Refactor review pipeline around normalized items

This commit is contained in:
ben
2026-03-20 11:27:46 -04:00
parent 607c51038a
commit 9104781b93
6 changed files with 512 additions and 361 deletions

View File

@@ -3,7 +3,6 @@ from pathlib import Path
import click
import build_observed_products
import build_purchases
import review_products
from layer_helpers import read_csv_rows, write_csv_rows
@@ -29,33 +28,36 @@ def build_status_summary(
purchases,
resolutions,
):
enriched_rows = giant_enriched + costco_enriched
observed_rows = build_observed_products.build_observed_products(enriched_rows)
normalized_rows = giant_enriched + costco_enriched
queue_rows = review_products.build_review_queue(purchases, resolutions)
queue_ids = {row["normalized_item_id"] for row in queue_rows}
unresolved_purchase_rows = [
row
for row in purchases
if row.get("observed_product_id")
and not row.get("canonical_product_id")
if row.get("normalized_item_id")
and not row.get("catalog_id")
and row.get("is_fee") != "true"
and row.get("is_discount_line") != "true"
and row.get("is_coupon_line") != "true"
]
excluded_rows = [
row
for row in purchases
if row.get("resolution_action") == "exclude"
]
linked_purchase_rows = [row for row in purchases if row.get("canonical_product_id")]
excluded_rows = [row for row in purchases if row.get("resolution_action") == "exclude"]
linked_purchase_rows = [row for row in purchases if row.get("catalog_id")]
distinct_normalized_items = {
row["normalized_item_id"] for row in normalized_rows if row.get("normalized_item_id")
}
linked_normalized_items = {
row["normalized_item_id"] for row in purchases if row.get("normalized_item_id") and row.get("catalog_id")
}
summary = [
{"stage": "raw_orders", "count": len(giant_orders) + len(costco_orders)},
{"stage": "raw_items", "count": len(giant_items) + len(costco_items)},
{"stage": "enriched_items", "count": len(enriched_rows)},
{"stage": "observed_products", "count": len(observed_rows)},
{"stage": "review_queue_observed_products", "count": len(queue_rows)},
{"stage": "canonical_linked_purchase_rows", "count": len(linked_purchase_rows)},
{"stage": "normalized_items", "count": len(normalized_rows)},
{"stage": "distinct_normalized_items", "count": len(distinct_normalized_items)},
{"stage": "review_queue_normalized_items", "count": len(queue_rows)},
{"stage": "linked_normalized_items", "count": len(linked_normalized_items)},
{"stage": "linked_purchase_rows", "count": len(linked_purchase_rows)},
{"stage": "final_purchase_rows", "count": len(purchases)},
{"stage": "unresolved_purchase_rows", "count": len(unresolved_purchase_rows)},
{"stage": "excluded_purchase_rows", "count": len(excluded_rows)},
@@ -65,8 +67,7 @@ def build_status_summary(
[
row
for row in unresolved_purchase_rows
if row.get("observed_product_id")
not in {queue_row["observed_product_id"] for queue_row in queue_rows}
if row.get("normalized_item_id") not in queue_ids
]
),
},
@@ -105,7 +106,7 @@ def main(
read_rows_if_exists(costco_items_csv),
read_rows_if_exists(costco_enriched_csv),
read_rows_if_exists(purchases_csv),
read_rows_if_exists(resolutions_csv),
[build_purchases.normalize_resolution_row(row) for row in read_rows_if_exists(resolutions_csv)],
)
write_csv_rows(summary_csv, summary_rows, SUMMARY_FIELDS)
summary_json_path = Path(summary_json)