diff --git a/README.md b/README.md index 41cb5a2..09dcd8a 100644 --- a/README.md +++ b/README.md @@ -6,21 +6,15 @@ Run each script step-by-step from the terminal. ## What It Does -1. `scrape_giant.py`: download Giant orders and items -2. `enrich_giant.py`: normalize Giant line items -3. `scrape_costco.py`: download Costco orders and items -4. `enrich_costco.py`: normalize Costco line items +1. `collect_giant_web.py`: download Giant orders and items +2. `normalize_giant_web.py`: normalize Giant line items +3. `collect_costco_web.py`: download Costco orders and items +4. `normalize_costco_web.py`: normalize Costco line items 5. `build_purchases.py`: combine retailer outputs into one purchase table 6. `review_products.py`: review unresolved product matches in the terminal 7. `report_pipeline_status.py`: show how many rows survive each stage 8. `analyze_purchases.py`: write chart-ready analysis CSVs from the purchase table -Active refactor entrypoints: -- `collect_giant_web.py` -- `collect_costco_web.py` -- `normalize_giant_web.py` -- `normalize_costco_web.py` - ## Requirements - Python 3.10+ @@ -65,13 +59,20 @@ data/ collected_items.csv normalized_items.csv review/ + catalog.csv review_queue.csv review_resolutions.csv product_links.csv - purchases.csv pipeline_status.csv pipeline_status.json - catalog.csv + analysis/ + purchases.csv + comparison_examples.csv + item_price_over_time.csv + spend_by_visit.csv + items_per_visit.csv + category_spend_over_time.csv + retailer_store_breakdown.csv ``` ## Run Order @@ -122,21 +123,21 @@ Costco: - `data/costco-web/normalized_items.csv` preserves raw totals and matched net discount fields Combined: -- `data/review/purchases.csv` -- `data/review/analysis/item_price_over_time.csv` -- `data/review/analysis/spend_by_visit.csv` -- `data/review/analysis/items_per_visit.csv` -- `data/review/analysis/category_spend_over_time.csv` -- `data/review/analysis/retailer_store_breakdown.csv` +- `data/analysis/purchases.csv` +- `data/analysis/comparison_examples.csv` +- `data/analysis/item_price_over_time.csv` +- `data/analysis/spend_by_visit.csv` +- `data/analysis/items_per_visit.csv` +- `data/analysis/category_spend_over_time.csv` +- `data/analysis/retailer_store_breakdown.csv` - `data/review/review_queue.csv` - `data/review/review_resolutions.csv` - `data/review/product_links.csv` -- `data/review/comparison_examples.csv` - `data/review/pipeline_status.csv` - `data/review/pipeline_status.json` -- `data/catalog.csv` +- `data/review/catalog.csv` -`data/review/purchases.csv` is the main analysis artifact. It is designed to support both: +`data/analysis/purchases.csv` is the main analysis artifact. It is designed to support both: - item-level price analysis - visit-level analysis such as spend by visit, items per visit, category spend by visit, and retailer/store breakdown @@ -164,9 +165,7 @@ The review step is intentionally conservative: ## Notes - This project is designed around fragile retailer scraping flows, so the code favors explicit retailer-specific steps over heavy abstraction. -- `scrape_giant.py`, `scrape_costco.py`, `enrich_giant.py`, and `enrich_costco.py` are now legacy-compatible entrypoints; prefer the `collect_*` and `normalize_*` scripts for active work. - Costco discount rows are preserved for auditability and also matched back to purchased items during enrichment. -- `validate_cross_retailer_flow.py` is a proof/check script, not a required production step. ## Test diff --git a/analyze_purchases.py b/analyze_purchases.py index 88676ea..a8e5846 100644 --- a/analyze_purchases.py +++ b/analyze_purchases.py @@ -241,8 +241,8 @@ def build_retailer_store_rows(purchase_rows): @click.command() -@click.option("--purchases-csv", default="data/review/purchases.csv", show_default=True) -@click.option("--output-dir", default="data/review/analysis", show_default=True) +@click.option("--purchases-csv", default="data/analysis/purchases.csv", show_default=True) +@click.option("--output-dir", default="data/analysis", show_default=True) def main(purchases_csv, output_dir): purchase_rows = read_csv_rows(purchases_csv) output_path = Path(output_dir) diff --git a/build_canonical_layer.py b/build_canonical_layer.py deleted file mode 100644 index d16addf..0000000 --- a/build_canonical_layer.py +++ /dev/null @@ -1,220 +0,0 @@ -import click -import re - -from layer_helpers import read_csv_rows, representative_value, stable_id, write_csv_rows - - -CANONICAL_FIELDS = [ - "canonical_product_id", - "canonical_name", - "product_type", - "brand", - "variant", - "size_value", - "size_unit", - "pack_qty", - "measure_type", - "normalized_quantity", - "normalized_quantity_unit", - "notes", - "created_at", - "updated_at", -] - -CANONICAL_DROP_TOKENS = {"CT", "COUNT", "COUNTS", "DOZ", "DOZEN", "DOZ.", "PACK"} - -LINK_FIELDS = [ - "observed_product_id", - "canonical_product_id", - "link_method", - "link_confidence", - "review_status", - "reviewed_by", - "reviewed_at", - "link_notes", -] - - -def to_float(value): - try: - return float(value) - except (TypeError, ValueError): - return None - - -def normalized_quantity(row): - size_value = to_float(row.get("representative_size_value")) - pack_qty = to_float(row.get("representative_pack_qty")) or 1.0 - size_unit = row.get("representative_size_unit", "") - measure_type = row.get("representative_measure_type", "") - - if size_value is not None and size_unit: - return format(size_value * pack_qty, "g"), size_unit - - if row.get("representative_pack_qty") and measure_type == "count": - return row["representative_pack_qty"], "count" - - if measure_type == "each": - return "1", "each" - - return "", "" - - -def auto_link_rule(observed_row): - if ( - observed_row.get("is_fee") == "true" - or observed_row.get("is_discount_line") == "true" - or observed_row.get("is_coupon_line") == "true" - ): - return "", "", "" - - if observed_row.get("representative_upc"): - return ( - "exact_upc", - f"upc={observed_row['representative_upc']}", - "high", - ) - - if ( - observed_row.get("representative_name_norm") - and observed_row.get("representative_size_value") - and observed_row.get("representative_size_unit") - ): - return ( - "exact_name_size", - "|".join( - [ - f"name={observed_row['representative_name_norm']}", - f"size={observed_row['representative_size_value']}", - f"unit={observed_row['representative_size_unit']}", - f"pack={observed_row['representative_pack_qty']}", - f"measure={observed_row['representative_measure_type']}", - ] - ), - "high", - ) - - return "", "", "" - - -def clean_canonical_name(name): - tokens = [] - for token in re.sub(r"[^A-Z0-9\s]", " ", (name or "").upper()).split(): - if token.isdigit(): - continue - if token in CANONICAL_DROP_TOKENS: - continue - if re.fullmatch(r"\d+(?:PK|PACK)", token): - continue - if re.fullmatch(r"\d+DZ", token): - continue - tokens.append(token) - return " ".join(tokens).strip() - - -def canonical_row_for_group(canonical_product_id, group_rows, link_method): - quantity_value, quantity_unit = normalized_quantity( - { - "representative_size_value": representative_value( - group_rows, "representative_size_value" - ), - "representative_size_unit": representative_value( - group_rows, "representative_size_unit" - ), - "representative_pack_qty": representative_value( - group_rows, "representative_pack_qty" - ), - "representative_measure_type": representative_value( - group_rows, "representative_measure_type" - ), - } - ) - return { - "canonical_product_id": canonical_product_id, - "canonical_name": clean_canonical_name( - representative_value(group_rows, "representative_name_norm") - ) - or representative_value(group_rows, "representative_name_norm"), - "product_type": "", - "brand": representative_value(group_rows, "representative_brand"), - "variant": representative_value(group_rows, "representative_variant"), - "size_value": representative_value(group_rows, "representative_size_value"), - "size_unit": representative_value(group_rows, "representative_size_unit"), - "pack_qty": representative_value(group_rows, "representative_pack_qty"), - "measure_type": representative_value(group_rows, "representative_measure_type"), - "normalized_quantity": quantity_value, - "normalized_quantity_unit": quantity_unit, - "notes": f"auto-linked via {link_method}", - "created_at": "", - "updated_at": "", - } - - -def build_canonical_layer(observed_rows): - canonical_rows = [] - link_rows = [] - groups = {} - - for observed_row in sorted(observed_rows, key=lambda row: row["observed_product_id"]): - link_method, group_key, confidence = auto_link_rule(observed_row) - if not group_key: - continue - - canonical_product_id = stable_id("gcan", f"{link_method}|{group_key}") - groups.setdefault(canonical_product_id, {"method": link_method, "rows": []}) - groups[canonical_product_id]["rows"].append(observed_row) - link_rows.append( - { - "observed_product_id": observed_row["observed_product_id"], - "canonical_product_id": canonical_product_id, - "link_method": link_method, - "link_confidence": confidence, - "review_status": "", - "reviewed_by": "", - "reviewed_at": "", - "link_notes": "", - } - ) - - for canonical_product_id, group in sorted(groups.items()): - canonical_rows.append( - canonical_row_for_group( - canonical_product_id, group["rows"], group["method"] - ) - ) - - return canonical_rows, link_rows - - -@click.command() -@click.option( - "--observed-csv", - default="giant_output/products_observed.csv", - show_default=True, - help="Path to observed product rows.", -) -@click.option( - "--canonical-csv", - default="giant_output/products_canonical.csv", - show_default=True, - help="Path to canonical product output.", -) -@click.option( - "--links-csv", - default="giant_output/product_links.csv", - show_default=True, - help="Path to observed-to-canonical link output.", -) -def main(observed_csv, canonical_csv, links_csv): - observed_rows = read_csv_rows(observed_csv) - canonical_rows, link_rows = build_canonical_layer(observed_rows) - write_csv_rows(canonical_csv, canonical_rows, CANONICAL_FIELDS) - write_csv_rows(links_csv, link_rows, LINK_FIELDS) - click.echo( - f"wrote {len(canonical_rows)} canonical rows to {canonical_csv} and " - f"{len(link_rows)} links to {links_csv}" - ) - - -if __name__ == "__main__": - main() diff --git a/build_observed_products.py b/build_observed_products.py deleted file mode 100644 index 807a6a3..0000000 --- a/build_observed_products.py +++ /dev/null @@ -1,172 +0,0 @@ -from collections import defaultdict - -import click - -from layer_helpers import ( - compact_join, - distinct_values, - first_nonblank, - read_csv_rows, - representative_value, - stable_id, - write_csv_rows, -) - - -OUTPUT_FIELDS = [ - "observed_product_id", - "retailer", - "observed_key", - "representative_retailer_item_id", - "representative_upc", - "representative_item_name", - "representative_name_norm", - "representative_brand", - "representative_variant", - "representative_size_value", - "representative_size_unit", - "representative_pack_qty", - "representative_measure_type", - "representative_image_url", - "is_store_brand", - "is_fee", - "is_discount_line", - "is_coupon_line", - "first_seen_date", - "last_seen_date", - "times_seen", - "example_order_id", - "example_item_name", - "raw_name_examples", - "normalized_name_examples", - "example_prices", - "distinct_item_names_count", - "distinct_retailer_item_ids_count", - "distinct_upcs_count", -] - - -def build_observed_key(row): - if row.get("upc"): - return "|".join( - [ - row["retailer"], - f"upc={row['upc']}", - f"name={row['item_name_norm']}", - ] - ) - - if row.get("retailer_item_id"): - return "|".join( - [ - row["retailer"], - f"retailer_item_id={row['retailer_item_id']}", - f"name={row['item_name_norm']}", - f"discount={row.get('is_discount_line', 'false')}", - f"coupon={row.get('is_coupon_line', 'false')}", - ] - ) - - return "|".join( - [ - row["retailer"], - f"name={row['item_name_norm']}", - f"size={row['size_value']}", - f"unit={row['size_unit']}", - f"pack={row['pack_qty']}", - f"measure={row['measure_type']}", - f"store_brand={row['is_store_brand']}", - f"fee={row['is_fee']}", - ] - ) - - -def build_observed_products(rows): - grouped = defaultdict(list) - for row in rows: - grouped[build_observed_key(row)].append(row) - - observed_rows = [] - for observed_key, group_rows in sorted(grouped.items()): - ordered = sorted( - group_rows, - key=lambda row: (row["order_date"], row["order_id"], int(row["line_no"])), - ) - observed_rows.append( - { - "observed_product_id": stable_id("gobs", observed_key), - "retailer": ordered[0]["retailer"], - "observed_key": observed_key, - "representative_retailer_item_id": representative_value( - ordered, "retailer_item_id" - ), - "representative_upc": representative_value(ordered, "upc"), - "representative_item_name": representative_value(ordered, "item_name"), - "representative_name_norm": representative_value( - ordered, "item_name_norm" - ), - "representative_brand": representative_value(ordered, "brand_guess"), - "representative_variant": representative_value(ordered, "variant"), - "representative_size_value": representative_value(ordered, "size_value"), - "representative_size_unit": representative_value(ordered, "size_unit"), - "representative_pack_qty": representative_value(ordered, "pack_qty"), - "representative_measure_type": representative_value( - ordered, "measure_type" - ), - "representative_image_url": first_nonblank(ordered, "image_url"), - "is_store_brand": representative_value(ordered, "is_store_brand"), - "is_fee": representative_value(ordered, "is_fee"), - "is_discount_line": representative_value( - ordered, "is_discount_line" - ), - "is_coupon_line": representative_value(ordered, "is_coupon_line"), - "first_seen_date": ordered[0]["order_date"], - "last_seen_date": ordered[-1]["order_date"], - "times_seen": str(len(ordered)), - "example_order_id": ordered[0]["order_id"], - "example_item_name": ordered[0]["item_name"], - "raw_name_examples": compact_join( - distinct_values(ordered, "item_name"), limit=4 - ), - "normalized_name_examples": compact_join( - distinct_values(ordered, "item_name_norm"), limit=4 - ), - "example_prices": compact_join( - distinct_values(ordered, "line_total"), limit=4 - ), - "distinct_item_names_count": str( - len(distinct_values(ordered, "item_name")) - ), - "distinct_retailer_item_ids_count": str( - len(distinct_values(ordered, "retailer_item_id")) - ), - "distinct_upcs_count": str(len(distinct_values(ordered, "upc"))), - } - ) - - observed_rows.sort(key=lambda row: row["observed_product_id"]) - return observed_rows - - -@click.command() -@click.option( - "--items-enriched-csv", - default="giant_output/items_enriched.csv", - show_default=True, - help="Path to enriched Giant item rows.", -) -@click.option( - "--output-csv", - default="giant_output/products_observed.csv", - show_default=True, - help="Path to observed product output.", -) -def main(items_enriched_csv, output_csv): - rows = read_csv_rows(items_enriched_csv) - observed_rows = build_observed_products(rows) - write_csv_rows(output_csv, observed_rows, OUTPUT_FIELDS) - click.echo(f"wrote {len(observed_rows)} rows to {output_csv}") - - -if __name__ == "__main__": - main() diff --git a/build_purchases.py b/build_purchases.py index 29e7415..7904fd9 100644 --- a/build_purchases.py +++ b/build_purchases.py @@ -440,10 +440,10 @@ def build_comparison_examples(purchase_rows): @click.option("--giant-orders-csv", default="data/giant-web/collected_orders.csv", show_default=True) @click.option("--costco-orders-csv", default="data/costco-web/collected_orders.csv", show_default=True) @click.option("--resolutions-csv", default="data/review/review_resolutions.csv", show_default=True) -@click.option("--catalog-csv", default="data/catalog.csv", show_default=True) +@click.option("--catalog-csv", default="data/review/catalog.csv", show_default=True) @click.option("--links-csv", default="data/review/product_links.csv", show_default=True) -@click.option("--output-csv", default="data/review/purchases.csv", show_default=True) -@click.option("--examples-csv", default="data/review/comparison_examples.csv", show_default=True) +@click.option("--output-csv", default="data/analysis/purchases.csv", show_default=True) +@click.option("--examples-csv", default="data/analysis/comparison_examples.csv", show_default=True) def main( giant_items_enriched_csv, costco_items_enriched_csv, diff --git a/build_review_queue.py b/build_review_queue.py deleted file mode 100644 index b0432a2..0000000 --- a/build_review_queue.py +++ /dev/null @@ -1,175 +0,0 @@ -from collections import defaultdict -from datetime import date - -import click - -from layer_helpers import compact_join, distinct_values, read_csv_rows, stable_id, write_csv_rows - - -OUTPUT_FIELDS = [ - "review_id", - "queue_type", - "retailer", - "observed_product_id", - "canonical_product_id", - "reason_code", - "priority", - "raw_item_names", - "normalized_names", - "upc", - "image_url", - "example_prices", - "seen_count", - "status", - "resolution_notes", - "created_at", - "updated_at", -] - - -def existing_review_state(path): - try: - rows = read_csv_rows(path) - except FileNotFoundError: - return {} - return {row["review_id"]: row for row in rows} - - -def review_reasons(observed_row): - reasons = [] - if ( - observed_row["is_fee"] == "true" - or observed_row.get("is_discount_line") == "true" - or observed_row.get("is_coupon_line") == "true" - ): - return reasons - if observed_row["distinct_upcs_count"] not in {"", "0", "1"}: - reasons.append(("multiple_upcs", "high")) - if observed_row["distinct_item_names_count"] not in {"", "0", "1"}: - reasons.append(("multiple_raw_names", "medium")) - if not observed_row["representative_image_url"]: - reasons.append(("missing_image", "medium")) - if not observed_row["representative_upc"]: - reasons.append(("missing_upc", "high")) - if not observed_row["representative_name_norm"]: - reasons.append(("missing_normalized_name", "high")) - return reasons - - -def build_review_queue(observed_rows, item_rows, existing_rows, today_text): - by_observed = defaultdict(list) - for row in item_rows: - observed_id = row.get("observed_product_id", "") - if observed_id: - by_observed[observed_id].append(row) - - queue_rows = [] - for observed_row in observed_rows: - reasons = review_reasons(observed_row) - if not reasons: - continue - - related_items = by_observed.get(observed_row["observed_product_id"], []) - raw_names = compact_join(distinct_values(related_items, "item_name"), limit=5) - norm_names = compact_join( - distinct_values(related_items, "item_name_norm"), limit=5 - ) - example_prices = compact_join( - distinct_values(related_items, "line_total"), limit=5 - ) - - for reason_code, priority in reasons: - review_id = stable_id( - "rvw", - f"{observed_row['observed_product_id']}|{reason_code}", - ) - prior = existing_rows.get(review_id, {}) - queue_rows.append( - { - "review_id": review_id, - "queue_type": "observed_product", - "retailer": observed_row["retailer"], - "observed_product_id": observed_row["observed_product_id"], - "canonical_product_id": prior.get("canonical_product_id", ""), - "reason_code": reason_code, - "priority": priority, - "raw_item_names": raw_names, - "normalized_names": norm_names, - "upc": observed_row["representative_upc"], - "image_url": observed_row["representative_image_url"], - "example_prices": example_prices, - "seen_count": observed_row["times_seen"], - "status": prior.get("status", "pending"), - "resolution_notes": prior.get("resolution_notes", ""), - "created_at": prior.get("created_at", today_text), - "updated_at": today_text, - } - ) - - queue_rows.sort(key=lambda row: (row["priority"], row["reason_code"], row["review_id"])) - return queue_rows - - -def attach_observed_ids(item_rows, observed_rows): - observed_by_key = {row["observed_key"]: row["observed_product_id"] for row in observed_rows} - attached = [] - for row in item_rows: - observed_key = "|".join( - [ - row["retailer"], - f"upc={row['upc']}", - f"name={row['item_name_norm']}", - ] - ) if row.get("upc") else "|".join( - [ - row["retailer"], - f"retailer_item_id={row.get('retailer_item_id', '')}", - f"name={row['item_name_norm']}", - f"size={row['size_value']}", - f"unit={row['size_unit']}", - f"pack={row['pack_qty']}", - f"measure={row['measure_type']}", - f"store_brand={row['is_store_brand']}", - f"fee={row['is_fee']}", - f"discount={row.get('is_discount_line', 'false')}", - f"coupon={row.get('is_coupon_line', 'false')}", - ] - ) - enriched = dict(row) - enriched["observed_product_id"] = observed_by_key.get(observed_key, "") - attached.append(enriched) - return attached - - -@click.command() -@click.option( - "--observed-csv", - default="giant_output/products_observed.csv", - show_default=True, - help="Path to observed product rows.", -) -@click.option( - "--items-enriched-csv", - default="giant_output/items_enriched.csv", - show_default=True, - help="Path to enriched Giant item rows.", -) -@click.option( - "--output-csv", - default="giant_output/review_queue.csv", - show_default=True, - help="Path to review queue output.", -) -def main(observed_csv, items_enriched_csv, output_csv): - observed_rows = read_csv_rows(observed_csv) - item_rows = read_csv_rows(items_enriched_csv) - item_rows = attach_observed_ids(item_rows, observed_rows) - existing_rows = existing_review_state(output_csv) - today_text = str(date.today()) - queue_rows = build_review_queue(observed_rows, item_rows, existing_rows, today_text) - write_csv_rows(output_csv, queue_rows, OUTPUT_FIELDS) - click.echo(f"wrote {len(queue_rows)} rows to {output_csv}") - - -if __name__ == "__main__": - main() diff --git a/pm/data-model.org b/pm/data-model.org index 321abc9..32e0e00 100644 --- a/pm/data-model.org +++ b/pm/data-model.org @@ -110,8 +110,15 @@ data/ review/ review_queue.csv # Human review queue for unresolved matching/parsing cases. product_links.csv # Links from normalized retailer items to catalog items. - catalog.csv # Cross-retailer product catalog entities used for comparison. - purchases.csv + catalog.csv # Cross-retailer product catalog entities used for comparison. + analysis/ + purchases.csv + comparison_examples.csv + item_price_over_time.csv + spend_by_visit.csv + items_per_visit.csv + category_spend_over_time.csv + retailer_store_breakdown.csv #+end_example Notes: @@ -223,7 +230,7 @@ Notes: - Valid `normalization_basis` values should be explicit, e.g. `exact_upc`, `exact_retailer_item_id`, `exact_name_size_pack`, or `approved_retailer_alias`. - Do not use fuzzy or semantic matching to assign `normalized_item_id`. - Discount/coupon rows may remain as standalone normalized rows for auditability even when their amounts are attached to a purchased row via `matched_discount_amount`. -- Cross-retailer identity is handled later in review/combine via `catalog.csv` and `product_links.csv`. +- Cross-retailer identity is handled later in review/combine via `data/review/catalog.csv` and `product_links.csv`. ** `data/review/product_links.csv` One row per review-approved link from a normalized retailer item to a catalog item. @@ -263,7 +270,7 @@ One row per issue needing human review. | `resolution_notes` | reviewer notes | | `created_at` | creation timestamp or date | | `updated_at` | last update timestamp or date | -** `data/catalog.csv` +** `data/review/catalog.csv` One row per cross-retailer catalog product. | key | definition | |----------------------------+----------------------------------------| @@ -288,7 +295,7 @@ Notes: - Do not encode packaging/count into `catalog_name` unless it is essential to product identity. - `catalog_name` should come from review-approved naming, not raw retailer strings. -** `data/purchases.csv` +** `data/analysis/purchases.csv` One row per purchased item (i.e., `is_item`==true from normalized layer), with catalog attributes denormalized in and discounts already applied. diff --git a/report_pipeline_status.py b/report_pipeline_status.py index a446b8c..55f7db7 100644 --- a/report_pipeline_status.py +++ b/report_pipeline_status.py @@ -85,10 +85,10 @@ def build_status_summary( @click.option("--costco-orders-csv", default="data/costco-web/collected_orders.csv", show_default=True) @click.option("--costco-items-csv", default="data/costco-web/collected_items.csv", show_default=True) @click.option("--costco-enriched-csv", default="data/costco-web/normalized_items.csv", show_default=True) -@click.option("--purchases-csv", default="data/review/purchases.csv", show_default=True) +@click.option("--purchases-csv", default="data/analysis/purchases.csv", show_default=True) @click.option("--resolutions-csv", default="data/review/review_resolutions.csv", show_default=True) @click.option("--links-csv", default="data/review/product_links.csv", show_default=True) -@click.option("--catalog-csv", default="data/catalog.csv", show_default=True) +@click.option("--catalog-csv", default="data/review/catalog.csv", show_default=True) @click.option("--summary-csv", default="data/review/pipeline_status.csv", show_default=True) @click.option("--summary-json", default="data/review/pipeline_status.json", show_default=True) def main( diff --git a/review_products.py b/review_products.py index e353934..6fa39a4 100644 --- a/review_products.py +++ b/review_products.py @@ -562,10 +562,10 @@ def link_rows_from_state(link_lookup): @click.option("--costco-items-enriched-csv", default="data/costco-web/normalized_items.csv", show_default=True) @click.option("--giant-orders-csv", default="data/giant-web/collected_orders.csv", show_default=True) @click.option("--costco-orders-csv", default="data/costco-web/collected_orders.csv", show_default=True) -@click.option("--purchases-csv", default="data/review/purchases.csv", show_default=True) +@click.option("--purchases-csv", default="data/analysis/purchases.csv", show_default=True) @click.option("--queue-csv", default="data/review/review_queue.csv", show_default=True) @click.option("--resolutions-csv", default="data/review/review_resolutions.csv", show_default=True) -@click.option("--catalog-csv", default="data/catalog.csv", show_default=True) +@click.option("--catalog-csv", default="data/review/catalog.csv", show_default=True) @click.option("--links-csv", default="data/review/product_links.csv", show_default=True) @click.option("--limit", default=0, show_default=True, type=int) @click.option("--refresh-only", is_flag=True, help="Only rebuild review_queue.csv without prompting.") diff --git a/scraper.py b/scraper.py deleted file mode 100644 index 5dcf5b0..0000000 --- a/scraper.py +++ /dev/null @@ -1,5 +0,0 @@ -from scrape_giant import * # noqa: F401,F403 - - -if __name__ == "__main__": - main() diff --git a/tests/test_canonical_layer.py b/tests/test_canonical_layer.py deleted file mode 100644 index 451c731..0000000 --- a/tests/test_canonical_layer.py +++ /dev/null @@ -1,119 +0,0 @@ -import unittest - -import build_canonical_layer - - -class CanonicalLayerTests(unittest.TestCase): - def test_build_canonical_layer_auto_links_exact_upc_and_name_size_only(self): - observed_rows = [ - { - "observed_product_id": "gobs_1", - "representative_upc": "111", - "representative_retailer_item_id": "11", - "representative_name_norm": "GALA APPLE", - "representative_brand": "SB", - "representative_variant": "", - "representative_size_value": "5", - "representative_size_unit": "lb", - "representative_pack_qty": "", - "representative_measure_type": "weight", - "is_fee": "false", - "is_discount_line": "false", - "is_coupon_line": "false", - }, - { - "observed_product_id": "gobs_2", - "representative_upc": "111", - "representative_retailer_item_id": "12", - "representative_name_norm": "LARGE WHITE EGGS", - "representative_brand": "SB", - "representative_variant": "", - "representative_size_value": "", - "representative_size_unit": "", - "representative_pack_qty": "18", - "representative_measure_type": "count", - "is_fee": "false", - "is_discount_line": "false", - "is_coupon_line": "false", - }, - { - "observed_product_id": "gobs_3", - "representative_upc": "", - "representative_retailer_item_id": "21", - "representative_name_norm": "ROTINI", - "representative_brand": "", - "representative_variant": "", - "representative_size_value": "16", - "representative_size_unit": "oz", - "representative_pack_qty": "", - "representative_measure_type": "weight", - "is_fee": "false", - "is_discount_line": "false", - "is_coupon_line": "false", - }, - { - "observed_product_id": "gobs_4", - "representative_upc": "", - "representative_retailer_item_id": "22", - "representative_name_norm": "ROTINI", - "representative_brand": "SB", - "representative_variant": "", - "representative_size_value": "16", - "representative_size_unit": "oz", - "representative_pack_qty": "", - "representative_measure_type": "weight", - "is_fee": "false", - "is_discount_line": "false", - "is_coupon_line": "false", - }, - { - "observed_product_id": "gobs_5", - "representative_upc": "", - "representative_retailer_item_id": "99", - "representative_name_norm": "GL BAG CHARGE", - "representative_brand": "", - "representative_variant": "", - "representative_size_value": "", - "representative_size_unit": "", - "representative_pack_qty": "", - "representative_measure_type": "each", - "is_fee": "true", - "is_discount_line": "false", - "is_coupon_line": "false", - }, - { - "observed_product_id": "gobs_6", - "representative_upc": "", - "representative_retailer_item_id": "", - "representative_name_norm": "LIME", - "representative_brand": "", - "representative_variant": "", - "representative_size_value": "", - "representative_size_unit": "", - "representative_pack_qty": "", - "representative_measure_type": "each", - "is_fee": "false", - "is_discount_line": "false", - "is_coupon_line": "false", - }, - ] - - canonicals, links = build_canonical_layer.build_canonical_layer(observed_rows) - - self.assertEqual(2, len(canonicals)) - self.assertEqual(4, len(links)) - methods = {row["observed_product_id"]: row["link_method"] for row in links} - self.assertEqual("exact_upc", methods["gobs_1"]) - self.assertEqual("exact_upc", methods["gobs_2"]) - self.assertEqual("exact_name_size", methods["gobs_3"]) - self.assertEqual("exact_name_size", methods["gobs_4"]) - self.assertNotIn("gobs_5", methods) - self.assertNotIn("gobs_6", methods) - - def test_clean_canonical_name_removes_packaging_noise(self): - self.assertEqual("LIME", build_canonical_layer.clean_canonical_name("LIME . / .")) - self.assertEqual("EGG", build_canonical_layer.clean_canonical_name("5DZ EGG / /")) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/test_costco_pipeline.py b/tests/test_costco_pipeline.py index a74a741..f035d11 100644 --- a/tests/test_costco_pipeline.py +++ b/tests/test_costco_pipeline.py @@ -7,7 +7,6 @@ from unittest import mock import enrich_costco import scrape_costco -import validate_cross_retailer_flow class CostcoPipelineTests(unittest.TestCase): @@ -423,76 +422,6 @@ class CostcoPipelineTests(unittest.TestCase): self.assertIn("matched_discount=4873222", purchase_row["parse_notes"]) self.assertIn("matched_to_item=4873222", discount_row["parse_notes"]) - def test_cross_retailer_validation_writes_proof_example(self): - with tempfile.TemporaryDirectory() as tmpdir: - giant_csv = Path(tmpdir) / "giant_items_enriched.csv" - costco_csv = Path(tmpdir) / "costco_items_enriched.csv" - outdir = Path(tmpdir) / "combined" - - fieldnames = enrich_costco.OUTPUT_FIELDS - giant_row = {field: "" for field in fieldnames} - giant_row.update( - { - "retailer": "giant", - "order_id": "g1", - "line_no": "1", - "order_date": "2026-03-01", - "retailer_item_id": "100", - "item_name": "FRESH BANANA", - "item_name_norm": "BANANA", - "upc": "4011", - "measure_type": "weight", - "is_store_brand": "false", - "is_fee": "false", - "is_discount_line": "false", - "is_coupon_line": "false", - "line_total": "1.29", - } - ) - costco_row = {field: "" for field in fieldnames} - costco_row.update( - { - "retailer": "costco", - "order_id": "c1", - "line_no": "1", - "order_date": "2026-03-12", - "retailer_item_id": "30669", - "item_name": "BANANAS 3 LB / 1.36 KG", - "item_name_norm": "BANANA", - "upc": "", - "size_value": "3", - "size_unit": "lb", - "measure_type": "weight", - "is_store_brand": "false", - "is_fee": "false", - "is_discount_line": "false", - "is_coupon_line": "false", - "line_total": "2.98", - } - ) - - with giant_csv.open("w", newline="", encoding="utf-8") as handle: - writer = csv.DictWriter(handle, fieldnames=fieldnames) - writer.writeheader() - writer.writerow(giant_row) - with costco_csv.open("w", newline="", encoding="utf-8") as handle: - writer = csv.DictWriter(handle, fieldnames=fieldnames) - writer.writeheader() - writer.writerow(costco_row) - - validate_cross_retailer_flow.main.callback( - giant_items_enriched_csv=str(giant_csv), - costco_items_enriched_csv=str(costco_csv), - outdir=str(outdir), - ) - - proof_path = outdir / "proof_examples.csv" - self.assertTrue(proof_path.exists()) - with proof_path.open(newline="", encoding="utf-8") as handle: - rows = list(csv.DictReader(handle)) - self.assertEqual(1, len(rows)) - self.assertEqual("banana", rows[0]["proof_name"]) - def test_main_writes_summary_request_metadata(self): with tempfile.TemporaryDirectory() as tmpdir: outdir = Path(tmpdir) / "costco_output" diff --git a/tests/test_observed_products.py b/tests/test_observed_products.py deleted file mode 100644 index 90a7a5e..0000000 --- a/tests/test_observed_products.py +++ /dev/null @@ -1,67 +0,0 @@ -import unittest - -import build_observed_products - - -class ObservedProductTests(unittest.TestCase): - def test_build_observed_products_aggregates_rows_with_same_key(self): - rows = [ - { - "retailer": "giant", - "order_id": "1", - "line_no": "1", - "order_date": "2026-01-01", - "item_name": "SB GALA APPLE 5LB", - "item_name_norm": "GALA APPLE", - "retailer_item_id": "11", - "upc": "111", - "brand_guess": "SB", - "variant": "", - "size_value": "5", - "size_unit": "lb", - "pack_qty": "", - "measure_type": "weight", - "image_url": "https://example.test/a.jpg", - "is_store_brand": "true", - "is_fee": "false", - "is_discount_line": "false", - "is_coupon_line": "false", - "line_total": "7.99", - }, - { - "retailer": "giant", - "order_id": "2", - "line_no": "1", - "order_date": "2026-01-10", - "item_name": "SB GALA APPLE 5 LB", - "item_name_norm": "GALA APPLE", - "retailer_item_id": "11", - "upc": "111", - "brand_guess": "SB", - "variant": "", - "size_value": "5", - "size_unit": "lb", - "pack_qty": "", - "measure_type": "weight", - "image_url": "", - "is_store_brand": "true", - "is_fee": "false", - "is_discount_line": "false", - "is_coupon_line": "false", - "line_total": "8.49", - }, - ] - - observed = build_observed_products.build_observed_products(rows) - - self.assertEqual(1, len(observed)) - self.assertEqual("2", observed[0]["times_seen"]) - self.assertEqual("2026-01-01", observed[0]["first_seen_date"]) - self.assertEqual("2026-01-10", observed[0]["last_seen_date"]) - self.assertEqual("11", observed[0]["representative_retailer_item_id"]) - self.assertEqual("111", observed[0]["representative_upc"]) - self.assertIn("SB GALA APPLE 5LB", observed[0]["raw_name_examples"]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/test_pipeline_status.py b/tests/test_pipeline_status.py index 7b3127e..ff24e65 100644 --- a/tests/test_pipeline_status.py +++ b/tests/test_pipeline_status.py @@ -65,6 +65,21 @@ class PipelineStatusTests(unittest.TestCase): }, ], resolutions=[], + links=[ + { + "normalized_item_id": "gnorm_banana", + "catalog_id": "cat_banana", + "review_status": "approved", + } + ], + catalog=[ + { + "catalog_id": "cat_banana", + "catalog_name": "BANANA", + "product_type": "banana", + "category": "produce", + } + ], ) counts = {row["stage"]: row["count"] for row in summary} diff --git a/tests/test_review_queue.py b/tests/test_review_queue.py deleted file mode 100644 index 3843700..0000000 --- a/tests/test_review_queue.py +++ /dev/null @@ -1,133 +0,0 @@ -import tempfile -import unittest -from pathlib import Path - -import build_observed_products -import build_review_queue -from layer_helpers import write_csv_rows - - -class ReviewQueueTests(unittest.TestCase): - def test_build_review_queue_preserves_existing_status(self): - observed_rows = [ - { - "observed_product_id": "gobs_1", - "retailer": "giant", - "representative_upc": "111", - "representative_image_url": "", - "representative_name_norm": "GALA APPLE", - "times_seen": "2", - "distinct_item_names_count": "2", - "distinct_upcs_count": "1", - "is_fee": "false", - "is_discount_line": "false", - "is_coupon_line": "false", - } - ] - item_rows = [ - { - "observed_product_id": "gobs_1", - "item_name": "SB GALA APPLE 5LB", - "item_name_norm": "GALA APPLE", - "line_total": "7.99", - }, - { - "observed_product_id": "gobs_1", - "item_name": "SB GALA APPLE 5 LB", - "item_name_norm": "GALA APPLE", - "line_total": "8.49", - }, - ] - existing = { - build_review_queue.stable_id("rvw", "gobs_1|missing_image"): { - "status": "approved", - "resolution_notes": "looked fine", - "created_at": "2026-03-15", - } - } - - queue = build_review_queue.build_review_queue( - observed_rows, item_rows, existing, "2026-03-16" - ) - - self.assertEqual(2, len(queue)) - missing_image = [row for row in queue if row["reason_code"] == "missing_image"][0] - self.assertEqual("approved", missing_image["status"]) - self.assertEqual("looked fine", missing_image["resolution_notes"]) - - def test_review_queue_main_writes_output(self): - with tempfile.TemporaryDirectory() as tmpdir: - observed_path = Path(tmpdir) / "products_observed.csv" - items_path = Path(tmpdir) / "items_enriched.csv" - output_path = Path(tmpdir) / "review_queue.csv" - - observed_rows = [ - { - "observed_product_id": "gobs_1", - "retailer": "giant", - "observed_key": "giant|upc=111|name=GALA APPLE", - "representative_retailer_item_id": "11", - "representative_upc": "111", - "representative_item_name": "SB GALA APPLE 5LB", - "representative_name_norm": "GALA APPLE", - "representative_brand": "SB", - "representative_variant": "", - "representative_size_value": "5", - "representative_size_unit": "lb", - "representative_pack_qty": "", - "representative_measure_type": "weight", - "representative_image_url": "", - "is_store_brand": "true", - "is_fee": "false", - "is_discount_line": "false", - "is_coupon_line": "false", - "first_seen_date": "2026-01-01", - "last_seen_date": "2026-01-10", - "times_seen": "2", - "example_order_id": "1", - "example_item_name": "SB GALA APPLE 5LB", - "raw_name_examples": "SB GALA APPLE 5LB | SB GALA APPLE 5 LB", - "normalized_name_examples": "GALA APPLE", - "example_prices": "7.99 | 8.49", - "distinct_item_names_count": "2", - "distinct_retailer_item_ids_count": "1", - "distinct_upcs_count": "1", - } - ] - item_rows = [ - { - "retailer": "giant", - "order_id": "1", - "line_no": "1", - "item_name": "SB GALA APPLE 5LB", - "item_name_norm": "GALA APPLE", - "retailer_item_id": "11", - "upc": "111", - "size_value": "5", - "size_unit": "lb", - "pack_qty": "", - "measure_type": "weight", - "is_store_brand": "true", - "is_fee": "false", - "is_discount_line": "false", - "is_coupon_line": "false", - "line_total": "7.99", - } - ] - - write_csv_rows( - observed_path, observed_rows, build_observed_products.OUTPUT_FIELDS - ) - write_csv_rows(items_path, item_rows, list(item_rows[0].keys())) - - build_review_queue.main.callback( - observed_csv=str(observed_path), - items_enriched_csv=str(items_path), - output_csv=str(output_path), - ) - - self.assertTrue(output_path.exists()) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/test_scraper.py b/tests/test_scraper.py index 9fc593f..f79ed99 100644 --- a/tests/test_scraper.py +++ b/tests/test_scraper.py @@ -3,7 +3,7 @@ import tempfile import unittest from pathlib import Path -import scraper +import scrape_giant as scraper class ScraperTests(unittest.TestCase): diff --git a/validate_cross_retailer_flow.py b/validate_cross_retailer_flow.py deleted file mode 100644 index 93d73bd..0000000 --- a/validate_cross_retailer_flow.py +++ /dev/null @@ -1,154 +0,0 @@ -import json -from pathlib import Path - -import click - -import build_canonical_layer -import build_observed_products -from layer_helpers import stable_id, write_csv_rows - - -PROOF_FIELDS = [ - "proof_name", - "canonical_product_id", - "giant_observed_product_id", - "costco_observed_product_id", - "giant_example_item", - "costco_example_item", - "notes", -] - - -def read_rows(path): - import csv - - with Path(path).open(newline="", encoding="utf-8") as handle: - return list(csv.DictReader(handle)) - - -def find_proof_pair(observed_rows): - giant = None - costco = None - for row in observed_rows: - if row["retailer"] == "giant" and row["representative_name_norm"] == "BANANA": - giant = row - if row["retailer"] == "costco" and row["representative_name_norm"] == "BANANA": - costco = row - return giant, costco - - -def merge_proof_pair(canonical_rows, link_rows, giant_row, costco_row): - if not giant_row or not costco_row: - return canonical_rows, link_rows, [] - - proof_canonical_id = stable_id("gcan", "proof|banana") - link_rows = [ - row - for row in link_rows - if row["observed_product_id"] - not in {giant_row["observed_product_id"], costco_row["observed_product_id"]} - ] - canonical_rows = [ - row - for row in canonical_rows - if row["canonical_product_id"] != proof_canonical_id - ] - canonical_rows.append( - { - "canonical_product_id": proof_canonical_id, - "canonical_name": "BANANA", - "product_type": "banana", - "brand": "", - "variant": "", - "size_value": "", - "size_unit": "", - "pack_qty": "", - "measure_type": "weight", - "normalized_quantity": "", - "normalized_quantity_unit": "", - "notes": "manual proof merge for cross-retailer validation", - "created_at": "", - "updated_at": "", - } - ) - for observed_row in [giant_row, costco_row]: - link_rows.append( - { - "observed_product_id": observed_row["observed_product_id"], - "canonical_product_id": proof_canonical_id, - "link_method": "manual_proof_merge", - "link_confidence": "medium", - "review_status": "", - "reviewed_by": "", - "reviewed_at": "", - "link_notes": "cross-retailer validation proof", - } - ) - - proof_rows = [ - { - "proof_name": "banana", - "canonical_product_id": proof_canonical_id, - "giant_observed_product_id": giant_row["observed_product_id"], - "costco_observed_product_id": costco_row["observed_product_id"], - "giant_example_item": giant_row["example_item_name"], - "costco_example_item": costco_row["example_item_name"], - "notes": "BANANA proof pair built from Giant and Costco enriched rows", - } - ] - return canonical_rows, link_rows, proof_rows - - -@click.command() -@click.option( - "--giant-items-enriched-csv", - default="giant_output/items_enriched.csv", - show_default=True, -) -@click.option( - "--costco-items-enriched-csv", - default="costco_output/items_enriched.csv", - show_default=True, -) -@click.option( - "--outdir", - default="combined_output", - show_default=True, -) -def main(giant_items_enriched_csv, costco_items_enriched_csv, outdir): - outdir = Path(outdir) - rows = read_rows(giant_items_enriched_csv) + read_rows(costco_items_enriched_csv) - observed_rows = build_observed_products.build_observed_products(rows) - canonical_rows, link_rows = build_canonical_layer.build_canonical_layer(observed_rows) - giant_row, costco_row = find_proof_pair(observed_rows) - if not giant_row or not costco_row: - raise click.ClickException( - "could not find BANANA proof pair across Giant and Costco observed products" - ) - canonical_rows, link_rows, proof_rows = merge_proof_pair( - canonical_rows, link_rows, giant_row, costco_row - ) - - write_csv_rows( - outdir / "products_observed.csv", - observed_rows, - build_observed_products.OUTPUT_FIELDS, - ) - write_csv_rows( - outdir / "products_canonical.csv", - canonical_rows, - build_canonical_layer.CANONICAL_FIELDS, - ) - write_csv_rows( - outdir / "product_links.csv", - link_rows, - build_canonical_layer.LINK_FIELDS, - ) - write_csv_rows(outdir / "proof_examples.csv", proof_rows, PROOF_FIELDS) - click.echo( - f"wrote combined outputs to {outdir} using {len(observed_rows)} observed rows" - ) - - -if __name__ == "__main__": - main()