Finalize post-refactor layout and remove old pipeline files

2026-03-24 17:09:57 -04:00
parent cdb7a15739
commit 09829b2b9d
17 changed files with 59 additions and 1154 deletions
--- a/README.md
+++ b/README.md
@@ -6,21 +6,15 @@ Run each script step-by-step from the terminal.

 ## What It Does

-1. `scrape_giant.py`: download Giant orders and items
-2. `enrich_giant.py`: normalize Giant line items
-3. `scrape_costco.py`: download Costco orders and items
-4. `enrich_costco.py`: normalize Costco line items
+1. `collect_giant_web.py`: download Giant orders and items
+2. `normalize_giant_web.py`: normalize Giant line items
+3. `collect_costco_web.py`: download Costco orders and items
+4. `normalize_costco_web.py`: normalize Costco line items
 5. `build_purchases.py`: combine retailer outputs into one purchase table
 6. `review_products.py`: review unresolved product matches in the terminal
 7. `report_pipeline_status.py`: show how many rows survive each stage
 8. `analyze_purchases.py`: write chart-ready analysis CSVs from the purchase table

-Active refactor entrypoints:
- `collect_giant_web.py`
- `collect_costco_web.py`
- `normalize_giant_web.py`
- `normalize_costco_web.py`
-
 ## Requirements

 - Python 3.10+
@@ -65,13 +59,20 @@ data/
    collected_items.csv
    normalized_items.csv
  review/
+    catalog.csv
    review_queue.csv
    review_resolutions.csv
    product_links.csv
-    purchases.csv
    pipeline_status.csv
    pipeline_status.json
-  catalog.csv
+  analysis/
+    purchases.csv
+    comparison_examples.csv
+    item_price_over_time.csv
+    spend_by_visit.csv
+    items_per_visit.csv
+    category_spend_over_time.csv
+    retailer_store_breakdown.csv
 ```

 ## Run Order
@@ -122,21 +123,21 @@ Costco:
 - `data/costco-web/normalized_items.csv` preserves raw totals and matched net discount fields

 Combined:
- `data/review/purchases.csv`
- `data/review/analysis/item_price_over_time.csv`
- `data/review/analysis/spend_by_visit.csv`
- `data/review/analysis/items_per_visit.csv`
- `data/review/analysis/category_spend_over_time.csv`
- `data/review/analysis/retailer_store_breakdown.csv`
+- `data/analysis/purchases.csv`
+- `data/analysis/comparison_examples.csv`
+- `data/analysis/item_price_over_time.csv`
+- `data/analysis/spend_by_visit.csv`
+- `data/analysis/items_per_visit.csv`
+- `data/analysis/category_spend_over_time.csv`
+- `data/analysis/retailer_store_breakdown.csv`
 - `data/review/review_queue.csv`
 - `data/review/review_resolutions.csv`
 - `data/review/product_links.csv`
- `data/review/comparison_examples.csv`
 - `data/review/pipeline_status.csv`
 - `data/review/pipeline_status.json`
- `data/catalog.csv`
+- `data/review/catalog.csv`

-`data/review/purchases.csv` is the main analysis artifact. It is designed to support both:
+`data/analysis/purchases.csv` is the main analysis artifact. It is designed to support both:
 - item-level price analysis
 - visit-level analysis such as spend by visit, items per visit, category spend by visit, and retailer/store breakdown

@@ -164,9 +165,7 @@ The review step is intentionally conservative:

 ## Notes
 - This project is designed around fragile retailer scraping flows, so the code favors explicit retailer-specific steps over heavy abstraction.
- `scrape_giant.py`, `scrape_costco.py`, `enrich_giant.py`, and `enrich_costco.py` are now legacy-compatible entrypoints; prefer the `collect_*` and `normalize_*` scripts for active work.
 - Costco discount rows are preserved for auditability and also matched back to purchased items during enrichment.
- `validate_cross_retailer_flow.py` is a proof/check script, not a required production step.

 ## Test

--- a/analyze_purchases.py
+++ b/analyze_purchases.py
@@ -241,8 +241,8 @@ def build_retailer_store_rows(purchase_rows):


@click.command()
-@click.option("--purchases-csv", default="data/review/purchases.csv", show_default=True)
-@click.option("--output-dir", default="data/review/analysis", show_default=True)
+@click.option("--purchases-csv", default="data/analysis/purchases.csv", show_default=True)
+@click.option("--output-dir", default="data/analysis", show_default=True)
 def main(purchases_csv, output_dir):
    purchase_rows = read_csv_rows(purchases_csv)
    output_path = Path(output_dir)
--- a/build_canonical_layer.py
+++ b/build_canonical_layer.py
@@ -1,220 +0,0 @@
-import click
-import re
-
-from layer_helpers import read_csv_rows, representative_value, stable_id, write_csv_rows
-
-
-CANONICAL_FIELDS = [
-    "canonical_product_id",
-    "canonical_name",
-    "product_type",
-    "brand",
-    "variant",
-    "size_value",
-    "size_unit",
-    "pack_qty",
-    "measure_type",
-    "normalized_quantity",
-    "normalized_quantity_unit",
-    "notes",
-    "created_at",
-    "updated_at",
-]
-
-CANONICAL_DROP_TOKENS = {"CT", "COUNT", "COUNTS", "DOZ", "DOZEN", "DOZ.", "PACK"}
-
-LINK_FIELDS = [
-    "observed_product_id",
-    "canonical_product_id",
-    "link_method",
-    "link_confidence",
-    "review_status",
-    "reviewed_by",
-    "reviewed_at",
-    "link_notes",
-]
-
-
-def to_float(value):
-    try:
-        return float(value)
-    except (TypeError, ValueError):
-        return None
-
-
-def normalized_quantity(row):
-    size_value = to_float(row.get("representative_size_value"))
-    pack_qty = to_float(row.get("representative_pack_qty")) or 1.0
-    size_unit = row.get("representative_size_unit", "")
-    measure_type = row.get("representative_measure_type", "")
-
-    if size_value is not None and size_unit:
-        return format(size_value * pack_qty, "g"), size_unit
-
-    if row.get("representative_pack_qty") and measure_type == "count":
-        return row["representative_pack_qty"], "count"
-
-    if measure_type == "each":
-        return "1", "each"
-
-    return "", ""
-
-
-def auto_link_rule(observed_row):
-    if (
-        observed_row.get("is_fee") == "true"
-        or observed_row.get("is_discount_line") == "true"
-        or observed_row.get("is_coupon_line") == "true"
-    ):
-        return "", "", ""
-
-    if observed_row.get("representative_upc"):
-        return (
-            "exact_upc",
-            f"upc={observed_row['representative_upc']}",
-            "high",
-        )
-
-    if (
-        observed_row.get("representative_name_norm")
-        and observed_row.get("representative_size_value")
-        and observed_row.get("representative_size_unit")
-    ):
-        return (
-            "exact_name_size",
-            "|".join(
-                [
-                    f"name={observed_row['representative_name_norm']}",
-                    f"size={observed_row['representative_size_value']}",
-                    f"unit={observed_row['representative_size_unit']}",
-                    f"pack={observed_row['representative_pack_qty']}",
-                    f"measure={observed_row['representative_measure_type']}",
-                ]
-            ),
-            "high",
-        )
-
-    return "", "", ""
-
-
-def clean_canonical_name(name):
-    tokens = []
-    for token in re.sub(r"[^A-Z0-9\s]", " ", (name or "").upper()).split():
-        if token.isdigit():
-            continue
-        if token in CANONICAL_DROP_TOKENS:
-            continue
-        if re.fullmatch(r"\d+(?:PK|PACK)", token):
-            continue
-        if re.fullmatch(r"\d+DZ", token):
-            continue
-        tokens.append(token)
-    return " ".join(tokens).strip()
-
-
-def canonical_row_for_group(canonical_product_id, group_rows, link_method):
-    quantity_value, quantity_unit = normalized_quantity(
-        {
-            "representative_size_value": representative_value(
-                group_rows, "representative_size_value"
-            ),
-            "representative_size_unit": representative_value(
-                group_rows, "representative_size_unit"
-            ),
-            "representative_pack_qty": representative_value(
-                group_rows, "representative_pack_qty"
-            ),
-            "representative_measure_type": representative_value(
-                group_rows, "representative_measure_type"
-            ),
-        }
-    )
-    return {
-        "canonical_product_id": canonical_product_id,
-        "canonical_name": clean_canonical_name(
-            representative_value(group_rows, "representative_name_norm")
-        )
-        or representative_value(group_rows, "representative_name_norm"),
-        "product_type": "",
-        "brand": representative_value(group_rows, "representative_brand"),
-        "variant": representative_value(group_rows, "representative_variant"),
-        "size_value": representative_value(group_rows, "representative_size_value"),
-        "size_unit": representative_value(group_rows, "representative_size_unit"),
-        "pack_qty": representative_value(group_rows, "representative_pack_qty"),
-        "measure_type": representative_value(group_rows, "representative_measure_type"),
-        "normalized_quantity": quantity_value,
-        "normalized_quantity_unit": quantity_unit,
-        "notes": f"auto-linked via {link_method}",
-        "created_at": "",
-        "updated_at": "",
-    }
-
-
-def build_canonical_layer(observed_rows):
-    canonical_rows = []
-    link_rows = []
-    groups = {}
-
-    for observed_row in sorted(observed_rows, key=lambda row: row["observed_product_id"]):
-        link_method, group_key, confidence = auto_link_rule(observed_row)
-        if not group_key:
-            continue
-
-        canonical_product_id = stable_id("gcan", f"{link_method}|{group_key}")
-        groups.setdefault(canonical_product_id, {"method": link_method, "rows": []})
-        groups[canonical_product_id]["rows"].append(observed_row)
-        link_rows.append(
-            {
-                "observed_product_id": observed_row["observed_product_id"],
-                "canonical_product_id": canonical_product_id,
-                "link_method": link_method,
-                "link_confidence": confidence,
-                "review_status": "",
-                "reviewed_by": "",
-                "reviewed_at": "",
-                "link_notes": "",
-            }
-        )
-
-    for canonical_product_id, group in sorted(groups.items()):
-        canonical_rows.append(
-            canonical_row_for_group(
-                canonical_product_id, group["rows"], group["method"]
-            )
-        )
-
-    return canonical_rows, link_rows
-
-
-@click.command()
-@click.option(
-    "--observed-csv",
-    default="giant_output/products_observed.csv",
-    show_default=True,
-    help="Path to observed product rows.",
-)
-@click.option(
-    "--canonical-csv",
-    default="giant_output/products_canonical.csv",
-    show_default=True,
-    help="Path to canonical product output.",
-)
-@click.option(
-    "--links-csv",
-    default="giant_output/product_links.csv",
-    show_default=True,
-    help="Path to observed-to-canonical link output.",
-)
-def main(observed_csv, canonical_csv, links_csv):
-    observed_rows = read_csv_rows(observed_csv)
-    canonical_rows, link_rows = build_canonical_layer(observed_rows)
-    write_csv_rows(canonical_csv, canonical_rows, CANONICAL_FIELDS)
-    write_csv_rows(links_csv, link_rows, LINK_FIELDS)
-    click.echo(
-        f"wrote {len(canonical_rows)} canonical rows to {canonical_csv} and "
-        f"{len(link_rows)} links to {links_csv}"
-    )
-
-
-if __name__ == "__main__":
-    main()
--- a/build_observed_products.py
+++ b/build_observed_products.py
@@ -1,172 +0,0 @@
-from collections import defaultdict
-
-import click
-
-from layer_helpers import (
-    compact_join,
-    distinct_values,
-    first_nonblank,
-    read_csv_rows,
-    representative_value,
-    stable_id,
-    write_csv_rows,
-)
-
-
-OUTPUT_FIELDS = [
-    "observed_product_id",
-    "retailer",
-    "observed_key",
-    "representative_retailer_item_id",
-    "representative_upc",
-    "representative_item_name",
-    "representative_name_norm",
-    "representative_brand",
-    "representative_variant",
-    "representative_size_value",
-    "representative_size_unit",
-    "representative_pack_qty",
-    "representative_measure_type",
-    "representative_image_url",
-    "is_store_brand",
-    "is_fee",
-    "is_discount_line",
-    "is_coupon_line",
-    "first_seen_date",
-    "last_seen_date",
-    "times_seen",
-    "example_order_id",
-    "example_item_name",
-    "raw_name_examples",
-    "normalized_name_examples",
-    "example_prices",
-    "distinct_item_names_count",
-    "distinct_retailer_item_ids_count",
-    "distinct_upcs_count",
-]
-
-
-def build_observed_key(row):
-    if row.get("upc"):
-        return "|".join(
-            [
-                row["retailer"],
-                f"upc={row['upc']}",
-                f"name={row['item_name_norm']}",
-            ]
-        )
-
-    if row.get("retailer_item_id"):
-        return "|".join(
-            [
-                row["retailer"],
-                f"retailer_item_id={row['retailer_item_id']}",
-                f"name={row['item_name_norm']}",
-                f"discount={row.get('is_discount_line', 'false')}",
-                f"coupon={row.get('is_coupon_line', 'false')}",
-            ]
-        )
-
-    return "|".join(
-        [
-            row["retailer"],
-            f"name={row['item_name_norm']}",
-            f"size={row['size_value']}",
-            f"unit={row['size_unit']}",
-            f"pack={row['pack_qty']}",
-            f"measure={row['measure_type']}",
-            f"store_brand={row['is_store_brand']}",
-            f"fee={row['is_fee']}",
-        ]
-    )
-
-
-def build_observed_products(rows):
-    grouped = defaultdict(list)
-    for row in rows:
-        grouped[build_observed_key(row)].append(row)
-
-    observed_rows = []
-    for observed_key, group_rows in sorted(grouped.items()):
-        ordered = sorted(
-            group_rows,
-            key=lambda row: (row["order_date"], row["order_id"], int(row["line_no"])),
-        )
-        observed_rows.append(
-            {
-                "observed_product_id": stable_id("gobs", observed_key),
-                "retailer": ordered[0]["retailer"],
-                "observed_key": observed_key,
-                "representative_retailer_item_id": representative_value(
-                    ordered, "retailer_item_id"
-                ),
-                "representative_upc": representative_value(ordered, "upc"),
-                "representative_item_name": representative_value(ordered, "item_name"),
-                "representative_name_norm": representative_value(
-                    ordered, "item_name_norm"
-                ),
-                "representative_brand": representative_value(ordered, "brand_guess"),
-                "representative_variant": representative_value(ordered, "variant"),
-                "representative_size_value": representative_value(ordered, "size_value"),
-                "representative_size_unit": representative_value(ordered, "size_unit"),
-                "representative_pack_qty": representative_value(ordered, "pack_qty"),
-                "representative_measure_type": representative_value(
-                    ordered, "measure_type"
-                ),
-                "representative_image_url": first_nonblank(ordered, "image_url"),
-                "is_store_brand": representative_value(ordered, "is_store_brand"),
-                "is_fee": representative_value(ordered, "is_fee"),
-                "is_discount_line": representative_value(
-                    ordered, "is_discount_line"
-                ),
-                "is_coupon_line": representative_value(ordered, "is_coupon_line"),
-                "first_seen_date": ordered[0]["order_date"],
-                "last_seen_date": ordered[-1]["order_date"],
-                "times_seen": str(len(ordered)),
-                "example_order_id": ordered[0]["order_id"],
-                "example_item_name": ordered[0]["item_name"],
-                "raw_name_examples": compact_join(
-                    distinct_values(ordered, "item_name"), limit=4
-                ),
-                "normalized_name_examples": compact_join(
-                    distinct_values(ordered, "item_name_norm"), limit=4
-                ),
-                "example_prices": compact_join(
-                    distinct_values(ordered, "line_total"), limit=4
-                ),
-                "distinct_item_names_count": str(
-                    len(distinct_values(ordered, "item_name"))
-                ),
-                "distinct_retailer_item_ids_count": str(
-                    len(distinct_values(ordered, "retailer_item_id"))
-                ),
-                "distinct_upcs_count": str(len(distinct_values(ordered, "upc"))),
-            }
-        )
-
-    observed_rows.sort(key=lambda row: row["observed_product_id"])
-    return observed_rows
-
-
-@click.command()
-@click.option(
-    "--items-enriched-csv",
-    default="giant_output/items_enriched.csv",
-    show_default=True,
-    help="Path to enriched Giant item rows.",
-)
-@click.option(
-    "--output-csv",
-    default="giant_output/products_observed.csv",
-    show_default=True,
-    help="Path to observed product output.",
-)
-def main(items_enriched_csv, output_csv):
-    rows = read_csv_rows(items_enriched_csv)
-    observed_rows = build_observed_products(rows)
-    write_csv_rows(output_csv, observed_rows, OUTPUT_FIELDS)
-    click.echo(f"wrote {len(observed_rows)} rows to {output_csv}")
-
-
-if __name__ == "__main__":
-    main()
--- a/build_purchases.py
+++ b/build_purchases.py
@@ -440,10 +440,10 @@ def build_comparison_examples(purchase_rows):
@click.option("--giant-orders-csv", default="data/giant-web/collected_orders.csv", show_default=True)
@click.option("--costco-orders-csv", default="data/costco-web/collected_orders.csv", show_default=True)
@click.option("--resolutions-csv", default="data/review/review_resolutions.csv", show_default=True)
-@click.option("--catalog-csv", default="data/catalog.csv", show_default=True)
+@click.option("--catalog-csv", default="data/review/catalog.csv", show_default=True)
@click.option("--links-csv", default="data/review/product_links.csv", show_default=True)
-@click.option("--output-csv", default="data/review/purchases.csv", show_default=True)
-@click.option("--examples-csv", default="data/review/comparison_examples.csv", show_default=True)
+@click.option("--output-csv", default="data/analysis/purchases.csv", show_default=True)
+@click.option("--examples-csv", default="data/analysis/comparison_examples.csv", show_default=True)
 def main(
    giant_items_enriched_csv,
    costco_items_enriched_csv,
--- a/build_review_queue.py
+++ b/build_review_queue.py
@@ -1,175 +0,0 @@
-from collections import defaultdict
-from datetime import date
-
-import click
-
-from layer_helpers import compact_join, distinct_values, read_csv_rows, stable_id, write_csv_rows
-
-
-OUTPUT_FIELDS = [
-    "review_id",
-    "queue_type",
-    "retailer",
-    "observed_product_id",
-    "canonical_product_id",
-    "reason_code",
-    "priority",
-    "raw_item_names",
-    "normalized_names",
-    "upc",
-    "image_url",
-    "example_prices",
-    "seen_count",
-    "status",
-    "resolution_notes",
-    "created_at",
-    "updated_at",
-]
-
-
-def existing_review_state(path):
-    try:
-        rows = read_csv_rows(path)
-    except FileNotFoundError:
-        return {}
-    return {row["review_id"]: row for row in rows}
-
-
-def review_reasons(observed_row):
-    reasons = []
-    if (
-        observed_row["is_fee"] == "true"
-        or observed_row.get("is_discount_line") == "true"
-        or observed_row.get("is_coupon_line") == "true"
-    ):
-        return reasons
-    if observed_row["distinct_upcs_count"] not in {"", "0", "1"}:
-        reasons.append(("multiple_upcs", "high"))
-    if observed_row["distinct_item_names_count"] not in {"", "0", "1"}:
-        reasons.append(("multiple_raw_names", "medium"))
-    if not observed_row["representative_image_url"]:
-        reasons.append(("missing_image", "medium"))
-    if not observed_row["representative_upc"]:
-        reasons.append(("missing_upc", "high"))
-    if not observed_row["representative_name_norm"]:
-        reasons.append(("missing_normalized_name", "high"))
-    return reasons
-
-
-def build_review_queue(observed_rows, item_rows, existing_rows, today_text):
-    by_observed = defaultdict(list)
-    for row in item_rows:
-        observed_id = row.get("observed_product_id", "")
-        if observed_id:
-            by_observed[observed_id].append(row)
-
-    queue_rows = []
-    for observed_row in observed_rows:
-        reasons = review_reasons(observed_row)
-        if not reasons:
-            continue
-
-        related_items = by_observed.get(observed_row["observed_product_id"], [])
-        raw_names = compact_join(distinct_values(related_items, "item_name"), limit=5)
-        norm_names = compact_join(
-            distinct_values(related_items, "item_name_norm"), limit=5
-        )
-        example_prices = compact_join(
-            distinct_values(related_items, "line_total"), limit=5
-        )
-
-        for reason_code, priority in reasons:
-            review_id = stable_id(
-                "rvw",
-                f"{observed_row['observed_product_id']}|{reason_code}",
-            )
-            prior = existing_rows.get(review_id, {})
-            queue_rows.append(
-                {
-                    "review_id": review_id,
-                    "queue_type": "observed_product",
-                    "retailer": observed_row["retailer"],
-                    "observed_product_id": observed_row["observed_product_id"],
-                    "canonical_product_id": prior.get("canonical_product_id", ""),
-                    "reason_code": reason_code,
-                    "priority": priority,
-                    "raw_item_names": raw_names,
-                    "normalized_names": norm_names,
-                    "upc": observed_row["representative_upc"],
-                    "image_url": observed_row["representative_image_url"],
-                    "example_prices": example_prices,
-                    "seen_count": observed_row["times_seen"],
-                    "status": prior.get("status", "pending"),
-                    "resolution_notes": prior.get("resolution_notes", ""),
-                    "created_at": prior.get("created_at", today_text),
-                    "updated_at": today_text,
-                }
-            )
-
-    queue_rows.sort(key=lambda row: (row["priority"], row["reason_code"], row["review_id"]))
-    return queue_rows
-
-
-def attach_observed_ids(item_rows, observed_rows):
-    observed_by_key = {row["observed_key"]: row["observed_product_id"] for row in observed_rows}
-    attached = []
-    for row in item_rows:
-        observed_key = "|".join(
-            [
-                row["retailer"],
-                f"upc={row['upc']}",
-                f"name={row['item_name_norm']}",
-            ]
-        ) if row.get("upc") else "|".join(
-            [
-                row["retailer"],
-                f"retailer_item_id={row.get('retailer_item_id', '')}",
-                f"name={row['item_name_norm']}",
-                f"size={row['size_value']}",
-                f"unit={row['size_unit']}",
-                f"pack={row['pack_qty']}",
-                f"measure={row['measure_type']}",
-                f"store_brand={row['is_store_brand']}",
-                f"fee={row['is_fee']}",
-                f"discount={row.get('is_discount_line', 'false')}",
-                f"coupon={row.get('is_coupon_line', 'false')}",
-            ]
-        )
-        enriched = dict(row)
-        enriched["observed_product_id"] = observed_by_key.get(observed_key, "")
-        attached.append(enriched)
-    return attached
-
-
-@click.command()
-@click.option(
-    "--observed-csv",
-    default="giant_output/products_observed.csv",
-    show_default=True,
-    help="Path to observed product rows.",
-)
-@click.option(
-    "--items-enriched-csv",
-    default="giant_output/items_enriched.csv",
-    show_default=True,
-    help="Path to enriched Giant item rows.",
-)
-@click.option(
-    "--output-csv",
-    default="giant_output/review_queue.csv",
-    show_default=True,
-    help="Path to review queue output.",
-)
-def main(observed_csv, items_enriched_csv, output_csv):
-    observed_rows = read_csv_rows(observed_csv)
-    item_rows = read_csv_rows(items_enriched_csv)
-    item_rows = attach_observed_ids(item_rows, observed_rows)
-    existing_rows = existing_review_state(output_csv)
-    today_text = str(date.today())
-    queue_rows = build_review_queue(observed_rows, item_rows, existing_rows, today_text)
-    write_csv_rows(output_csv, queue_rows, OUTPUT_FIELDS)
-    click.echo(f"wrote {len(queue_rows)} rows to {output_csv}")
-
-
-if __name__ == "__main__":
-    main()
--- a/pm/data-model.org
+++ b/pm/data-model.org
@@ -110,8 +110,15 @@ data/
  review/
    review_queue.csv # Human review queue for unresolved matching/parsing cases.
    product_links.csv # Links from normalized retailer items to catalog items.
-  catalog.csv  # Cross-retailer product catalog entities used for comparison.
-  purchases.csv
+    catalog.csv # Cross-retailer product catalog entities used for comparison.
+  analysis/
+    purchases.csv
+    comparison_examples.csv
+    item_price_over_time.csv
+    spend_by_visit.csv
+    items_per_visit.csv
+    category_spend_over_time.csv
+    retailer_store_breakdown.csv
 #+end_example

 Notes:
@@ -223,7 +230,7 @@ Notes:
 - Valid `normalization_basis` values should be explicit, e.g. `exact_upc`, `exact_retailer_item_id`, `exact_name_size_pack`, or `approved_retailer_alias`.
 - Do not use fuzzy or semantic matching to assign `normalized_item_id`.
 - Discount/coupon rows may remain as standalone normalized rows for auditability even when their amounts are attached to a purchased row via `matched_discount_amount`.
- Cross-retailer identity is handled later in review/combine via `catalog.csv` and `product_links.csv`.
+- Cross-retailer identity is handled later in review/combine via `data/review/catalog.csv` and `product_links.csv`.

 ** `data/review/product_links.csv`
 One row per review-approved link from a normalized retailer item to a catalog item.
@@ -263,7 +270,7 @@ One row per issue needing human review.
 | `resolution_notes`   | reviewer notes                                      |
 | `created_at`         | creation timestamp or date                          |
 | `updated_at`         | last update timestamp or date                       |
-** `data/catalog.csv`
+** `data/review/catalog.csv`
 One row per cross-retailer catalog product.
 | key                        | definition                             |
 |----------------------------+----------------------------------------|
@@ -288,7 +295,7 @@ Notes:
 - Do not encode packaging/count into `catalog_name` unless it is essential to product identity.
 - `catalog_name` should come from review-approved naming, not raw retailer strings.

-** `data/purchases.csv`
+** `data/analysis/purchases.csv`
 One row per purchased item (i.e., `is_item`==true from normalized layer), with
 catalog attributes denormalized in and discounts already applied.

--- a/report_pipeline_status.py
+++ b/report_pipeline_status.py
@@ -85,10 +85,10 @@ def build_status_summary(
@click.option("--costco-orders-csv", default="data/costco-web/collected_orders.csv", show_default=True)
@click.option("--costco-items-csv", default="data/costco-web/collected_items.csv", show_default=True)
@click.option("--costco-enriched-csv", default="data/costco-web/normalized_items.csv", show_default=True)
-@click.option("--purchases-csv", default="data/review/purchases.csv", show_default=True)
+@click.option("--purchases-csv", default="data/analysis/purchases.csv", show_default=True)
@click.option("--resolutions-csv", default="data/review/review_resolutions.csv", show_default=True)
@click.option("--links-csv", default="data/review/product_links.csv", show_default=True)
-@click.option("--catalog-csv", default="data/catalog.csv", show_default=True)
+@click.option("--catalog-csv", default="data/review/catalog.csv", show_default=True)
@click.option("--summary-csv", default="data/review/pipeline_status.csv", show_default=True)
@click.option("--summary-json", default="data/review/pipeline_status.json", show_default=True)
 def main(
--- a/review_products.py
+++ b/review_products.py
@@ -562,10 +562,10 @@ def link_rows_from_state(link_lookup):
@click.option("--costco-items-enriched-csv", default="data/costco-web/normalized_items.csv", show_default=True)
@click.option("--giant-orders-csv", default="data/giant-web/collected_orders.csv", show_default=True)
@click.option("--costco-orders-csv", default="data/costco-web/collected_orders.csv", show_default=True)
-@click.option("--purchases-csv", default="data/review/purchases.csv", show_default=True)
+@click.option("--purchases-csv", default="data/analysis/purchases.csv", show_default=True)
@click.option("--queue-csv", default="data/review/review_queue.csv", show_default=True)
@click.option("--resolutions-csv", default="data/review/review_resolutions.csv", show_default=True)
-@click.option("--catalog-csv", default="data/catalog.csv", show_default=True)
+@click.option("--catalog-csv", default="data/review/catalog.csv", show_default=True)
@click.option("--links-csv", default="data/review/product_links.csv", show_default=True)
@click.option("--limit", default=0, show_default=True, type=int)
@click.option("--refresh-only", is_flag=True, help="Only rebuild review_queue.csv without prompting.")
--- a/scraper.py
+++ b/scraper.py
@@ -1,5 +0,0 @@
-from scrape_giant import *  # noqa: F401,F403
-
-
-if __name__ == "__main__":
-    main()
--- a/tests/test_canonical_layer.py
+++ b/tests/test_canonical_layer.py
@@ -1,119 +0,0 @@
-import unittest
-
-import build_canonical_layer
-
-
-class CanonicalLayerTests(unittest.TestCase):
-    def test_build_canonical_layer_auto_links_exact_upc_and_name_size_only(self):
-        observed_rows = [
-            {
-                "observed_product_id": "gobs_1",
-                "representative_upc": "111",
-                "representative_retailer_item_id": "11",
-                "representative_name_norm": "GALA APPLE",
-                "representative_brand": "SB",
-                "representative_variant": "",
-                "representative_size_value": "5",
-                "representative_size_unit": "lb",
-                "representative_pack_qty": "",
-                "representative_measure_type": "weight",
-                "is_fee": "false",
-                "is_discount_line": "false",
-                "is_coupon_line": "false",
-            },
-            {
-                "observed_product_id": "gobs_2",
-                "representative_upc": "111",
-                "representative_retailer_item_id": "12",
-                "representative_name_norm": "LARGE WHITE EGGS",
-                "representative_brand": "SB",
-                "representative_variant": "",
-                "representative_size_value": "",
-                "representative_size_unit": "",
-                "representative_pack_qty": "18",
-                "representative_measure_type": "count",
-                "is_fee": "false",
-                "is_discount_line": "false",
-                "is_coupon_line": "false",
-            },
-            {
-                "observed_product_id": "gobs_3",
-                "representative_upc": "",
-                "representative_retailer_item_id": "21",
-                "representative_name_norm": "ROTINI",
-                "representative_brand": "",
-                "representative_variant": "",
-                "representative_size_value": "16",
-                "representative_size_unit": "oz",
-                "representative_pack_qty": "",
-                "representative_measure_type": "weight",
-                "is_fee": "false",
-                "is_discount_line": "false",
-                "is_coupon_line": "false",
-            },
-            {
-                "observed_product_id": "gobs_4",
-                "representative_upc": "",
-                "representative_retailer_item_id": "22",
-                "representative_name_norm": "ROTINI",
-                "representative_brand": "SB",
-                "representative_variant": "",
-                "representative_size_value": "16",
-                "representative_size_unit": "oz",
-                "representative_pack_qty": "",
-                "representative_measure_type": "weight",
-                "is_fee": "false",
-                "is_discount_line": "false",
-                "is_coupon_line": "false",
-            },
-            {
-                "observed_product_id": "gobs_5",
-                "representative_upc": "",
-                "representative_retailer_item_id": "99",
-                "representative_name_norm": "GL BAG CHARGE",
-                "representative_brand": "",
-                "representative_variant": "",
-                "representative_size_value": "",
-                "representative_size_unit": "",
-                "representative_pack_qty": "",
-                "representative_measure_type": "each",
-                "is_fee": "true",
-                "is_discount_line": "false",
-                "is_coupon_line": "false",
-            },
-            {
-                "observed_product_id": "gobs_6",
-                "representative_upc": "",
-                "representative_retailer_item_id": "",
-                "representative_name_norm": "LIME",
-                "representative_brand": "",
-                "representative_variant": "",
-                "representative_size_value": "",
-                "representative_size_unit": "",
-                "representative_pack_qty": "",
-                "representative_measure_type": "each",
-                "is_fee": "false",
-                "is_discount_line": "false",
-                "is_coupon_line": "false",
-            },
-        ]
-
-        canonicals, links = build_canonical_layer.build_canonical_layer(observed_rows)
-
-        self.assertEqual(2, len(canonicals))
-        self.assertEqual(4, len(links))
-        methods = {row["observed_product_id"]: row["link_method"] for row in links}
-        self.assertEqual("exact_upc", methods["gobs_1"])
-        self.assertEqual("exact_upc", methods["gobs_2"])
-        self.assertEqual("exact_name_size", methods["gobs_3"])
-        self.assertEqual("exact_name_size", methods["gobs_4"])
-        self.assertNotIn("gobs_5", methods)
-        self.assertNotIn("gobs_6", methods)
-
-    def test_clean_canonical_name_removes_packaging_noise(self):
-        self.assertEqual("LIME", build_canonical_layer.clean_canonical_name("LIME  . / ."))
-        self.assertEqual("EGG", build_canonical_layer.clean_canonical_name("5DZ EGG / /"))
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tests/test_costco_pipeline.py
+++ b/tests/test_costco_pipeline.py
@@ -7,7 +7,6 @@ from unittest import mock

 import enrich_costco
 import scrape_costco
-import validate_cross_retailer_flow


 class CostcoPipelineTests(unittest.TestCase):
@@ -423,76 +422,6 @@ class CostcoPipelineTests(unittest.TestCase):
            self.assertIn("matched_discount=4873222", purchase_row["parse_notes"])
            self.assertIn("matched_to_item=4873222", discount_row["parse_notes"])

-    def test_cross_retailer_validation_writes_proof_example(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            giant_csv = Path(tmpdir) / "giant_items_enriched.csv"
-            costco_csv = Path(tmpdir) / "costco_items_enriched.csv"
-            outdir = Path(tmpdir) / "combined"
-
-            fieldnames = enrich_costco.OUTPUT_FIELDS
-            giant_row = {field: "" for field in fieldnames}
-            giant_row.update(
-                {
-                    "retailer": "giant",
-                    "order_id": "g1",
-                    "line_no": "1",
-                    "order_date": "2026-03-01",
-                    "retailer_item_id": "100",
-                    "item_name": "FRESH BANANA",
-                    "item_name_norm": "BANANA",
-                    "upc": "4011",
-                    "measure_type": "weight",
-                    "is_store_brand": "false",
-                    "is_fee": "false",
-                    "is_discount_line": "false",
-                    "is_coupon_line": "false",
-                    "line_total": "1.29",
-                }
-            )
-            costco_row = {field: "" for field in fieldnames}
-            costco_row.update(
-                {
-                    "retailer": "costco",
-                    "order_id": "c1",
-                    "line_no": "1",
-                    "order_date": "2026-03-12",
-                    "retailer_item_id": "30669",
-                    "item_name": "BANANAS 3 LB / 1.36 KG",
-                    "item_name_norm": "BANANA",
-                    "upc": "",
-                    "size_value": "3",
-                    "size_unit": "lb",
-                    "measure_type": "weight",
-                    "is_store_brand": "false",
-                    "is_fee": "false",
-                    "is_discount_line": "false",
-                    "is_coupon_line": "false",
-                    "line_total": "2.98",
-                }
-            )
-
-            with giant_csv.open("w", newline="", encoding="utf-8") as handle:
-                writer = csv.DictWriter(handle, fieldnames=fieldnames)
-                writer.writeheader()
-                writer.writerow(giant_row)
-            with costco_csv.open("w", newline="", encoding="utf-8") as handle:
-                writer = csv.DictWriter(handle, fieldnames=fieldnames)
-                writer.writeheader()
-                writer.writerow(costco_row)
-
-            validate_cross_retailer_flow.main.callback(
-                giant_items_enriched_csv=str(giant_csv),
-                costco_items_enriched_csv=str(costco_csv),
-                outdir=str(outdir),
-            )
-
-            proof_path = outdir / "proof_examples.csv"
-            self.assertTrue(proof_path.exists())
-            with proof_path.open(newline="", encoding="utf-8") as handle:
-                rows = list(csv.DictReader(handle))
-            self.assertEqual(1, len(rows))
-            self.assertEqual("banana", rows[0]["proof_name"])
-
    def test_main_writes_summary_request_metadata(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            outdir = Path(tmpdir) / "costco_output"
--- a/tests/test_observed_products.py
+++ b/tests/test_observed_products.py
@@ -1,67 +0,0 @@
-import unittest
-
-import build_observed_products
-
-
-class ObservedProductTests(unittest.TestCase):
-    def test_build_observed_products_aggregates_rows_with_same_key(self):
-        rows = [
-            {
-                "retailer": "giant",
-                "order_id": "1",
-                "line_no": "1",
-                "order_date": "2026-01-01",
-                "item_name": "SB GALA APPLE 5LB",
-                "item_name_norm": "GALA APPLE",
-                "retailer_item_id": "11",
-                "upc": "111",
-                "brand_guess": "SB",
-                "variant": "",
-                "size_value": "5",
-                "size_unit": "lb",
-                "pack_qty": "",
-                "measure_type": "weight",
-                "image_url": "https://example.test/a.jpg",
-                "is_store_brand": "true",
-                "is_fee": "false",
-                "is_discount_line": "false",
-                "is_coupon_line": "false",
-                "line_total": "7.99",
-            },
-            {
-                "retailer": "giant",
-                "order_id": "2",
-                "line_no": "1",
-                "order_date": "2026-01-10",
-                "item_name": "SB GALA APPLE 5 LB",
-                "item_name_norm": "GALA APPLE",
-                "retailer_item_id": "11",
-                "upc": "111",
-                "brand_guess": "SB",
-                "variant": "",
-                "size_value": "5",
-                "size_unit": "lb",
-                "pack_qty": "",
-                "measure_type": "weight",
-                "image_url": "",
-                "is_store_brand": "true",
-                "is_fee": "false",
-                "is_discount_line": "false",
-                "is_coupon_line": "false",
-                "line_total": "8.49",
-            },
-        ]
-
-        observed = build_observed_products.build_observed_products(rows)
-
-        self.assertEqual(1, len(observed))
-        self.assertEqual("2", observed[0]["times_seen"])
-        self.assertEqual("2026-01-01", observed[0]["first_seen_date"])
-        self.assertEqual("2026-01-10", observed[0]["last_seen_date"])
-        self.assertEqual("11", observed[0]["representative_retailer_item_id"])
-        self.assertEqual("111", observed[0]["representative_upc"])
-        self.assertIn("SB GALA APPLE 5LB", observed[0]["raw_name_examples"])
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tests/test_pipeline_status.py
+++ b/tests/test_pipeline_status.py
@@ -65,6 +65,21 @@ class PipelineStatusTests(unittest.TestCase):
                },
            ],
            resolutions=[],
+            links=[
+                {
+                    "normalized_item_id": "gnorm_banana",
+                    "catalog_id": "cat_banana",
+                    "review_status": "approved",
+                }
+            ],
+            catalog=[
+                {
+                    "catalog_id": "cat_banana",
+                    "catalog_name": "BANANA",
+                    "product_type": "banana",
+                    "category": "produce",
+                }
+            ],
        )

        counts = {row["stage"]: row["count"] for row in summary}
--- a/tests/test_review_queue.py
+++ b/tests/test_review_queue.py
@@ -1,133 +0,0 @@
-import tempfile
-import unittest
-from pathlib import Path
-
-import build_observed_products
-import build_review_queue
-from layer_helpers import write_csv_rows
-
-
-class ReviewQueueTests(unittest.TestCase):
-    def test_build_review_queue_preserves_existing_status(self):
-        observed_rows = [
-            {
-                "observed_product_id": "gobs_1",
-                "retailer": "giant",
-                "representative_upc": "111",
-                "representative_image_url": "",
-                "representative_name_norm": "GALA APPLE",
-                "times_seen": "2",
-                "distinct_item_names_count": "2",
-                "distinct_upcs_count": "1",
-                "is_fee": "false",
-                "is_discount_line": "false",
-                "is_coupon_line": "false",
-            }
-        ]
-        item_rows = [
-            {
-                "observed_product_id": "gobs_1",
-                "item_name": "SB GALA APPLE 5LB",
-                "item_name_norm": "GALA APPLE",
-                "line_total": "7.99",
-            },
-            {
-                "observed_product_id": "gobs_1",
-                "item_name": "SB GALA APPLE 5 LB",
-                "item_name_norm": "GALA APPLE",
-                "line_total": "8.49",
-            },
-        ]
-        existing = {
-            build_review_queue.stable_id("rvw", "gobs_1|missing_image"): {
-                "status": "approved",
-                "resolution_notes": "looked fine",
-                "created_at": "2026-03-15",
-            }
-        }
-
-        queue = build_review_queue.build_review_queue(
-            observed_rows, item_rows, existing, "2026-03-16"
-        )
-
-        self.assertEqual(2, len(queue))
-        missing_image = [row for row in queue if row["reason_code"] == "missing_image"][0]
-        self.assertEqual("approved", missing_image["status"])
-        self.assertEqual("looked fine", missing_image["resolution_notes"])
-
-    def test_review_queue_main_writes_output(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            observed_path = Path(tmpdir) / "products_observed.csv"
-            items_path = Path(tmpdir) / "items_enriched.csv"
-            output_path = Path(tmpdir) / "review_queue.csv"
-
-            observed_rows = [
-                {
-                    "observed_product_id": "gobs_1",
-                    "retailer": "giant",
-                    "observed_key": "giant|upc=111|name=GALA APPLE",
-                    "representative_retailer_item_id": "11",
-                    "representative_upc": "111",
-                    "representative_item_name": "SB GALA APPLE 5LB",
-                    "representative_name_norm": "GALA APPLE",
-                    "representative_brand": "SB",
-                    "representative_variant": "",
-                    "representative_size_value": "5",
-                    "representative_size_unit": "lb",
-                    "representative_pack_qty": "",
-                    "representative_measure_type": "weight",
-                    "representative_image_url": "",
-                    "is_store_brand": "true",
-                    "is_fee": "false",
-                    "is_discount_line": "false",
-                    "is_coupon_line": "false",
-                    "first_seen_date": "2026-01-01",
-                    "last_seen_date": "2026-01-10",
-                    "times_seen": "2",
-                    "example_order_id": "1",
-                    "example_item_name": "SB GALA APPLE 5LB",
-                    "raw_name_examples": "SB GALA APPLE 5LB | SB GALA APPLE 5 LB",
-                    "normalized_name_examples": "GALA APPLE",
-                    "example_prices": "7.99 | 8.49",
-                    "distinct_item_names_count": "2",
-                    "distinct_retailer_item_ids_count": "1",
-                    "distinct_upcs_count": "1",
-                }
-            ]
-            item_rows = [
-                {
-                    "retailer": "giant",
-                    "order_id": "1",
-                    "line_no": "1",
-                    "item_name": "SB GALA APPLE 5LB",
-                    "item_name_norm": "GALA APPLE",
-                    "retailer_item_id": "11",
-                    "upc": "111",
-                    "size_value": "5",
-                    "size_unit": "lb",
-                    "pack_qty": "",
-                    "measure_type": "weight",
-                    "is_store_brand": "true",
-                    "is_fee": "false",
-                    "is_discount_line": "false",
-                    "is_coupon_line": "false",
-                    "line_total": "7.99",
-                }
-            ]
-
-            write_csv_rows(
-                observed_path, observed_rows, build_observed_products.OUTPUT_FIELDS
-            )
-            write_csv_rows(items_path, item_rows, list(item_rows[0].keys()))
-
-            build_review_queue.main.callback(
-                observed_csv=str(observed_path),
-                items_enriched_csv=str(items_path),
-                output_csv=str(output_path),
-            )
-
-            self.assertTrue(output_path.exists())
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tests/test_scraper.py
+++ b/tests/test_scraper.py
@@ -3,7 +3,7 @@ import tempfile
 import unittest
 from pathlib import Path

-import scraper
+import scrape_giant as scraper


 class ScraperTests(unittest.TestCase):
--- a/validate_cross_retailer_flow.py
+++ b/validate_cross_retailer_flow.py
@@ -1,154 +0,0 @@
-import json
-from pathlib import Path
-
-import click
-
-import build_canonical_layer
-import build_observed_products
-from layer_helpers import stable_id, write_csv_rows
-
-
-PROOF_FIELDS = [
-    "proof_name",
-    "canonical_product_id",
-    "giant_observed_product_id",
-    "costco_observed_product_id",
-    "giant_example_item",
-    "costco_example_item",
-    "notes",
-]
-
-
-def read_rows(path):
-    import csv
-
-    with Path(path).open(newline="", encoding="utf-8") as handle:
-        return list(csv.DictReader(handle))
-
-
-def find_proof_pair(observed_rows):
-    giant = None
-    costco = None
-    for row in observed_rows:
-        if row["retailer"] == "giant" and row["representative_name_norm"] == "BANANA":
-            giant = row
-        if row["retailer"] == "costco" and row["representative_name_norm"] == "BANANA":
-            costco = row
-    return giant, costco
-
-
-def merge_proof_pair(canonical_rows, link_rows, giant_row, costco_row):
-    if not giant_row or not costco_row:
-        return canonical_rows, link_rows, []
-
-    proof_canonical_id = stable_id("gcan", "proof|banana")
-    link_rows = [
-        row
-        for row in link_rows
-        if row["observed_product_id"]
-        not in {giant_row["observed_product_id"], costco_row["observed_product_id"]}
-    ]
-    canonical_rows = [
-        row
-        for row in canonical_rows
-        if row["canonical_product_id"] != proof_canonical_id
-    ]
-    canonical_rows.append(
-        {
-            "canonical_product_id": proof_canonical_id,
-            "canonical_name": "BANANA",
-            "product_type": "banana",
-            "brand": "",
-            "variant": "",
-            "size_value": "",
-            "size_unit": "",
-            "pack_qty": "",
-            "measure_type": "weight",
-            "normalized_quantity": "",
-            "normalized_quantity_unit": "",
-            "notes": "manual proof merge for cross-retailer validation",
-            "created_at": "",
-            "updated_at": "",
-        }
-    )
-    for observed_row in [giant_row, costco_row]:
-        link_rows.append(
-            {
-                "observed_product_id": observed_row["observed_product_id"],
-                "canonical_product_id": proof_canonical_id,
-                "link_method": "manual_proof_merge",
-                "link_confidence": "medium",
-                "review_status": "",
-                "reviewed_by": "",
-                "reviewed_at": "",
-                "link_notes": "cross-retailer validation proof",
-            }
-        )
-
-    proof_rows = [
-        {
-            "proof_name": "banana",
-            "canonical_product_id": proof_canonical_id,
-            "giant_observed_product_id": giant_row["observed_product_id"],
-            "costco_observed_product_id": costco_row["observed_product_id"],
-            "giant_example_item": giant_row["example_item_name"],
-            "costco_example_item": costco_row["example_item_name"],
-            "notes": "BANANA proof pair built from Giant and Costco enriched rows",
-        }
-    ]
-    return canonical_rows, link_rows, proof_rows
-
-
-@click.command()
-@click.option(
-    "--giant-items-enriched-csv",
-    default="giant_output/items_enriched.csv",
-    show_default=True,
-)
-@click.option(
-    "--costco-items-enriched-csv",
-    default="costco_output/items_enriched.csv",
-    show_default=True,
-)
-@click.option(
-    "--outdir",
-    default="combined_output",
-    show_default=True,
-)
-def main(giant_items_enriched_csv, costco_items_enriched_csv, outdir):
-    outdir = Path(outdir)
-    rows = read_rows(giant_items_enriched_csv) + read_rows(costco_items_enriched_csv)
-    observed_rows = build_observed_products.build_observed_products(rows)
-    canonical_rows, link_rows = build_canonical_layer.build_canonical_layer(observed_rows)
-    giant_row, costco_row = find_proof_pair(observed_rows)
-    if not giant_row or not costco_row:
-        raise click.ClickException(
-            "could not find BANANA proof pair across Giant and Costco observed products"
-        )
-    canonical_rows, link_rows, proof_rows = merge_proof_pair(
-        canonical_rows, link_rows, giant_row, costco_row
-    )
-
-    write_csv_rows(
-        outdir / "products_observed.csv",
-        observed_rows,
-        build_observed_products.OUTPUT_FIELDS,
-    )
-    write_csv_rows(
-        outdir / "products_canonical.csv",
-        canonical_rows,
-        build_canonical_layer.CANONICAL_FIELDS,
-    )
-    write_csv_rows(
-        outdir / "product_links.csv",
-        link_rows,
-        build_canonical_layer.LINK_FIELDS,
-    )
-    write_csv_rows(outdir / "proof_examples.csv", proof_rows, PROOF_FIELDS)
-    click.echo(
-        f"wrote combined outputs to {outdir} using {len(observed_rows)} observed rows"
-    )
-
-
-if __name__ == "__main__":
-    main()