Finalize post-refactor layout and remove old pipeline files

This commit is contained in:
ben
2026-03-24 17:09:57 -04:00
parent cdb7a15739
commit 09829b2b9d
17 changed files with 59 additions and 1154 deletions

View File

@@ -6,21 +6,15 @@ Run each script step-by-step from the terminal.
## What It Does
1. `scrape_giant.py`: download Giant orders and items
2. `enrich_giant.py`: normalize Giant line items
3. `scrape_costco.py`: download Costco orders and items
4. `enrich_costco.py`: normalize Costco line items
1. `collect_giant_web.py`: download Giant orders and items
2. `normalize_giant_web.py`: normalize Giant line items
3. `collect_costco_web.py`: download Costco orders and items
4. `normalize_costco_web.py`: normalize Costco line items
5. `build_purchases.py`: combine retailer outputs into one purchase table
6. `review_products.py`: review unresolved product matches in the terminal
7. `report_pipeline_status.py`: show how many rows survive each stage
8. `analyze_purchases.py`: write chart-ready analysis CSVs from the purchase table
Active refactor entrypoints:
- `collect_giant_web.py`
- `collect_costco_web.py`
- `normalize_giant_web.py`
- `normalize_costco_web.py`
## Requirements
- Python 3.10+
@@ -65,13 +59,20 @@ data/
collected_items.csv
normalized_items.csv
review/
catalog.csv
review_queue.csv
review_resolutions.csv
product_links.csv
purchases.csv
pipeline_status.csv
pipeline_status.json
catalog.csv
analysis/
purchases.csv
comparison_examples.csv
item_price_over_time.csv
spend_by_visit.csv
items_per_visit.csv
category_spend_over_time.csv
retailer_store_breakdown.csv
```
## Run Order
@@ -122,21 +123,21 @@ Costco:
- `data/costco-web/normalized_items.csv` preserves raw totals and matched net discount fields
Combined:
- `data/review/purchases.csv`
- `data/review/analysis/item_price_over_time.csv`
- `data/review/analysis/spend_by_visit.csv`
- `data/review/analysis/items_per_visit.csv`
- `data/review/analysis/category_spend_over_time.csv`
- `data/review/analysis/retailer_store_breakdown.csv`
- `data/analysis/purchases.csv`
- `data/analysis/comparison_examples.csv`
- `data/analysis/item_price_over_time.csv`
- `data/analysis/spend_by_visit.csv`
- `data/analysis/items_per_visit.csv`
- `data/analysis/category_spend_over_time.csv`
- `data/analysis/retailer_store_breakdown.csv`
- `data/review/review_queue.csv`
- `data/review/review_resolutions.csv`
- `data/review/product_links.csv`
- `data/review/comparison_examples.csv`
- `data/review/pipeline_status.csv`
- `data/review/pipeline_status.json`
- `data/catalog.csv`
- `data/review/catalog.csv`
`data/review/purchases.csv` is the main analysis artifact. It is designed to support both:
`data/analysis/purchases.csv` is the main analysis artifact. It is designed to support both:
- item-level price analysis
- visit-level analysis such as spend by visit, items per visit, category spend by visit, and retailer/store breakdown
@@ -164,9 +165,7 @@ The review step is intentionally conservative:
## Notes
- This project is designed around fragile retailer scraping flows, so the code favors explicit retailer-specific steps over heavy abstraction.
- `scrape_giant.py`, `scrape_costco.py`, `enrich_giant.py`, and `enrich_costco.py` are now legacy-compatible entrypoints; prefer the `collect_*` and `normalize_*` scripts for active work.
- Costco discount rows are preserved for auditability and also matched back to purchased items during enrichment.
- `validate_cross_retailer_flow.py` is a proof/check script, not a required production step.
## Test

View File

@@ -241,8 +241,8 @@ def build_retailer_store_rows(purchase_rows):
@click.command()
@click.option("--purchases-csv", default="data/review/purchases.csv", show_default=True)
@click.option("--output-dir", default="data/review/analysis", show_default=True)
@click.option("--purchases-csv", default="data/analysis/purchases.csv", show_default=True)
@click.option("--output-dir", default="data/analysis", show_default=True)
def main(purchases_csv, output_dir):
purchase_rows = read_csv_rows(purchases_csv)
output_path = Path(output_dir)

View File

@@ -1,220 +0,0 @@
import click
import re
from layer_helpers import read_csv_rows, representative_value, stable_id, write_csv_rows
CANONICAL_FIELDS = [
"canonical_product_id",
"canonical_name",
"product_type",
"brand",
"variant",
"size_value",
"size_unit",
"pack_qty",
"measure_type",
"normalized_quantity",
"normalized_quantity_unit",
"notes",
"created_at",
"updated_at",
]
CANONICAL_DROP_TOKENS = {"CT", "COUNT", "COUNTS", "DOZ", "DOZEN", "DOZ.", "PACK"}
LINK_FIELDS = [
"observed_product_id",
"canonical_product_id",
"link_method",
"link_confidence",
"review_status",
"reviewed_by",
"reviewed_at",
"link_notes",
]
def to_float(value):
try:
return float(value)
except (TypeError, ValueError):
return None
def normalized_quantity(row):
size_value = to_float(row.get("representative_size_value"))
pack_qty = to_float(row.get("representative_pack_qty")) or 1.0
size_unit = row.get("representative_size_unit", "")
measure_type = row.get("representative_measure_type", "")
if size_value is not None and size_unit:
return format(size_value * pack_qty, "g"), size_unit
if row.get("representative_pack_qty") and measure_type == "count":
return row["representative_pack_qty"], "count"
if measure_type == "each":
return "1", "each"
return "", ""
def auto_link_rule(observed_row):
if (
observed_row.get("is_fee") == "true"
or observed_row.get("is_discount_line") == "true"
or observed_row.get("is_coupon_line") == "true"
):
return "", "", ""
if observed_row.get("representative_upc"):
return (
"exact_upc",
f"upc={observed_row['representative_upc']}",
"high",
)
if (
observed_row.get("representative_name_norm")
and observed_row.get("representative_size_value")
and observed_row.get("representative_size_unit")
):
return (
"exact_name_size",
"|".join(
[
f"name={observed_row['representative_name_norm']}",
f"size={observed_row['representative_size_value']}",
f"unit={observed_row['representative_size_unit']}",
f"pack={observed_row['representative_pack_qty']}",
f"measure={observed_row['representative_measure_type']}",
]
),
"high",
)
return "", "", ""
def clean_canonical_name(name):
tokens = []
for token in re.sub(r"[^A-Z0-9\s]", " ", (name or "").upper()).split():
if token.isdigit():
continue
if token in CANONICAL_DROP_TOKENS:
continue
if re.fullmatch(r"\d+(?:PK|PACK)", token):
continue
if re.fullmatch(r"\d+DZ", token):
continue
tokens.append(token)
return " ".join(tokens).strip()
def canonical_row_for_group(canonical_product_id, group_rows, link_method):
quantity_value, quantity_unit = normalized_quantity(
{
"representative_size_value": representative_value(
group_rows, "representative_size_value"
),
"representative_size_unit": representative_value(
group_rows, "representative_size_unit"
),
"representative_pack_qty": representative_value(
group_rows, "representative_pack_qty"
),
"representative_measure_type": representative_value(
group_rows, "representative_measure_type"
),
}
)
return {
"canonical_product_id": canonical_product_id,
"canonical_name": clean_canonical_name(
representative_value(group_rows, "representative_name_norm")
)
or representative_value(group_rows, "representative_name_norm"),
"product_type": "",
"brand": representative_value(group_rows, "representative_brand"),
"variant": representative_value(group_rows, "representative_variant"),
"size_value": representative_value(group_rows, "representative_size_value"),
"size_unit": representative_value(group_rows, "representative_size_unit"),
"pack_qty": representative_value(group_rows, "representative_pack_qty"),
"measure_type": representative_value(group_rows, "representative_measure_type"),
"normalized_quantity": quantity_value,
"normalized_quantity_unit": quantity_unit,
"notes": f"auto-linked via {link_method}",
"created_at": "",
"updated_at": "",
}
def build_canonical_layer(observed_rows):
canonical_rows = []
link_rows = []
groups = {}
for observed_row in sorted(observed_rows, key=lambda row: row["observed_product_id"]):
link_method, group_key, confidence = auto_link_rule(observed_row)
if not group_key:
continue
canonical_product_id = stable_id("gcan", f"{link_method}|{group_key}")
groups.setdefault(canonical_product_id, {"method": link_method, "rows": []})
groups[canonical_product_id]["rows"].append(observed_row)
link_rows.append(
{
"observed_product_id": observed_row["observed_product_id"],
"canonical_product_id": canonical_product_id,
"link_method": link_method,
"link_confidence": confidence,
"review_status": "",
"reviewed_by": "",
"reviewed_at": "",
"link_notes": "",
}
)
for canonical_product_id, group in sorted(groups.items()):
canonical_rows.append(
canonical_row_for_group(
canonical_product_id, group["rows"], group["method"]
)
)
return canonical_rows, link_rows
@click.command()
@click.option(
"--observed-csv",
default="giant_output/products_observed.csv",
show_default=True,
help="Path to observed product rows.",
)
@click.option(
"--canonical-csv",
default="giant_output/products_canonical.csv",
show_default=True,
help="Path to canonical product output.",
)
@click.option(
"--links-csv",
default="giant_output/product_links.csv",
show_default=True,
help="Path to observed-to-canonical link output.",
)
def main(observed_csv, canonical_csv, links_csv):
observed_rows = read_csv_rows(observed_csv)
canonical_rows, link_rows = build_canonical_layer(observed_rows)
write_csv_rows(canonical_csv, canonical_rows, CANONICAL_FIELDS)
write_csv_rows(links_csv, link_rows, LINK_FIELDS)
click.echo(
f"wrote {len(canonical_rows)} canonical rows to {canonical_csv} and "
f"{len(link_rows)} links to {links_csv}"
)
if __name__ == "__main__":
main()

View File

@@ -1,172 +0,0 @@
from collections import defaultdict
import click
from layer_helpers import (
compact_join,
distinct_values,
first_nonblank,
read_csv_rows,
representative_value,
stable_id,
write_csv_rows,
)
OUTPUT_FIELDS = [
"observed_product_id",
"retailer",
"observed_key",
"representative_retailer_item_id",
"representative_upc",
"representative_item_name",
"representative_name_norm",
"representative_brand",
"representative_variant",
"representative_size_value",
"representative_size_unit",
"representative_pack_qty",
"representative_measure_type",
"representative_image_url",
"is_store_brand",
"is_fee",
"is_discount_line",
"is_coupon_line",
"first_seen_date",
"last_seen_date",
"times_seen",
"example_order_id",
"example_item_name",
"raw_name_examples",
"normalized_name_examples",
"example_prices",
"distinct_item_names_count",
"distinct_retailer_item_ids_count",
"distinct_upcs_count",
]
def build_observed_key(row):
if row.get("upc"):
return "|".join(
[
row["retailer"],
f"upc={row['upc']}",
f"name={row['item_name_norm']}",
]
)
if row.get("retailer_item_id"):
return "|".join(
[
row["retailer"],
f"retailer_item_id={row['retailer_item_id']}",
f"name={row['item_name_norm']}",
f"discount={row.get('is_discount_line', 'false')}",
f"coupon={row.get('is_coupon_line', 'false')}",
]
)
return "|".join(
[
row["retailer"],
f"name={row['item_name_norm']}",
f"size={row['size_value']}",
f"unit={row['size_unit']}",
f"pack={row['pack_qty']}",
f"measure={row['measure_type']}",
f"store_brand={row['is_store_brand']}",
f"fee={row['is_fee']}",
]
)
def build_observed_products(rows):
grouped = defaultdict(list)
for row in rows:
grouped[build_observed_key(row)].append(row)
observed_rows = []
for observed_key, group_rows in sorted(grouped.items()):
ordered = sorted(
group_rows,
key=lambda row: (row["order_date"], row["order_id"], int(row["line_no"])),
)
observed_rows.append(
{
"observed_product_id": stable_id("gobs", observed_key),
"retailer": ordered[0]["retailer"],
"observed_key": observed_key,
"representative_retailer_item_id": representative_value(
ordered, "retailer_item_id"
),
"representative_upc": representative_value(ordered, "upc"),
"representative_item_name": representative_value(ordered, "item_name"),
"representative_name_norm": representative_value(
ordered, "item_name_norm"
),
"representative_brand": representative_value(ordered, "brand_guess"),
"representative_variant": representative_value(ordered, "variant"),
"representative_size_value": representative_value(ordered, "size_value"),
"representative_size_unit": representative_value(ordered, "size_unit"),
"representative_pack_qty": representative_value(ordered, "pack_qty"),
"representative_measure_type": representative_value(
ordered, "measure_type"
),
"representative_image_url": first_nonblank(ordered, "image_url"),
"is_store_brand": representative_value(ordered, "is_store_brand"),
"is_fee": representative_value(ordered, "is_fee"),
"is_discount_line": representative_value(
ordered, "is_discount_line"
),
"is_coupon_line": representative_value(ordered, "is_coupon_line"),
"first_seen_date": ordered[0]["order_date"],
"last_seen_date": ordered[-1]["order_date"],
"times_seen": str(len(ordered)),
"example_order_id": ordered[0]["order_id"],
"example_item_name": ordered[0]["item_name"],
"raw_name_examples": compact_join(
distinct_values(ordered, "item_name"), limit=4
),
"normalized_name_examples": compact_join(
distinct_values(ordered, "item_name_norm"), limit=4
),
"example_prices": compact_join(
distinct_values(ordered, "line_total"), limit=4
),
"distinct_item_names_count": str(
len(distinct_values(ordered, "item_name"))
),
"distinct_retailer_item_ids_count": str(
len(distinct_values(ordered, "retailer_item_id"))
),
"distinct_upcs_count": str(len(distinct_values(ordered, "upc"))),
}
)
observed_rows.sort(key=lambda row: row["observed_product_id"])
return observed_rows
@click.command()
@click.option(
"--items-enriched-csv",
default="giant_output/items_enriched.csv",
show_default=True,
help="Path to enriched Giant item rows.",
)
@click.option(
"--output-csv",
default="giant_output/products_observed.csv",
show_default=True,
help="Path to observed product output.",
)
def main(items_enriched_csv, output_csv):
rows = read_csv_rows(items_enriched_csv)
observed_rows = build_observed_products(rows)
write_csv_rows(output_csv, observed_rows, OUTPUT_FIELDS)
click.echo(f"wrote {len(observed_rows)} rows to {output_csv}")
if __name__ == "__main__":
main()

View File

@@ -440,10 +440,10 @@ def build_comparison_examples(purchase_rows):
@click.option("--giant-orders-csv", default="data/giant-web/collected_orders.csv", show_default=True)
@click.option("--costco-orders-csv", default="data/costco-web/collected_orders.csv", show_default=True)
@click.option("--resolutions-csv", default="data/review/review_resolutions.csv", show_default=True)
@click.option("--catalog-csv", default="data/catalog.csv", show_default=True)
@click.option("--catalog-csv", default="data/review/catalog.csv", show_default=True)
@click.option("--links-csv", default="data/review/product_links.csv", show_default=True)
@click.option("--output-csv", default="data/review/purchases.csv", show_default=True)
@click.option("--examples-csv", default="data/review/comparison_examples.csv", show_default=True)
@click.option("--output-csv", default="data/analysis/purchases.csv", show_default=True)
@click.option("--examples-csv", default="data/analysis/comparison_examples.csv", show_default=True)
def main(
giant_items_enriched_csv,
costco_items_enriched_csv,

View File

@@ -1,175 +0,0 @@
from collections import defaultdict
from datetime import date
import click
from layer_helpers import compact_join, distinct_values, read_csv_rows, stable_id, write_csv_rows
OUTPUT_FIELDS = [
"review_id",
"queue_type",
"retailer",
"observed_product_id",
"canonical_product_id",
"reason_code",
"priority",
"raw_item_names",
"normalized_names",
"upc",
"image_url",
"example_prices",
"seen_count",
"status",
"resolution_notes",
"created_at",
"updated_at",
]
def existing_review_state(path):
try:
rows = read_csv_rows(path)
except FileNotFoundError:
return {}
return {row["review_id"]: row for row in rows}
def review_reasons(observed_row):
reasons = []
if (
observed_row["is_fee"] == "true"
or observed_row.get("is_discount_line") == "true"
or observed_row.get("is_coupon_line") == "true"
):
return reasons
if observed_row["distinct_upcs_count"] not in {"", "0", "1"}:
reasons.append(("multiple_upcs", "high"))
if observed_row["distinct_item_names_count"] not in {"", "0", "1"}:
reasons.append(("multiple_raw_names", "medium"))
if not observed_row["representative_image_url"]:
reasons.append(("missing_image", "medium"))
if not observed_row["representative_upc"]:
reasons.append(("missing_upc", "high"))
if not observed_row["representative_name_norm"]:
reasons.append(("missing_normalized_name", "high"))
return reasons
def build_review_queue(observed_rows, item_rows, existing_rows, today_text):
by_observed = defaultdict(list)
for row in item_rows:
observed_id = row.get("observed_product_id", "")
if observed_id:
by_observed[observed_id].append(row)
queue_rows = []
for observed_row in observed_rows:
reasons = review_reasons(observed_row)
if not reasons:
continue
related_items = by_observed.get(observed_row["observed_product_id"], [])
raw_names = compact_join(distinct_values(related_items, "item_name"), limit=5)
norm_names = compact_join(
distinct_values(related_items, "item_name_norm"), limit=5
)
example_prices = compact_join(
distinct_values(related_items, "line_total"), limit=5
)
for reason_code, priority in reasons:
review_id = stable_id(
"rvw",
f"{observed_row['observed_product_id']}|{reason_code}",
)
prior = existing_rows.get(review_id, {})
queue_rows.append(
{
"review_id": review_id,
"queue_type": "observed_product",
"retailer": observed_row["retailer"],
"observed_product_id": observed_row["observed_product_id"],
"canonical_product_id": prior.get("canonical_product_id", ""),
"reason_code": reason_code,
"priority": priority,
"raw_item_names": raw_names,
"normalized_names": norm_names,
"upc": observed_row["representative_upc"],
"image_url": observed_row["representative_image_url"],
"example_prices": example_prices,
"seen_count": observed_row["times_seen"],
"status": prior.get("status", "pending"),
"resolution_notes": prior.get("resolution_notes", ""),
"created_at": prior.get("created_at", today_text),
"updated_at": today_text,
}
)
queue_rows.sort(key=lambda row: (row["priority"], row["reason_code"], row["review_id"]))
return queue_rows
def attach_observed_ids(item_rows, observed_rows):
observed_by_key = {row["observed_key"]: row["observed_product_id"] for row in observed_rows}
attached = []
for row in item_rows:
observed_key = "|".join(
[
row["retailer"],
f"upc={row['upc']}",
f"name={row['item_name_norm']}",
]
) if row.get("upc") else "|".join(
[
row["retailer"],
f"retailer_item_id={row.get('retailer_item_id', '')}",
f"name={row['item_name_norm']}",
f"size={row['size_value']}",
f"unit={row['size_unit']}",
f"pack={row['pack_qty']}",
f"measure={row['measure_type']}",
f"store_brand={row['is_store_brand']}",
f"fee={row['is_fee']}",
f"discount={row.get('is_discount_line', 'false')}",
f"coupon={row.get('is_coupon_line', 'false')}",
]
)
enriched = dict(row)
enriched["observed_product_id"] = observed_by_key.get(observed_key, "")
attached.append(enriched)
return attached
@click.command()
@click.option(
"--observed-csv",
default="giant_output/products_observed.csv",
show_default=True,
help="Path to observed product rows.",
)
@click.option(
"--items-enriched-csv",
default="giant_output/items_enriched.csv",
show_default=True,
help="Path to enriched Giant item rows.",
)
@click.option(
"--output-csv",
default="giant_output/review_queue.csv",
show_default=True,
help="Path to review queue output.",
)
def main(observed_csv, items_enriched_csv, output_csv):
observed_rows = read_csv_rows(observed_csv)
item_rows = read_csv_rows(items_enriched_csv)
item_rows = attach_observed_ids(item_rows, observed_rows)
existing_rows = existing_review_state(output_csv)
today_text = str(date.today())
queue_rows = build_review_queue(observed_rows, item_rows, existing_rows, today_text)
write_csv_rows(output_csv, queue_rows, OUTPUT_FIELDS)
click.echo(f"wrote {len(queue_rows)} rows to {output_csv}")
if __name__ == "__main__":
main()

View File

@@ -110,8 +110,15 @@ data/
review/
review_queue.csv # Human review queue for unresolved matching/parsing cases.
product_links.csv # Links from normalized retailer items to catalog items.
catalog.csv # Cross-retailer product catalog entities used for comparison.
purchases.csv
catalog.csv # Cross-retailer product catalog entities used for comparison.
analysis/
purchases.csv
comparison_examples.csv
item_price_over_time.csv
spend_by_visit.csv
items_per_visit.csv
category_spend_over_time.csv
retailer_store_breakdown.csv
#+end_example
Notes:
@@ -223,7 +230,7 @@ Notes:
- Valid `normalization_basis` values should be explicit, e.g. `exact_upc`, `exact_retailer_item_id`, `exact_name_size_pack`, or `approved_retailer_alias`.
- Do not use fuzzy or semantic matching to assign `normalized_item_id`.
- Discount/coupon rows may remain as standalone normalized rows for auditability even when their amounts are attached to a purchased row via `matched_discount_amount`.
- Cross-retailer identity is handled later in review/combine via `catalog.csv` and `product_links.csv`.
- Cross-retailer identity is handled later in review/combine via `data/review/catalog.csv` and `product_links.csv`.
** `data/review/product_links.csv`
One row per review-approved link from a normalized retailer item to a catalog item.
@@ -263,7 +270,7 @@ One row per issue needing human review.
| `resolution_notes` | reviewer notes |
| `created_at` | creation timestamp or date |
| `updated_at` | last update timestamp or date |
** `data/catalog.csv`
** `data/review/catalog.csv`
One row per cross-retailer catalog product.
| key | definition |
|----------------------------+----------------------------------------|
@@ -288,7 +295,7 @@ Notes:
- Do not encode packaging/count into `catalog_name` unless it is essential to product identity.
- `catalog_name` should come from review-approved naming, not raw retailer strings.
** `data/purchases.csv`
** `data/analysis/purchases.csv`
One row per purchased item (i.e., `is_item`==true from normalized layer), with
catalog attributes denormalized in and discounts already applied.

View File

@@ -85,10 +85,10 @@ def build_status_summary(
@click.option("--costco-orders-csv", default="data/costco-web/collected_orders.csv", show_default=True)
@click.option("--costco-items-csv", default="data/costco-web/collected_items.csv", show_default=True)
@click.option("--costco-enriched-csv", default="data/costco-web/normalized_items.csv", show_default=True)
@click.option("--purchases-csv", default="data/review/purchases.csv", show_default=True)
@click.option("--purchases-csv", default="data/analysis/purchases.csv", show_default=True)
@click.option("--resolutions-csv", default="data/review/review_resolutions.csv", show_default=True)
@click.option("--links-csv", default="data/review/product_links.csv", show_default=True)
@click.option("--catalog-csv", default="data/catalog.csv", show_default=True)
@click.option("--catalog-csv", default="data/review/catalog.csv", show_default=True)
@click.option("--summary-csv", default="data/review/pipeline_status.csv", show_default=True)
@click.option("--summary-json", default="data/review/pipeline_status.json", show_default=True)
def main(

View File

@@ -562,10 +562,10 @@ def link_rows_from_state(link_lookup):
@click.option("--costco-items-enriched-csv", default="data/costco-web/normalized_items.csv", show_default=True)
@click.option("--giant-orders-csv", default="data/giant-web/collected_orders.csv", show_default=True)
@click.option("--costco-orders-csv", default="data/costco-web/collected_orders.csv", show_default=True)
@click.option("--purchases-csv", default="data/review/purchases.csv", show_default=True)
@click.option("--purchases-csv", default="data/analysis/purchases.csv", show_default=True)
@click.option("--queue-csv", default="data/review/review_queue.csv", show_default=True)
@click.option("--resolutions-csv", default="data/review/review_resolutions.csv", show_default=True)
@click.option("--catalog-csv", default="data/catalog.csv", show_default=True)
@click.option("--catalog-csv", default="data/review/catalog.csv", show_default=True)
@click.option("--links-csv", default="data/review/product_links.csv", show_default=True)
@click.option("--limit", default=0, show_default=True, type=int)
@click.option("--refresh-only", is_flag=True, help="Only rebuild review_queue.csv without prompting.")

View File

@@ -1,5 +0,0 @@
from scrape_giant import * # noqa: F401,F403
if __name__ == "__main__":
main()

View File

@@ -1,119 +0,0 @@
import unittest
import build_canonical_layer
class CanonicalLayerTests(unittest.TestCase):
def test_build_canonical_layer_auto_links_exact_upc_and_name_size_only(self):
observed_rows = [
{
"observed_product_id": "gobs_1",
"representative_upc": "111",
"representative_retailer_item_id": "11",
"representative_name_norm": "GALA APPLE",
"representative_brand": "SB",
"representative_variant": "",
"representative_size_value": "5",
"representative_size_unit": "lb",
"representative_pack_qty": "",
"representative_measure_type": "weight",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
},
{
"observed_product_id": "gobs_2",
"representative_upc": "111",
"representative_retailer_item_id": "12",
"representative_name_norm": "LARGE WHITE EGGS",
"representative_brand": "SB",
"representative_variant": "",
"representative_size_value": "",
"representative_size_unit": "",
"representative_pack_qty": "18",
"representative_measure_type": "count",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
},
{
"observed_product_id": "gobs_3",
"representative_upc": "",
"representative_retailer_item_id": "21",
"representative_name_norm": "ROTINI",
"representative_brand": "",
"representative_variant": "",
"representative_size_value": "16",
"representative_size_unit": "oz",
"representative_pack_qty": "",
"representative_measure_type": "weight",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
},
{
"observed_product_id": "gobs_4",
"representative_upc": "",
"representative_retailer_item_id": "22",
"representative_name_norm": "ROTINI",
"representative_brand": "SB",
"representative_variant": "",
"representative_size_value": "16",
"representative_size_unit": "oz",
"representative_pack_qty": "",
"representative_measure_type": "weight",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
},
{
"observed_product_id": "gobs_5",
"representative_upc": "",
"representative_retailer_item_id": "99",
"representative_name_norm": "GL BAG CHARGE",
"representative_brand": "",
"representative_variant": "",
"representative_size_value": "",
"representative_size_unit": "",
"representative_pack_qty": "",
"representative_measure_type": "each",
"is_fee": "true",
"is_discount_line": "false",
"is_coupon_line": "false",
},
{
"observed_product_id": "gobs_6",
"representative_upc": "",
"representative_retailer_item_id": "",
"representative_name_norm": "LIME",
"representative_brand": "",
"representative_variant": "",
"representative_size_value": "",
"representative_size_unit": "",
"representative_pack_qty": "",
"representative_measure_type": "each",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
},
]
canonicals, links = build_canonical_layer.build_canonical_layer(observed_rows)
self.assertEqual(2, len(canonicals))
self.assertEqual(4, len(links))
methods = {row["observed_product_id"]: row["link_method"] for row in links}
self.assertEqual("exact_upc", methods["gobs_1"])
self.assertEqual("exact_upc", methods["gobs_2"])
self.assertEqual("exact_name_size", methods["gobs_3"])
self.assertEqual("exact_name_size", methods["gobs_4"])
self.assertNotIn("gobs_5", methods)
self.assertNotIn("gobs_6", methods)
def test_clean_canonical_name_removes_packaging_noise(self):
self.assertEqual("LIME", build_canonical_layer.clean_canonical_name("LIME . / ."))
self.assertEqual("EGG", build_canonical_layer.clean_canonical_name("5DZ EGG / /"))
if __name__ == "__main__":
unittest.main()

View File

@@ -7,7 +7,6 @@ from unittest import mock
import enrich_costco
import scrape_costco
import validate_cross_retailer_flow
class CostcoPipelineTests(unittest.TestCase):
@@ -423,76 +422,6 @@ class CostcoPipelineTests(unittest.TestCase):
self.assertIn("matched_discount=4873222", purchase_row["parse_notes"])
self.assertIn("matched_to_item=4873222", discount_row["parse_notes"])
def test_cross_retailer_validation_writes_proof_example(self):
with tempfile.TemporaryDirectory() as tmpdir:
giant_csv = Path(tmpdir) / "giant_items_enriched.csv"
costco_csv = Path(tmpdir) / "costco_items_enriched.csv"
outdir = Path(tmpdir) / "combined"
fieldnames = enrich_costco.OUTPUT_FIELDS
giant_row = {field: "" for field in fieldnames}
giant_row.update(
{
"retailer": "giant",
"order_id": "g1",
"line_no": "1",
"order_date": "2026-03-01",
"retailer_item_id": "100",
"item_name": "FRESH BANANA",
"item_name_norm": "BANANA",
"upc": "4011",
"measure_type": "weight",
"is_store_brand": "false",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
"line_total": "1.29",
}
)
costco_row = {field: "" for field in fieldnames}
costco_row.update(
{
"retailer": "costco",
"order_id": "c1",
"line_no": "1",
"order_date": "2026-03-12",
"retailer_item_id": "30669",
"item_name": "BANANAS 3 LB / 1.36 KG",
"item_name_norm": "BANANA",
"upc": "",
"size_value": "3",
"size_unit": "lb",
"measure_type": "weight",
"is_store_brand": "false",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
"line_total": "2.98",
}
)
with giant_csv.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=fieldnames)
writer.writeheader()
writer.writerow(giant_row)
with costco_csv.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=fieldnames)
writer.writeheader()
writer.writerow(costco_row)
validate_cross_retailer_flow.main.callback(
giant_items_enriched_csv=str(giant_csv),
costco_items_enriched_csv=str(costco_csv),
outdir=str(outdir),
)
proof_path = outdir / "proof_examples.csv"
self.assertTrue(proof_path.exists())
with proof_path.open(newline="", encoding="utf-8") as handle:
rows = list(csv.DictReader(handle))
self.assertEqual(1, len(rows))
self.assertEqual("banana", rows[0]["proof_name"])
def test_main_writes_summary_request_metadata(self):
with tempfile.TemporaryDirectory() as tmpdir:
outdir = Path(tmpdir) / "costco_output"

View File

@@ -1,67 +0,0 @@
import unittest
import build_observed_products
class ObservedProductTests(unittest.TestCase):
def test_build_observed_products_aggregates_rows_with_same_key(self):
rows = [
{
"retailer": "giant",
"order_id": "1",
"line_no": "1",
"order_date": "2026-01-01",
"item_name": "SB GALA APPLE 5LB",
"item_name_norm": "GALA APPLE",
"retailer_item_id": "11",
"upc": "111",
"brand_guess": "SB",
"variant": "",
"size_value": "5",
"size_unit": "lb",
"pack_qty": "",
"measure_type": "weight",
"image_url": "https://example.test/a.jpg",
"is_store_brand": "true",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
"line_total": "7.99",
},
{
"retailer": "giant",
"order_id": "2",
"line_no": "1",
"order_date": "2026-01-10",
"item_name": "SB GALA APPLE 5 LB",
"item_name_norm": "GALA APPLE",
"retailer_item_id": "11",
"upc": "111",
"brand_guess": "SB",
"variant": "",
"size_value": "5",
"size_unit": "lb",
"pack_qty": "",
"measure_type": "weight",
"image_url": "",
"is_store_brand": "true",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
"line_total": "8.49",
},
]
observed = build_observed_products.build_observed_products(rows)
self.assertEqual(1, len(observed))
self.assertEqual("2", observed[0]["times_seen"])
self.assertEqual("2026-01-01", observed[0]["first_seen_date"])
self.assertEqual("2026-01-10", observed[0]["last_seen_date"])
self.assertEqual("11", observed[0]["representative_retailer_item_id"])
self.assertEqual("111", observed[0]["representative_upc"])
self.assertIn("SB GALA APPLE 5LB", observed[0]["raw_name_examples"])
if __name__ == "__main__":
unittest.main()

View File

@@ -65,6 +65,21 @@ class PipelineStatusTests(unittest.TestCase):
},
],
resolutions=[],
links=[
{
"normalized_item_id": "gnorm_banana",
"catalog_id": "cat_banana",
"review_status": "approved",
}
],
catalog=[
{
"catalog_id": "cat_banana",
"catalog_name": "BANANA",
"product_type": "banana",
"category": "produce",
}
],
)
counts = {row["stage"]: row["count"] for row in summary}

View File

@@ -1,133 +0,0 @@
import tempfile
import unittest
from pathlib import Path
import build_observed_products
import build_review_queue
from layer_helpers import write_csv_rows
class ReviewQueueTests(unittest.TestCase):
def test_build_review_queue_preserves_existing_status(self):
observed_rows = [
{
"observed_product_id": "gobs_1",
"retailer": "giant",
"representative_upc": "111",
"representative_image_url": "",
"representative_name_norm": "GALA APPLE",
"times_seen": "2",
"distinct_item_names_count": "2",
"distinct_upcs_count": "1",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
}
]
item_rows = [
{
"observed_product_id": "gobs_1",
"item_name": "SB GALA APPLE 5LB",
"item_name_norm": "GALA APPLE",
"line_total": "7.99",
},
{
"observed_product_id": "gobs_1",
"item_name": "SB GALA APPLE 5 LB",
"item_name_norm": "GALA APPLE",
"line_total": "8.49",
},
]
existing = {
build_review_queue.stable_id("rvw", "gobs_1|missing_image"): {
"status": "approved",
"resolution_notes": "looked fine",
"created_at": "2026-03-15",
}
}
queue = build_review_queue.build_review_queue(
observed_rows, item_rows, existing, "2026-03-16"
)
self.assertEqual(2, len(queue))
missing_image = [row for row in queue if row["reason_code"] == "missing_image"][0]
self.assertEqual("approved", missing_image["status"])
self.assertEqual("looked fine", missing_image["resolution_notes"])
def test_review_queue_main_writes_output(self):
with tempfile.TemporaryDirectory() as tmpdir:
observed_path = Path(tmpdir) / "products_observed.csv"
items_path = Path(tmpdir) / "items_enriched.csv"
output_path = Path(tmpdir) / "review_queue.csv"
observed_rows = [
{
"observed_product_id": "gobs_1",
"retailer": "giant",
"observed_key": "giant|upc=111|name=GALA APPLE",
"representative_retailer_item_id": "11",
"representative_upc": "111",
"representative_item_name": "SB GALA APPLE 5LB",
"representative_name_norm": "GALA APPLE",
"representative_brand": "SB",
"representative_variant": "",
"representative_size_value": "5",
"representative_size_unit": "lb",
"representative_pack_qty": "",
"representative_measure_type": "weight",
"representative_image_url": "",
"is_store_brand": "true",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
"first_seen_date": "2026-01-01",
"last_seen_date": "2026-01-10",
"times_seen": "2",
"example_order_id": "1",
"example_item_name": "SB GALA APPLE 5LB",
"raw_name_examples": "SB GALA APPLE 5LB | SB GALA APPLE 5 LB",
"normalized_name_examples": "GALA APPLE",
"example_prices": "7.99 | 8.49",
"distinct_item_names_count": "2",
"distinct_retailer_item_ids_count": "1",
"distinct_upcs_count": "1",
}
]
item_rows = [
{
"retailer": "giant",
"order_id": "1",
"line_no": "1",
"item_name": "SB GALA APPLE 5LB",
"item_name_norm": "GALA APPLE",
"retailer_item_id": "11",
"upc": "111",
"size_value": "5",
"size_unit": "lb",
"pack_qty": "",
"measure_type": "weight",
"is_store_brand": "true",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
"line_total": "7.99",
}
]
write_csv_rows(
observed_path, observed_rows, build_observed_products.OUTPUT_FIELDS
)
write_csv_rows(items_path, item_rows, list(item_rows[0].keys()))
build_review_queue.main.callback(
observed_csv=str(observed_path),
items_enriched_csv=str(items_path),
output_csv=str(output_path),
)
self.assertTrue(output_path.exists())
if __name__ == "__main__":
unittest.main()

View File

@@ -3,7 +3,7 @@ import tempfile
import unittest
from pathlib import Path
import scraper
import scrape_giant as scraper
class ScraperTests(unittest.TestCase):

View File

@@ -1,154 +0,0 @@
import json
from pathlib import Path
import click
import build_canonical_layer
import build_observed_products
from layer_helpers import stable_id, write_csv_rows
PROOF_FIELDS = [
"proof_name",
"canonical_product_id",
"giant_observed_product_id",
"costco_observed_product_id",
"giant_example_item",
"costco_example_item",
"notes",
]
def read_rows(path):
import csv
with Path(path).open(newline="", encoding="utf-8") as handle:
return list(csv.DictReader(handle))
def find_proof_pair(observed_rows):
giant = None
costco = None
for row in observed_rows:
if row["retailer"] == "giant" and row["representative_name_norm"] == "BANANA":
giant = row
if row["retailer"] == "costco" and row["representative_name_norm"] == "BANANA":
costco = row
return giant, costco
def merge_proof_pair(canonical_rows, link_rows, giant_row, costco_row):
if not giant_row or not costco_row:
return canonical_rows, link_rows, []
proof_canonical_id = stable_id("gcan", "proof|banana")
link_rows = [
row
for row in link_rows
if row["observed_product_id"]
not in {giant_row["observed_product_id"], costco_row["observed_product_id"]}
]
canonical_rows = [
row
for row in canonical_rows
if row["canonical_product_id"] != proof_canonical_id
]
canonical_rows.append(
{
"canonical_product_id": proof_canonical_id,
"canonical_name": "BANANA",
"product_type": "banana",
"brand": "",
"variant": "",
"size_value": "",
"size_unit": "",
"pack_qty": "",
"measure_type": "weight",
"normalized_quantity": "",
"normalized_quantity_unit": "",
"notes": "manual proof merge for cross-retailer validation",
"created_at": "",
"updated_at": "",
}
)
for observed_row in [giant_row, costco_row]:
link_rows.append(
{
"observed_product_id": observed_row["observed_product_id"],
"canonical_product_id": proof_canonical_id,
"link_method": "manual_proof_merge",
"link_confidence": "medium",
"review_status": "",
"reviewed_by": "",
"reviewed_at": "",
"link_notes": "cross-retailer validation proof",
}
)
proof_rows = [
{
"proof_name": "banana",
"canonical_product_id": proof_canonical_id,
"giant_observed_product_id": giant_row["observed_product_id"],
"costco_observed_product_id": costco_row["observed_product_id"],
"giant_example_item": giant_row["example_item_name"],
"costco_example_item": costco_row["example_item_name"],
"notes": "BANANA proof pair built from Giant and Costco enriched rows",
}
]
return canonical_rows, link_rows, proof_rows
@click.command()
@click.option(
"--giant-items-enriched-csv",
default="giant_output/items_enriched.csv",
show_default=True,
)
@click.option(
"--costco-items-enriched-csv",
default="costco_output/items_enriched.csv",
show_default=True,
)
@click.option(
"--outdir",
default="combined_output",
show_default=True,
)
def main(giant_items_enriched_csv, costco_items_enriched_csv, outdir):
outdir = Path(outdir)
rows = read_rows(giant_items_enriched_csv) + read_rows(costco_items_enriched_csv)
observed_rows = build_observed_products.build_observed_products(rows)
canonical_rows, link_rows = build_canonical_layer.build_canonical_layer(observed_rows)
giant_row, costco_row = find_proof_pair(observed_rows)
if not giant_row or not costco_row:
raise click.ClickException(
"could not find BANANA proof pair across Giant and Costco observed products"
)
canonical_rows, link_rows, proof_rows = merge_proof_pair(
canonical_rows, link_rows, giant_row, costco_row
)
write_csv_rows(
outdir / "products_observed.csv",
observed_rows,
build_observed_products.OUTPUT_FIELDS,
)
write_csv_rows(
outdir / "products_canonical.csv",
canonical_rows,
build_canonical_layer.CANONICAL_FIELDS,
)
write_csv_rows(
outdir / "product_links.csv",
link_rows,
build_canonical_layer.LINK_FIELDS,
)
write_csv_rows(outdir / "proof_examples.csv", proof_rows, PROOF_FIELDS)
click.echo(
f"wrote combined outputs to {outdir} using {len(observed_rows)} observed rows"
)
if __name__ == "__main__":
main()