Finalize post-refactor layout and remove old pipeline files
This commit is contained in:
45
README.md
45
README.md
@@ -6,21 +6,15 @@ Run each script step-by-step from the terminal.
|
||||
|
||||
## What It Does
|
||||
|
||||
1. `scrape_giant.py`: download Giant orders and items
|
||||
2. `enrich_giant.py`: normalize Giant line items
|
||||
3. `scrape_costco.py`: download Costco orders and items
|
||||
4. `enrich_costco.py`: normalize Costco line items
|
||||
1. `collect_giant_web.py`: download Giant orders and items
|
||||
2. `normalize_giant_web.py`: normalize Giant line items
|
||||
3. `collect_costco_web.py`: download Costco orders and items
|
||||
4. `normalize_costco_web.py`: normalize Costco line items
|
||||
5. `build_purchases.py`: combine retailer outputs into one purchase table
|
||||
6. `review_products.py`: review unresolved product matches in the terminal
|
||||
7. `report_pipeline_status.py`: show how many rows survive each stage
|
||||
8. `analyze_purchases.py`: write chart-ready analysis CSVs from the purchase table
|
||||
|
||||
Active refactor entrypoints:
|
||||
- `collect_giant_web.py`
|
||||
- `collect_costco_web.py`
|
||||
- `normalize_giant_web.py`
|
||||
- `normalize_costco_web.py`
|
||||
|
||||
## Requirements
|
||||
|
||||
- Python 3.10+
|
||||
@@ -65,13 +59,20 @@ data/
|
||||
collected_items.csv
|
||||
normalized_items.csv
|
||||
review/
|
||||
catalog.csv
|
||||
review_queue.csv
|
||||
review_resolutions.csv
|
||||
product_links.csv
|
||||
purchases.csv
|
||||
pipeline_status.csv
|
||||
pipeline_status.json
|
||||
catalog.csv
|
||||
analysis/
|
||||
purchases.csv
|
||||
comparison_examples.csv
|
||||
item_price_over_time.csv
|
||||
spend_by_visit.csv
|
||||
items_per_visit.csv
|
||||
category_spend_over_time.csv
|
||||
retailer_store_breakdown.csv
|
||||
```
|
||||
|
||||
## Run Order
|
||||
@@ -122,21 +123,21 @@ Costco:
|
||||
- `data/costco-web/normalized_items.csv` preserves raw totals and matched net discount fields
|
||||
|
||||
Combined:
|
||||
- `data/review/purchases.csv`
|
||||
- `data/review/analysis/item_price_over_time.csv`
|
||||
- `data/review/analysis/spend_by_visit.csv`
|
||||
- `data/review/analysis/items_per_visit.csv`
|
||||
- `data/review/analysis/category_spend_over_time.csv`
|
||||
- `data/review/analysis/retailer_store_breakdown.csv`
|
||||
- `data/analysis/purchases.csv`
|
||||
- `data/analysis/comparison_examples.csv`
|
||||
- `data/analysis/item_price_over_time.csv`
|
||||
- `data/analysis/spend_by_visit.csv`
|
||||
- `data/analysis/items_per_visit.csv`
|
||||
- `data/analysis/category_spend_over_time.csv`
|
||||
- `data/analysis/retailer_store_breakdown.csv`
|
||||
- `data/review/review_queue.csv`
|
||||
- `data/review/review_resolutions.csv`
|
||||
- `data/review/product_links.csv`
|
||||
- `data/review/comparison_examples.csv`
|
||||
- `data/review/pipeline_status.csv`
|
||||
- `data/review/pipeline_status.json`
|
||||
- `data/catalog.csv`
|
||||
- `data/review/catalog.csv`
|
||||
|
||||
`data/review/purchases.csv` is the main analysis artifact. It is designed to support both:
|
||||
`data/analysis/purchases.csv` is the main analysis artifact. It is designed to support both:
|
||||
- item-level price analysis
|
||||
- visit-level analysis such as spend by visit, items per visit, category spend by visit, and retailer/store breakdown
|
||||
|
||||
@@ -164,9 +165,7 @@ The review step is intentionally conservative:
|
||||
|
||||
## Notes
|
||||
- This project is designed around fragile retailer scraping flows, so the code favors explicit retailer-specific steps over heavy abstraction.
|
||||
- `scrape_giant.py`, `scrape_costco.py`, `enrich_giant.py`, and `enrich_costco.py` are now legacy-compatible entrypoints; prefer the `collect_*` and `normalize_*` scripts for active work.
|
||||
- Costco discount rows are preserved for auditability and also matched back to purchased items during enrichment.
|
||||
- `validate_cross_retailer_flow.py` is a proof/check script, not a required production step.
|
||||
|
||||
## Test
|
||||
|
||||
|
||||
@@ -241,8 +241,8 @@ def build_retailer_store_rows(purchase_rows):
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option("--purchases-csv", default="data/review/purchases.csv", show_default=True)
|
||||
@click.option("--output-dir", default="data/review/analysis", show_default=True)
|
||||
@click.option("--purchases-csv", default="data/analysis/purchases.csv", show_default=True)
|
||||
@click.option("--output-dir", default="data/analysis", show_default=True)
|
||||
def main(purchases_csv, output_dir):
|
||||
purchase_rows = read_csv_rows(purchases_csv)
|
||||
output_path = Path(output_dir)
|
||||
|
||||
@@ -1,220 +0,0 @@
|
||||
import click
|
||||
import re
|
||||
|
||||
from layer_helpers import read_csv_rows, representative_value, stable_id, write_csv_rows
|
||||
|
||||
|
||||
CANONICAL_FIELDS = [
|
||||
"canonical_product_id",
|
||||
"canonical_name",
|
||||
"product_type",
|
||||
"brand",
|
||||
"variant",
|
||||
"size_value",
|
||||
"size_unit",
|
||||
"pack_qty",
|
||||
"measure_type",
|
||||
"normalized_quantity",
|
||||
"normalized_quantity_unit",
|
||||
"notes",
|
||||
"created_at",
|
||||
"updated_at",
|
||||
]
|
||||
|
||||
CANONICAL_DROP_TOKENS = {"CT", "COUNT", "COUNTS", "DOZ", "DOZEN", "DOZ.", "PACK"}
|
||||
|
||||
LINK_FIELDS = [
|
||||
"observed_product_id",
|
||||
"canonical_product_id",
|
||||
"link_method",
|
||||
"link_confidence",
|
||||
"review_status",
|
||||
"reviewed_by",
|
||||
"reviewed_at",
|
||||
"link_notes",
|
||||
]
|
||||
|
||||
|
||||
def to_float(value):
|
||||
try:
|
||||
return float(value)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def normalized_quantity(row):
|
||||
size_value = to_float(row.get("representative_size_value"))
|
||||
pack_qty = to_float(row.get("representative_pack_qty")) or 1.0
|
||||
size_unit = row.get("representative_size_unit", "")
|
||||
measure_type = row.get("representative_measure_type", "")
|
||||
|
||||
if size_value is not None and size_unit:
|
||||
return format(size_value * pack_qty, "g"), size_unit
|
||||
|
||||
if row.get("representative_pack_qty") and measure_type == "count":
|
||||
return row["representative_pack_qty"], "count"
|
||||
|
||||
if measure_type == "each":
|
||||
return "1", "each"
|
||||
|
||||
return "", ""
|
||||
|
||||
|
||||
def auto_link_rule(observed_row):
|
||||
if (
|
||||
observed_row.get("is_fee") == "true"
|
||||
or observed_row.get("is_discount_line") == "true"
|
||||
or observed_row.get("is_coupon_line") == "true"
|
||||
):
|
||||
return "", "", ""
|
||||
|
||||
if observed_row.get("representative_upc"):
|
||||
return (
|
||||
"exact_upc",
|
||||
f"upc={observed_row['representative_upc']}",
|
||||
"high",
|
||||
)
|
||||
|
||||
if (
|
||||
observed_row.get("representative_name_norm")
|
||||
and observed_row.get("representative_size_value")
|
||||
and observed_row.get("representative_size_unit")
|
||||
):
|
||||
return (
|
||||
"exact_name_size",
|
||||
"|".join(
|
||||
[
|
||||
f"name={observed_row['representative_name_norm']}",
|
||||
f"size={observed_row['representative_size_value']}",
|
||||
f"unit={observed_row['representative_size_unit']}",
|
||||
f"pack={observed_row['representative_pack_qty']}",
|
||||
f"measure={observed_row['representative_measure_type']}",
|
||||
]
|
||||
),
|
||||
"high",
|
||||
)
|
||||
|
||||
return "", "", ""
|
||||
|
||||
|
||||
def clean_canonical_name(name):
|
||||
tokens = []
|
||||
for token in re.sub(r"[^A-Z0-9\s]", " ", (name or "").upper()).split():
|
||||
if token.isdigit():
|
||||
continue
|
||||
if token in CANONICAL_DROP_TOKENS:
|
||||
continue
|
||||
if re.fullmatch(r"\d+(?:PK|PACK)", token):
|
||||
continue
|
||||
if re.fullmatch(r"\d+DZ", token):
|
||||
continue
|
||||
tokens.append(token)
|
||||
return " ".join(tokens).strip()
|
||||
|
||||
|
||||
def canonical_row_for_group(canonical_product_id, group_rows, link_method):
|
||||
quantity_value, quantity_unit = normalized_quantity(
|
||||
{
|
||||
"representative_size_value": representative_value(
|
||||
group_rows, "representative_size_value"
|
||||
),
|
||||
"representative_size_unit": representative_value(
|
||||
group_rows, "representative_size_unit"
|
||||
),
|
||||
"representative_pack_qty": representative_value(
|
||||
group_rows, "representative_pack_qty"
|
||||
),
|
||||
"representative_measure_type": representative_value(
|
||||
group_rows, "representative_measure_type"
|
||||
),
|
||||
}
|
||||
)
|
||||
return {
|
||||
"canonical_product_id": canonical_product_id,
|
||||
"canonical_name": clean_canonical_name(
|
||||
representative_value(group_rows, "representative_name_norm")
|
||||
)
|
||||
or representative_value(group_rows, "representative_name_norm"),
|
||||
"product_type": "",
|
||||
"brand": representative_value(group_rows, "representative_brand"),
|
||||
"variant": representative_value(group_rows, "representative_variant"),
|
||||
"size_value": representative_value(group_rows, "representative_size_value"),
|
||||
"size_unit": representative_value(group_rows, "representative_size_unit"),
|
||||
"pack_qty": representative_value(group_rows, "representative_pack_qty"),
|
||||
"measure_type": representative_value(group_rows, "representative_measure_type"),
|
||||
"normalized_quantity": quantity_value,
|
||||
"normalized_quantity_unit": quantity_unit,
|
||||
"notes": f"auto-linked via {link_method}",
|
||||
"created_at": "",
|
||||
"updated_at": "",
|
||||
}
|
||||
|
||||
|
||||
def build_canonical_layer(observed_rows):
|
||||
canonical_rows = []
|
||||
link_rows = []
|
||||
groups = {}
|
||||
|
||||
for observed_row in sorted(observed_rows, key=lambda row: row["observed_product_id"]):
|
||||
link_method, group_key, confidence = auto_link_rule(observed_row)
|
||||
if not group_key:
|
||||
continue
|
||||
|
||||
canonical_product_id = stable_id("gcan", f"{link_method}|{group_key}")
|
||||
groups.setdefault(canonical_product_id, {"method": link_method, "rows": []})
|
||||
groups[canonical_product_id]["rows"].append(observed_row)
|
||||
link_rows.append(
|
||||
{
|
||||
"observed_product_id": observed_row["observed_product_id"],
|
||||
"canonical_product_id": canonical_product_id,
|
||||
"link_method": link_method,
|
||||
"link_confidence": confidence,
|
||||
"review_status": "",
|
||||
"reviewed_by": "",
|
||||
"reviewed_at": "",
|
||||
"link_notes": "",
|
||||
}
|
||||
)
|
||||
|
||||
for canonical_product_id, group in sorted(groups.items()):
|
||||
canonical_rows.append(
|
||||
canonical_row_for_group(
|
||||
canonical_product_id, group["rows"], group["method"]
|
||||
)
|
||||
)
|
||||
|
||||
return canonical_rows, link_rows
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option(
|
||||
"--observed-csv",
|
||||
default="giant_output/products_observed.csv",
|
||||
show_default=True,
|
||||
help="Path to observed product rows.",
|
||||
)
|
||||
@click.option(
|
||||
"--canonical-csv",
|
||||
default="giant_output/products_canonical.csv",
|
||||
show_default=True,
|
||||
help="Path to canonical product output.",
|
||||
)
|
||||
@click.option(
|
||||
"--links-csv",
|
||||
default="giant_output/product_links.csv",
|
||||
show_default=True,
|
||||
help="Path to observed-to-canonical link output.",
|
||||
)
|
||||
def main(observed_csv, canonical_csv, links_csv):
|
||||
observed_rows = read_csv_rows(observed_csv)
|
||||
canonical_rows, link_rows = build_canonical_layer(observed_rows)
|
||||
write_csv_rows(canonical_csv, canonical_rows, CANONICAL_FIELDS)
|
||||
write_csv_rows(links_csv, link_rows, LINK_FIELDS)
|
||||
click.echo(
|
||||
f"wrote {len(canonical_rows)} canonical rows to {canonical_csv} and "
|
||||
f"{len(link_rows)} links to {links_csv}"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,172 +0,0 @@
|
||||
from collections import defaultdict
|
||||
|
||||
import click
|
||||
|
||||
from layer_helpers import (
|
||||
compact_join,
|
||||
distinct_values,
|
||||
first_nonblank,
|
||||
read_csv_rows,
|
||||
representative_value,
|
||||
stable_id,
|
||||
write_csv_rows,
|
||||
)
|
||||
|
||||
|
||||
OUTPUT_FIELDS = [
|
||||
"observed_product_id",
|
||||
"retailer",
|
||||
"observed_key",
|
||||
"representative_retailer_item_id",
|
||||
"representative_upc",
|
||||
"representative_item_name",
|
||||
"representative_name_norm",
|
||||
"representative_brand",
|
||||
"representative_variant",
|
||||
"representative_size_value",
|
||||
"representative_size_unit",
|
||||
"representative_pack_qty",
|
||||
"representative_measure_type",
|
||||
"representative_image_url",
|
||||
"is_store_brand",
|
||||
"is_fee",
|
||||
"is_discount_line",
|
||||
"is_coupon_line",
|
||||
"first_seen_date",
|
||||
"last_seen_date",
|
||||
"times_seen",
|
||||
"example_order_id",
|
||||
"example_item_name",
|
||||
"raw_name_examples",
|
||||
"normalized_name_examples",
|
||||
"example_prices",
|
||||
"distinct_item_names_count",
|
||||
"distinct_retailer_item_ids_count",
|
||||
"distinct_upcs_count",
|
||||
]
|
||||
|
||||
|
||||
def build_observed_key(row):
|
||||
if row.get("upc"):
|
||||
return "|".join(
|
||||
[
|
||||
row["retailer"],
|
||||
f"upc={row['upc']}",
|
||||
f"name={row['item_name_norm']}",
|
||||
]
|
||||
)
|
||||
|
||||
if row.get("retailer_item_id"):
|
||||
return "|".join(
|
||||
[
|
||||
row["retailer"],
|
||||
f"retailer_item_id={row['retailer_item_id']}",
|
||||
f"name={row['item_name_norm']}",
|
||||
f"discount={row.get('is_discount_line', 'false')}",
|
||||
f"coupon={row.get('is_coupon_line', 'false')}",
|
||||
]
|
||||
)
|
||||
|
||||
return "|".join(
|
||||
[
|
||||
row["retailer"],
|
||||
f"name={row['item_name_norm']}",
|
||||
f"size={row['size_value']}",
|
||||
f"unit={row['size_unit']}",
|
||||
f"pack={row['pack_qty']}",
|
||||
f"measure={row['measure_type']}",
|
||||
f"store_brand={row['is_store_brand']}",
|
||||
f"fee={row['is_fee']}",
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def build_observed_products(rows):
|
||||
grouped = defaultdict(list)
|
||||
for row in rows:
|
||||
grouped[build_observed_key(row)].append(row)
|
||||
|
||||
observed_rows = []
|
||||
for observed_key, group_rows in sorted(grouped.items()):
|
||||
ordered = sorted(
|
||||
group_rows,
|
||||
key=lambda row: (row["order_date"], row["order_id"], int(row["line_no"])),
|
||||
)
|
||||
observed_rows.append(
|
||||
{
|
||||
"observed_product_id": stable_id("gobs", observed_key),
|
||||
"retailer": ordered[0]["retailer"],
|
||||
"observed_key": observed_key,
|
||||
"representative_retailer_item_id": representative_value(
|
||||
ordered, "retailer_item_id"
|
||||
),
|
||||
"representative_upc": representative_value(ordered, "upc"),
|
||||
"representative_item_name": representative_value(ordered, "item_name"),
|
||||
"representative_name_norm": representative_value(
|
||||
ordered, "item_name_norm"
|
||||
),
|
||||
"representative_brand": representative_value(ordered, "brand_guess"),
|
||||
"representative_variant": representative_value(ordered, "variant"),
|
||||
"representative_size_value": representative_value(ordered, "size_value"),
|
||||
"representative_size_unit": representative_value(ordered, "size_unit"),
|
||||
"representative_pack_qty": representative_value(ordered, "pack_qty"),
|
||||
"representative_measure_type": representative_value(
|
||||
ordered, "measure_type"
|
||||
),
|
||||
"representative_image_url": first_nonblank(ordered, "image_url"),
|
||||
"is_store_brand": representative_value(ordered, "is_store_brand"),
|
||||
"is_fee": representative_value(ordered, "is_fee"),
|
||||
"is_discount_line": representative_value(
|
||||
ordered, "is_discount_line"
|
||||
),
|
||||
"is_coupon_line": representative_value(ordered, "is_coupon_line"),
|
||||
"first_seen_date": ordered[0]["order_date"],
|
||||
"last_seen_date": ordered[-1]["order_date"],
|
||||
"times_seen": str(len(ordered)),
|
||||
"example_order_id": ordered[0]["order_id"],
|
||||
"example_item_name": ordered[0]["item_name"],
|
||||
"raw_name_examples": compact_join(
|
||||
distinct_values(ordered, "item_name"), limit=4
|
||||
),
|
||||
"normalized_name_examples": compact_join(
|
||||
distinct_values(ordered, "item_name_norm"), limit=4
|
||||
),
|
||||
"example_prices": compact_join(
|
||||
distinct_values(ordered, "line_total"), limit=4
|
||||
),
|
||||
"distinct_item_names_count": str(
|
||||
len(distinct_values(ordered, "item_name"))
|
||||
),
|
||||
"distinct_retailer_item_ids_count": str(
|
||||
len(distinct_values(ordered, "retailer_item_id"))
|
||||
),
|
||||
"distinct_upcs_count": str(len(distinct_values(ordered, "upc"))),
|
||||
}
|
||||
)
|
||||
|
||||
observed_rows.sort(key=lambda row: row["observed_product_id"])
|
||||
return observed_rows
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option(
|
||||
"--items-enriched-csv",
|
||||
default="giant_output/items_enriched.csv",
|
||||
show_default=True,
|
||||
help="Path to enriched Giant item rows.",
|
||||
)
|
||||
@click.option(
|
||||
"--output-csv",
|
||||
default="giant_output/products_observed.csv",
|
||||
show_default=True,
|
||||
help="Path to observed product output.",
|
||||
)
|
||||
def main(items_enriched_csv, output_csv):
|
||||
rows = read_csv_rows(items_enriched_csv)
|
||||
observed_rows = build_observed_products(rows)
|
||||
write_csv_rows(output_csv, observed_rows, OUTPUT_FIELDS)
|
||||
click.echo(f"wrote {len(observed_rows)} rows to {output_csv}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -440,10 +440,10 @@ def build_comparison_examples(purchase_rows):
|
||||
@click.option("--giant-orders-csv", default="data/giant-web/collected_orders.csv", show_default=True)
|
||||
@click.option("--costco-orders-csv", default="data/costco-web/collected_orders.csv", show_default=True)
|
||||
@click.option("--resolutions-csv", default="data/review/review_resolutions.csv", show_default=True)
|
||||
@click.option("--catalog-csv", default="data/catalog.csv", show_default=True)
|
||||
@click.option("--catalog-csv", default="data/review/catalog.csv", show_default=True)
|
||||
@click.option("--links-csv", default="data/review/product_links.csv", show_default=True)
|
||||
@click.option("--output-csv", default="data/review/purchases.csv", show_default=True)
|
||||
@click.option("--examples-csv", default="data/review/comparison_examples.csv", show_default=True)
|
||||
@click.option("--output-csv", default="data/analysis/purchases.csv", show_default=True)
|
||||
@click.option("--examples-csv", default="data/analysis/comparison_examples.csv", show_default=True)
|
||||
def main(
|
||||
giant_items_enriched_csv,
|
||||
costco_items_enriched_csv,
|
||||
|
||||
@@ -1,175 +0,0 @@
|
||||
from collections import defaultdict
|
||||
from datetime import date
|
||||
|
||||
import click
|
||||
|
||||
from layer_helpers import compact_join, distinct_values, read_csv_rows, stable_id, write_csv_rows
|
||||
|
||||
|
||||
OUTPUT_FIELDS = [
|
||||
"review_id",
|
||||
"queue_type",
|
||||
"retailer",
|
||||
"observed_product_id",
|
||||
"canonical_product_id",
|
||||
"reason_code",
|
||||
"priority",
|
||||
"raw_item_names",
|
||||
"normalized_names",
|
||||
"upc",
|
||||
"image_url",
|
||||
"example_prices",
|
||||
"seen_count",
|
||||
"status",
|
||||
"resolution_notes",
|
||||
"created_at",
|
||||
"updated_at",
|
||||
]
|
||||
|
||||
|
||||
def existing_review_state(path):
|
||||
try:
|
||||
rows = read_csv_rows(path)
|
||||
except FileNotFoundError:
|
||||
return {}
|
||||
return {row["review_id"]: row for row in rows}
|
||||
|
||||
|
||||
def review_reasons(observed_row):
|
||||
reasons = []
|
||||
if (
|
||||
observed_row["is_fee"] == "true"
|
||||
or observed_row.get("is_discount_line") == "true"
|
||||
or observed_row.get("is_coupon_line") == "true"
|
||||
):
|
||||
return reasons
|
||||
if observed_row["distinct_upcs_count"] not in {"", "0", "1"}:
|
||||
reasons.append(("multiple_upcs", "high"))
|
||||
if observed_row["distinct_item_names_count"] not in {"", "0", "1"}:
|
||||
reasons.append(("multiple_raw_names", "medium"))
|
||||
if not observed_row["representative_image_url"]:
|
||||
reasons.append(("missing_image", "medium"))
|
||||
if not observed_row["representative_upc"]:
|
||||
reasons.append(("missing_upc", "high"))
|
||||
if not observed_row["representative_name_norm"]:
|
||||
reasons.append(("missing_normalized_name", "high"))
|
||||
return reasons
|
||||
|
||||
|
||||
def build_review_queue(observed_rows, item_rows, existing_rows, today_text):
|
||||
by_observed = defaultdict(list)
|
||||
for row in item_rows:
|
||||
observed_id = row.get("observed_product_id", "")
|
||||
if observed_id:
|
||||
by_observed[observed_id].append(row)
|
||||
|
||||
queue_rows = []
|
||||
for observed_row in observed_rows:
|
||||
reasons = review_reasons(observed_row)
|
||||
if not reasons:
|
||||
continue
|
||||
|
||||
related_items = by_observed.get(observed_row["observed_product_id"], [])
|
||||
raw_names = compact_join(distinct_values(related_items, "item_name"), limit=5)
|
||||
norm_names = compact_join(
|
||||
distinct_values(related_items, "item_name_norm"), limit=5
|
||||
)
|
||||
example_prices = compact_join(
|
||||
distinct_values(related_items, "line_total"), limit=5
|
||||
)
|
||||
|
||||
for reason_code, priority in reasons:
|
||||
review_id = stable_id(
|
||||
"rvw",
|
||||
f"{observed_row['observed_product_id']}|{reason_code}",
|
||||
)
|
||||
prior = existing_rows.get(review_id, {})
|
||||
queue_rows.append(
|
||||
{
|
||||
"review_id": review_id,
|
||||
"queue_type": "observed_product",
|
||||
"retailer": observed_row["retailer"],
|
||||
"observed_product_id": observed_row["observed_product_id"],
|
||||
"canonical_product_id": prior.get("canonical_product_id", ""),
|
||||
"reason_code": reason_code,
|
||||
"priority": priority,
|
||||
"raw_item_names": raw_names,
|
||||
"normalized_names": norm_names,
|
||||
"upc": observed_row["representative_upc"],
|
||||
"image_url": observed_row["representative_image_url"],
|
||||
"example_prices": example_prices,
|
||||
"seen_count": observed_row["times_seen"],
|
||||
"status": prior.get("status", "pending"),
|
||||
"resolution_notes": prior.get("resolution_notes", ""),
|
||||
"created_at": prior.get("created_at", today_text),
|
||||
"updated_at": today_text,
|
||||
}
|
||||
)
|
||||
|
||||
queue_rows.sort(key=lambda row: (row["priority"], row["reason_code"], row["review_id"]))
|
||||
return queue_rows
|
||||
|
||||
|
||||
def attach_observed_ids(item_rows, observed_rows):
|
||||
observed_by_key = {row["observed_key"]: row["observed_product_id"] for row in observed_rows}
|
||||
attached = []
|
||||
for row in item_rows:
|
||||
observed_key = "|".join(
|
||||
[
|
||||
row["retailer"],
|
||||
f"upc={row['upc']}",
|
||||
f"name={row['item_name_norm']}",
|
||||
]
|
||||
) if row.get("upc") else "|".join(
|
||||
[
|
||||
row["retailer"],
|
||||
f"retailer_item_id={row.get('retailer_item_id', '')}",
|
||||
f"name={row['item_name_norm']}",
|
||||
f"size={row['size_value']}",
|
||||
f"unit={row['size_unit']}",
|
||||
f"pack={row['pack_qty']}",
|
||||
f"measure={row['measure_type']}",
|
||||
f"store_brand={row['is_store_brand']}",
|
||||
f"fee={row['is_fee']}",
|
||||
f"discount={row.get('is_discount_line', 'false')}",
|
||||
f"coupon={row.get('is_coupon_line', 'false')}",
|
||||
]
|
||||
)
|
||||
enriched = dict(row)
|
||||
enriched["observed_product_id"] = observed_by_key.get(observed_key, "")
|
||||
attached.append(enriched)
|
||||
return attached
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option(
|
||||
"--observed-csv",
|
||||
default="giant_output/products_observed.csv",
|
||||
show_default=True,
|
||||
help="Path to observed product rows.",
|
||||
)
|
||||
@click.option(
|
||||
"--items-enriched-csv",
|
||||
default="giant_output/items_enriched.csv",
|
||||
show_default=True,
|
||||
help="Path to enriched Giant item rows.",
|
||||
)
|
||||
@click.option(
|
||||
"--output-csv",
|
||||
default="giant_output/review_queue.csv",
|
||||
show_default=True,
|
||||
help="Path to review queue output.",
|
||||
)
|
||||
def main(observed_csv, items_enriched_csv, output_csv):
|
||||
observed_rows = read_csv_rows(observed_csv)
|
||||
item_rows = read_csv_rows(items_enriched_csv)
|
||||
item_rows = attach_observed_ids(item_rows, observed_rows)
|
||||
existing_rows = existing_review_state(output_csv)
|
||||
today_text = str(date.today())
|
||||
queue_rows = build_review_queue(observed_rows, item_rows, existing_rows, today_text)
|
||||
write_csv_rows(output_csv, queue_rows, OUTPUT_FIELDS)
|
||||
click.echo(f"wrote {len(queue_rows)} rows to {output_csv}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -110,8 +110,15 @@ data/
|
||||
review/
|
||||
review_queue.csv # Human review queue for unresolved matching/parsing cases.
|
||||
product_links.csv # Links from normalized retailer items to catalog items.
|
||||
catalog.csv # Cross-retailer product catalog entities used for comparison.
|
||||
purchases.csv
|
||||
catalog.csv # Cross-retailer product catalog entities used for comparison.
|
||||
analysis/
|
||||
purchases.csv
|
||||
comparison_examples.csv
|
||||
item_price_over_time.csv
|
||||
spend_by_visit.csv
|
||||
items_per_visit.csv
|
||||
category_spend_over_time.csv
|
||||
retailer_store_breakdown.csv
|
||||
#+end_example
|
||||
|
||||
Notes:
|
||||
@@ -223,7 +230,7 @@ Notes:
|
||||
- Valid `normalization_basis` values should be explicit, e.g. `exact_upc`, `exact_retailer_item_id`, `exact_name_size_pack`, or `approved_retailer_alias`.
|
||||
- Do not use fuzzy or semantic matching to assign `normalized_item_id`.
|
||||
- Discount/coupon rows may remain as standalone normalized rows for auditability even when their amounts are attached to a purchased row via `matched_discount_amount`.
|
||||
- Cross-retailer identity is handled later in review/combine via `catalog.csv` and `product_links.csv`.
|
||||
- Cross-retailer identity is handled later in review/combine via `data/review/catalog.csv` and `product_links.csv`.
|
||||
|
||||
** `data/review/product_links.csv`
|
||||
One row per review-approved link from a normalized retailer item to a catalog item.
|
||||
@@ -263,7 +270,7 @@ One row per issue needing human review.
|
||||
| `resolution_notes` | reviewer notes |
|
||||
| `created_at` | creation timestamp or date |
|
||||
| `updated_at` | last update timestamp or date |
|
||||
** `data/catalog.csv`
|
||||
** `data/review/catalog.csv`
|
||||
One row per cross-retailer catalog product.
|
||||
| key | definition |
|
||||
|----------------------------+----------------------------------------|
|
||||
@@ -288,7 +295,7 @@ Notes:
|
||||
- Do not encode packaging/count into `catalog_name` unless it is essential to product identity.
|
||||
- `catalog_name` should come from review-approved naming, not raw retailer strings.
|
||||
|
||||
** `data/purchases.csv`
|
||||
** `data/analysis/purchases.csv`
|
||||
One row per purchased item (i.e., `is_item`==true from normalized layer), with
|
||||
catalog attributes denormalized in and discounts already applied.
|
||||
|
||||
|
||||
@@ -85,10 +85,10 @@ def build_status_summary(
|
||||
@click.option("--costco-orders-csv", default="data/costco-web/collected_orders.csv", show_default=True)
|
||||
@click.option("--costco-items-csv", default="data/costco-web/collected_items.csv", show_default=True)
|
||||
@click.option("--costco-enriched-csv", default="data/costco-web/normalized_items.csv", show_default=True)
|
||||
@click.option("--purchases-csv", default="data/review/purchases.csv", show_default=True)
|
||||
@click.option("--purchases-csv", default="data/analysis/purchases.csv", show_default=True)
|
||||
@click.option("--resolutions-csv", default="data/review/review_resolutions.csv", show_default=True)
|
||||
@click.option("--links-csv", default="data/review/product_links.csv", show_default=True)
|
||||
@click.option("--catalog-csv", default="data/catalog.csv", show_default=True)
|
||||
@click.option("--catalog-csv", default="data/review/catalog.csv", show_default=True)
|
||||
@click.option("--summary-csv", default="data/review/pipeline_status.csv", show_default=True)
|
||||
@click.option("--summary-json", default="data/review/pipeline_status.json", show_default=True)
|
||||
def main(
|
||||
|
||||
@@ -562,10 +562,10 @@ def link_rows_from_state(link_lookup):
|
||||
@click.option("--costco-items-enriched-csv", default="data/costco-web/normalized_items.csv", show_default=True)
|
||||
@click.option("--giant-orders-csv", default="data/giant-web/collected_orders.csv", show_default=True)
|
||||
@click.option("--costco-orders-csv", default="data/costco-web/collected_orders.csv", show_default=True)
|
||||
@click.option("--purchases-csv", default="data/review/purchases.csv", show_default=True)
|
||||
@click.option("--purchases-csv", default="data/analysis/purchases.csv", show_default=True)
|
||||
@click.option("--queue-csv", default="data/review/review_queue.csv", show_default=True)
|
||||
@click.option("--resolutions-csv", default="data/review/review_resolutions.csv", show_default=True)
|
||||
@click.option("--catalog-csv", default="data/catalog.csv", show_default=True)
|
||||
@click.option("--catalog-csv", default="data/review/catalog.csv", show_default=True)
|
||||
@click.option("--links-csv", default="data/review/product_links.csv", show_default=True)
|
||||
@click.option("--limit", default=0, show_default=True, type=int)
|
||||
@click.option("--refresh-only", is_flag=True, help="Only rebuild review_queue.csv without prompting.")
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
from scrape_giant import * # noqa: F401,F403
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,119 +0,0 @@
|
||||
import unittest
|
||||
|
||||
import build_canonical_layer
|
||||
|
||||
|
||||
class CanonicalLayerTests(unittest.TestCase):
|
||||
def test_build_canonical_layer_auto_links_exact_upc_and_name_size_only(self):
|
||||
observed_rows = [
|
||||
{
|
||||
"observed_product_id": "gobs_1",
|
||||
"representative_upc": "111",
|
||||
"representative_retailer_item_id": "11",
|
||||
"representative_name_norm": "GALA APPLE",
|
||||
"representative_brand": "SB",
|
||||
"representative_variant": "",
|
||||
"representative_size_value": "5",
|
||||
"representative_size_unit": "lb",
|
||||
"representative_pack_qty": "",
|
||||
"representative_measure_type": "weight",
|
||||
"is_fee": "false",
|
||||
"is_discount_line": "false",
|
||||
"is_coupon_line": "false",
|
||||
},
|
||||
{
|
||||
"observed_product_id": "gobs_2",
|
||||
"representative_upc": "111",
|
||||
"representative_retailer_item_id": "12",
|
||||
"representative_name_norm": "LARGE WHITE EGGS",
|
||||
"representative_brand": "SB",
|
||||
"representative_variant": "",
|
||||
"representative_size_value": "",
|
||||
"representative_size_unit": "",
|
||||
"representative_pack_qty": "18",
|
||||
"representative_measure_type": "count",
|
||||
"is_fee": "false",
|
||||
"is_discount_line": "false",
|
||||
"is_coupon_line": "false",
|
||||
},
|
||||
{
|
||||
"observed_product_id": "gobs_3",
|
||||
"representative_upc": "",
|
||||
"representative_retailer_item_id": "21",
|
||||
"representative_name_norm": "ROTINI",
|
||||
"representative_brand": "",
|
||||
"representative_variant": "",
|
||||
"representative_size_value": "16",
|
||||
"representative_size_unit": "oz",
|
||||
"representative_pack_qty": "",
|
||||
"representative_measure_type": "weight",
|
||||
"is_fee": "false",
|
||||
"is_discount_line": "false",
|
||||
"is_coupon_line": "false",
|
||||
},
|
||||
{
|
||||
"observed_product_id": "gobs_4",
|
||||
"representative_upc": "",
|
||||
"representative_retailer_item_id": "22",
|
||||
"representative_name_norm": "ROTINI",
|
||||
"representative_brand": "SB",
|
||||
"representative_variant": "",
|
||||
"representative_size_value": "16",
|
||||
"representative_size_unit": "oz",
|
||||
"representative_pack_qty": "",
|
||||
"representative_measure_type": "weight",
|
||||
"is_fee": "false",
|
||||
"is_discount_line": "false",
|
||||
"is_coupon_line": "false",
|
||||
},
|
||||
{
|
||||
"observed_product_id": "gobs_5",
|
||||
"representative_upc": "",
|
||||
"representative_retailer_item_id": "99",
|
||||
"representative_name_norm": "GL BAG CHARGE",
|
||||
"representative_brand": "",
|
||||
"representative_variant": "",
|
||||
"representative_size_value": "",
|
||||
"representative_size_unit": "",
|
||||
"representative_pack_qty": "",
|
||||
"representative_measure_type": "each",
|
||||
"is_fee": "true",
|
||||
"is_discount_line": "false",
|
||||
"is_coupon_line": "false",
|
||||
},
|
||||
{
|
||||
"observed_product_id": "gobs_6",
|
||||
"representative_upc": "",
|
||||
"representative_retailer_item_id": "",
|
||||
"representative_name_norm": "LIME",
|
||||
"representative_brand": "",
|
||||
"representative_variant": "",
|
||||
"representative_size_value": "",
|
||||
"representative_size_unit": "",
|
||||
"representative_pack_qty": "",
|
||||
"representative_measure_type": "each",
|
||||
"is_fee": "false",
|
||||
"is_discount_line": "false",
|
||||
"is_coupon_line": "false",
|
||||
},
|
||||
]
|
||||
|
||||
canonicals, links = build_canonical_layer.build_canonical_layer(observed_rows)
|
||||
|
||||
self.assertEqual(2, len(canonicals))
|
||||
self.assertEqual(4, len(links))
|
||||
methods = {row["observed_product_id"]: row["link_method"] for row in links}
|
||||
self.assertEqual("exact_upc", methods["gobs_1"])
|
||||
self.assertEqual("exact_upc", methods["gobs_2"])
|
||||
self.assertEqual("exact_name_size", methods["gobs_3"])
|
||||
self.assertEqual("exact_name_size", methods["gobs_4"])
|
||||
self.assertNotIn("gobs_5", methods)
|
||||
self.assertNotIn("gobs_6", methods)
|
||||
|
||||
def test_clean_canonical_name_removes_packaging_noise(self):
|
||||
self.assertEqual("LIME", build_canonical_layer.clean_canonical_name("LIME . / ."))
|
||||
self.assertEqual("EGG", build_canonical_layer.clean_canonical_name("5DZ EGG / /"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -7,7 +7,6 @@ from unittest import mock
|
||||
|
||||
import enrich_costco
|
||||
import scrape_costco
|
||||
import validate_cross_retailer_flow
|
||||
|
||||
|
||||
class CostcoPipelineTests(unittest.TestCase):
|
||||
@@ -423,76 +422,6 @@ class CostcoPipelineTests(unittest.TestCase):
|
||||
self.assertIn("matched_discount=4873222", purchase_row["parse_notes"])
|
||||
self.assertIn("matched_to_item=4873222", discount_row["parse_notes"])
|
||||
|
||||
def test_cross_retailer_validation_writes_proof_example(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
giant_csv = Path(tmpdir) / "giant_items_enriched.csv"
|
||||
costco_csv = Path(tmpdir) / "costco_items_enriched.csv"
|
||||
outdir = Path(tmpdir) / "combined"
|
||||
|
||||
fieldnames = enrich_costco.OUTPUT_FIELDS
|
||||
giant_row = {field: "" for field in fieldnames}
|
||||
giant_row.update(
|
||||
{
|
||||
"retailer": "giant",
|
||||
"order_id": "g1",
|
||||
"line_no": "1",
|
||||
"order_date": "2026-03-01",
|
||||
"retailer_item_id": "100",
|
||||
"item_name": "FRESH BANANA",
|
||||
"item_name_norm": "BANANA",
|
||||
"upc": "4011",
|
||||
"measure_type": "weight",
|
||||
"is_store_brand": "false",
|
||||
"is_fee": "false",
|
||||
"is_discount_line": "false",
|
||||
"is_coupon_line": "false",
|
||||
"line_total": "1.29",
|
||||
}
|
||||
)
|
||||
costco_row = {field: "" for field in fieldnames}
|
||||
costco_row.update(
|
||||
{
|
||||
"retailer": "costco",
|
||||
"order_id": "c1",
|
||||
"line_no": "1",
|
||||
"order_date": "2026-03-12",
|
||||
"retailer_item_id": "30669",
|
||||
"item_name": "BANANAS 3 LB / 1.36 KG",
|
||||
"item_name_norm": "BANANA",
|
||||
"upc": "",
|
||||
"size_value": "3",
|
||||
"size_unit": "lb",
|
||||
"measure_type": "weight",
|
||||
"is_store_brand": "false",
|
||||
"is_fee": "false",
|
||||
"is_discount_line": "false",
|
||||
"is_coupon_line": "false",
|
||||
"line_total": "2.98",
|
||||
}
|
||||
)
|
||||
|
||||
with giant_csv.open("w", newline="", encoding="utf-8") as handle:
|
||||
writer = csv.DictWriter(handle, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerow(giant_row)
|
||||
with costco_csv.open("w", newline="", encoding="utf-8") as handle:
|
||||
writer = csv.DictWriter(handle, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerow(costco_row)
|
||||
|
||||
validate_cross_retailer_flow.main.callback(
|
||||
giant_items_enriched_csv=str(giant_csv),
|
||||
costco_items_enriched_csv=str(costco_csv),
|
||||
outdir=str(outdir),
|
||||
)
|
||||
|
||||
proof_path = outdir / "proof_examples.csv"
|
||||
self.assertTrue(proof_path.exists())
|
||||
with proof_path.open(newline="", encoding="utf-8") as handle:
|
||||
rows = list(csv.DictReader(handle))
|
||||
self.assertEqual(1, len(rows))
|
||||
self.assertEqual("banana", rows[0]["proof_name"])
|
||||
|
||||
def test_main_writes_summary_request_metadata(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
outdir = Path(tmpdir) / "costco_output"
|
||||
|
||||
@@ -1,67 +0,0 @@
|
||||
import unittest
|
||||
|
||||
import build_observed_products
|
||||
|
||||
|
||||
class ObservedProductTests(unittest.TestCase):
|
||||
def test_build_observed_products_aggregates_rows_with_same_key(self):
|
||||
rows = [
|
||||
{
|
||||
"retailer": "giant",
|
||||
"order_id": "1",
|
||||
"line_no": "1",
|
||||
"order_date": "2026-01-01",
|
||||
"item_name": "SB GALA APPLE 5LB",
|
||||
"item_name_norm": "GALA APPLE",
|
||||
"retailer_item_id": "11",
|
||||
"upc": "111",
|
||||
"brand_guess": "SB",
|
||||
"variant": "",
|
||||
"size_value": "5",
|
||||
"size_unit": "lb",
|
||||
"pack_qty": "",
|
||||
"measure_type": "weight",
|
||||
"image_url": "https://example.test/a.jpg",
|
||||
"is_store_brand": "true",
|
||||
"is_fee": "false",
|
||||
"is_discount_line": "false",
|
||||
"is_coupon_line": "false",
|
||||
"line_total": "7.99",
|
||||
},
|
||||
{
|
||||
"retailer": "giant",
|
||||
"order_id": "2",
|
||||
"line_no": "1",
|
||||
"order_date": "2026-01-10",
|
||||
"item_name": "SB GALA APPLE 5 LB",
|
||||
"item_name_norm": "GALA APPLE",
|
||||
"retailer_item_id": "11",
|
||||
"upc": "111",
|
||||
"brand_guess": "SB",
|
||||
"variant": "",
|
||||
"size_value": "5",
|
||||
"size_unit": "lb",
|
||||
"pack_qty": "",
|
||||
"measure_type": "weight",
|
||||
"image_url": "",
|
||||
"is_store_brand": "true",
|
||||
"is_fee": "false",
|
||||
"is_discount_line": "false",
|
||||
"is_coupon_line": "false",
|
||||
"line_total": "8.49",
|
||||
},
|
||||
]
|
||||
|
||||
observed = build_observed_products.build_observed_products(rows)
|
||||
|
||||
self.assertEqual(1, len(observed))
|
||||
self.assertEqual("2", observed[0]["times_seen"])
|
||||
self.assertEqual("2026-01-01", observed[0]["first_seen_date"])
|
||||
self.assertEqual("2026-01-10", observed[0]["last_seen_date"])
|
||||
self.assertEqual("11", observed[0]["representative_retailer_item_id"])
|
||||
self.assertEqual("111", observed[0]["representative_upc"])
|
||||
self.assertIn("SB GALA APPLE 5LB", observed[0]["raw_name_examples"])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -65,6 +65,21 @@ class PipelineStatusTests(unittest.TestCase):
|
||||
},
|
||||
],
|
||||
resolutions=[],
|
||||
links=[
|
||||
{
|
||||
"normalized_item_id": "gnorm_banana",
|
||||
"catalog_id": "cat_banana",
|
||||
"review_status": "approved",
|
||||
}
|
||||
],
|
||||
catalog=[
|
||||
{
|
||||
"catalog_id": "cat_banana",
|
||||
"catalog_name": "BANANA",
|
||||
"product_type": "banana",
|
||||
"category": "produce",
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
counts = {row["stage"]: row["count"] for row in summary}
|
||||
|
||||
@@ -1,133 +0,0 @@
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
import build_observed_products
|
||||
import build_review_queue
|
||||
from layer_helpers import write_csv_rows
|
||||
|
||||
|
||||
class ReviewQueueTests(unittest.TestCase):
|
||||
def test_build_review_queue_preserves_existing_status(self):
|
||||
observed_rows = [
|
||||
{
|
||||
"observed_product_id": "gobs_1",
|
||||
"retailer": "giant",
|
||||
"representative_upc": "111",
|
||||
"representative_image_url": "",
|
||||
"representative_name_norm": "GALA APPLE",
|
||||
"times_seen": "2",
|
||||
"distinct_item_names_count": "2",
|
||||
"distinct_upcs_count": "1",
|
||||
"is_fee": "false",
|
||||
"is_discount_line": "false",
|
||||
"is_coupon_line": "false",
|
||||
}
|
||||
]
|
||||
item_rows = [
|
||||
{
|
||||
"observed_product_id": "gobs_1",
|
||||
"item_name": "SB GALA APPLE 5LB",
|
||||
"item_name_norm": "GALA APPLE",
|
||||
"line_total": "7.99",
|
||||
},
|
||||
{
|
||||
"observed_product_id": "gobs_1",
|
||||
"item_name": "SB GALA APPLE 5 LB",
|
||||
"item_name_norm": "GALA APPLE",
|
||||
"line_total": "8.49",
|
||||
},
|
||||
]
|
||||
existing = {
|
||||
build_review_queue.stable_id("rvw", "gobs_1|missing_image"): {
|
||||
"status": "approved",
|
||||
"resolution_notes": "looked fine",
|
||||
"created_at": "2026-03-15",
|
||||
}
|
||||
}
|
||||
|
||||
queue = build_review_queue.build_review_queue(
|
||||
observed_rows, item_rows, existing, "2026-03-16"
|
||||
)
|
||||
|
||||
self.assertEqual(2, len(queue))
|
||||
missing_image = [row for row in queue if row["reason_code"] == "missing_image"][0]
|
||||
self.assertEqual("approved", missing_image["status"])
|
||||
self.assertEqual("looked fine", missing_image["resolution_notes"])
|
||||
|
||||
def test_review_queue_main_writes_output(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
observed_path = Path(tmpdir) / "products_observed.csv"
|
||||
items_path = Path(tmpdir) / "items_enriched.csv"
|
||||
output_path = Path(tmpdir) / "review_queue.csv"
|
||||
|
||||
observed_rows = [
|
||||
{
|
||||
"observed_product_id": "gobs_1",
|
||||
"retailer": "giant",
|
||||
"observed_key": "giant|upc=111|name=GALA APPLE",
|
||||
"representative_retailer_item_id": "11",
|
||||
"representative_upc": "111",
|
||||
"representative_item_name": "SB GALA APPLE 5LB",
|
||||
"representative_name_norm": "GALA APPLE",
|
||||
"representative_brand": "SB",
|
||||
"representative_variant": "",
|
||||
"representative_size_value": "5",
|
||||
"representative_size_unit": "lb",
|
||||
"representative_pack_qty": "",
|
||||
"representative_measure_type": "weight",
|
||||
"representative_image_url": "",
|
||||
"is_store_brand": "true",
|
||||
"is_fee": "false",
|
||||
"is_discount_line": "false",
|
||||
"is_coupon_line": "false",
|
||||
"first_seen_date": "2026-01-01",
|
||||
"last_seen_date": "2026-01-10",
|
||||
"times_seen": "2",
|
||||
"example_order_id": "1",
|
||||
"example_item_name": "SB GALA APPLE 5LB",
|
||||
"raw_name_examples": "SB GALA APPLE 5LB | SB GALA APPLE 5 LB",
|
||||
"normalized_name_examples": "GALA APPLE",
|
||||
"example_prices": "7.99 | 8.49",
|
||||
"distinct_item_names_count": "2",
|
||||
"distinct_retailer_item_ids_count": "1",
|
||||
"distinct_upcs_count": "1",
|
||||
}
|
||||
]
|
||||
item_rows = [
|
||||
{
|
||||
"retailer": "giant",
|
||||
"order_id": "1",
|
||||
"line_no": "1",
|
||||
"item_name": "SB GALA APPLE 5LB",
|
||||
"item_name_norm": "GALA APPLE",
|
||||
"retailer_item_id": "11",
|
||||
"upc": "111",
|
||||
"size_value": "5",
|
||||
"size_unit": "lb",
|
||||
"pack_qty": "",
|
||||
"measure_type": "weight",
|
||||
"is_store_brand": "true",
|
||||
"is_fee": "false",
|
||||
"is_discount_line": "false",
|
||||
"is_coupon_line": "false",
|
||||
"line_total": "7.99",
|
||||
}
|
||||
]
|
||||
|
||||
write_csv_rows(
|
||||
observed_path, observed_rows, build_observed_products.OUTPUT_FIELDS
|
||||
)
|
||||
write_csv_rows(items_path, item_rows, list(item_rows[0].keys()))
|
||||
|
||||
build_review_queue.main.callback(
|
||||
observed_csv=str(observed_path),
|
||||
items_enriched_csv=str(items_path),
|
||||
output_csv=str(output_path),
|
||||
)
|
||||
|
||||
self.assertTrue(output_path.exists())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -3,7 +3,7 @@ import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
import scraper
|
||||
import scrape_giant as scraper
|
||||
|
||||
|
||||
class ScraperTests(unittest.TestCase):
|
||||
|
||||
@@ -1,154 +0,0 @@
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import click
|
||||
|
||||
import build_canonical_layer
|
||||
import build_observed_products
|
||||
from layer_helpers import stable_id, write_csv_rows
|
||||
|
||||
|
||||
PROOF_FIELDS = [
|
||||
"proof_name",
|
||||
"canonical_product_id",
|
||||
"giant_observed_product_id",
|
||||
"costco_observed_product_id",
|
||||
"giant_example_item",
|
||||
"costco_example_item",
|
||||
"notes",
|
||||
]
|
||||
|
||||
|
||||
def read_rows(path):
|
||||
import csv
|
||||
|
||||
with Path(path).open(newline="", encoding="utf-8") as handle:
|
||||
return list(csv.DictReader(handle))
|
||||
|
||||
|
||||
def find_proof_pair(observed_rows):
|
||||
giant = None
|
||||
costco = None
|
||||
for row in observed_rows:
|
||||
if row["retailer"] == "giant" and row["representative_name_norm"] == "BANANA":
|
||||
giant = row
|
||||
if row["retailer"] == "costco" and row["representative_name_norm"] == "BANANA":
|
||||
costco = row
|
||||
return giant, costco
|
||||
|
||||
|
||||
def merge_proof_pair(canonical_rows, link_rows, giant_row, costco_row):
|
||||
if not giant_row or not costco_row:
|
||||
return canonical_rows, link_rows, []
|
||||
|
||||
proof_canonical_id = stable_id("gcan", "proof|banana")
|
||||
link_rows = [
|
||||
row
|
||||
for row in link_rows
|
||||
if row["observed_product_id"]
|
||||
not in {giant_row["observed_product_id"], costco_row["observed_product_id"]}
|
||||
]
|
||||
canonical_rows = [
|
||||
row
|
||||
for row in canonical_rows
|
||||
if row["canonical_product_id"] != proof_canonical_id
|
||||
]
|
||||
canonical_rows.append(
|
||||
{
|
||||
"canonical_product_id": proof_canonical_id,
|
||||
"canonical_name": "BANANA",
|
||||
"product_type": "banana",
|
||||
"brand": "",
|
||||
"variant": "",
|
||||
"size_value": "",
|
||||
"size_unit": "",
|
||||
"pack_qty": "",
|
||||
"measure_type": "weight",
|
||||
"normalized_quantity": "",
|
||||
"normalized_quantity_unit": "",
|
||||
"notes": "manual proof merge for cross-retailer validation",
|
||||
"created_at": "",
|
||||
"updated_at": "",
|
||||
}
|
||||
)
|
||||
for observed_row in [giant_row, costco_row]:
|
||||
link_rows.append(
|
||||
{
|
||||
"observed_product_id": observed_row["observed_product_id"],
|
||||
"canonical_product_id": proof_canonical_id,
|
||||
"link_method": "manual_proof_merge",
|
||||
"link_confidence": "medium",
|
||||
"review_status": "",
|
||||
"reviewed_by": "",
|
||||
"reviewed_at": "",
|
||||
"link_notes": "cross-retailer validation proof",
|
||||
}
|
||||
)
|
||||
|
||||
proof_rows = [
|
||||
{
|
||||
"proof_name": "banana",
|
||||
"canonical_product_id": proof_canonical_id,
|
||||
"giant_observed_product_id": giant_row["observed_product_id"],
|
||||
"costco_observed_product_id": costco_row["observed_product_id"],
|
||||
"giant_example_item": giant_row["example_item_name"],
|
||||
"costco_example_item": costco_row["example_item_name"],
|
||||
"notes": "BANANA proof pair built from Giant and Costco enriched rows",
|
||||
}
|
||||
]
|
||||
return canonical_rows, link_rows, proof_rows
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option(
|
||||
"--giant-items-enriched-csv",
|
||||
default="giant_output/items_enriched.csv",
|
||||
show_default=True,
|
||||
)
|
||||
@click.option(
|
||||
"--costco-items-enriched-csv",
|
||||
default="costco_output/items_enriched.csv",
|
||||
show_default=True,
|
||||
)
|
||||
@click.option(
|
||||
"--outdir",
|
||||
default="combined_output",
|
||||
show_default=True,
|
||||
)
|
||||
def main(giant_items_enriched_csv, costco_items_enriched_csv, outdir):
|
||||
outdir = Path(outdir)
|
||||
rows = read_rows(giant_items_enriched_csv) + read_rows(costco_items_enriched_csv)
|
||||
observed_rows = build_observed_products.build_observed_products(rows)
|
||||
canonical_rows, link_rows = build_canonical_layer.build_canonical_layer(observed_rows)
|
||||
giant_row, costco_row = find_proof_pair(observed_rows)
|
||||
if not giant_row or not costco_row:
|
||||
raise click.ClickException(
|
||||
"could not find BANANA proof pair across Giant and Costco observed products"
|
||||
)
|
||||
canonical_rows, link_rows, proof_rows = merge_proof_pair(
|
||||
canonical_rows, link_rows, giant_row, costco_row
|
||||
)
|
||||
|
||||
write_csv_rows(
|
||||
outdir / "products_observed.csv",
|
||||
observed_rows,
|
||||
build_observed_products.OUTPUT_FIELDS,
|
||||
)
|
||||
write_csv_rows(
|
||||
outdir / "products_canonical.csv",
|
||||
canonical_rows,
|
||||
build_canonical_layer.CANONICAL_FIELDS,
|
||||
)
|
||||
write_csv_rows(
|
||||
outdir / "product_links.csv",
|
||||
link_rows,
|
||||
build_canonical_layer.LINK_FIELDS,
|
||||
)
|
||||
write_csv_rows(outdir / "proof_examples.csv", proof_rows, PROOF_FIELDS)
|
||||
click.echo(
|
||||
f"wrote combined outputs to {outdir} using {len(observed_rows)} observed rows"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user