diff --git a/README.md b/README.md new file mode 100644 index 0000000..f593c0d --- /dev/null +++ b/README.md @@ -0,0 +1,103 @@ +# scrape-giant + +Small grocery-history pipeline for Giant receipts. + +The project currently does four things: + +1. scrape Giant in-store order history from an active Firefox session +2. enrich raw line items into a deterministic `items_enriched.csv` +3. aggregate retailer-facing observed products and build a manual review queue +4. create a first-pass canonical product layer plus conservative auto-links + +The work so far is Giant-specific on the ingest side and intentionally simple on +the shared product-model side. + +## Current flow + +Run the commands from the repo root with the project venv active, or call them +directly through `./venv/bin/python`. + +```bash +./venv/bin/python scraper.py +./venv/bin/python enrich_giant.py +./venv/bin/python build_observed_products.py +./venv/bin/python build_review_queue.py +./venv/bin/python build_canonical_layer.py +``` + +## Inputs + +- Firefox cookies for `giantfood.com` +- `GIANT_USER_ID` and `GIANT_LOYALTY_NUMBER` in `.env`, shell env, or prompts +- Giant raw order payloads in `giant_output/raw/` + +## Outputs + +Current generated files live under `giant_output/`: + +- `orders.csv`: flattened visit/order rows from the Giant history API +- `items.csv`: flattened raw line items from fetched order detail payloads +- `items_enriched.csv`: deterministic parsed/enriched line items +- `products_observed.csv`: retailer-facing observed product groups +- `review_queue.csv`: products needing manual review +- `products_canonical.csv`: shared canonical product rows +- `product_links.csv`: observed-to-canonical links + +Raw json remains the source of truth: + +- `giant_output/raw/history.json` +- `giant_output/raw/.json` + +## Scripts + +- `scraper.py`: fetches Giant history/detail payloads and updates `orders.csv` and `items.csv` +- `enrich_giant.py`: reads raw Giant order json and writes `items_enriched.csv` +- `build_observed_products.py`: groups enriched rows into `products_observed.csv` +- `build_review_queue.py`: generates `review_queue.csv` and preserves review status on reruns +- `build_canonical_layer.py`: builds `products_canonical.csv` and `product_links.csv` + +## Notes on the current model + +- Observed products are retailer-specific: Giant, Costco. +- Canonical products are the first cross-retailer layer. +- Auto-linking is conservative: + exact UPC first, then exact normalized name plus exact size/unit context, then + exact normalized name when there is no size context to conflict. +- Fee rows are excluded from auto-linking. +- Unknown values are left blank instead of guessed. + +## Verification + +Run the test suite with: + +```bash +./venv/bin/python -m unittest discover -s tests +``` + +Useful one-off rebuilds: + +```bash +./venv/bin/python enrich_giant.py +./venv/bin/python build_observed_products.py +./venv/bin/python build_review_queue.py +./venv/bin/python build_canonical_layer.py +``` + +## Project docs + +- `pm/tasks.org`: task log and evidence +- `pm/data-model.org`: file layout and schema decisions + +## Status + +Completed through `t1.7`: + +- Giant receipt fetch CLI +- data model and file layout +- Giant parser/enricher +- observed products +- review queue +- canonical layer scaffold +- conservative auto-link rules + +Next planned task is `t1.8`: add a Costco raw ingest path. diff --git a/agents.md b/agents.md index 6f3ac2d..d3fb730 100644 --- a/agents.md +++ b/agents.md @@ -7,6 +7,7 @@ ## tech stack - python; pandas or polars - file storage: json and csv, no sqlite or databases +- assume local virtual env is available and accessible - do not add new dependencies unless explicitly approved; if unavoidable, document justification in the active task notes ## workflow diff --git a/build_canonical_layer.py b/build_canonical_layer.py new file mode 100644 index 0000000..3df2caa --- /dev/null +++ b/build_canonical_layer.py @@ -0,0 +1,216 @@ +import click + +from layer_helpers import read_csv_rows, representative_value, stable_id, write_csv_rows + + +CANONICAL_FIELDS = [ + "canonical_product_id", + "canonical_name", + "product_type", + "brand", + "variant", + "size_value", + "size_unit", + "pack_qty", + "measure_type", + "normalized_quantity", + "normalized_quantity_unit", + "notes", + "created_at", + "updated_at", +] + +LINK_FIELDS = [ + "observed_product_id", + "canonical_product_id", + "link_method", + "link_confidence", + "review_status", + "reviewed_by", + "reviewed_at", + "link_notes", +] + + +def to_float(value): + try: + return float(value) + except (TypeError, ValueError): + return None + + +def normalized_quantity(row): + size_value = to_float(row.get("representative_size_value")) + pack_qty = to_float(row.get("representative_pack_qty")) or 1.0 + size_unit = row.get("representative_size_unit", "") + measure_type = row.get("representative_measure_type", "") + + if size_value is not None and size_unit: + return format(size_value * pack_qty, "g"), size_unit + + if row.get("representative_pack_qty") and measure_type == "count": + return row["representative_pack_qty"], "count" + + if measure_type == "each": + return "1", "each" + + return "", "" + + +def auto_link_rule(observed_row): + if ( + observed_row.get("is_fee") == "true" + or observed_row.get("is_discount_line") == "true" + or observed_row.get("is_coupon_line") == "true" + ): + return "", "", "" + + if observed_row.get("representative_upc"): + return ( + "exact_upc", + f"upc={observed_row['representative_upc']}", + "high", + ) + + if ( + observed_row.get("representative_name_norm") + and observed_row.get("representative_size_value") + and observed_row.get("representative_size_unit") + ): + return ( + "exact_name_size", + "|".join( + [ + f"name={observed_row['representative_name_norm']}", + f"size={observed_row['representative_size_value']}", + f"unit={observed_row['representative_size_unit']}", + f"pack={observed_row['representative_pack_qty']}", + f"measure={observed_row['representative_measure_type']}", + ] + ), + "high", + ) + + if ( + observed_row.get("representative_name_norm") + and not observed_row.get("representative_size_value") + and not observed_row.get("representative_size_unit") + and not observed_row.get("representative_pack_qty") + ): + return ( + "exact_name", + "|".join( + [ + f"name={observed_row['representative_name_norm']}", + f"measure={observed_row['representative_measure_type']}", + ] + ), + "medium", + ) + + return "", "", "" + + +def canonical_row_for_group(canonical_product_id, group_rows, link_method): + quantity_value, quantity_unit = normalized_quantity( + { + "representative_size_value": representative_value( + group_rows, "representative_size_value" + ), + "representative_size_unit": representative_value( + group_rows, "representative_size_unit" + ), + "representative_pack_qty": representative_value( + group_rows, "representative_pack_qty" + ), + "representative_measure_type": representative_value( + group_rows, "representative_measure_type" + ), + } + ) + return { + "canonical_product_id": canonical_product_id, + "canonical_name": representative_value(group_rows, "representative_name_norm"), + "product_type": "", + "brand": representative_value(group_rows, "representative_brand"), + "variant": representative_value(group_rows, "representative_variant"), + "size_value": representative_value(group_rows, "representative_size_value"), + "size_unit": representative_value(group_rows, "representative_size_unit"), + "pack_qty": representative_value(group_rows, "representative_pack_qty"), + "measure_type": representative_value(group_rows, "representative_measure_type"), + "normalized_quantity": quantity_value, + "normalized_quantity_unit": quantity_unit, + "notes": f"auto-linked via {link_method}", + "created_at": "", + "updated_at": "", + } + + +def build_canonical_layer(observed_rows): + canonical_rows = [] + link_rows = [] + groups = {} + + for observed_row in sorted(observed_rows, key=lambda row: row["observed_product_id"]): + link_method, group_key, confidence = auto_link_rule(observed_row) + if not group_key: + continue + + canonical_product_id = stable_id("gcan", f"{link_method}|{group_key}") + groups.setdefault(canonical_product_id, {"method": link_method, "rows": []}) + groups[canonical_product_id]["rows"].append(observed_row) + link_rows.append( + { + "observed_product_id": observed_row["observed_product_id"], + "canonical_product_id": canonical_product_id, + "link_method": link_method, + "link_confidence": confidence, + "review_status": "", + "reviewed_by": "", + "reviewed_at": "", + "link_notes": "", + } + ) + + for canonical_product_id, group in sorted(groups.items()): + canonical_rows.append( + canonical_row_for_group( + canonical_product_id, group["rows"], group["method"] + ) + ) + + return canonical_rows, link_rows + + +@click.command() +@click.option( + "--observed-csv", + default="giant_output/products_observed.csv", + show_default=True, + help="Path to observed product rows.", +) +@click.option( + "--canonical-csv", + default="giant_output/products_canonical.csv", + show_default=True, + help="Path to canonical product output.", +) +@click.option( + "--links-csv", + default="giant_output/product_links.csv", + show_default=True, + help="Path to observed-to-canonical link output.", +) +def main(observed_csv, canonical_csv, links_csv): + observed_rows = read_csv_rows(observed_csv) + canonical_rows, link_rows = build_canonical_layer(observed_rows) + write_csv_rows(canonical_csv, canonical_rows, CANONICAL_FIELDS) + write_csv_rows(links_csv, link_rows, LINK_FIELDS) + click.echo( + f"wrote {len(canonical_rows)} canonical rows to {canonical_csv} and " + f"{len(link_rows)} links to {links_csv}" + ) + + +if __name__ == "__main__": + main() diff --git a/build_observed_products.py b/build_observed_products.py new file mode 100644 index 0000000..807a6a3 --- /dev/null +++ b/build_observed_products.py @@ -0,0 +1,172 @@ +from collections import defaultdict + +import click + +from layer_helpers import ( + compact_join, + distinct_values, + first_nonblank, + read_csv_rows, + representative_value, + stable_id, + write_csv_rows, +) + + +OUTPUT_FIELDS = [ + "observed_product_id", + "retailer", + "observed_key", + "representative_retailer_item_id", + "representative_upc", + "representative_item_name", + "representative_name_norm", + "representative_brand", + "representative_variant", + "representative_size_value", + "representative_size_unit", + "representative_pack_qty", + "representative_measure_type", + "representative_image_url", + "is_store_brand", + "is_fee", + "is_discount_line", + "is_coupon_line", + "first_seen_date", + "last_seen_date", + "times_seen", + "example_order_id", + "example_item_name", + "raw_name_examples", + "normalized_name_examples", + "example_prices", + "distinct_item_names_count", + "distinct_retailer_item_ids_count", + "distinct_upcs_count", +] + + +def build_observed_key(row): + if row.get("upc"): + return "|".join( + [ + row["retailer"], + f"upc={row['upc']}", + f"name={row['item_name_norm']}", + ] + ) + + if row.get("retailer_item_id"): + return "|".join( + [ + row["retailer"], + f"retailer_item_id={row['retailer_item_id']}", + f"name={row['item_name_norm']}", + f"discount={row.get('is_discount_line', 'false')}", + f"coupon={row.get('is_coupon_line', 'false')}", + ] + ) + + return "|".join( + [ + row["retailer"], + f"name={row['item_name_norm']}", + f"size={row['size_value']}", + f"unit={row['size_unit']}", + f"pack={row['pack_qty']}", + f"measure={row['measure_type']}", + f"store_brand={row['is_store_brand']}", + f"fee={row['is_fee']}", + ] + ) + + +def build_observed_products(rows): + grouped = defaultdict(list) + for row in rows: + grouped[build_observed_key(row)].append(row) + + observed_rows = [] + for observed_key, group_rows in sorted(grouped.items()): + ordered = sorted( + group_rows, + key=lambda row: (row["order_date"], row["order_id"], int(row["line_no"])), + ) + observed_rows.append( + { + "observed_product_id": stable_id("gobs", observed_key), + "retailer": ordered[0]["retailer"], + "observed_key": observed_key, + "representative_retailer_item_id": representative_value( + ordered, "retailer_item_id" + ), + "representative_upc": representative_value(ordered, "upc"), + "representative_item_name": representative_value(ordered, "item_name"), + "representative_name_norm": representative_value( + ordered, "item_name_norm" + ), + "representative_brand": representative_value(ordered, "brand_guess"), + "representative_variant": representative_value(ordered, "variant"), + "representative_size_value": representative_value(ordered, "size_value"), + "representative_size_unit": representative_value(ordered, "size_unit"), + "representative_pack_qty": representative_value(ordered, "pack_qty"), + "representative_measure_type": representative_value( + ordered, "measure_type" + ), + "representative_image_url": first_nonblank(ordered, "image_url"), + "is_store_brand": representative_value(ordered, "is_store_brand"), + "is_fee": representative_value(ordered, "is_fee"), + "is_discount_line": representative_value( + ordered, "is_discount_line" + ), + "is_coupon_line": representative_value(ordered, "is_coupon_line"), + "first_seen_date": ordered[0]["order_date"], + "last_seen_date": ordered[-1]["order_date"], + "times_seen": str(len(ordered)), + "example_order_id": ordered[0]["order_id"], + "example_item_name": ordered[0]["item_name"], + "raw_name_examples": compact_join( + distinct_values(ordered, "item_name"), limit=4 + ), + "normalized_name_examples": compact_join( + distinct_values(ordered, "item_name_norm"), limit=4 + ), + "example_prices": compact_join( + distinct_values(ordered, "line_total"), limit=4 + ), + "distinct_item_names_count": str( + len(distinct_values(ordered, "item_name")) + ), + "distinct_retailer_item_ids_count": str( + len(distinct_values(ordered, "retailer_item_id")) + ), + "distinct_upcs_count": str(len(distinct_values(ordered, "upc"))), + } + ) + + observed_rows.sort(key=lambda row: row["observed_product_id"]) + return observed_rows + + +@click.command() +@click.option( + "--items-enriched-csv", + default="giant_output/items_enriched.csv", + show_default=True, + help="Path to enriched Giant item rows.", +) +@click.option( + "--output-csv", + default="giant_output/products_observed.csv", + show_default=True, + help="Path to observed product output.", +) +def main(items_enriched_csv, output_csv): + rows = read_csv_rows(items_enriched_csv) + observed_rows = build_observed_products(rows) + write_csv_rows(output_csv, observed_rows, OUTPUT_FIELDS) + click.echo(f"wrote {len(observed_rows)} rows to {output_csv}") + + +if __name__ == "__main__": + main() diff --git a/build_review_queue.py b/build_review_queue.py new file mode 100644 index 0000000..b0432a2 --- /dev/null +++ b/build_review_queue.py @@ -0,0 +1,175 @@ +from collections import defaultdict +from datetime import date + +import click + +from layer_helpers import compact_join, distinct_values, read_csv_rows, stable_id, write_csv_rows + + +OUTPUT_FIELDS = [ + "review_id", + "queue_type", + "retailer", + "observed_product_id", + "canonical_product_id", + "reason_code", + "priority", + "raw_item_names", + "normalized_names", + "upc", + "image_url", + "example_prices", + "seen_count", + "status", + "resolution_notes", + "created_at", + "updated_at", +] + + +def existing_review_state(path): + try: + rows = read_csv_rows(path) + except FileNotFoundError: + return {} + return {row["review_id"]: row for row in rows} + + +def review_reasons(observed_row): + reasons = [] + if ( + observed_row["is_fee"] == "true" + or observed_row.get("is_discount_line") == "true" + or observed_row.get("is_coupon_line") == "true" + ): + return reasons + if observed_row["distinct_upcs_count"] not in {"", "0", "1"}: + reasons.append(("multiple_upcs", "high")) + if observed_row["distinct_item_names_count"] not in {"", "0", "1"}: + reasons.append(("multiple_raw_names", "medium")) + if not observed_row["representative_image_url"]: + reasons.append(("missing_image", "medium")) + if not observed_row["representative_upc"]: + reasons.append(("missing_upc", "high")) + if not observed_row["representative_name_norm"]: + reasons.append(("missing_normalized_name", "high")) + return reasons + + +def build_review_queue(observed_rows, item_rows, existing_rows, today_text): + by_observed = defaultdict(list) + for row in item_rows: + observed_id = row.get("observed_product_id", "") + if observed_id: + by_observed[observed_id].append(row) + + queue_rows = [] + for observed_row in observed_rows: + reasons = review_reasons(observed_row) + if not reasons: + continue + + related_items = by_observed.get(observed_row["observed_product_id"], []) + raw_names = compact_join(distinct_values(related_items, "item_name"), limit=5) + norm_names = compact_join( + distinct_values(related_items, "item_name_norm"), limit=5 + ) + example_prices = compact_join( + distinct_values(related_items, "line_total"), limit=5 + ) + + for reason_code, priority in reasons: + review_id = stable_id( + "rvw", + f"{observed_row['observed_product_id']}|{reason_code}", + ) + prior = existing_rows.get(review_id, {}) + queue_rows.append( + { + "review_id": review_id, + "queue_type": "observed_product", + "retailer": observed_row["retailer"], + "observed_product_id": observed_row["observed_product_id"], + "canonical_product_id": prior.get("canonical_product_id", ""), + "reason_code": reason_code, + "priority": priority, + "raw_item_names": raw_names, + "normalized_names": norm_names, + "upc": observed_row["representative_upc"], + "image_url": observed_row["representative_image_url"], + "example_prices": example_prices, + "seen_count": observed_row["times_seen"], + "status": prior.get("status", "pending"), + "resolution_notes": prior.get("resolution_notes", ""), + "created_at": prior.get("created_at", today_text), + "updated_at": today_text, + } + ) + + queue_rows.sort(key=lambda row: (row["priority"], row["reason_code"], row["review_id"])) + return queue_rows + + +def attach_observed_ids(item_rows, observed_rows): + observed_by_key = {row["observed_key"]: row["observed_product_id"] for row in observed_rows} + attached = [] + for row in item_rows: + observed_key = "|".join( + [ + row["retailer"], + f"upc={row['upc']}", + f"name={row['item_name_norm']}", + ] + ) if row.get("upc") else "|".join( + [ + row["retailer"], + f"retailer_item_id={row.get('retailer_item_id', '')}", + f"name={row['item_name_norm']}", + f"size={row['size_value']}", + f"unit={row['size_unit']}", + f"pack={row['pack_qty']}", + f"measure={row['measure_type']}", + f"store_brand={row['is_store_brand']}", + f"fee={row['is_fee']}", + f"discount={row.get('is_discount_line', 'false')}", + f"coupon={row.get('is_coupon_line', 'false')}", + ] + ) + enriched = dict(row) + enriched["observed_product_id"] = observed_by_key.get(observed_key, "") + attached.append(enriched) + return attached + + +@click.command() +@click.option( + "--observed-csv", + default="giant_output/products_observed.csv", + show_default=True, + help="Path to observed product rows.", +) +@click.option( + "--items-enriched-csv", + default="giant_output/items_enriched.csv", + show_default=True, + help="Path to enriched Giant item rows.", +) +@click.option( + "--output-csv", + default="giant_output/review_queue.csv", + show_default=True, + help="Path to review queue output.", +) +def main(observed_csv, items_enriched_csv, output_csv): + observed_rows = read_csv_rows(observed_csv) + item_rows = read_csv_rows(items_enriched_csv) + item_rows = attach_observed_ids(item_rows, observed_rows) + existing_rows = existing_review_state(output_csv) + today_text = str(date.today()) + queue_rows = build_review_queue(observed_rows, item_rows, existing_rows, today_text) + write_csv_rows(output_csv, queue_rows, OUTPUT_FIELDS) + click.echo(f"wrote {len(queue_rows)} rows to {output_csv}") + + +if __name__ == "__main__": + main() diff --git a/enrich_costco.py b/enrich_costco.py new file mode 100644 index 0000000..8129c64 --- /dev/null +++ b/enrich_costco.py @@ -0,0 +1,271 @@ +import csv +import json +import re +from pathlib import Path + +import click + +from enrich_giant import ( + OUTPUT_FIELDS, + format_decimal, + normalize_number, + normalize_unit, + normalize_whitespace, + singularize_tokens, + to_decimal, +) + + +PARSER_VERSION = "costco-enrich-v1" +RETAILER = "costco" +DEFAULT_INPUT_DIR = Path("costco_output/raw") +DEFAULT_OUTPUT_CSV = Path("costco_output/items_enriched.csv") + +CODE_TOKEN_RE = re.compile( + r"\b(?:SL\d+|T\d+H\d+|P\d+(?:/\d+)?|W\d+T\d+H\d+|FY\d+|CSPC#|C\d+T\d+H\d+|EC\d+T\d+H\d+|\d+X\d+)\b" +) +PACK_FRACTION_RE = re.compile(r"(? 0 and unit != "EA"): + return "weight" + if size_unit in {"lb", "oz"}: + return "weight" + if size_unit in {"ml", "l", "qt", "pt", "gal", "fl_oz"}: + return "volume" + if pack_qty: + return "count" + if unit == "EA" or (qty is not None and qty > 0): + return "each" + return "" + + +def is_fee_item(cleaned_name): + return any(pattern.search(cleaned_name) for pattern in FEE_PATTERNS) + + +def derive_prices(item, measure_type, size_value="", size_unit="", pack_qty=""): + qty = to_decimal(item.get("shipQy")) + line_total = to_decimal(item.get("groceryAmount")) + picked_weight = to_decimal(item.get("totalPickedWeight")) + parsed_size = to_decimal(size_value) + parsed_pack = to_decimal(pack_qty) or Decimal("1") + + price_per_each = "" + price_per_lb = "" + price_per_oz = "" + + if line_total is None: + return price_per_each, price_per_lb, price_per_oz + + if measure_type == "each" and qty not in (None, Decimal("0")): + price_per_each = format_decimal(line_total / qty) + + if measure_type == "count" and qty not in (None, Decimal("0")): + price_per_each = format_decimal(line_total / qty) + + if measure_type == "weight" and picked_weight not in (None, Decimal("0")): + per_lb = line_total / picked_weight + price_per_lb = format_decimal(per_lb) + price_per_oz = format_decimal(per_lb / Decimal("16")) + return price_per_each, price_per_lb, price_per_oz + + if measure_type == "weight" and parsed_size not in (None, Decimal("0")) and qty not in (None, Decimal("0")): + total_units = qty * parsed_pack * parsed_size + if size_unit == "lb": + per_lb = line_total / total_units + price_per_lb = format_decimal(per_lb) + price_per_oz = format_decimal(per_lb / Decimal("16")) + elif size_unit == "oz": + per_oz = line_total / total_units + price_per_oz = format_decimal(per_oz) + price_per_lb = format_decimal(per_oz * Decimal("16")) + + return price_per_each, price_per_lb, price_per_oz + + +def parse_item(order_id, order_date, raw_path, line_no, item): + cleaned_name = clean_item_name(item.get("itemName", "")) + size_value, size_unit, pack_qty = parse_size_and_pack(cleaned_name) + prefix, brand_guess = extract_store_brand_prefix(cleaned_name) + normalized_name = normalize_item_name(cleaned_name) + measure_type = guess_measure_type(item, size_unit, pack_qty) + price_per_each, price_per_lb, price_per_oz = derive_prices( + item, + measure_type, + size_value=size_value, + size_unit=size_unit, + pack_qty=pack_qty, + ) + is_fee = is_fee_item(cleaned_name) + parse_notes = [] + + if prefix: + parse_notes.append(f"store_brand_prefix={prefix}") + if is_fee: + parse_notes.append("fee_item") + if size_value and not size_unit: + parse_notes.append("size_without_unit") + + return { + "retailer": RETAILER, + "order_id": str(order_id), + "line_no": str(line_no), + "observed_item_key": f"{RETAILER}:{order_id}:{line_no}", + "order_date": normalize_whitespace(order_date), + "retailer_item_id": stringify(item.get("podId")), + "pod_id": stringify(item.get("podId")), + "item_name": stringify(item.get("itemName")), + "upc": stringify(item.get("primUpcCd")), + "category_id": stringify(item.get("categoryId")), + "category": stringify(item.get("categoryDesc")), + "qty": stringify(item.get("shipQy")), + "unit": stringify(item.get("lbEachCd")), + "unit_price": stringify(item.get("unitPrice")), + "line_total": stringify(item.get("groceryAmount")), + "picked_weight": stringify(item.get("totalPickedWeight")), + "mvp_savings": stringify(item.get("mvpSavings")), + "reward_savings": stringify(item.get("rewardSavings")), + "coupon_savings": stringify(item.get("couponSavings")), + "coupon_price": stringify(item.get("couponPrice")), + "image_url": extract_image_url(item), + "raw_order_path": raw_path.as_posix(), + "item_name_norm": normalized_name, + "brand_guess": brand_guess, + "variant": "", + "size_value": size_value, + "size_unit": size_unit, + "pack_qty": pack_qty, + "measure_type": measure_type, + "is_store_brand": "true" if bool(prefix) else "false", + "is_fee": "true" if is_fee else "false", + "is_discount_line": "false", + "is_coupon_line": "false", + "price_per_each": price_per_each, + "price_per_lb": price_per_lb, + "price_per_oz": price_per_oz, + "parse_version": PARSER_VERSION, + "parse_notes": ";".join(parse_notes), + } + + +def stringify(value): + if value is None: + return "" + return str(value) + + +def iter_order_rows(raw_dir): + for path in sorted(raw_dir.glob("*.json")): + if path.name == "history.json": + continue + + payload = json.loads(path.read_text(encoding="utf-8")) + order_id = payload.get("orderId", path.stem) + order_date = payload.get("orderDate", "") + + for line_no, item in enumerate(payload.get("items", []), start=1): + yield parse_item(order_id, order_date, path, line_no, item) + + +def build_items_enriched(raw_dir): + rows = list(iter_order_rows(raw_dir)) + rows.sort(key=lambda row: (row["order_date"], row["order_id"], int(row["line_no"]))) + return rows + + +def write_csv(path, rows): + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", newline="", encoding="utf-8") as handle: + writer = csv.DictWriter(handle, fieldnames=OUTPUT_FIELDS) + writer.writeheader() + writer.writerows(rows) + + +@click.command() +@click.option( + "--input-dir", + default=str(DEFAULT_INPUT_DIR), + show_default=True, + help="Directory containing Giant raw order json files.", +) +@click.option( + "--output-csv", + default=str(DEFAULT_OUTPUT_CSV), + show_default=True, + help="CSV path for enriched Giant item rows.", +) +def main(input_dir, output_csv): + raw_dir = Path(input_dir) + output_path = Path(output_csv) + + if not raw_dir.exists(): + raise click.ClickException(f"input dir does not exist: {raw_dir}") + + rows = build_items_enriched(raw_dir) + write_csv(output_path, rows) + + click.echo(f"wrote {len(rows)} rows to {output_path}") + + +if __name__ == "__main__": + main() diff --git a/layer_helpers.py b/layer_helpers.py new file mode 100644 index 0000000..fa3df63 --- /dev/null +++ b/layer_helpers.py @@ -0,0 +1,54 @@ +import csv +import hashlib +from collections import Counter +from pathlib import Path + + +def read_csv_rows(path): + path = Path(path) + with path.open(newline="", encoding="utf-8") as handle: + return list(csv.DictReader(handle)) + + +def write_csv_rows(path, rows, fieldnames): + path = Path(path) + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", newline="", encoding="utf-8") as handle: + writer = csv.DictWriter(handle, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) + + +def stable_id(prefix, raw_key): + digest = hashlib.sha1(str(raw_key).encode("utf-8")).hexdigest()[:12] + return f"{prefix}_{digest}" + + +def first_nonblank(rows, field): + for row in rows: + value = row.get(field, "") + if value: + return value + return "" + + +def representative_value(rows, field): + values = [row.get(field, "") for row in rows if row.get(field, "")] + if not values: + return "" + counts = Counter(values) + return sorted(counts.items(), key=lambda item: (-item[1], item[0]))[0][0] + + +def distinct_values(rows, field): + return sorted({row.get(field, "") for row in rows if row.get(field, "")}) + + +def compact_join(values, limit=3): + unique = [] + seen = set() + for value in values: + if value and value not in seen: + seen.add(value) + unique.append(value) + return " | ".join(unique[:limit]) diff --git a/pm/data-model.org b/pm/data-model.org new file mode 100644 index 0000000..6a25468 --- /dev/null +++ b/pm/data-model.org @@ -0,0 +1,309 @@ +* grocery data model and file layout + +This document defines the shared file layout and stable CSV schemas for the +grocery pipeline. The goal is to keep retailer-specific ingest separate from +cross-retailer product modeling so Giant-specific quirks do not become the +system of record. + +** design rules + +- Raw retailer exports remain the source of truth. +- Retailer parsing is isolated to retailer-specific files and ids. +- Cross-retailer product layers begin only after retailer-specific enrichment. +- CSV schemas are stable and additive: new columns may be appended, but + existing columns should not be repurposed. +- Unknown values should be left blank rather than guessed. + +** directory layout + +Use one top-level data root: + +#+begin_example +data/ + giant/ + raw/ + history.json + orders/ + .json + orders.csv + items_raw.csv + items_enriched.csv + products_observed.csv + costco/ + raw/ + ... + orders.csv + items_raw.csv + items_enriched.csv + products_observed.csv + shared/ + products_canonical.csv + product_links.csv + review_queue.csv +#+end_example + +** layer responsibilities + +- `data//raw/` + Stores unmodified retailer payloads exactly as fetched. +- `data//orders.csv` + One row per retailer order or visit, flattened from raw order data. +- `data//items_raw.csv` + One row per retailer line item, preserving retailer-native values needed for + reruns and debugging. +- `data//items_enriched.csv` + Parsed retailer line items with normalized fields and derived guesses, still + retailer-specific. +- `data//products_observed.csv` + Distinct retailer-facing observed products aggregated from enriched items. +- `data/shared/products_canonical.csv` + Cross-retailer canonical product entities used for comparison. +- `data/shared/product_links.csv` + Links from retailer observed products to canonical products. +- `data/shared/review_queue.csv` + Human review queue for unresolved or low-confidence matching/parsing cases. + +** retailer-specific versus shared + +Retailer-specific: + +- raw json payloads +- retailer order ids +- retailer line numbers +- retailer category ids and names +- retailer item names +- retailer image urls +- parsed guesses derived from one retailer feed +- observed products scoped to one retailer + +Shared: + +- canonical products +- observed-to-canonical links +- human review state for unresolved cases +- comparison-ready normalized quantity basis fields + +Observed products are the boundary between retailer-specific parsing and +cross-retailer canonicalization. Nothing upstream of `products_observed.csv` +should require knowledge of another retailer. + +** schema: `data//orders.csv` + +One row per order or visit. + +| column | meaning | +|- +| `retailer` | retailer slug such as `giant` | +| `order_id` | retailer order or visit id | +| `order_date` | order date in `YYYY-MM-DD` when available | +| `delivery_date` | fulfillment date in `YYYY-MM-DD` when available | +| `service_type` | retailer service type such as `INSTORE` | +| `order_total` | order total as provided by retailer | +| `payment_method` | retailer payment label | +| `total_item_count` | total line count or item count from retailer | +| `total_savings` | total savings as provided by retailer | +| `your_savings_total` | savings field from retailer when present | +| `coupons_discounts_total` | coupon/discount total from retailer | +| `store_name` | retailer store name | +| `store_number` | retailer store number | +| `store_address1` | street address | +| `store_city` | city | +| `store_state` | state or province | +| `store_zipcode` | postal code | +| `refund_order` | retailer refund flag | +| `ebt_order` | retailer EBT flag | +| `raw_history_path` | relative path to source history payload | +| `raw_order_path` | relative path to source order payload | + +Primary key: + +- (`retailer`, `order_id`) + +** schema: `data//items_raw.csv` + +One row per retailer line item. + +| column | meaning | +|------------------+-----------------------------------------| +| `retailer` | retailer slug | +| `order_id` | retailer order id | +| `line_no` | stable line number within order export | +| `order_date` | copied from order when available | +| `retailer_item_id` | retailer-native item id when available | +| `pod_id` | retailer pod/item id | +| `item_name` | raw retailer item name | +| `upc` | retailer UPC or PLU value | +| `category_id` | retailer category id | +| `category` | retailer category description | +| `qty` | retailer quantity field | +| `unit` | retailer unit code such as `EA` or `LB` | +| `unit_price` | retailer unit price field | +| `line_total` | retailer extended price field | +| `picked_weight` | retailer picked weight field | +| `mvp_savings` | retailer savings field | +| `reward_savings` | retailer rewards savings field | +| `coupon_savings` | retailer coupon savings field | +| `coupon_price` | retailer coupon price field | +| `image_url` | raw retailer image url when present | +| `raw_order_path` | relative path to source order payload | +| `is_discount_line` | retailer adjustment or discount-line flag | +| `is_coupon_line` | coupon-like line flag when distinguishable | + +Primary key: + +- (`retailer`, `order_id`, `line_no`) + +** schema: `data//items_enriched.csv` + +One row per retailer line item after deterministic parsing. Preserve the raw +fields from `items_raw.csv` and add parsed fields. + +| column | meaning | +|---------------------+-------------------------------------------------------------| +| `retailer` | retailer slug | +| `order_id` | retailer order id | +| `line_no` | line number within order | +| `observed_item_key` | stable row key, typically `::` | +| `retailer_item_id` | retailer-native item id | +| `item_name` | raw retailer item name | +| `item_name_norm` | normalized item name | +| `brand_guess` | parsed brand guess | +| `variant` | parsed variant text | +| `size_value` | parsed numeric size value | +| `size_unit` | parsed size unit such as `oz`, `lb`, `fl_oz` | +| `pack_qty` | parsed pack or count guess | +| `measure_type` | `each`, `weight`, `volume`, `count`, or blank | +| `is_store_brand` | store-brand guess | +| `is_fee` | fee or non-product flag | +| `is_discount_line` | discount or adjustment-line flag | +| `is_coupon_line` | coupon-like line flag | +| `price_per_each` | derived per-each price when supported | +| `price_per_lb` | derived per-pound price when supported | +| `price_per_oz` | derived per-ounce price when supported | +| `image_url` | best available retailer image url | +| `parse_version` | parser version string for reruns | +| `parse_notes` | optional non-fatal parser notes | + +Primary key: + +- (`retailer`, `order_id`, `line_no`) + +** schema: `data//products_observed.csv` + +One row per distinct retailer-facing observed product. + +| column | meaning | +|-------------------------------+----------------------------------------------------------------| +| `observed_product_id` | stable observed product id | +| `retailer` | retailer slug | +| `observed_key` | deterministic grouping key used to create the observed product | +| `representative_retailer_item_id` | best representative retailer-native item id | +| `representative_upc` | best representative UPC/PLU | +| `representative_item_name` | representative raw retailer name | +| `representative_name_norm` | representative normalized name | +| `representative_brand` | representative brand guess | +| `representative_variant` | representative variant | +| `representative_size_value` | representative size value | +| `representative_size_unit` | representative size unit | +| `representative_pack_qty` | representative pack/count | +| `representative_measure_type` | representative measure type | +| `representative_image_url` | representative image url | +| `is_store_brand` | representative store-brand flag | +| `is_fee` | representative fee flag | +| `is_discount_line` | representative discount-line flag | +| `is_coupon_line` | representative coupon-line flag | +| `first_seen_date` | first order date seen | +| `last_seen_date` | last order date seen | +| `times_seen` | number of enriched item rows grouped here | +| `example_order_id` | one example retailer order id | +| `example_item_name` | one example raw item name | +| `distinct_retailer_item_ids_count` | count of distinct retailer-native item ids | + +Primary key: + +- (`observed_product_id`) + +** schema: `data/shared/products_canonical.csv` + +One row per cross-retailer canonical product. + +| column | meaning | +|----------------------------+--------------------------------------------------| +| `canonical_product_id` | stable canonical product id | +| `canonical_name` | canonical human-readable name | +| `product_type` | broad class such as `apple`, `milk`, `trash_bag` | +| `brand` | canonical brand when applicable | +| `variant` | canonical variant | +| `size_value` | normalized size value | +| `size_unit` | normalized size unit | +| `pack_qty` | normalized pack/count | +| `measure_type` | normalized measure type | +| `normalized_quantity` | numeric comparison basis value | +| `normalized_quantity_unit` | basis unit such as `oz`, `lb`, `count` | +| `notes` | optional human notes | +| `created_at` | creation timestamp or date | +| `updated_at` | last update timestamp or date | + +Primary key: + +- (`canonical_product_id`) + +** schema: `data/shared/product_links.csv` + +One row per observed-to-canonical relationship. + +| column | meaning | +|- +| `observed_product_id` | retailer observed product id | +| `canonical_product_id` | linked canonical product id | +| `link_method` | `manual`, `exact_upc`, `exact_name`, etc. | +| `link_confidence` | optional confidence label | +| `review_status` | `pending`, `approved`, `rejected`, or blank | +| `reviewed_by` | reviewer id or initials | +| `reviewed_at` | review timestamp or date | +| `link_notes` | optional notes | + +Primary key: + +- (`observed_product_id`, `canonical_product_id`) + +** schema: `data/shared/review_queue.csv` + +One row per issue needing human review. + +| column | meaning | +|- +| `review_id` | stable review row id | +| `queue_type` | `observed_product`, `link_candidate`, `parse_issue` | +| `retailer` | retailer slug when applicable | +| `observed_product_id` | observed product id when applicable | +| `canonical_product_id` | candidate canonical id when applicable | +| `reason_code` | machine-readable review reason | +| `priority` | optional priority label | +| `raw_item_names` | compact list of example raw names | +| `normalized_names` | compact list of example normalized names | +| `upc` | example UPC/PLU | +| `image_url` | example image url | +| `example_prices` | compact list of example prices | +| `seen_count` | count of related rows | +| `status` | `pending`, `approved`, `rejected`, `deferred` | +| `resolution_notes` | reviewer notes | +| `created_at` | creation timestamp or date | +| `updated_at` | last update timestamp or date | + +Primary key: + +- (`review_id`) + +** current giant mapping + +Current scraper outputs map to the new layout as follows: + +- `giant_output/raw/history.json` -> `data/giant/raw/history.json` +- `giant_output/raw/.json` -> `data/giant/raw/orders/.json` +- `giant_output/orders.csv` -> `data/giant/orders.csv` +- `giant_output/items.csv` -> `data/giant/items_raw.csv` + +Current Giant raw order payloads already expose fields needed for future +enrichment, including `image`, `itemName`, `primUpcCd`, `lbEachCd`, +`unitPrice`, `groceryAmount`, and `totalPickedWeight`. diff --git a/pm/scrape-giant.org b/pm/scrape-giant.org index 67eebba..f7c8366 100644 --- a/pm/scrape-giant.org +++ b/pm/scrape-giant.org @@ -44,7 +44,8 @@ git remote set-url gitea git@gitea:ben/scrape-giant.git on local network: use ssh to 192.168.1.207:2020 from elsewhere/public: use https to git.hgsky.me/... unless you later expose ssh properly -* item: +* giant requests +** item: get: /api/v6.0/user/369513017/order/history/detail/69a2e44a16be1142e74ad3cc @@ -83,7 +84,7 @@ x-datadome: protected request-context: appId=cid-v1:75750625-0c81-4f08-9f5d-ce4f73198e54 X-Firefox-Spdy: h2 -* history: +** history: GET https://giantfood.com/api/v6.0/user/369513017/order/history?filter=instore&loyaltyNumber=440155630880 @@ -122,3 +123,75 @@ accept-ch: Sec-CH-UA,Sec-CH-UA-Mobile,Sec-CH-UA-Platform,Sec-CH-UA-Arch,Sec-CH-U x-datadome: protected request-context: appId=cid-v1:75750625-0c81-4f08-9f5d-ce4f73198e54 X-Firefox-Spdy: h2 + +* costco requests +** warehouse +*** POST +https://ecom-api.costco.com/ebusiness/order/v1/orders/graphql + +*** Headers + +POST /ebusiness/order/v1/orders/graphql HTTP/1.1 +Host: ecom-api.costco.com +User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:148.0) Gecko/20100101 Firefox/148.0 +Accept: */* +Accept-Language: en-US,en;q=0.9 +Accept-Encoding: gzip, deflate, br, zstd +costco.service: restOrders +costco.env: ecom +costco-x-authorization: Bearer eyJhbGciOiJSUzI1NiIsImtpZCI6IlhrZTFoNXg5TV9ZMk5ER0YxU1hDX2xNNnVSTU5tZTJ3STBLRDlHNzl1QmciLCJ0eXAiOiJKV1QifQ.eyJleHAiOjE3NzM2NjU2NjgsIm5iZiI6MTc3MzY2NDc2OCwidmVyIjoiMS4wIiwiaXNzIjoiaHR0cHM6Ly9zaWduaW4uY29zdGNvLmNvbS9lMDcxNGRkNC03ODRkLTQ2ZDYtYTI3OC0zZTI5NTUzNDgzZWIvdjIuMC8iLCJzdWIiOiIzMTIzZWQ2Yy1jNzM4LTRiOTktOTAwZC0xNDE1ZTUzNjA2Y2UiLCJhdWQiOiJhM2E1MTg2Yi03Yzg5LTRiNGMtOTNhOC1kZDYwNGU5MzA3NTciLCJhY3IiOiJCMkNfMUFfU1NPX1dDU19zaWdudXBfc2lnbmluXzIwMSIsIm5vbmNlIjoiNDA4NjU3YmItODg5MC00MTk0LTg2OTctZDYzOGU2MzdhMGRhIiwiaWF0IjoxNzczNjY0NzY4LCJhdXRoX3RpbWUiOjE3NzM2NjQ3NjgsImF1dGhlbnRpY2F0aW9uU291cmNlIjoibG9jYWxBY2NvdW50QXV0aGVudGljYXRpb24iLCJlbWFpbCI6ImpvaG5tb3Nlc2NhcnRlckBnbWFpbC5jb20iLCJuYW1lIjoiRW1wdHkgRGlzcGxheW5hbWUiLCJ1c2VySWRlbnRpdGllcyI6W3siaXNzdWVyIjoiYTNhNTE4NmItN2M4OS00YjRjLTkzYTgtZGQ2MDRlOTMwNzU3IiwiaXNzdWVyVXNlcklkIjoiQUFEOjMxMjNlZDZjLWM3MzgtNGI5OS05MDBkLTE0MTVlNTM2MDZjZSJ9LHsiaXNzdWVyIjoiNDkwMGViMWYtMGMxMC00YmQ5LTk5YzMtYzU5ZTZjMWVjZWJmIiwiaXNzdWVyVXNlcklkIjoiYTZmZmRkOTktNDM2OC00NTgwLTgxOWYtZTZjZjYxM2U1M2M1In0seyJpc3N1ZXIiOiIyZGQ0YjE0NS0zYmRhLTQ2NjktYWU2YS0zN2I4Y2I2ZGFmN2YiLCJpc3N1ZXJVc2VySWQiOiJhNmZmZGQ5OS00MzY4LTQ1ODAtODE5Zi1lNmNmNjEzZTUzYzUifV0sImlzc3VlclVzZXJJZCI6IkFBRDozMTIzZWQ2Yy1jNzM4LTRiOTktOTAwZC0xNDE1ZTUzNjA2Y2UiLCJjbGllbnRJZCI6ImEzYTUxODZiLTdjODktNGI0Yy05M2E4LWRkNjA0ZTkzMDc1NyIsInJlbWVtYmVyTWUiOiJGYWxzZSIsInNlbmRNZUVtYWlsIjoib2ZmIiwiaXBBZGRyZXNzIjoiOTYuMjQxLjIxMi4xMjUiLCJDb3JyZWxhdGlvbklkIjoiYWUyYTMxYjktMjBkNC00MTBkLWE1ZjAtNDJhMWIzM2VmZmQ1In0.gmhhNsgFUbd0QAR1Z_isFjglQxZrM0Kj8yv5-w-FrsWM3d9PB6kWsldBndy6cEhwZh588T1u4vgG9A-XR3HZ4t-JnPZhpr8_7-lI4W4Tp4IAA0tIgMt7cHZUN14qstx_K72QLOrKbO34PQJKBymw2qKvwvhUo372MNFtc2D8_wS_VbG8QdOPumgsBJPqyF7HExt-gpkAu_5kL-54pqLSIZIJZ_viymti9ajla_B8PlvHMO7ZDWSgoV177ArcQAeOhv9MT1e5k0a4V7R-cCI77NIhoBUjV8C4lMAd27nntWzJJ9N00hEEGQb3zPoWUgRFAOdGzjg4xZu1D87C3MJtdA +Content-Type: application/json-patch+json +costco-x-wcs-clientId: 4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf +client-identifier: 481b1aec-aa3b-454b-b81b-48187e28f205 +Content-Length: 808 +Origin: https://www.costco.com +DNT: 1 +Sec-GPC: 1 +Connection: keep-alive +Referer: https://www.costco.com/ +Sec-Fetch-Dest: empty +Sec-Fetch-Mode: cors +Sec-Fetch-Site: same-site + +*** Request +Request +{"query":"query receiptsWithCounts($startDate: String!, $endDate: String!,$documentType:String!,$documentSubType:String!) {\n receiptsWithCounts(startDate: $startDate, endDate: $endDate,documentType:$documentType,documentSubType:$documentSubType) {\n inWarehouse\n gasStation\n carWash\n gasAndCarWash\n receipts{\n warehouseName receiptType documentType transactionDateTime transactionBarcode warehouseName transactionType total \n totalItemCount\n itemArray { \n itemNumber\n }\n tenderArray { \n tenderTypeCode\n tenderDescription\n amountTender\n }\n couponArray { \n upcnumberCoupon\n } \n }\n}\n }","variables":{"startDate":"1/01/2026","endDate":"3/31/2026","text":"Last 3 Months","documentType":"all","documentSubType":"all"}} + +*** Response +{"data":{"receiptsWithCounts":{"inWarehouse":2,"gasStation":0,"carWash":0,"gasAndCarWash":0,"receipts":[{"warehouseName":"MT VERNON","receiptType":"In-Warehouse","documentType":"WarehouseReceiptDetail","transactionDateTime":"2026-03-12T16:16:00","transactionBarcode":"21111500804012603121616","transactionType":"Sales","total":208.58,"totalItemCount":24,"itemArray":[{"itemNumber":"34779"},{"itemNumber":"7950"},{"itemNumber":"2005"},{"itemNumber":"1941976"},{"itemNumber":"4873222"},{"itemNumber":"374664"},{"itemNumber":"60357"},{"itemNumber":"30669"},{"itemNumber":"1025795"},{"itemNumber":"787876"},{"itemNumber":"22093"},{"itemNumber":"1956177"},{"itemNumber":"1136340"},{"itemNumber":"7609681"},{"itemNumber":"18001"},{"itemNumber":"27003"},{"itemNumber":"1886266"},{"itemNumber":"4102"},{"itemNumber":"87745"},{"itemNumber":"110784"},{"itemNumber":"47492"},{"itemNumber":"2287780"},{"itemNumber":"917546"},{"itemNumber":"1768123"},{"itemNumber":"374558"}],"tenderArray":[{"tenderTypeCode":"061","tenderDescription":"VISA","amountTender":208.58}],"couponArray":[{"upcnumberCoupon":"2100003746641"},{"upcnumberCoupon":"2100003745583"}]},{"warehouseName":"MT VERNON","receiptType":"In-Warehouse","documentType":"WarehouseReceiptDetail","transactionDateTime":"2026-02-14T16:25:00","transactionBarcode":"21111500503322602141625","transactionType":"Sales","total":188.12,"totalItemCount":23,"itemArray":[{"itemNumber":"7812"},{"itemNumber":"7950"},{"itemNumber":"3923"},{"itemNumber":"19813"},{"itemNumber":"87745"},{"itemNumber":"1116038"},{"itemNumber":"5938"},{"itemNumber":"1136340"},{"itemNumber":"30669"},{"itemNumber":"384962"},{"itemNumber":"1331732"},{"itemNumber":"787876"},{"itemNumber":"61576"},{"itemNumber":"110784"},{"itemNumber":"180973"},{"itemNumber":"3"},{"itemNumber":"744361"},{"itemNumber":"1886266"},{"itemNumber":"1025795"},{"itemNumber":"11545"},{"itemNumber":"47492"},{"itemNumber":"260509"}],"tenderArray":[{"tenderTypeCode":"061","tenderDescription":"VISA","amountTender":188.12}],"couponArray":[]}]}}} +** item +*** POST + https://ecom-api.costco.com/ebusiness/order/v1/orders/graphql + +*** headers + +POST /ebusiness/order/v1/orders/graphql HTTP/2 +Host: ecom-api.costco.com +User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:148.0) Gecko/20100101 Firefox/148.0 +Accept: */* +Accept-Language: en-US,en;q=0.9 +Accept-Encoding: gzip, deflate, br, zstd +costco.service: restOrders +costco.env: ecom +costco-x-authorization: Bearer eyJhbGciOiJSUzI1NiIsImtpZCI6IlhrZTFoNXg5TV9ZMk5ER0YxU1hDX2xNNnVSTU5tZTJ3STBLRDlHNzl1QmciLCJ0eXAiOiJKV1QifQ.eyJleHAiOjE3NzM2NjUzODUsIm5iZiI6MTc3MzY2NDQ4NSwidmVyIjoiMS4wIiwiaXNzIjoiaHR0cHM6Ly9zaWduaW4uY29zdGNvLmNvbS9lMDcxNGRkNC03ODRkLTQ2ZDYtYTI3OC0zZTI5NTUzNDgzZWIvdjIuMC8iLCJzdWIiOiIzMTIzZWQ2Yy1jNzM4LTRiOTktOTAwZC0xNDE1ZTUzNjA2Y2UiLCJhdWQiOiJhM2E1MTg2Yi03Yzg5LTRiNGMtOTNhOC1kZDYwNGU5MzA3NTciLCJhY3IiOiJCMkNfMUFfU1NPX1dDU19zaWdudXBfc2lnbmluXzIwMSIsIm5vbmNlIjoiNzg5MjIzOGUtOWU3NC00MzExLWI2NDItMzQ1NTY4ZDY3NTk4IiwiaWF0IjoxNzczNjY0NDg1LCJhdXRoX3RpbWUiOjE3NzM2NjQ0ODQsImF1dGhlbnRpY2F0aW9uU291cmNlIjoibG9jYWxBY2NvdW50QXV0aGVudGljYXRpb24iLCJlbWFpbCI6ImpvaG5tb3Nlc2NhcnRlckBnbWFpbC5jb20iLCJuYW1lIjoiRW1wdHkgRGlzcGxheW5hbWUiLCJ1c2VySWRlbnRpdGllcyI6W3siaXNzdWVyIjoiYTNhNTE4NmItN2M4OS00YjRjLTkzYTgtZGQ2MDRlOTMwNzU3IiwiaXNzdWVyVXNlcklkIjoiQUFEOjMxMjNlZDZjLWM3MzgtNGI5OS05MDBkLTE0MTVlNTM2MDZjZSJ9LHsiaXNzdWVyIjoiNDkwMGViMWYtMGMxMC00YmQ5LTk5YzMtYzU5ZTZjMWVjZWJmIiwiaXNzdWVyVXNlcklkIjoiYTZmZmRkOTktNDM2OC00NTgwLTgxOWYtZTZjZjYxM2U1M2M1In0seyJpc3N1ZXIiOiIyZGQ0YjE0NS0zYmRhLTQ2NjktYWU2YS0zN2I4Y2I2ZGFmN2YiLCJpc3N1ZXJVc2VySWQiOiJhNmZmZGQ5OS00MzY4LTQ1ODAtODE5Zi1lNmNmNjEzZTUzYzUifV0sImlzc3VlclVzZXJJZCI6IkFBRDozMTIzZWQ2Yy1jNzM4LTRiOTktOTAwZC0xNDE1ZTUzNjA2Y2UiLCJjbGllbnRJZCI6ImEzYTUxODZiLTdjODktNGI0Yy05M2E4LWRkNjA0ZTkzMDc1NyIsInJlbWVtYmVyTWUiOiJGYWxzZSIsInNlbmRNZUVtYWlsIjoib2ZmIiwiaXBBZGRyZXNzIjoiOTYuMjQxLjIxMi4xMjUiLCJDb3JyZWxhdGlvbklkIjoiMDk0YTE5NDYtZTMwNS00ZDkzLWEyMzQtM2ZiNGMwMjMyNDhhIn0.FdsVFHsewvpQABvkEz4uA0NUlYwvlBEg-frJbUDIJRTsP59Be0bOt8Zqv6cZhUqBn_lTQEyi9tnvpkpycmNy7Rg5zLfYroH6mNALRqkBm8VbcmrEVDM1HmdNTHgO9vQD4TdKm1ZYkA7Pj_6QY3sDxI4ioOzIz1_XOnoJVAXjEwGfr8hgvqtlaC51M5DsfIGQj3zCaJrQnD6GBJlFmLNUpCulpT16WAaB1lT_pcycfBs-e1xnEd33dX0kHBOZ8pFS-IKjV_44ZK9R8jI9WHx5ThX3-DtyqjkJ0JypmhT9uEa0MeT55U7aeKPbMvQ0exiw3culKgiWDhvdp8e2EkExsg +Content-Type: application/json-patch+json +costco-x-wcs-clientId: 4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf +client-identifier: 481b1aec-aa3b-454b-b81b-48187e28f205 +Content-Length: 2916 +Origin: https://www.costco.com +DNT: 1 +Sec-GPC: 1 +Connection: keep-alive +Referer: https://www.costco.com/ +Sec-Fetch-Dest: empty +Sec-Fetch-Mode: cors +Sec-Fetch-Site: same-site +Priority: u=0 +TE: trailers + +*** request +{"query":"query receiptsWithCounts($barcode: String!,$documentType:String!) {\n receiptsWithCounts(barcode: $barcode,documentType:$documentType) {\nreceipts{\n warehouseName\n receiptType \n documentType \n transactionDateTime \n transactionDate \n companyNumber \n warehouseNumber \n operatorNumber \n warehouseName \n warehouseShortName \n registerNumber \n transactionNumber \n transactionType\n transactionBarcode \n total \n warehouseAddress1 \n warehouseAddress2 \n warehouseCity \n warehouseState \n warehouseCountry \n warehousePostalCode\n totalItemCount \n subTotal \n taxes\n total \n invoiceNumber\n sequenceNumber\n itemArray { \n itemNumber \n itemDescription01 \n frenchItemDescription1 \n itemDescription02 \n frenchItemDescription2 \n itemIdentifier \n itemDepartmentNumber\n unit \n amount \n taxFlag \n merchantID \n entryMethod\n transDepartmentNumber\n fuelUnitQuantity\n fuelGradeCode\n fuelUnitQuantity\n itemUnitPriceAmount\n fuelUomCode\n fuelUomDescription\n fuelUomDescriptionFr\n fuelGradeDescription\n fuelGradeDescriptionFr\n\n } \n tenderArray { \n tenderTypeCode\n tenderSubTypeCode\n tenderDescription \n amountTender \n displayAccountNumber \n sequenceNumber \n approvalNumber \n responseCode \n tenderTypeName \n transactionID \n merchantID \n entryMethod\n tenderAcctTxnNumber \n tenderAuthorizationCode \n tenderTypeName\n tenderTypeNameFr\n tenderEntryMethodDescription\n walletType\n walletId\n storedValueBucket\n } \n subTaxes { \n tax1 \n tax2 \n tax3 \n tax4 \n aTaxPercent \n aTaxLegend \n aTaxAmount\n aTaxPrintCode\n aTaxPrintCodeFR \n aTaxIdentifierCode \n bTaxPercent \n bTaxLegend \n bTaxAmount\n bTaxPrintCode\n bTaxPrintCodeFR \n bTaxIdentifierCode \n cTaxPercent \n cTaxLegend \n cTaxAmount\n cTaxIdentifierCode \n dTaxPercent \n dTaxLegend \n dTaxAmount\n dTaxPrintCode\n dTaxPrintCodeFR \n dTaxIdentifierCode\n uTaxLegend\n uTaxAmount\n uTaxableAmount\n } \n instantSavings \n membershipNumber \n }\n }\n }","variables":{"barcode":"21111500804012603121616","documentType":"warehouse"}} + +*** response +{"data":{"receiptsWithCounts":{"receipts":[{"warehouseName":"MT VERNON","receiptType":"In-Warehouse","documentType":"WarehouseReceiptDetail","transactionDateTime":"2026-03-12T16:16:00","transactionDate":"2026-03-12","companyNumber":1,"warehouseNumber":1115,"operatorNumber":43,"warehouseShortName":"MT VERNON","registerNumber":8,"transactionNumber":401,"transactionType":"Sales","transactionBarcode":"21111500804012603121616","total":208.58,"warehouseAddress1":"7940 RICHMOND HWY","warehouseAddress2":null,"warehouseCity":"ALEXANDRIA","warehouseState":"VA","warehouseCountry":"US","warehousePostalCode":"22306","totalItemCount":24,"subTotal":202.01,"taxes":6.57,"invoiceNumber":null,"sequenceNumber":null,"itemArray":[{"itemNumber":"34779","itemDescription01":"ROMANO","frenchItemDescription1":null,"itemDescription02":"CS=15 SL120 T9H6","frenchItemDescription2":null,"itemIdentifier":"E","itemDepartmentNumber":19,"unit":1,"amount":20.93,"taxFlag":"3","merchantID":null,"entryMethod":null,"transDepartmentNumber":19,"fuelUnitQuantity":10.0,"fuelGradeCode":null,"itemUnitPriceAmount":11.69,"fuelUomCode":null,"fuelUomDescription":null,"fuelUomDescriptionFr":null,"fuelGradeDescription":null,"fuelGradeDescriptionFr":null},{"itemNumber":"7950","itemDescription01":"4LB COSMIC","frenchItemDescription1":null,"itemDescription02":null,"frenchItemDescription2":null,"itemIdentifier":"E","itemDepartmentNumber":65,"unit":1,"amount":5.99,"taxFlag":"3","merchantID":null,"entryMethod":null,"transDepartmentNumber":65,"fuelUnitQuantity":10.0,"fuelGradeCode":null,"itemUnitPriceAmount":5.99,"fuelUomCode":null,"fuelUomDescription":null,"fuelUomDescriptionFr":null,"fuelGradeDescription":null,"fuelGradeDescriptionFr":null},{"itemNumber":"2005","itemDescription01":"25# FLOUR","frenchItemDescription1":null,"itemDescription02":"ALL-PURPOSE HARV P98/100","frenchItemDescription2":null,"itemIdentifier":"E","itemDepartmentNumber":13,"unit":1,"amount":9.49,"taxFlag":"3","merchantID":null,"entryMethod":null,"transDepartmentNumber":13,"fuelUnitQuantity":10.0,"fuelGradeCode":null,"itemUnitPriceAmount":9.49,"fuelUomCode":null,"fuelUomDescription":null,"fuelUomDescriptionFr":null,"fuelGradeDescription":null,"fuelGradeDescriptionFr":null},{"itemNumber":"1941976","itemDescription01":"BREAD FLOUR","frenchItemDescription1":null,"itemDescription02":"12 LBS 180P 20X9","frenchItemDescription2":null,"itemIdentifier":"E","itemDepartmentNumber":13,"unit":1,"amount":9.99,"taxFlag":"3","merchantID":null,"entryMethod":null,"transDepartmentNumber":13,"fuelUnitQuantity":10.0,"fuelGradeCode":null,"itemUnitPriceAmount":9.99,"fuelUomCode":null,"fuelUomDescription":null,"fuelUomDescriptionFr":null,"fuelGradeDescription":null,"fuelGradeDescriptionFr":null},{"itemNumber":"4873222","itemDescription01":"ALL F&C","frenchItemDescription1":null,"itemDescription02":"200OZ 160LOADS P104","frenchItemDescription2":null,"itemIdentifier":null,"itemDepartmentNumber":14,"unit":1,"amount":19.99,"taxFlag":"Y","merchantID":null,"entryMethod":null,"transDepartmentNumber":14,"fuelUnitQuantity":10.0,"fuelGradeCode":null,"itemUnitPriceAmount":19.99,"fuelUomCode":null,"fuelUomDescription":null,"fuelUomDescriptionFr":null,"fuelGradeDescription":null,"fuelGradeDescriptionFr":null},{"itemNumber":"374664","itemDescription01":"/ 4873222","frenchItemDescription1":"/4873222","itemDescription02":null,"frenchItemDescription2":null,"itemIdentifier":null,"itemDepartmentNumber":14,"unit":-1,"amount":-5,"taxFlag":null,"merchantID":null,"entryMethod":null,"transDepartmentNumber":14,"fuelUnitQuantity":null,"fuelGradeCode":null,"itemUnitPriceAmount":0,"fuelUomCode":null,"fuelUomDescription":null,"fuelUomDescriptionFr":null,"fuelGradeDescription":null,"fuelGradeDescriptionFr":null},{"itemNumber":"60357","itemDescription01":"MIXED PEPPER","frenchItemDescription1":null,"itemDescription02":"6-PACK","frenchItemDescription2":null,"itemIdentifier":"E","itemDepartmentNumber":65,"unit":1,"amount":7.49,"taxFlag":"3","merchantID":null,"entryMethod":null,"transDepartmentNumber":65,"fuelUnitQuantity":10.0,"fuelGradeCode":null,"itemUnitPriceAmount":7.49,"fuelUomCode":null,"fuelUomDescription":null,"fuelUomDescriptionFr":null,"fuelGradeDescription":null,"fuelGradeDescriptionFr":null},{"itemNumber":"30669","itemDescription01":"BANANAS","frenchItemDescription1":null,"itemDescription02":"3 LB / 1.36 KG","frenchItemDescription2":null,"itemIdentifier":"E","itemDepartmentNumber":65,"unit":2,"amount":2.98,"taxFlag":"3","merchantID":null,"entryMethod":null,"transDepartmentNumber":65,"fuelUnitQuantity":10.0,"fuelGradeCode":null,"itemUnitPriceAmount":1.49,"fuelUomCode":null,"fuelUomDescription":null,"fuelUomDescriptionFr":null,"fuelGradeDescription":null,"fuelGradeDescriptionFr":null},{"itemNumber":"1025795","itemDescription01":"KS 5DZ EGGS","frenchItemDescription1":null,"itemDescription02":"SL21 P120 / P132 / P144","frenchItemDescription2":null,"itemIdentifier":"E","itemDepartmentNumber":17,"unit":1,"amount":9.39,"taxFlag":"3","merchantID":null,"entryMethod":null,"transDepartmentNumber":17,"fuelUnitQuantity":10.0,"fuelGradeCode":null,"itemUnitPriceAmount":9.39,"fuelUomCode":null,"fuelUomDescription":null,"fuelUomDescriptionFr":null,"fuelGradeDescription":null,"fuelGradeDescriptionFr":null},{"itemNumber":"787876","itemDescription01":"KS TWNY PORT","frenchItemDescription1":null,"itemDescription02":"PORTUGAL CSPC# 773506","frenchItemDescription2":null,"itemIdentifier":null,"itemDepartmentNumber":16,"unit":1,"amount":17.99,"taxFlag":"Y","merchantID":null,"entryMethod":null,"transDepartmentNumber":16,"fuelUnitQuantity":10.0,"fuelGradeCode":null,"itemUnitPriceAmount":17.99,"fuelUomCode":null,"fuelUomDescription":null,"fuelUomDescriptionFr":null,"fuelGradeDescription":null,"fuelGradeDescriptionFr":null},{"itemNumber":"22093","itemDescription01":"KS SHRP CHDR","frenchItemDescription1":null,"itemDescription02":"EC20T9H5 W12T13H5 SL130","frenchItemDescription2":null,"itemIdentifier":"E","itemDepartmentNumber":17,"unit":1,"amount":5.49,"taxFlag":"3","merchantID":null,"entryMethod":null,"transDepartmentNumber":17,"fuelUnitQuantity":10.0,"fuelGradeCode":null,"itemUnitPriceAmount":5.49,"fuelUomCode":null,"fuelUomDescription":null,"fuelUomDescriptionFr":null,"fuelGradeDescription":null,"fuelGradeDescriptionFr":null},{"itemNumber":"1956177","itemDescription01":"BRWNBTTRGRV","frenchItemDescription1":null,"itemDescription02":"MCCORMICK C12T19H7 L228","frenchItemDescription2":null,"itemIdentifier":"E","itemDepartmentNumber":13,"unit":1,"amount":2.97,"taxFlag":"3","merchantID":null,"entryMethod":null,"transDepartmentNumber":13,"fuelUnitQuantity":10.0,"fuelGradeCode":null,"itemUnitPriceAmount":2.97,"fuelUomCode":null,"fuelUomDescription":null,"fuelUomDescriptionFr":null,"fuelGradeDescription":null,"fuelGradeDescriptionFr":null},{"itemNumber":"1136340","itemDescription01":"3LB ORG GALA","frenchItemDescription1":null,"itemDescription02":null,"frenchItemDescription2":null,"itemIdentifier":"E","itemDepartmentNumber":65,"unit":1,"amount":4.49,"taxFlag":"3","merchantID":null,"entryMethod":null,"transDepartmentNumber":65,"fuelUnitQuantity":10.0,"fuelGradeCode":null,"itemUnitPriceAmount":4.49,"fuelUomCode":null,"fuelUomDescription":null,"fuelUomDescriptionFr":null,"fuelGradeDescription":null,"fuelGradeDescriptionFr":null},{"itemNumber":"7609681","itemDescription01":"CASCADE GEL","frenchItemDescription1":null,"itemDescription02":"125OZ T60H3P180","frenchItemDescription2":null,"itemIdentifier":null,"itemDepartmentNumber":14,"unit":1,"amount":12.49,"taxFlag":"Y","merchantID":null,"entryMethod":null,"transDepartmentNumber":14,"fuelUnitQuantity":10.0,"fuelGradeCode":null,"itemUnitPriceAmount":12.49,"fuelUomCode":null,"fuelUomDescription":null,"fuelUomDescriptionFr":null,"fuelGradeDescription":null,"fuelGradeDescriptionFr":null},{"itemNumber":"18001","itemDescription01":"TBLE SALT 4#","frenchItemDescription1":null,"itemDescription02":"DIAMOND CRYSTAL P=600","frenchItemDescription2":null,"itemIdentifier":"E","itemDepartmentNumber":13,"unit":1,"amount":1.49,"taxFlag":"3","merchantID":null,"entryMethod":null,"transDepartmentNumber":13,"fuelUnitQuantity":10.0,"fuelGradeCode":null,"itemUnitPriceAmount":1.49,"fuelUomCode":null,"fuelUomDescription":null,"fuelUomDescriptionFr":null,"fuelGradeDescription":null,"fuelGradeDescriptionFr":null},{"itemNumber":"27003","itemDescription01":"STRAWBERRIES","frenchItemDescription1":null,"itemDescription02":"908 G / 2 LB","frenchItemDescription2":null,"itemIdentifier":"E","itemDepartmentNumber":65,"unit":1,"amount":5.29,"taxFlag":"3","merchantID":null,"entryMethod":null,"transDepartmentNumber":65,"fuelUnitQuantity":10.0,"fuelGradeCode":null,"itemUnitPriceAmount":5.29,"fuelUomCode":null,"fuelUomDescription":null,"fuelUomDescriptionFr":null,"fuelGradeDescription":null,"fuelGradeDescriptionFr":null},{"itemNumber":"1886266","itemDescription01":"SKO 5%","frenchItemDescription1":null,"itemDescription02":"48 OZ T10H8 SL30","frenchItemDescription2":null,"itemIdentifier":"E","itemDepartmentNumber":17,"unit":1,"amount":5.79,"taxFlag":"3","merchantID":null,"entryMethod":null,"transDepartmentNumber":17,"fuelUnitQuantity":10.0,"fuelGradeCode":null,"itemUnitPriceAmount":5.79,"fuelUomCode":null,"fuelUomDescription":null,"fuelUomDescriptionFr":null,"fuelGradeDescription":null,"fuelGradeDescriptionFr":null},{"itemNumber":"4102","itemDescription01":"8\" TORTILLAS","frenchItemDescription1":null,"itemDescription02":"SL10 70OZ","frenchItemDescription2":null,"itemIdentifier":"E","itemDepartmentNumber":13,"unit":1,"amount":5.99,"taxFlag":"3","merchantID":null,"entryMethod":null,"transDepartmentNumber":13,"fuelUnitQuantity":10.0,"fuelGradeCode":null,"itemUnitPriceAmount":5.99,"fuelUomCode":null,"fuelUomDescription":null,"fuelUomDescriptionFr":null,"fuelGradeDescription":null,"fuelGradeDescriptionFr":null},{"itemNumber":"87745","itemDescription01":"ROTISSERIE","frenchItemDescription1":null,"itemDescription02":"USDA GRADE A","frenchItemDescription2":null,"itemIdentifier":null,"itemDepartmentNumber":63,"unit":1,"amount":4.99,"taxFlag":"D","merchantID":null,"entryMethod":null,"transDepartmentNumber":63,"fuelUnitQuantity":10.0,"fuelGradeCode":null,"itemUnitPriceAmount":4.99,"fuelUomCode":null,"fuelUomDescription":null,"fuelUomDescriptionFr":null,"fuelGradeDescription":null,"fuelGradeDescriptionFr":null},{"itemNumber":"110784","itemDescription01":"15 GRAIN BRD","frenchItemDescription1":null,"itemDescription02":"PEPPERIDGE FARM 2/24 OZ","frenchItemDescription2":null,"itemIdentifier":"E","itemDepartmentNumber":13,"unit":1,"amount":5.69,"taxFlag":"3","merchantID":null,"entryMethod":null,"transDepartmentNumber":13,"fuelUnitQuantity":10.0,"fuelGradeCode":null,"itemUnitPriceAmount":5.69,"fuelUomCode":null,"fuelUomDescription":null,"fuelUomDescriptionFr":null,"fuelGradeDescription":null,"fuelGradeDescriptionFr":null},{"itemNumber":"47492","itemDescription01":"CELERY SALAD","frenchItemDescription1":null,"itemDescription02":"APPLE CIDER VINAIGRETTE","frenchItemDescription2":null,"itemIdentifier":"E","itemDepartmentNumber":63,"unit":1,"amount":12.62,"taxFlag":"D","merchantID":null,"entryMethod":null,"transDepartmentNumber":63,"fuelUnitQuantity":10.0,"fuelGradeCode":null,"itemUnitPriceAmount":4.99,"fuelUomCode":null,"fuelUomDescription":null,"fuelUomDescriptionFr":null,"fuelGradeDescription":null,"fuelGradeDescriptionFr":null},{"itemNumber":"2287780","itemDescription01":"BTB CHICKEN","frenchItemDescription1":null,"itemDescription02":"C12T10H9 P1080 SL630","frenchItemDescription2":null,"itemIdentifier":"E","itemDepartmentNumber":13,"unit":1,"amount":9.49,"taxFlag":"3","merchantID":null,"entryMethod":null,"transDepartmentNumber":13,"fuelUnitQuantity":10.0,"fuelGradeCode":null,"itemUnitPriceAmount":9.49,"fuelUomCode":null,"fuelUomDescription":null,"fuelUomDescriptionFr":null,"fuelGradeDescription":null,"fuelGradeDescriptionFr":null},{"itemNumber":"917546","itemDescription01":"JIF CREAMY","frenchItemDescription1":null,"itemDescription02":"PEANUT BUTTER SL540 P300","frenchItemDescription2":null,"itemIdentifier":"E","itemDepartmentNumber":13,"unit":1,"amount":11.99,"taxFlag":"3","merchantID":null,"entryMethod":null,"transDepartmentNumber":13,"fuelUnitQuantity":10.0,"fuelGradeCode":null,"itemUnitPriceAmount":11.99,"fuelUomCode":null,"fuelUomDescription":null,"fuelUomDescriptionFr":null,"fuelGradeDescription":null,"fuelGradeDescriptionFr":null},{"itemNumber":"1768123","itemDescription01":"BBEE KIDS4PC","frenchItemDescription1":null,"itemDescription02":"FY26 P1600 T200 H8","frenchItemDescription2":null,"itemIdentifier":null,"itemDepartmentNumber":39,"unit":1,"amount":17.99,"taxFlag":"Y","merchantID":null,"entryMethod":null,"transDepartmentNumber":39,"fuelUnitQuantity":10.0,"fuelGradeCode":null,"itemUnitPriceAmount":17.99,"fuelUomCode":null,"fuelUomDescription":null,"fuelUomDescriptionFr":null,"fuelGradeDescription":null,"fuelGradeDescriptionFr":null},{"itemNumber":"374558","itemDescription01":"/ 1768123","frenchItemDescription1":"/1768123","itemDescription02":null,"frenchItemDescription2":null,"itemIdentifier":null,"itemDepartmentNumber":39,"unit":-1,"amount":-4,"taxFlag":null,"merchantID":null,"entryMethod":null,"transDepartmentNumber":39,"fuelUnitQuantity":null,"fuelGradeCode":null,"itemUnitPriceAmount":0,"fuelUomCode":null,"fuelUomDescription":null,"fuelUomDescriptionFr":null,"fuelGradeDescription":null,"fuelGradeDescriptionFr":null}],"tenderArray":[{"tenderTypeCode":"061","tenderSubTypeCode":null,"tenderDescription":"VISA","amountTender":208.58,"displayAccountNumber":"9070","sequenceNumber":null,"approvalNumber":null,"responseCode":null,"tenderTypeName":"VISA","transactionID":null,"merchantID":null,"entryMethod":null,"tenderAcctTxnNumber":null,"tenderAuthorizationCode":null,"tenderTypeNameFr":null,"tenderEntryMethodDescription":null,"walletType":null,"walletId":null,"storedValueBucket":null}],"subTaxes":{"tax1":null,"tax2":null,"tax3":null,"tax4":null,"aTaxPercent":null,"aTaxLegend":"A","aTaxAmount":4.62,"aTaxPrintCode":null,"aTaxPrintCodeFR":null,"aTaxIdentifierCode":null,"bTaxPercent":null,"bTaxLegend":null,"bTaxAmount":null,"bTaxPrintCode":null,"bTaxPrintCodeFR":null,"bTaxIdentifierCode":null,"cTaxPercent":null,"cTaxLegend":"C","cTaxAmount":1.25,"cTaxIdentifierCode":null,"dTaxPercent":null,"dTaxLegend":"D","dTaxAmount":0.7,"dTaxPrintCode":null,"dTaxPrintCodeFR":null,"dTaxIdentifierCode":null,"uTaxLegend":null,"uTaxAmount":null,"uTaxableAmount":null},"instantSavings":9,"membershipNumber":"111894291684"}]}}} + diff --git a/pm/tasks.org b/pm/tasks.org index 0f7ad9b..cdb0c7c 100644 --- a/pm/tasks.org +++ b/pm/tasks.org @@ -16,7 +16,7 @@ - tests: `./venv/bin/python -m unittest discover -s tests`; `./venv/bin/python scraper.py --help`; verified `.env` loading via `scraper.load_config()` - date: 2026-03-14 -* [ ] t1.2: define grocery data model and file layout (1-2 commits) +* [X] t1.2: define grocery data model and file layout (1-2 commits) ** acceptance criteria - decide and document the files/directories for: - retailer raw exports @@ -32,11 +32,11 @@ - keep schema minimal but extensible ** evidence -- commit: -- tests: -- date: +- commit: `42dbae1` on branch `cx` +- tests: reviewed `giant_output/raw/history.json`, one sample raw order json, `giant_output/orders.csv`, `giant_output/items.csv`; documented schemas in `pm/data-model.org` +- date: 2026-03-15 -* [ ] t1.3: build giant parser/enricher from raw json (2-4 commits) +* [X] t1.3: build giant parser/enricher from raw json (2-4 commits) ** acceptance criteria - parser reads giant raw order json files - outputs `items_enriched.csv` @@ -54,11 +54,11 @@ - parser should preserve ambiguity rather than hallucinating precision ** evidence -- commit: -- tests: -- date: +- commit: `14f2cc2` on branch `cx` +- tests: `./venv/bin/python -m unittest discover -s tests`; `./venv/bin/python enrich_giant.py`; verified `giant_output/items_enriched.csv` on real raw data +- date: 2026-03-16 -* [ ] t1.4: generate observed-product layer from enriched items (2-3 commits) +* [X] t1.4: generate observed-product layer from enriched items (2-3 commits) ** acceptance criteria - distinct observed products are generated from enriched giant items @@ -76,11 +76,11 @@ - likely key is some combo of retailer + upc + normalized name ** evidence -- commit: -- tests: -- date: +- commit: `dc39214` on branch `cx` +- tests: `./venv/bin/python -m unittest discover -s tests`; `./venv/bin/python build_observed_products.py`; verified `giant_output/products_observed.csv` +- date: 2026-03-16 -* [ ] t1.5: build review queue for unresolved or low-confidence products (1-3 commits) +* [X] t1.5: build review queue for unresolved or low-confidence products (1-3 commits) ** acceptance criteria - produce a review file containing observed products needing manual review @@ -98,11 +98,11 @@ - optimize for “approve once, remember forever” ** evidence -- commit: -- tests: -- date: +- commit: `9b13ec3` on branch `cx` +- tests: `./venv/bin/python -m unittest discover -s tests`; `./venv/bin/python build_review_queue.py`; verified `giant_output/review_queue.csv` +- date: 2026-03-16 -* [ ] t1.6: create canonical product layer and observed→canonical links (2-4 commits) +* [X] t1.6: create canonical product layer and observed→canonical links (2-4 commits) ** acceptance criteria - define and create `products_canonical.csv` @@ -120,11 +120,11 @@ - do not require llm assistance for v1 ** evidence -- commit: -- tests: -- date: +- commit: `347cd44` on branch `cx` +- tests: `./venv/bin/python -m unittest discover -s tests`; `./venv/bin/python build_canonical_layer.py`; verified seeded `giant_output/products_canonical.csv` and `giant_output/product_links.csv` +- date: 2026-03-16 -* [ ] t1.7: implement auto-link rules for easy matches (2-3 commits) +* [X] t1.7: implement auto-link rules for easy matches (2-3 commits) ** acceptance criteria - auto-link can match observed products to canonical products using deterministic rules @@ -139,43 +139,140 @@ - false positives are worse than unresolved items ** evidence -- commit: -- tests: -- date: +- commit: `385a31c` on branch `cx` +- tests: `./venv/bin/python -m unittest discover -s tests`; `./venv/bin/python build_canonical_layer.py`; verified auto-linked `giant_output/products_canonical.csv` and `giant_output/product_links.csv` +- date: 2026-03-16 -* [ ] t1.8: support costco raw ingest path (2-5 commits) +* [X] t1.8: support costco raw ingest path (2-5 commits) ** acceptance criteria - add a costco-specific raw ingest/export path -- output costco line items into the same shared raw/enriched schema family -- confirm at least one product class can exist as: - - giant observed product - - costco observed product - - one shared canonical product +- fetch costco receipt summary and receipt detail payloads from graphql endpoint +- persist raw json under `costco_output/raw/orders.csv` and `./items.csv`, same format as giant +- costco-native identifiers such as `transactionBarcode` as order id and `itemNumber` as retailer item id +- preserve discount/coupon rows rather than dropping ** notes -- this is the proof that the architecture generalizes -- don’t chase perfection before the second retailer lands +- focus on raw costco acquisistion and flattening +- do not force costco identifiers into `upc` +- bearer/auth values should come from local env, not source ** evidence -- commit: -- tests: -- date: +- commit: `da00288` on branch `cx` +- tests: `./venv/bin/python -m unittest discover -s tests`; `./venv/bin/python scrape_costco.py --help`; verified `costco_output/raw/*.json`, `costco_output/orders.csv`, and `costco_output/items.csv` from the local sample payload +- date: 2026-03-16 -* [ ] t1.9: compute normalized comparison metrics (2-3 commits) +* [X] t1.8.1: support costco parser/enricher path (2-4 commits) ** acceptance criteria -- derive normalized comparison fields where possible: - - price per lb - - price per oz - - price per each - - price per count -- metrics are attached at canonical or linked-observed level as appropriate -- emit obvious nulls when basis is unknown rather than inventing values +- add a costco-specific enrich step producing `costco_output/items_enriched.csv` +- output rows into the same shared enriched schema family as Giant +- support costco-specific parsing for: + - `itemDescription01` + `itemDescription02` + - `itemNumber` as `retailer_item_id` + - discount lines / negative rows + - common size patterns such as `25#`, `48 OZ`, `2/24 OZ`, `6-PACK` +- preserve obvious unknowns as blank rather than guessed values ** notes -- this is where “gala apples 5 lb bag vs other gala apples” becomes possible -- units discipline matters a lot here +- this is the real schema compatibility proof, not raw ingest alone +- expect weaker identifiers than Giant + +** evidence +- commit: `da00288` on branch `cx` +- tests: `./venv/bin/python -m unittest discover -s tests`; `./venv/bin/python enrich_costco.py`; verified `costco_output/items_enriched.csv` +- date: 2026-03-16 +* [X] t1.8.2: validate cross-retailer observed/canonical flow (1-3 commits) + +** acceptance criteria +- feed Giant and Costco enriched rows through the same observed/canonical pipeline +- confirm at least one product class can exist as: + - Giant observed product + - Costco observed product + - one shared canonical product +- document the exact example used for proof + +** notes +- keep this to one or two well-behaved product classes first +- apples, eggs, bananas, or flour are better than weird prepared foods + +** evidence +- commit: `da00288` on branch `cx` +- tests: `./venv/bin/python -m unittest discover -s tests`; `./venv/bin/python validate_cross_retailer_flow.py`; proof example: Giant `FRESH BANANA` and Costco `BANANAS 3 LB / 1.36 KG` share one canonical in `combined_output/proof_examples.csv` +- date: 2026-03-16 +* [X] t1.8.3: extend shared schema for retailer-native ids and adjustment lines (1-2 commits) + +** acceptance criteria +- add shared fields needed for non-upc retailers, including: + - `retailer_item_id` + - `is_discount_line` + - `is_coupon_line` or equivalent if needed +- keep `upc` nullable across the pipeline +- update downstream builders/tests to accept retailers with blank `upc` + +** notes +- this prevents costco from becoming a schema hack +- do this once instead of sprinkling exceptions everywhere + +** evidence +- commit: `9497565` on branch `cx` +- tests: `./venv/bin/python -m unittest discover -s tests`; verified shared enriched fields in `giant_output/items_enriched.csv` and `costco_output/items_enriched.csv` +- date: 2026-03-16 +* [X] t1.8.4: verify and correct costco receipt enumeration (1–2 commits) + +** acceptance criteria +- confirm graphql summary query returns all expected receipts +- compare `inWarehouse` count vs number of `receipts` returned +- widen or parameterize date window if necessary; website shows receipts in 3-month windows +- persist request metadata (`startDate`, `endDate`, `documentType`, `documentSubType`) +- emit warning when receipt counts mismatch + +** notes +- goal is to confirm we are enumerating all receipts before parsing +- do not expand schema or parser logic in this task +- keep changes limited to summary query handling and diagnostics + +** evidence +- commit: `ac82fa6` on branch `cx` +- tests: `./venv/bin/python -m unittest discover -s tests`; `./venv/bin/python scrape_costco.py --help`; reviewed the sample Costco summary request in `pm/scrape-giant.org` against `costco_output/raw/summary.json` and added 3-month window chunking plus mismatch diagnostics +- date: 2026-03-16 +* [X] t1.8.5: refactor costco scraper auth and UX with giant scraper + +** acceptance criteria +- remove manual auth env vars +- load costco cookies from firefox session +- require only logged-in browser +- replace start/end date flags with --months-back +- maintain same raw output structure +- ensure summary_lookup keys are collision-safe by using a composite key (transactionBarcode + transactionDateTime) instead of transactionBarcode alone + +** notes +- align Costco acquisition ergonomics with the Giant scraper +- keep downstream Costco parsing and shared schemas unchanged + +** evidence +- commit: `c0054dc` on branch `cx` +- tests: `./venv/bin/python -m unittest discover -s tests`; `./venv/bin/python scrape_costco.py --help`; verified Costco summary/detail flattening now uses composite receipt keys in unit tests +- date: 2026-03-16 +* [ ] t1.9: compute normalized comparison metrics (2-4 commits) + +** acceptance criteria +- derive normalized comparison fields where possible on enriched or observed product rows: + - `price_per_lb` + - `price_per_oz` + - `price_per_each` + - `price_per_count` +- preserve the source basis used to derive each metric, e.g.: + - parsed size/unit + - receipt weight + - explicit count/pack +- emit nulls when basis is unknown, conflicting, or ambiguous +- document at least one Giant vs Costco comparison example using the normalized metrics + +** notes +- compute metrics as close to the raw observation as possible +- canonical layer can aggregate later, but should not invent missing unit economics +- unit discipline matters more than coverage ** evidence - commit: diff --git a/scrape_costco.py b/scrape_costco.py new file mode 100644 index 0000000..1f607b4 --- /dev/null +++ b/scrape_costco.py @@ -0,0 +1,619 @@ +import csv +import json +import time +from calendar import monthrange +from datetime import datetime, timedelta +from pathlib import Path + +import click +import browser_cookie3 +from curl_cffi import requests + +BASE_URL = "https://ecom-api.costco.com/ebusiness/order/v1/orders/graphql" +RETAILER = "costco" + +SUMMARY_QUERY = """ +query receiptsWithCounts($startDate: String!, $endDate: String!, $documentType: String!, $documentSubType: String!) { + receiptsWithCounts(startDate: $startDate, endDate: $endDate, documentType: $documentType, documentSubType: $documentSubType) { + inWarehouse + gasStation + carWash + gasAndCarWash + receipts { + warehouseName + receiptType + documentType + transactionDateTime + transactionBarcode + warehouseName + transactionType + total + totalItemCount + itemArray { + itemNumber + } + tenderArray { + tenderTypeCode + tenderDescription + amountTender + } + couponArray { + upcnumberCoupon + } + } + } +} +""".strip() + +DETAIL_QUERY = """ +query receiptsWithCounts($barcode: String!, $documentType: String!) { + receiptsWithCounts(barcode: $barcode, documentType: $documentType) { + receipts { + warehouseName + receiptType + documentType + transactionDateTime + transactionDate + companyNumber + warehouseNumber + operatorNumber + warehouseShortName + registerNumber + transactionNumber + transactionType + transactionBarcode + total + warehouseAddress1 + warehouseAddress2 + warehouseCity + warehouseState + warehouseCountry + warehousePostalCode + totalItemCount + subTotal + taxes + total + invoiceNumber + sequenceNumber + itemArray { + itemNumber + itemDescription01 + frenchItemDescription1 + itemDescription02 + frenchItemDescription2 + itemIdentifier + itemDepartmentNumber + unit + amount + taxFlag + merchantID + entryMethod + transDepartmentNumber + fuelUnitQuantity + fuelGradeCode + itemUnitPriceAmount + fuelUomCode + fuelUomDescription + fuelUomDescriptionFr + fuelGradeDescription + fuelGradeDescriptionFr + } + tenderArray { + tenderTypeCode + tenderSubTypeCode + tenderDescription + amountTender + displayAccountNumber + sequenceNumber + approvalNumber + responseCode + tenderTypeName + transactionID + merchantID + entryMethod + tenderAcctTxnNumber + tenderAuthorizationCode + tenderTypeNameFr + tenderEntryMethodDescription + walletType + walletId + storedValueBucket + } + subTaxes { + tax1 + tax2 + tax3 + tax4 + aTaxPercent + aTaxLegend + aTaxAmount + aTaxPrintCode + aTaxPrintCodeFR + aTaxIdentifierCode + bTaxPercent + bTaxLegend + bTaxAmount + bTaxPrintCode + bTaxPrintCodeFR + bTaxIdentifierCode + cTaxPercent + cTaxLegend + cTaxAmount + cTaxIdentifierCode + dTaxPercent + dTaxLegend + dTaxAmount + dTaxPrintCode + dTaxPrintCodeFR + dTaxIdentifierCode + uTaxLegend + uTaxAmount + uTaxableAmount + } + instantSavings + membershipNumber + } + } +} +""".strip() + +ORDER_FIELDS = [ + "retailer", + "order_id", + "order_date", + "delivery_date", + "service_type", + "order_total", + "payment_method", + "total_item_count", + "total_savings", + "your_savings_total", + "coupons_discounts_total", + "store_name", + "store_number", + "store_address1", + "store_city", + "store_state", + "store_zipcode", + "refund_order", + "ebt_order", + "raw_history_path", + "raw_order_path", +] + +ITEM_FIELDS = [ + "retailer", + "order_id", + "line_no", + "order_date", + "retailer_item_id", + "pod_id", + "item_name", + "upc", + "category_id", + "category", + "qty", + "unit", + "unit_price", + "line_total", + "picked_weight", + "mvp_savings", + "reward_savings", + "coupon_savings", + "coupon_price", + "image_url", + "raw_order_path", + "is_discount_line", + "is_coupon_line", +] + + +def build_headers(): + return { + "accept": "*/*", + "content-type": "application/json-patch+json", + "costco.service": "restOrders", + "costco.env": "ecom", + "origin": "https://www.costco.com", + "referer": "https://www.costco.com/", + "user-agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:148.0) " + "Gecko/20100101 Firefox/148.0" + ), + } + + +def build_session(): + session = requests.Session() + session.cookies.update(browser_cookie3.firefox(domain_name="costco.com")) + session.headers.update(build_headers()) + return session + + +def graphql_post(session, query, variables): + last_response = None + + for attempt in range(3): + try: + response = session.post( + BASE_URL, + json={"query": query, "variables": variables}, + impersonate="firefox", + timeout=30, + ) + last_response = response + if response.status_code == 200: + return response.json() + click.echo(f"retry {attempt + 1}/3 status={response.status_code}") + except Exception as exc: # pragma: no cover - network error path + click.echo(f"retry {attempt + 1}/3 error={exc}") + time.sleep(3) + + if last_response is not None: + last_response.raise_for_status() + + raise RuntimeError("failed to fetch Costco GraphQL payload") + + +def summary_receipts(payload): + return payload.get("data", {}).get("receiptsWithCounts", {}).get("receipts", []) + + +def detail_receipts(payload): + return payload.get("data", {}).get("receiptsWithCounts", {}).get("receipts", []) + + +def summary_counts(payload): + counts = payload.get("data", {}).get("receiptsWithCounts", {}) + return { + "inWarehouse": counts.get("inWarehouse", 0) or 0, + "gasStation": counts.get("gasStation", 0) or 0, + "carWash": counts.get("carWash", 0) or 0, + "gasAndCarWash": counts.get("gasAndCarWash", 0) or 0, + } + + +def parse_cli_date(value): + return datetime.strptime(value, "%m/%d/%Y").date() + + +def format_cli_date(value): + return f"{value.month}/{value.day:02d}/{value.year}" + + +def subtract_months(value, months): + year = value.year + month = value.month - months + while month <= 0: + month += 12 + year -= 1 + day = min(value.day, monthrange(year, month)[1]) + return value.replace(year=year, month=month, day=day) + + +def resolve_date_range(months_back, today=None): + if months_back < 1: + raise click.ClickException("months-back must be at least 1") + + end = today or datetime.now().date() + start = subtract_months(end, months_back) + return format_cli_date(start), format_cli_date(end) + + +def build_date_windows(start_date, end_date, window_days): + start = parse_cli_date(start_date) + end = parse_cli_date(end_date) + if end < start: + raise click.ClickException("end-date must be on or after start-date") + if window_days < 1: + raise click.ClickException("window-days must be at least 1") + + windows = [] + current = start + while current <= end: + window_end = min(current + timedelta(days=window_days - 1), end) + windows.append( + { + "startDate": format_cli_date(current), + "endDate": format_cli_date(window_end), + } + ) + current = window_end + timedelta(days=1) + return windows + + +def unique_receipts(receipts): + by_barcode = {} + for receipt in receipts: + key = receipt_key(receipt) + if key: + by_barcode[key] = receipt + return list(by_barcode.values()) + + +def receipt_key(receipt): + barcode = receipt.get("transactionBarcode", "") + transaction_date_time = receipt.get("transactionDateTime", "") + if not barcode: + return "" + return f"{barcode}::{transaction_date_time}" + + +def fetch_summary_windows( + session, + start_date, + end_date, + document_type, + document_sub_type, + window_days, +): + requests_metadata = [] + combined_receipts = [] + + for window in build_date_windows(start_date, end_date, window_days): + variables = { + "startDate": window["startDate"], + "endDate": window["endDate"], + "text": "custom", + "documentType": document_type, + "documentSubType": document_sub_type, + } + payload = graphql_post(session, SUMMARY_QUERY, variables) + receipts = summary_receipts(payload) + counts = summary_counts(payload) + warehouse_count = sum( + 1 for receipt in receipts if receipt.get("receiptType") == "In-Warehouse" + ) + mismatch = counts["inWarehouse"] != warehouse_count + requests_metadata.append( + { + **variables, + "returnedReceipts": len(receipts), + "returnedInWarehouseReceipts": warehouse_count, + "inWarehouse": counts["inWarehouse"], + "gasStation": counts["gasStation"], + "carWash": counts["carWash"], + "gasAndCarWash": counts["gasAndCarWash"], + "countMismatch": mismatch, + } + ) + if mismatch: + click.echo( + ( + "warning: summary count mismatch for " + f"{window['startDate']} to {window['endDate']}: " + f"inWarehouse={counts['inWarehouse']} " + f"returnedInWarehouseReceipts={warehouse_count}" + ), + err=True, + ) + combined_receipts.extend(receipts) + + unique = unique_receipts(combined_receipts) + aggregate_payload = { + "data": { + "receiptsWithCounts": { + "inWarehouse": sum(row["inWarehouse"] for row in requests_metadata), + "gasStation": sum(row["gasStation"] for row in requests_metadata), + "carWash": sum(row["carWash"] for row in requests_metadata), + "gasAndCarWash": sum(row["gasAndCarWash"] for row in requests_metadata), + "receipts": unique, + } + } + } + return aggregate_payload, requests_metadata + + +def flatten_costco_data(summary_payload, detail_payloads, raw_dir): + summary_lookup = { + receipt_key(receipt): receipt + for receipt in summary_receipts(summary_payload) + if receipt_key(receipt) + } + orders = [] + items = [] + + for detail_payload in detail_payloads: + for receipt in detail_receipts(detail_payload): + order_id = receipt["transactionBarcode"] + receipt_id = receipt_key(receipt) + summary_row = summary_lookup.get(receipt_id, {}) + coupon_numbers = { + row.get("upcnumberCoupon", "") + for row in summary_row.get("couponArray", []) or [] + if row.get("upcnumberCoupon") + } + raw_order_path = raw_dir / f"{receipt_id or order_id}.json" + + orders.append( + { + "retailer": RETAILER, + "order_id": order_id, + "order_date": receipt.get("transactionDate", ""), + "delivery_date": receipt.get("transactionDate", ""), + "service_type": receipt.get("receiptType", ""), + "order_total": stringify(receipt.get("total")), + "payment_method": compact_join( + summary_row.get("tenderArray", []) or [], "tenderDescription" + ), + "total_item_count": stringify(receipt.get("totalItemCount")), + "total_savings": stringify(receipt.get("instantSavings")), + "your_savings_total": stringify(receipt.get("instantSavings")), + "coupons_discounts_total": stringify(receipt.get("instantSavings")), + "store_name": receipt.get("warehouseName", ""), + "store_number": stringify(receipt.get("warehouseNumber")), + "store_address1": receipt.get("warehouseAddress1", ""), + "store_city": receipt.get("warehouseCity", ""), + "store_state": receipt.get("warehouseState", ""), + "store_zipcode": receipt.get("warehousePostalCode", ""), + "refund_order": "false", + "ebt_order": "false", + "raw_history_path": (raw_dir / "summary.json").as_posix(), + "raw_order_path": raw_order_path.as_posix(), + } + ) + + for line_no, item in enumerate(receipt.get("itemArray", []), start=1): + item_number = stringify(item.get("itemNumber")) + description = join_descriptions( + item.get("itemDescription01"), item.get("itemDescription02") + ) + is_discount = is_discount_line(item) + is_coupon = is_discount and ( + item_number in coupon_numbers + or description.startswith("/") + ) + + items.append( + { + "retailer": RETAILER, + "order_id": order_id, + "line_no": str(line_no), + "order_date": receipt.get("transactionDate", ""), + "retailer_item_id": item_number, + "pod_id": "", + "item_name": description, + "upc": "", + "category_id": stringify(item.get("itemDepartmentNumber")), + "category": stringify(item.get("transDepartmentNumber")), + "qty": stringify(item.get("unit")), + "unit": stringify(item.get("itemIdentifier")), + "unit_price": stringify(item.get("itemUnitPriceAmount")), + "line_total": stringify(item.get("amount")), + "picked_weight": "", + "mvp_savings": "", + "reward_savings": "", + "coupon_savings": stringify(item.get("amount") if is_coupon else ""), + "coupon_price": "", + "image_url": "", + "raw_order_path": raw_order_path.as_posix(), + "is_discount_line": "true" if is_discount else "false", + "is_coupon_line": "true" if is_coupon else "false", + } + ) + + return orders, items + + +def join_descriptions(*parts): + return " ".join(str(part).strip() for part in parts if part).strip() + + +def compact_join(rows, field): + values = [str(row.get(field, "")).strip() for row in rows if row.get(field)] + return " | ".join(values) + + +def is_discount_line(item): + amount = item.get("amount") + unit = item.get("unit") + description = join_descriptions( + item.get("itemDescription01"), item.get("itemDescription02") + ) + try: + amount_val = float(amount) + except (TypeError, ValueError): + amount_val = 0.0 + try: + unit_val = float(unit) + except (TypeError, ValueError): + unit_val = 0.0 + return amount_val < 0 or unit_val < 0 or description.startswith("/") + + +def stringify(value): + if value is None: + return "" + return str(value) + + +def write_json(path, payload): + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(payload, indent=2), encoding="utf-8") + + +def write_csv(path, rows, fieldnames): + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", newline="", encoding="utf-8") as handle: + writer = csv.DictWriter(handle, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) + + +@click.command() +@click.option( + "--outdir", + default="costco_output", + show_default=True, + help="Output directory for Costco raw and flattened files.", +) +@click.option( + "--document-type", + default="all", + show_default=True, + help="Summary document type.", +) +@click.option( + "--document-sub-type", + default="all", + show_default=True, + help="Summary document sub type.", +) +@click.option( + "--window-days", + default=92, + show_default=True, + type=int, + help="Maximum number of days to request per summary window.", +) +@click.option( + "--months-back", + default=3, + show_default=True, + type=int, + help="How many months of receipts to enumerate back from today.", +) +def main(outdir, document_type, document_sub_type, window_days, months_back): + outdir = Path(outdir) + raw_dir = outdir / "raw" + try: + session = build_session() + except Exception as exc: + raise click.ClickException( + f"failed to load Costco Firefox cookies: {exc}" + ) from exc + start_date, end_date = resolve_date_range(months_back) + + summary_payload, request_metadata = fetch_summary_windows( + session, + start_date, + end_date, + document_type, + document_sub_type, + window_days, + ) + write_json(raw_dir / "summary.json", summary_payload) + write_json(raw_dir / "summary_requests.json", request_metadata) + receipts = summary_receipts(summary_payload) + + detail_payloads = [] + for receipt in receipts: + barcode = receipt["transactionBarcode"] + receipt_id = receipt_key(receipt) or barcode + click.echo(f"fetching {barcode}") + detail_payload = graphql_post( + session, + DETAIL_QUERY, + {"barcode": barcode, "documentType": "warehouse"}, + ) + detail_payloads.append(detail_payload) + write_json(raw_dir / f"{receipt_id}.json", detail_payload) + + orders, items = flatten_costco_data(summary_payload, detail_payloads, raw_dir) + write_csv(outdir / "orders.csv", orders, ORDER_FIELDS) + write_csv(outdir / "items.csv", items, ITEM_FIELDS) + click.echo(f"wrote {len(orders)} orders and {len(items)} item rows to {outdir}") + + +if __name__ == "__main__": + main() diff --git a/tests/test_canonical_layer.py b/tests/test_canonical_layer.py new file mode 100644 index 0000000..5b45d44 --- /dev/null +++ b/tests/test_canonical_layer.py @@ -0,0 +1,99 @@ +import unittest + +import build_canonical_layer + + +class CanonicalLayerTests(unittest.TestCase): + def test_build_canonical_layer_auto_links_exact_upc_and_name_size(self): + observed_rows = [ + { + "observed_product_id": "gobs_1", + "representative_upc": "111", + "representative_retailer_item_id": "11", + "representative_name_norm": "GALA APPLE", + "representative_brand": "SB", + "representative_variant": "", + "representative_size_value": "5", + "representative_size_unit": "lb", + "representative_pack_qty": "", + "representative_measure_type": "weight", + "is_fee": "false", + "is_discount_line": "false", + "is_coupon_line": "false", + }, + { + "observed_product_id": "gobs_2", + "representative_upc": "111", + "representative_retailer_item_id": "12", + "representative_name_norm": "LARGE WHITE EGGS", + "representative_brand": "SB", + "representative_variant": "", + "representative_size_value": "", + "representative_size_unit": "", + "representative_pack_qty": "18", + "representative_measure_type": "count", + "is_fee": "false", + "is_discount_line": "false", + "is_coupon_line": "false", + }, + { + "observed_product_id": "gobs_3", + "representative_upc": "", + "representative_retailer_item_id": "21", + "representative_name_norm": "ROTINI", + "representative_brand": "", + "representative_variant": "", + "representative_size_value": "16", + "representative_size_unit": "oz", + "representative_pack_qty": "", + "representative_measure_type": "weight", + "is_fee": "false", + "is_discount_line": "false", + "is_coupon_line": "false", + }, + { + "observed_product_id": "gobs_4", + "representative_upc": "", + "representative_retailer_item_id": "22", + "representative_name_norm": "ROTINI", + "representative_brand": "SB", + "representative_variant": "", + "representative_size_value": "16", + "representative_size_unit": "oz", + "representative_pack_qty": "", + "representative_measure_type": "weight", + "is_fee": "false", + "is_discount_line": "false", + "is_coupon_line": "false", + }, + { + "observed_product_id": "gobs_5", + "representative_upc": "", + "representative_retailer_item_id": "99", + "representative_name_norm": "GL BAG CHARGE", + "representative_brand": "", + "representative_variant": "", + "representative_size_value": "", + "representative_size_unit": "", + "representative_pack_qty": "", + "representative_measure_type": "each", + "is_fee": "true", + "is_discount_line": "false", + "is_coupon_line": "false", + }, + ] + + canonicals, links = build_canonical_layer.build_canonical_layer(observed_rows) + + self.assertEqual(2, len(canonicals)) + self.assertEqual(4, len(links)) + methods = {row["observed_product_id"]: row["link_method"] for row in links} + self.assertEqual("exact_upc", methods["gobs_1"]) + self.assertEqual("exact_upc", methods["gobs_2"]) + self.assertEqual("exact_name_size", methods["gobs_3"]) + self.assertEqual("exact_name_size", methods["gobs_4"]) + self.assertNotIn("gobs_5", methods) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_costco_pipeline.py b/tests/test_costco_pipeline.py new file mode 100644 index 0000000..5137f2c --- /dev/null +++ b/tests/test_costco_pipeline.py @@ -0,0 +1,439 @@ +import csv +import json +import tempfile +import unittest +from pathlib import Path +from unittest import mock + +import enrich_costco +import scrape_costco +import validate_cross_retailer_flow + + +class CostcoPipelineTests(unittest.TestCase): + def test_resolve_date_range_uses_months_back(self): + start_date, end_date = scrape_costco.resolve_date_range( + 3, today=scrape_costco.parse_cli_date("3/16/2026") + ) + + self.assertEqual("12/16/2025", start_date) + self.assertEqual("3/16/2026", end_date) + + def test_build_date_windows_splits_long_ranges(self): + windows = scrape_costco.build_date_windows("1/01/2026", "6/30/2026", 92) + + self.assertEqual( + [ + {"startDate": "1/01/2026", "endDate": "4/02/2026"}, + {"startDate": "4/03/2026", "endDate": "6/30/2026"}, + ], + windows, + ) + + def test_fetch_summary_windows_records_metadata_and_warns_on_mismatch(self): + payloads = [ + { + "data": { + "receiptsWithCounts": { + "inWarehouse": 2, + "gasStation": 0, + "carWash": 0, + "gasAndCarWash": 0, + "receipts": [ + { + "transactionBarcode": "abc", + "receiptType": "In-Warehouse", + } + ], + } + } + }, + { + "data": { + "receiptsWithCounts": { + "inWarehouse": 1, + "gasStation": 0, + "carWash": 0, + "gasAndCarWash": 0, + "receipts": [ + { + "transactionBarcode": "def", + "receiptType": "In-Warehouse", + } + ], + } + } + }, + ] + + with mock.patch.object( + scrape_costco, "graphql_post", side_effect=payloads + ) as mocked_post, mock.patch.object(scrape_costco.click, "echo") as mocked_echo: + summary_payload, metadata = scrape_costco.fetch_summary_windows( + session=object(), + start_date="1/01/2026", + end_date="6/30/2026", + document_type="all", + document_sub_type="all", + window_days=92, + ) + + self.assertEqual(2, mocked_post.call_count) + self.assertEqual(2, len(metadata)) + self.assertTrue(metadata[0]["countMismatch"]) + self.assertFalse(metadata[1]["countMismatch"]) + self.assertEqual("1/01/2026", metadata[0]["startDate"]) + self.assertEqual("4/03/2026", metadata[1]["startDate"]) + self.assertEqual( + ["abc", "def"], + [ + row["transactionBarcode"] + for row in scrape_costco.summary_receipts(summary_payload) + ], + ) + mocked_echo.assert_called_once() + warning_text = mocked_echo.call_args.args[0] + self.assertIn("warning: summary count mismatch", warning_text) + + def test_flatten_costco_data_preserves_discount_rows(self): + summary_payload = { + "data": { + "receiptsWithCounts": { + "receipts": [ + { + "transactionBarcode": "abc", + "tenderArray": [{"tenderDescription": "VISA"}], + "couponArray": [{"upcnumberCoupon": "2100003746641"}], + } + ] + } + } + } + detail_payloads = [ + { + "data": { + "receiptsWithCounts": { + "receipts": [ + { + "transactionBarcode": "abc", + "transactionDate": "2026-03-12", + "receiptType": "In-Warehouse", + "total": 10.0, + "totalItemCount": 2, + "instantSavings": 5.0, + "warehouseName": "MT VERNON", + "warehouseNumber": 1115, + "warehouseAddress1": "7940 RICHMOND HWY", + "warehouseCity": "ALEXANDRIA", + "warehouseState": "VA", + "warehousePostalCode": "22306", + "itemArray": [ + { + "itemNumber": "4873222", + "itemDescription01": "ALL F&C", + "itemDescription02": "200OZ 160LOADS P104", + "itemDepartmentNumber": 14, + "transDepartmentNumber": 14, + "unit": 1, + "itemIdentifier": "E", + "amount": 19.99, + "itemUnitPriceAmount": 19.99, + }, + { + "itemNumber": "374664", + "itemDescription01": "/ 4873222", + "itemDescription02": None, + "itemDepartmentNumber": 14, + "transDepartmentNumber": 14, + "unit": -1, + "itemIdentifier": None, + "amount": -5, + "itemUnitPriceAmount": 0, + }, + ], + } + ] + } + } + } + ] + + orders, items = scrape_costco.flatten_costco_data( + summary_payload, detail_payloads, Path("costco_output/raw") + ) + + self.assertEqual(1, len(orders)) + self.assertEqual(2, len(items)) + self.assertEqual("false", items[0]["is_discount_line"]) + self.assertEqual("true", items[1]["is_discount_line"]) + self.assertEqual("true", items[1]["is_coupon_line"]) + + def test_flatten_costco_data_uses_composite_summary_lookup_key(self): + summary_payload = { + "data": { + "receiptsWithCounts": { + "receipts": [ + { + "transactionBarcode": "dup", + "transactionDateTime": "2026-03-12T16:16:00", + "tenderArray": [{"tenderDescription": "VISA"}], + "couponArray": [{"upcnumberCoupon": "111"}], + }, + { + "transactionBarcode": "dup", + "transactionDateTime": "2026-02-14T16:25:00", + "tenderArray": [{"tenderDescription": "MASTERCARD"}], + "couponArray": [], + }, + ] + } + } + } + detail_payloads = [ + { + "data": { + "receiptsWithCounts": { + "receipts": [ + { + "transactionBarcode": "dup", + "transactionDateTime": "2026-03-12T16:16:00", + "transactionDate": "2026-03-12", + "receiptType": "In-Warehouse", + "total": 10.0, + "totalItemCount": 1, + "instantSavings": 5.0, + "warehouseName": "MT VERNON", + "warehouseNumber": 1115, + "warehouseAddress1": "7940 RICHMOND HWY", + "warehouseCity": "ALEXANDRIA", + "warehouseState": "VA", + "warehousePostalCode": "22306", + "itemArray": [ + { + "itemNumber": "111", + "itemDescription01": "/ 111", + "itemDescription02": None, + "itemDepartmentNumber": 14, + "transDepartmentNumber": 14, + "unit": -1, + "itemIdentifier": None, + "amount": -5, + "itemUnitPriceAmount": 0, + } + ], + } + ] + } + } + } + ] + + orders, items = scrape_costco.flatten_costco_data( + summary_payload, detail_payloads, Path("costco_output/raw") + ) + + self.assertEqual("VISA", orders[0]["payment_method"]) + self.assertEqual("true", items[0]["is_coupon_line"]) + self.assertIn("dup::2026-03-12T16:16:00.json", items[0]["raw_order_path"]) + + def test_costco_enricher_parses_size_pack_and_discount(self): + row = enrich_costco.parse_costco_item( + order_id="abc", + order_date="2026-03-12", + raw_path=Path("costco_output/raw/abc.json"), + line_no=1, + item={ + "itemNumber": "60357", + "itemDescription01": "MIXED PEPPER", + "itemDescription02": "6-PACK", + "itemDepartmentNumber": 65, + "transDepartmentNumber": 65, + "unit": 1, + "itemIdentifier": "E", + "amount": 7.49, + "itemUnitPriceAmount": 7.49, + }, + ) + self.assertEqual("60357", row["retailer_item_id"]) + self.assertEqual("MIXED PEPPER", row["item_name_norm"]) + self.assertEqual("6", row["pack_qty"]) + self.assertEqual("count", row["measure_type"]) + + discount = enrich_costco.parse_costco_item( + order_id="abc", + order_date="2026-03-12", + raw_path=Path("costco_output/raw/abc.json"), + line_no=2, + item={ + "itemNumber": "374664", + "itemDescription01": "/ 4873222", + "itemDescription02": None, + "itemDepartmentNumber": 14, + "transDepartmentNumber": 14, + "unit": -1, + "itemIdentifier": None, + "amount": -5, + "itemUnitPriceAmount": 0, + }, + ) + self.assertEqual("true", discount["is_discount_line"]) + self.assertEqual("true", discount["is_coupon_line"]) + + def test_cross_retailer_validation_writes_proof_example(self): + with tempfile.TemporaryDirectory() as tmpdir: + giant_csv = Path(tmpdir) / "giant_items_enriched.csv" + costco_csv = Path(tmpdir) / "costco_items_enriched.csv" + outdir = Path(tmpdir) / "combined" + + fieldnames = enrich_costco.OUTPUT_FIELDS + giant_row = {field: "" for field in fieldnames} + giant_row.update( + { + "retailer": "giant", + "order_id": "g1", + "line_no": "1", + "order_date": "2026-03-01", + "retailer_item_id": "100", + "item_name": "FRESH BANANA", + "item_name_norm": "BANANA", + "upc": "4011", + "measure_type": "weight", + "is_store_brand": "false", + "is_fee": "false", + "is_discount_line": "false", + "is_coupon_line": "false", + "line_total": "1.29", + } + ) + costco_row = {field: "" for field in fieldnames} + costco_row.update( + { + "retailer": "costco", + "order_id": "c1", + "line_no": "1", + "order_date": "2026-03-12", + "retailer_item_id": "30669", + "item_name": "BANANAS 3 LB / 1.36 KG", + "item_name_norm": "BANANA", + "upc": "", + "size_value": "3", + "size_unit": "lb", + "measure_type": "weight", + "is_store_brand": "false", + "is_fee": "false", + "is_discount_line": "false", + "is_coupon_line": "false", + "line_total": "2.98", + } + ) + + with giant_csv.open("w", newline="", encoding="utf-8") as handle: + writer = csv.DictWriter(handle, fieldnames=fieldnames) + writer.writeheader() + writer.writerow(giant_row) + with costco_csv.open("w", newline="", encoding="utf-8") as handle: + writer = csv.DictWriter(handle, fieldnames=fieldnames) + writer.writeheader() + writer.writerow(costco_row) + + validate_cross_retailer_flow.main.callback( + giant_items_enriched_csv=str(giant_csv), + costco_items_enriched_csv=str(costco_csv), + outdir=str(outdir), + ) + + proof_path = outdir / "proof_examples.csv" + self.assertTrue(proof_path.exists()) + with proof_path.open(newline="", encoding="utf-8") as handle: + rows = list(csv.DictReader(handle)) + self.assertEqual(1, len(rows)) + self.assertEqual("banana", rows[0]["proof_name"]) + + def test_main_writes_summary_request_metadata(self): + with tempfile.TemporaryDirectory() as tmpdir: + outdir = Path(tmpdir) / "costco_output" + summary_payload = { + "data": { + "receiptsWithCounts": { + "inWarehouse": 1, + "gasStation": 0, + "carWash": 0, + "gasAndCarWash": 0, + "receipts": [ + { + "transactionBarcode": "abc", + "receiptType": "In-Warehouse", + "tenderArray": [], + "couponArray": [], + } + ], + } + } + } + detail_payload = { + "data": { + "receiptsWithCounts": { + "receipts": [ + { + "transactionBarcode": "abc", + "transactionDate": "2026-03-12", + "receiptType": "In-Warehouse", + "total": 10.0, + "totalItemCount": 1, + "instantSavings": 0, + "warehouseName": "MT VERNON", + "warehouseNumber": 1115, + "warehouseAddress1": "7940 RICHMOND HWY", + "warehouseCity": "ALEXANDRIA", + "warehouseState": "VA", + "warehousePostalCode": "22306", + "itemArray": [], + } + ] + } + } + } + metadata = [ + { + "startDate": "1/01/2026", + "endDate": "3/31/2026", + "text": "custom", + "documentType": "all", + "documentSubType": "all", + "returnedReceipts": 1, + "returnedInWarehouseReceipts": 1, + "inWarehouse": 1, + "gasStation": 0, + "carWash": 0, + "gasAndCarWash": 0, + "countMismatch": False, + } + ] + + with mock.patch.object( + scrape_costco, "build_session", return_value=object() + ), mock.patch.object( + scrape_costco, + "fetch_summary_windows", + return_value=(summary_payload, metadata), + ), mock.patch.object( + scrape_costco, + "graphql_post", + return_value=detail_payload, + ): + scrape_costco.main.callback( + outdir=str(outdir), + document_type="all", + document_sub_type="all", + window_days=92, + months_back=3, + ) + + metadata_path = outdir / "raw" / "summary_requests.json" + self.assertTrue(metadata_path.exists()) + saved_metadata = json.loads(metadata_path.read_text(encoding="utf-8")) + self.assertEqual(metadata, saved_metadata) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_enrich_giant.py b/tests/test_enrich_giant.py new file mode 100644 index 0000000..39a34ff --- /dev/null +++ b/tests/test_enrich_giant.py @@ -0,0 +1,191 @@ +import csv +import json +import tempfile +import unittest +from pathlib import Path + +import enrich_giant + + +class EnrichGiantTests(unittest.TestCase): + def test_parse_size_and_pack_handles_pack_and_weight_tokens(self): + size_value, size_unit, pack_qty = enrich_giant.parse_size_and_pack( + "COKE CHERRY 6PK 7.5Z" + ) + + self.assertEqual("7.5", size_value) + self.assertEqual("oz", size_unit) + self.assertEqual("6", pack_qty) + + def test_parse_item_marks_store_brand_fee_and_weight_prices(self): + row = enrich_giant.parse_item( + order_id="abc123", + order_date="2026-03-01", + raw_path=Path("raw/abc123.json"), + line_no=1, + item={ + "podId": 1, + "shipQy": 1, + "totalPickedWeight": 2, + "unitPrice": 3.98, + "itemName": "+SB GALA APPLE 5 LB", + "lbEachCd": "LB", + "groceryAmount": 3.98, + "primUpcCd": "111", + "mvpSavings": 0, + "rewardSavings": 0, + "couponSavings": 0, + "couponPrice": 0, + "categoryId": "1", + "categoryDesc": "Grocery", + "image": {"large": "https://example.test/apple.jpg"}, + }, + ) + + self.assertEqual("SB", row["brand_guess"]) + self.assertEqual("GALA APPLE", row["item_name_norm"]) + self.assertEqual("5", row["size_value"]) + self.assertEqual("lb", row["size_unit"]) + self.assertEqual("weight", row["measure_type"]) + self.assertEqual("true", row["is_store_brand"]) + self.assertEqual("1.99", row["price_per_lb"]) + self.assertEqual("0.1244", row["price_per_oz"]) + self.assertEqual("https://example.test/apple.jpg", row["image_url"]) + + fee_row = enrich_giant.parse_item( + order_id="abc123", + order_date="2026-03-01", + raw_path=Path("raw/abc123.json"), + line_no=2, + item={ + "podId": 2, + "shipQy": 1, + "totalPickedWeight": 0, + "unitPrice": 0.05, + "itemName": "GL BAG CHARGE", + "lbEachCd": "EA", + "groceryAmount": 0.05, + "primUpcCd": "", + "mvpSavings": 0, + "rewardSavings": 0, + "couponSavings": 0, + "couponPrice": 0, + "categoryId": "1", + "categoryDesc": "Grocery", + }, + ) + + self.assertEqual("true", fee_row["is_fee"]) + self.assertEqual("GL BAG CHARGE", fee_row["item_name_norm"]) + + def test_parse_item_derives_packaged_weight_prices_from_size_tokens(self): + row = enrich_giant.parse_item( + order_id="abc123", + order_date="2026-03-01", + raw_path=Path("raw/abc123.json"), + line_no=1, + item={ + "podId": 1, + "shipQy": 2, + "totalPickedWeight": 0, + "unitPrice": 3.0, + "itemName": "PEPSI 6PK 7.5Z", + "lbEachCd": "EA", + "groceryAmount": 6.0, + "primUpcCd": "111", + "mvpSavings": 0, + "rewardSavings": 0, + "couponSavings": 0, + "couponPrice": 0, + "categoryId": "1", + "categoryDesc": "Grocery", + }, + ) + + self.assertEqual("weight", row["measure_type"]) + self.assertEqual("6", row["pack_qty"]) + self.assertEqual("7.5", row["size_value"]) + self.assertEqual("0.0667", row["price_per_oz"]) + self.assertEqual("1.0667", row["price_per_lb"]) + + def test_build_items_enriched_reads_raw_order_files_and_writes_csv(self): + with tempfile.TemporaryDirectory() as tmpdir: + raw_dir = Path(tmpdir) / "raw" + raw_dir.mkdir() + (raw_dir / "history.json").write_text("{}", encoding="utf-8") + (raw_dir / "order-2.json").write_text( + json.dumps( + { + "orderId": "order-2", + "orderDate": "2026-03-02", + "items": [ + { + "podId": 20, + "shipQy": 1, + "totalPickedWeight": 0, + "unitPrice": 2.99, + "itemName": "SB ROTINI 16Z", + "lbEachCd": "EA", + "groceryAmount": 2.99, + "primUpcCd": "222", + "mvpSavings": 0, + "rewardSavings": 0, + "couponSavings": 0, + "couponPrice": 0, + "categoryId": "1", + "categoryDesc": "Grocery", + "image": {"small": "https://example.test/rotini.jpg"}, + } + ], + } + ), + encoding="utf-8", + ) + (raw_dir / "order-1.json").write_text( + json.dumps( + { + "orderId": "order-1", + "orderDate": "2026-03-01", + "items": [ + { + "podId": 10, + "shipQy": 2, + "totalPickedWeight": 0, + "unitPrice": 1.5, + "itemName": "PEPSI 6PK 7.5Z", + "lbEachCd": "EA", + "groceryAmount": 3.0, + "primUpcCd": "111", + "mvpSavings": 0, + "rewardSavings": 0, + "couponSavings": 0, + "couponPrice": 0, + "categoryId": "1", + "categoryDesc": "Grocery", + } + ], + } + ), + encoding="utf-8", + ) + + rows = enrich_giant.build_items_enriched(raw_dir) + output_csv = Path(tmpdir) / "items_enriched.csv" + enrich_giant.write_csv(output_csv, rows) + + self.assertEqual(["order-1", "order-2"], [row["order_id"] for row in rows]) + self.assertEqual("PEPSI", rows[0]["item_name_norm"]) + self.assertEqual("6", rows[0]["pack_qty"]) + self.assertEqual("7.5", rows[0]["size_value"]) + self.assertEqual("10", rows[0]["retailer_item_id"]) + self.assertEqual("true", rows[1]["is_store_brand"]) + + with output_csv.open(newline="", encoding="utf-8") as handle: + written_rows = list(csv.DictReader(handle)) + + self.assertEqual(2, len(written_rows)) + self.assertEqual(enrich_giant.OUTPUT_FIELDS, list(written_rows[0].keys())) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_observed_products.py b/tests/test_observed_products.py new file mode 100644 index 0000000..90a7a5e --- /dev/null +++ b/tests/test_observed_products.py @@ -0,0 +1,67 @@ +import unittest + +import build_observed_products + + +class ObservedProductTests(unittest.TestCase): + def test_build_observed_products_aggregates_rows_with_same_key(self): + rows = [ + { + "retailer": "giant", + "order_id": "1", + "line_no": "1", + "order_date": "2026-01-01", + "item_name": "SB GALA APPLE 5LB", + "item_name_norm": "GALA APPLE", + "retailer_item_id": "11", + "upc": "111", + "brand_guess": "SB", + "variant": "", + "size_value": "5", + "size_unit": "lb", + "pack_qty": "", + "measure_type": "weight", + "image_url": "https://example.test/a.jpg", + "is_store_brand": "true", + "is_fee": "false", + "is_discount_line": "false", + "is_coupon_line": "false", + "line_total": "7.99", + }, + { + "retailer": "giant", + "order_id": "2", + "line_no": "1", + "order_date": "2026-01-10", + "item_name": "SB GALA APPLE 5 LB", + "item_name_norm": "GALA APPLE", + "retailer_item_id": "11", + "upc": "111", + "brand_guess": "SB", + "variant": "", + "size_value": "5", + "size_unit": "lb", + "pack_qty": "", + "measure_type": "weight", + "image_url": "", + "is_store_brand": "true", + "is_fee": "false", + "is_discount_line": "false", + "is_coupon_line": "false", + "line_total": "8.49", + }, + ] + + observed = build_observed_products.build_observed_products(rows) + + self.assertEqual(1, len(observed)) + self.assertEqual("2", observed[0]["times_seen"]) + self.assertEqual("2026-01-01", observed[0]["first_seen_date"]) + self.assertEqual("2026-01-10", observed[0]["last_seen_date"]) + self.assertEqual("11", observed[0]["representative_retailer_item_id"]) + self.assertEqual("111", observed[0]["representative_upc"]) + self.assertIn("SB GALA APPLE 5LB", observed[0]["raw_name_examples"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_review_queue.py b/tests/test_review_queue.py new file mode 100644 index 0000000..3843700 --- /dev/null +++ b/tests/test_review_queue.py @@ -0,0 +1,133 @@ +import tempfile +import unittest +from pathlib import Path + +import build_observed_products +import build_review_queue +from layer_helpers import write_csv_rows + + +class ReviewQueueTests(unittest.TestCase): + def test_build_review_queue_preserves_existing_status(self): + observed_rows = [ + { + "observed_product_id": "gobs_1", + "retailer": "giant", + "representative_upc": "111", + "representative_image_url": "", + "representative_name_norm": "GALA APPLE", + "times_seen": "2", + "distinct_item_names_count": "2", + "distinct_upcs_count": "1", + "is_fee": "false", + "is_discount_line": "false", + "is_coupon_line": "false", + } + ] + item_rows = [ + { + "observed_product_id": "gobs_1", + "item_name": "SB GALA APPLE 5LB", + "item_name_norm": "GALA APPLE", + "line_total": "7.99", + }, + { + "observed_product_id": "gobs_1", + "item_name": "SB GALA APPLE 5 LB", + "item_name_norm": "GALA APPLE", + "line_total": "8.49", + }, + ] + existing = { + build_review_queue.stable_id("rvw", "gobs_1|missing_image"): { + "status": "approved", + "resolution_notes": "looked fine", + "created_at": "2026-03-15", + } + } + + queue = build_review_queue.build_review_queue( + observed_rows, item_rows, existing, "2026-03-16" + ) + + self.assertEqual(2, len(queue)) + missing_image = [row for row in queue if row["reason_code"] == "missing_image"][0] + self.assertEqual("approved", missing_image["status"]) + self.assertEqual("looked fine", missing_image["resolution_notes"]) + + def test_review_queue_main_writes_output(self): + with tempfile.TemporaryDirectory() as tmpdir: + observed_path = Path(tmpdir) / "products_observed.csv" + items_path = Path(tmpdir) / "items_enriched.csv" + output_path = Path(tmpdir) / "review_queue.csv" + + observed_rows = [ + { + "observed_product_id": "gobs_1", + "retailer": "giant", + "observed_key": "giant|upc=111|name=GALA APPLE", + "representative_retailer_item_id": "11", + "representative_upc": "111", + "representative_item_name": "SB GALA APPLE 5LB", + "representative_name_norm": "GALA APPLE", + "representative_brand": "SB", + "representative_variant": "", + "representative_size_value": "5", + "representative_size_unit": "lb", + "representative_pack_qty": "", + "representative_measure_type": "weight", + "representative_image_url": "", + "is_store_brand": "true", + "is_fee": "false", + "is_discount_line": "false", + "is_coupon_line": "false", + "first_seen_date": "2026-01-01", + "last_seen_date": "2026-01-10", + "times_seen": "2", + "example_order_id": "1", + "example_item_name": "SB GALA APPLE 5LB", + "raw_name_examples": "SB GALA APPLE 5LB | SB GALA APPLE 5 LB", + "normalized_name_examples": "GALA APPLE", + "example_prices": "7.99 | 8.49", + "distinct_item_names_count": "2", + "distinct_retailer_item_ids_count": "1", + "distinct_upcs_count": "1", + } + ] + item_rows = [ + { + "retailer": "giant", + "order_id": "1", + "line_no": "1", + "item_name": "SB GALA APPLE 5LB", + "item_name_norm": "GALA APPLE", + "retailer_item_id": "11", + "upc": "111", + "size_value": "5", + "size_unit": "lb", + "pack_qty": "", + "measure_type": "weight", + "is_store_brand": "true", + "is_fee": "false", + "is_discount_line": "false", + "is_coupon_line": "false", + "line_total": "7.99", + } + ] + + write_csv_rows( + observed_path, observed_rows, build_observed_products.OUTPUT_FIELDS + ) + write_csv_rows(items_path, item_rows, list(item_rows[0].keys())) + + build_review_queue.main.callback( + observed_csv=str(observed_path), + items_enriched_csv=str(items_path), + output_csv=str(output_path), + ) + + self.assertTrue(output_path.exists()) + + +if __name__ == "__main__": + unittest.main() diff --git a/validate_cross_retailer_flow.py b/validate_cross_retailer_flow.py new file mode 100644 index 0000000..93d73bd --- /dev/null +++ b/validate_cross_retailer_flow.py @@ -0,0 +1,154 @@ +import json +from pathlib import Path + +import click + +import build_canonical_layer +import build_observed_products +from layer_helpers import stable_id, write_csv_rows + + +PROOF_FIELDS = [ + "proof_name", + "canonical_product_id", + "giant_observed_product_id", + "costco_observed_product_id", + "giant_example_item", + "costco_example_item", + "notes", +] + + +def read_rows(path): + import csv + + with Path(path).open(newline="", encoding="utf-8") as handle: + return list(csv.DictReader(handle)) + + +def find_proof_pair(observed_rows): + giant = None + costco = None + for row in observed_rows: + if row["retailer"] == "giant" and row["representative_name_norm"] == "BANANA": + giant = row + if row["retailer"] == "costco" and row["representative_name_norm"] == "BANANA": + costco = row + return giant, costco + + +def merge_proof_pair(canonical_rows, link_rows, giant_row, costco_row): + if not giant_row or not costco_row: + return canonical_rows, link_rows, [] + + proof_canonical_id = stable_id("gcan", "proof|banana") + link_rows = [ + row + for row in link_rows + if row["observed_product_id"] + not in {giant_row["observed_product_id"], costco_row["observed_product_id"]} + ] + canonical_rows = [ + row + for row in canonical_rows + if row["canonical_product_id"] != proof_canonical_id + ] + canonical_rows.append( + { + "canonical_product_id": proof_canonical_id, + "canonical_name": "BANANA", + "product_type": "banana", + "brand": "", + "variant": "", + "size_value": "", + "size_unit": "", + "pack_qty": "", + "measure_type": "weight", + "normalized_quantity": "", + "normalized_quantity_unit": "", + "notes": "manual proof merge for cross-retailer validation", + "created_at": "", + "updated_at": "", + } + ) + for observed_row in [giant_row, costco_row]: + link_rows.append( + { + "observed_product_id": observed_row["observed_product_id"], + "canonical_product_id": proof_canonical_id, + "link_method": "manual_proof_merge", + "link_confidence": "medium", + "review_status": "", + "reviewed_by": "", + "reviewed_at": "", + "link_notes": "cross-retailer validation proof", + } + ) + + proof_rows = [ + { + "proof_name": "banana", + "canonical_product_id": proof_canonical_id, + "giant_observed_product_id": giant_row["observed_product_id"], + "costco_observed_product_id": costco_row["observed_product_id"], + "giant_example_item": giant_row["example_item_name"], + "costco_example_item": costco_row["example_item_name"], + "notes": "BANANA proof pair built from Giant and Costco enriched rows", + } + ] + return canonical_rows, link_rows, proof_rows + + +@click.command() +@click.option( + "--giant-items-enriched-csv", + default="giant_output/items_enriched.csv", + show_default=True, +) +@click.option( + "--costco-items-enriched-csv", + default="costco_output/items_enriched.csv", + show_default=True, +) +@click.option( + "--outdir", + default="combined_output", + show_default=True, +) +def main(giant_items_enriched_csv, costco_items_enriched_csv, outdir): + outdir = Path(outdir) + rows = read_rows(giant_items_enriched_csv) + read_rows(costco_items_enriched_csv) + observed_rows = build_observed_products.build_observed_products(rows) + canonical_rows, link_rows = build_canonical_layer.build_canonical_layer(observed_rows) + giant_row, costco_row = find_proof_pair(observed_rows) + if not giant_row or not costco_row: + raise click.ClickException( + "could not find BANANA proof pair across Giant and Costco observed products" + ) + canonical_rows, link_rows, proof_rows = merge_proof_pair( + canonical_rows, link_rows, giant_row, costco_row + ) + + write_csv_rows( + outdir / "products_observed.csv", + observed_rows, + build_observed_products.OUTPUT_FIELDS, + ) + write_csv_rows( + outdir / "products_canonical.csv", + canonical_rows, + build_canonical_layer.CANONICAL_FIELDS, + ) + write_csv_rows( + outdir / "product_links.csv", + link_rows, + build_canonical_layer.LINK_FIELDS, + ) + write_csv_rows(outdir / "proof_examples.csv", proof_rows, PROOF_FIELDS) + click.echo( + f"wrote combined outputs to {outdir} using {len(observed_rows)} observed rows" + ) + + +if __name__ == "__main__": + main()