From 7f8c3ed8eb5f2b0f36420dfaf71fbaa1b0382406 Mon Sep 17 00:00:00 2001 From: ben Date: Tue, 17 Mar 2026 09:14:14 -0400 Subject: [PATCH] updated readme with Review steps --- README.md | 273 ++++++++++++---------------------------- build_purchases.py | 52 ++++++++ tests/test_purchases.py | 13 ++ 3 files changed, 147 insertions(+), 191 deletions(-) diff --git a/README.md b/README.md index 0ff4e66..d7803a1 100644 --- a/README.md +++ b/README.md @@ -1,227 +1,118 @@ # scrape-giant -Small grocery-history pipeline for Giant and Costco receipt data. +Small CLI pipeline for pulling purchase history from Giant and Costco, enriching line items, and building a reviewable cross-retailer purchase dataset. -This repo is still a manual, stepwise pipeline. There is no single orchestrator -script yet. Each stage is run directly, and later stages depend on files -produced by earlier stages. +There is no one-shot runner yet. Today, you run the scripts step by step from the terminal. -## What The Project Does +## What It Does -The current flow is: +- `scrape_giant.py`: download Giant orders and items +- `enrich_giant.py`: normalize Giant line items +- `scrape_costco.py`: download Costco orders and items +- `enrich_costco.py`: normalize Costco line items +- `build_purchases.py`: combine retailer outputs into one purchase table +- `review_products.py`: review unresolved product matches in the terminal -1. acquire raw Giant receipt/history data -2. enrich Giant line items into a shared enriched-item schema -3. acquire raw Costco receipt data -4. enrich Costco line items into the same shared enriched-item schema -5. build observed-product, review, and canonical-product layers -6. validate that Giant and Costco can flow through the same downstream model +## Requirements -Raw retailer JSON remains the source of truth. +- Python 3.10+ +- Firefox installed with active Giant and Costco sessions -## Current Scripts - -- `scrape_giant.py` - Fetch Giant in-store history and order detail payloads from an active Firefox - session. -- `scrape_costco.py` - Fetch Costco receipt summary/detail payloads from an active Firefox session. - Costco currently prefers `.env` header values first, then falls back to exact - Firefox local-storage values for session auth. -- `enrich_giant.py` - Parse Giant raw order JSON into `giant_output/items_enriched.csv`. -- `enrich_costco.py` - Parse Costco raw receipt JSON into `costco_output/items_enriched.csv`. -- `build_observed_products.py` - Build retailer-facing observed products from enriched rows. -- `build_review_queue.py` - Build a manual review queue for low-confidence or unresolved observed - products. -- `build_canonical_layer.py` - Build shared canonical products and observed-to-canonical links. -- `validate_cross_retailer_flow.py` - Write a proof/check output showing that Giant and Costco can meet in the same - downstream model. - -## Manual Pipeline - -Run these from the repo root with the venv active, or call them through -`./venv/bin/python`. - -### 1. Acquire Giant raw data +## Install ```bash -./venv/bin/python scrape_giant.py +python -m venv venv +./venv/scripts/activate +pip install -r requirements.txt ``` -Inputs: -- active Firefox session for `giantfood.com` -- `GIANT_USER_ID` and `GIANT_LOYALTY_NUMBER` from `.env`, shell env, or prompt +## Optional `.env` -Outputs: -- `giant_output/raw/history.json` -- `giant_output/raw/.json` +Current version works best with `.env` in the project root. The scraper will prompt for these values if they are not found in the current browser session. +- `scrape_giant` prompts if `GIANT_USER_ID` or `GIANT_LOYALTY_NUMBER` is missing. +- `scrape_costco` tries `.env` first, then Firefox local storage for session-backed values; `COSTCO_CLIENT_IDENTIFIER` should still be set explicitly. + +```env +GIANT_USER_ID=... +GIANT_LOYALTY_NUMBER=... + +# Costco can use these if present, but it can also pull session values from Firefox. +COSTCO_X_AUTHORIZATION=... +COSTCO_X_WCS_CLIENTID=... +COSTCO_CLIENT_IDENTIFIER=... +``` + +## Run Order + +Run the pipeline in this order: + +```bash +python scrape_giant.py +python enrich_giant.py +python scrape_costco.py +python enrich_costco.py +python build_purchases.py +python review_products.py +python build_purchases.py +``` + +Why run `build_purchases.py` twice: +- first pass builds the current combined dataset and review queue inputs +- `review_products.py` writes durable review decisions +- second pass reapplies those decisions into the purchase output + +If you only want to refresh the queue without reviewing interactively: + +```bash +python review_products.py --refresh-only +``` + +## Key Outputs + +Giant: - `giant_output/orders.csv` - `giant_output/items.csv` - -### 2. Enrich Giant data - -```bash -./venv/bin/python enrich_giant.py -``` - -Input: -- `giant_output/raw/*.json` - -Output: - `giant_output/items_enriched.csv` -### 3. Acquire Costco raw data - -```bash -./venv/bin/python scrape_costco.py -``` - -Optional useful flags: - -```bash -./venv/bin/python scrape_costco.py --months-back 36 -./venv/bin/python scrape_costco.py --firefox-profile-dir "C:\\Users\\you\\AppData\\Roaming\\Mozilla\\Firefox\\Profiles\\xxxx.default-release" -``` - -Inputs: -- active Firefox session for `costco.com` -- optional `.env` values: - - `COSTCO_X_AUTHORIZATION` - - `COSTCO_X_WCS_CLIENTID` - - `COSTCO_CLIENT_IDENTIFIER` -- if `COSTCO_X_AUTHORIZATION` is absent, the script falls back to exact Firefox - local-storage values: - - `idToken` -> sent as `Bearer ` - - `clientID` -> used as `costco-x-wcs-clientId` when env is blank - -Outputs: -- `costco_output/raw/summary.json` -- `costco_output/raw/summary_requests.json` -- `costco_output/raw/-.json` +Costco: - `costco_output/orders.csv` - `costco_output/items.csv` - -### 4. Enrich Costco data - -```bash -./venv/bin/python enrich_costco.py -``` - -Input: -- `costco_output/raw/*.json` - -Output: - `costco_output/items_enriched.csv` -### 5. Build shared downstream layers +Combined: +- `combined_output/purchases.csv` +- `combined_output/review_queue.csv` +- `combined_output/review_resolutions.csv` +- `combined_output/canonical_catalog.csv` +- `combined_output/product_links.csv` +- `combined_output/comparison_examples.csv` -```bash -./venv/bin/python build_observed_products.py -./venv/bin/python build_review_queue.py -./venv/bin/python build_canonical_layer.py -``` +## Review Workflow -These scripts consume the enriched item files and generate the downstream -product-model outputs. +`review_products.py` is the manual cleanup step for unresolved or weakly unified items. -Current outputs on disk: +In the terminal, you can: +- link an item to an existing canonical product +- create a new canonical product +- exclude an item +- skip it for later -- retailer-facing: - - `giant_output/products_observed.csv` - - `giant_output/review_queue.csv` - - `giant_output/products_canonical.csv` - - `giant_output/product_links.csv` -- cross-retailer proof/check output: - - `combined_output/products_observed.csv` - - `combined_output/products_canonical.csv` - - `combined_output/product_links.csv` - - `combined_output/proof_examples.csv` - -### 6. Validate cross-retailer flow - -```bash -./venv/bin/python validate_cross_retailer_flow.py -``` - -This is a proof/check step, not the main acquisition path. - -## Inputs And Outputs By Directory - -### `giant_output/` - -Inputs to this layer: -- Firefox session data for Giant -- Giant raw JSON payloads - -Generated files: -- `raw/history.json` -- `raw/.json` -- `orders.csv` -- `items.csv` -- `items_enriched.csv` -- `products_observed.csv` -- `review_queue.csv` -- `products_canonical.csv` -- `product_links.csv` - -### `costco_output/` - -Inputs to this layer: -- Firefox session data for Costco -- Costco raw GraphQL receipt payloads - -Generated files: -- `raw/summary.json` -- `raw/summary_requests.json` -- `raw/-.json` -- `orders.csv` -- `items.csv` -- `items_enriched.csv` - -### `combined_output/` - -Generated by cross-retailer proof/build scripts: -- `products_observed.csv` -- `products_canonical.csv` -- `product_links.csv` -- `proof_examples.csv` +Those decisions are saved and reused on later runs. ## Notes -- The pipeline is intentionally simple and currently manual. -- Scraping is retailer-specific and fragile; downstream modeling is shared only - after enrichment. -- `summary_requests.json` is diagnostic metadata from Costco summary enumeration - and is not a receipt payload. -- `enrich_costco.py` skips that file and only parses receipt payloads. -- The repo may contain archived or sample output files under `archive/`; they - are not part of the active scrape path. +- This project is designed around fragile retailer scraping flows, so the code favors explicit retailer-specific steps over heavy abstraction. +- `scrape_giant.py` and `scrape_costco.py` are meant to work as standalone acquisition scripts. +- `validate_cross_retailer_flow.py` is a proof/check script, not a required production step. -## Verification - -Run the full test suite with: +## Test ```bash ./venv/bin/python -m unittest discover -s tests ``` -Useful one-off checks: - -```bash -./venv/bin/python scrape_giant.py --help -./venv/bin/python scrape_costco.py --help -./venv/bin/python enrich_giant.py -./venv/bin/python enrich_costco.py -``` - ## Project Docs -- `pm/tasks.org` -- `pm/data-model.org` -- `pm/scrape-giant.org` +- `pm/tasks.org`: task tracking +- `pm/data-model.org`: current data model notes +- `pm/review-workflow.org`: review and resolution workflow diff --git a/build_purchases.py b/build_purchases.py index 4f4996f..416041a 100644 --- a/build_purchases.py +++ b/build_purchases.py @@ -7,7 +7,11 @@ import build_canonical_layer import build_observed_products import validate_cross_retailer_flow from enrich_giant import format_decimal, to_decimal +<<<<<<< HEAD from layer_helpers import read_csv_rows, stable_id, write_csv_rows +======= +from layer_helpers import read_csv_rows, write_csv_rows +>>>>>>> be1bf63 (Build pivot-ready purchase log) PURCHASE_FIELDS = [ @@ -18,8 +22,11 @@ PURCHASE_FIELDS = [ "observed_item_key", "observed_product_id", "canonical_product_id", +<<<<<<< HEAD "review_status", "resolution_action", +======= +>>>>>>> be1bf63 (Build pivot-ready purchase log) "raw_item_name", "normalized_item_name", "retailer_item_id", @@ -62,6 +69,7 @@ EXAMPLE_FIELDS = [ "notes", ] +<<<<<<< HEAD CATALOG_FIELDS = [ "canonical_product_id", "canonical_name", @@ -87,6 +95,8 @@ RESOLUTION_FIELDS = [ "reviewed_at", ] +======= +>>>>>>> be1bf63 (Build pivot-ready purchase log) def decimal_or_zero(value): return to_decimal(value) or Decimal("0") @@ -165,6 +175,7 @@ def order_lookup(rows, retailer): } +<<<<<<< HEAD def read_optional_csv_rows(path): path = Path(path) if not path.exists(): @@ -209,6 +220,9 @@ def catalog_row_from_canonical(row): def build_link_state(enriched_rows): +======= +def build_link_lookup(enriched_rows): +>>>>>>> be1bf63 (Build pivot-ready purchase log) observed_rows = build_observed_products.build_observed_products(enriched_rows) canonical_rows, link_rows = build_canonical_layer.build_canonical_layer(observed_rows) giant_row, costco_row = validate_cross_retailer_flow.find_proof_pair(observed_rows) @@ -225,6 +239,7 @@ def build_link_state(enriched_rows): canonical_id_by_observed = { row["observed_product_id"]: row["canonical_product_id"] for row in link_rows } +<<<<<<< HEAD return observed_rows, canonical_rows, link_rows, observed_id_by_key, canonical_id_by_observed @@ -253,6 +268,14 @@ def build_purchase_rows( canonical_id_by_observed[observed_product_id] = resolution["canonical_product_id"] elif action == "exclude": canonical_id_by_observed[observed_product_id] = "" +======= + return observed_id_by_key, canonical_id_by_observed + + +def build_purchase_rows(giant_enriched_rows, costco_enriched_rows, giant_orders, costco_orders): + all_enriched_rows = giant_enriched_rows + costco_enriched_rows + observed_id_by_key, canonical_id_by_observed = build_link_lookup(all_enriched_rows) +>>>>>>> be1bf63 (Build pivot-ready purchase log) orders_by_id = {} orders_by_id.update(order_lookup(giant_orders, "giant")) orders_by_id.update(order_lookup(costco_orders, "costco")) @@ -266,7 +289,10 @@ def build_purchase_rows( observed_product_id = observed_id_by_key.get(observed_key, "") order_row = orders_by_id.get((row["retailer"], row["order_id"]), {}) metrics = derive_metrics(row) +<<<<<<< HEAD resolution = resolution_lookup.get(observed_product_id, {}) +======= +>>>>>>> be1bf63 (Build pivot-ready purchase log) purchase_rows.append( { "purchase_date": row["order_date"], @@ -276,8 +302,11 @@ def build_purchase_rows( "observed_item_key": row["observed_item_key"], "observed_product_id": observed_product_id, "canonical_product_id": canonical_id_by_observed.get(observed_product_id, ""), +<<<<<<< HEAD "review_status": resolution.get("status", ""), "resolution_action": resolution.get("resolution_action", ""), +======= +>>>>>>> be1bf63 (Build pivot-ready purchase log) "raw_item_name": row["item_name"], "normalized_item_name": row["item_name_norm"], "retailer_item_id": row["retailer_item_id"], @@ -301,6 +330,7 @@ def build_purchase_rows( **metrics, } ) +<<<<<<< HEAD return purchase_rows, observed_rows, canonical_rows, link_rows @@ -328,6 +358,9 @@ def apply_manual_resolutions_to_links(link_rows, resolution_rows): "link_notes": resolution.get("resolution_notes", ""), } return sorted(link_by_observed.values(), key=lambda row: row["observed_product_id"]) +======= + return purchase_rows +>>>>>>> be1bf63 (Build pivot-ready purchase log) def build_comparison_examples(purchase_rows): @@ -366,9 +399,12 @@ def build_comparison_examples(purchase_rows): @click.option("--costco-items-enriched-csv", default="costco_output/items_enriched.csv", show_default=True) @click.option("--giant-orders-csv", default="giant_output/orders.csv", show_default=True) @click.option("--costco-orders-csv", default="costco_output/orders.csv", show_default=True) +<<<<<<< HEAD @click.option("--resolutions-csv", default="combined_output/review_resolutions.csv", show_default=True) @click.option("--catalog-csv", default="combined_output/canonical_catalog.csv", show_default=True) @click.option("--links-csv", default="combined_output/product_links.csv", show_default=True) +======= +>>>>>>> be1bf63 (Build pivot-ready purchase log) @click.option("--output-csv", default="combined_output/purchases.csv", show_default=True) @click.option("--examples-csv", default="combined_output/comparison_examples.csv", show_default=True) def main( @@ -376,6 +412,7 @@ def main( costco_items_enriched_csv, giant_orders_csv, costco_orders_csv, +<<<<<<< HEAD resolutions_csv, catalog_csv, links_csv, @@ -384,10 +421,17 @@ def main( ): resolution_rows = read_optional_csv_rows(resolutions_csv) purchase_rows, _observed_rows, canonical_rows, link_rows = build_purchase_rows( +======= + output_csv, + examples_csv, +): + purchase_rows = build_purchase_rows( +>>>>>>> be1bf63 (Build pivot-ready purchase log) read_csv_rows(giant_items_enriched_csv), read_csv_rows(costco_items_enriched_csv), read_csv_rows(giant_orders_csv), read_csv_rows(costco_orders_csv), +<<<<<<< HEAD resolution_rows, ) existing_catalog_rows = read_optional_csv_rows(catalog_csv) @@ -404,6 +448,14 @@ def main( click.echo( f"wrote {len(purchase_rows)} purchase rows to {output_csv}, " f"{len(merged_catalog_rows)} catalog rows to {catalog_csv}, " +======= + ) + example_rows = build_comparison_examples(purchase_rows) + write_csv_rows(output_csv, purchase_rows, PURCHASE_FIELDS) + write_csv_rows(examples_csv, example_rows, EXAMPLE_FIELDS) + click.echo( + f"wrote {len(purchase_rows)} purchase rows to {output_csv} " +>>>>>>> be1bf63 (Build pivot-ready purchase log) f"and {len(example_rows)} comparison examples to {examples_csv}" ) diff --git a/tests/test_purchases.py b/tests/test_purchases.py index 9cd9ccb..32c6fce 100644 --- a/tests/test_purchases.py +++ b/tests/test_purchases.py @@ -99,12 +99,19 @@ class PurchaseLogTests(unittest.TestCase): } ] +<<<<<<< HEAD rows, _observed, _canon, _links = build_purchases.build_purchase_rows( +======= + rows = build_purchases.build_purchase_rows( +>>>>>>> be1bf63 (Build pivot-ready purchase log) [giant_row], [costco_row], giant_orders, costco_orders, +<<<<<<< HEAD [], +======= +>>>>>>> be1bf63 (Build pivot-ready purchase log) ) self.assertEqual(2, len(rows)) @@ -196,9 +203,12 @@ class PurchaseLogTests(unittest.TestCase): costco_items_enriched_csv=str(costco_items), giant_orders_csv=str(giant_orders), costco_orders_csv=str(costco_orders), +<<<<<<< HEAD resolutions_csv=str(Path(tmpdir) / "review_resolutions.csv"), catalog_csv=str(Path(tmpdir) / "canonical_catalog.csv"), links_csv=str(Path(tmpdir) / "product_links.csv"), +======= +>>>>>>> be1bf63 (Build pivot-ready purchase log) output_csv=str(purchases_csv), examples_csv=str(examples_csv), ) @@ -212,6 +222,7 @@ class PurchaseLogTests(unittest.TestCase): self.assertEqual(2, len(purchase_rows)) self.assertEqual(1, len(example_rows)) +<<<<<<< HEAD def test_build_purchase_rows_applies_manual_resolution(self): fieldnames = enrich_costco.OUTPUT_FIELDS giant_row = {field: "" for field in fieldnames} @@ -262,6 +273,8 @@ class PurchaseLogTests(unittest.TestCase): self.assertEqual("approved", rows[0]["review_status"]) self.assertEqual("create", rows[0]["resolution_action"]) +======= +>>>>>>> be1bf63 (Build pivot-ready purchase log) if __name__ == "__main__": unittest.main()