From 7f8c3ed8eb5f2b0f36420dfaf71fbaa1b0382406 Mon Sep 17 00:00:00 2001
From: ben <johnmosescarter@gmail.com>
Date: Tue, 17 Mar 2026 09:14:14 -0400
Subject: [PATCH] updated readme with Review steps

---
 README.md               | 273 ++++++++++++----------------------------
 build_purchases.py      |  52 ++++++++
 tests/test_purchases.py |  13 ++
 3 files changed, 147 insertions(+), 191 deletions(-)

diff --git a/README.md b/README.md
index 0ff4e66..d7803a1 100644
--- a/README.md
+++ b/README.md
@@ -1,227 +1,118 @@
 # scrape-giant
 
-Small grocery-history pipeline for Giant and Costco receipt data.
+Small CLI pipeline for pulling purchase history from Giant and Costco, enriching line items, and building a reviewable cross-retailer purchase dataset.
 
-This repo is still a manual, stepwise pipeline. There is no single orchestrator
-script yet. Each stage is run directly, and later stages depend on files
-produced by earlier stages.
+There is no one-shot runner yet. Today, you run the scripts step by step from the terminal.
 
-## What The Project Does
+## What It Does
 
-The current flow is:
+- `scrape_giant.py`: download Giant orders and items
+- `enrich_giant.py`: normalize Giant line items
+- `scrape_costco.py`: download Costco orders and items
+- `enrich_costco.py`: normalize Costco line items
+- `build_purchases.py`: combine retailer outputs into one purchase table
+- `review_products.py`: review unresolved product matches in the terminal
 
-1. acquire raw Giant receipt/history data
-2. enrich Giant line items into a shared enriched-item schema
-3. acquire raw Costco receipt data
-4. enrich Costco line items into the same shared enriched-item schema
-5. build observed-product, review, and canonical-product layers
-6. validate that Giant and Costco can flow through the same downstream model
+## Requirements
 
-Raw retailer JSON remains the source of truth.
+- Python 3.10+
+- Firefox installed with active Giant and Costco sessions
 
-## Current Scripts
-
-- `scrape_giant.py`
-  Fetch Giant in-store history and order detail payloads from an active Firefox
-  session.
-- `scrape_costco.py`
-  Fetch Costco receipt summary/detail payloads from an active Firefox session.
-  Costco currently prefers `.env` header values first, then falls back to exact
-  Firefox local-storage values for session auth.
-- `enrich_giant.py`
-  Parse Giant raw order JSON into `giant_output/items_enriched.csv`.
-- `enrich_costco.py`
-  Parse Costco raw receipt JSON into `costco_output/items_enriched.csv`.
-- `build_observed_products.py`
-  Build retailer-facing observed products from enriched rows.
-- `build_review_queue.py`
-  Build a manual review queue for low-confidence or unresolved observed
-  products.
-- `build_canonical_layer.py`
-  Build shared canonical products and observed-to-canonical links.
-- `validate_cross_retailer_flow.py`
-  Write a proof/check output showing that Giant and Costco can meet in the same
-  downstream model.
-
-## Manual Pipeline
-
-Run these from the repo root with the venv active, or call them through
-`./venv/bin/python`.
-
-### 1. Acquire Giant raw data
+## Install
 
 ```bash
-./venv/bin/python scrape_giant.py
+python -m venv venv
+./venv/scripts/activate
+pip install -r requirements.txt
 ```
 
-Inputs:
-- active Firefox session for `giantfood.com`
-- `GIANT_USER_ID` and `GIANT_LOYALTY_NUMBER` from `.env`, shell env, or prompt
+## Optional `.env`
 
-Outputs:
-- `giant_output/raw/history.json`
-- `giant_output/raw/<order_id>.json`
+Current version works best with `.env` in the project root.  The scraper will prompt for these values if they are not found in the current browser session.  
+- `scrape_giant` prompts if `GIANT_USER_ID` or `GIANT_LOYALTY_NUMBER` is missing.
+- `scrape_costco` tries `.env` first, then Firefox local storage for session-backed values; `COSTCO_CLIENT_IDENTIFIER` should still be set explicitly.
+
+```env
+GIANT_USER_ID=...
+GIANT_LOYALTY_NUMBER=...
+
+# Costco can use these if present, but it can also pull session values from Firefox.
+COSTCO_X_AUTHORIZATION=...
+COSTCO_X_WCS_CLIENTID=...
+COSTCO_CLIENT_IDENTIFIER=...
+```
+
+## Run Order
+
+Run the pipeline in this order:
+
+```bash
+python scrape_giant.py
+python enrich_giant.py
+python scrape_costco.py
+python enrich_costco.py
+python build_purchases.py
+python review_products.py
+python build_purchases.py
+```
+
+Why run `build_purchases.py` twice:
+- first pass builds the current combined dataset and review queue inputs
+- `review_products.py` writes durable review decisions
+- second pass reapplies those decisions into the purchase output
+
+If you only want to refresh the queue without reviewing interactively:
+
+```bash
+python review_products.py --refresh-only
+```
+
+## Key Outputs
+
+Giant:
 - `giant_output/orders.csv`
 - `giant_output/items.csv`
-
-### 2. Enrich Giant data
-
-```bash
-./venv/bin/python enrich_giant.py
-```
-
-Input:
-- `giant_output/raw/*.json`
-
-Output:
 - `giant_output/items_enriched.csv`
 
-### 3. Acquire Costco raw data
-
-```bash
-./venv/bin/python scrape_costco.py
-```
-
-Optional useful flags:
-
-```bash
-./venv/bin/python scrape_costco.py --months-back 36
-./venv/bin/python scrape_costco.py --firefox-profile-dir "C:\\Users\\you\\AppData\\Roaming\\Mozilla\\Firefox\\Profiles\\xxxx.default-release"
-```
-
-Inputs:
-- active Firefox session for `costco.com`
-- optional `.env` values:
-  - `COSTCO_X_AUTHORIZATION`
-  - `COSTCO_X_WCS_CLIENTID`
-  - `COSTCO_CLIENT_IDENTIFIER`
-- if `COSTCO_X_AUTHORIZATION` is absent, the script falls back to exact Firefox
-  local-storage values:
-  - `idToken` -> sent as `Bearer <idToken>`
-  - `clientID` -> used as `costco-x-wcs-clientId` when env is blank
-
-Outputs:
-- `costco_output/raw/summary.json`
-- `costco_output/raw/summary_requests.json`
-- `costco_output/raw/<receipt_id>-<timestamp>.json`
+Costco:
 - `costco_output/orders.csv`
 - `costco_output/items.csv`
-
-### 4. Enrich Costco data
-
-```bash
-./venv/bin/python enrich_costco.py
-```
-
-Input:
-- `costco_output/raw/*.json`
-
-Output:
 - `costco_output/items_enriched.csv`
 
-### 5. Build shared downstream layers
+Combined:
+- `combined_output/purchases.csv`
+- `combined_output/review_queue.csv`
+- `combined_output/review_resolutions.csv`
+- `combined_output/canonical_catalog.csv`
+- `combined_output/product_links.csv`
+- `combined_output/comparison_examples.csv`
 
-```bash
-./venv/bin/python build_observed_products.py
-./venv/bin/python build_review_queue.py
-./venv/bin/python build_canonical_layer.py
-```
+## Review Workflow
 
-These scripts consume the enriched item files and generate the downstream
-product-model outputs.
+`review_products.py` is the manual cleanup step for unresolved or weakly unified items.
 
-Current outputs on disk:
+In the terminal, you can:
+- link an item to an existing canonical product
+- create a new canonical product
+- exclude an item
+- skip it for later
 
-- retailer-facing:
-  - `giant_output/products_observed.csv`
-  - `giant_output/review_queue.csv`
-  - `giant_output/products_canonical.csv`
-  - `giant_output/product_links.csv`
-- cross-retailer proof/check output:
-  - `combined_output/products_observed.csv`
-  - `combined_output/products_canonical.csv`
-  - `combined_output/product_links.csv`
-  - `combined_output/proof_examples.csv`
-
-### 6. Validate cross-retailer flow
-
-```bash
-./venv/bin/python validate_cross_retailer_flow.py
-```
-
-This is a proof/check step, not the main acquisition path.
-
-## Inputs And Outputs By Directory
-
-### `giant_output/`
-
-Inputs to this layer:
-- Firefox session data for Giant
-- Giant raw JSON payloads
-
-Generated files:
-- `raw/history.json`
-- `raw/<order_id>.json`
-- `orders.csv`
-- `items.csv`
-- `items_enriched.csv`
-- `products_observed.csv`
-- `review_queue.csv`
-- `products_canonical.csv`
-- `product_links.csv`
-
-### `costco_output/`
-
-Inputs to this layer:
-- Firefox session data for Costco
-- Costco raw GraphQL receipt payloads
-
-Generated files:
-- `raw/summary.json`
-- `raw/summary_requests.json`
-- `raw/<receipt_id>-<timestamp>.json`
-- `orders.csv`
-- `items.csv`
-- `items_enriched.csv`
-
-### `combined_output/`
-
-Generated by cross-retailer proof/build scripts:
-- `products_observed.csv`
-- `products_canonical.csv`
-- `product_links.csv`
-- `proof_examples.csv`
+Those decisions are saved and reused on later runs.
 
 ## Notes
 
-- The pipeline is intentionally simple and currently manual.
-- Scraping is retailer-specific and fragile; downstream modeling is shared only
-  after enrichment.
-- `summary_requests.json` is diagnostic metadata from Costco summary enumeration
-  and is not a receipt payload.
-- `enrich_costco.py` skips that file and only parses receipt payloads.
-- The repo may contain archived or sample output files under `archive/`; they
-  are not part of the active scrape path.
+- This project is designed around fragile retailer scraping flows, so the code favors explicit retailer-specific steps over heavy abstraction.
+- `scrape_giant.py` and `scrape_costco.py` are meant to work as standalone acquisition scripts.
+- `validate_cross_retailer_flow.py` is a proof/check script, not a required production step.
 
-## Verification
-
-Run the full test suite with:
+## Test
 
 ```bash
 ./venv/bin/python -m unittest discover -s tests
 ```
 
-Useful one-off checks:
-
-```bash
-./venv/bin/python scrape_giant.py --help
-./venv/bin/python scrape_costco.py --help
-./venv/bin/python enrich_giant.py
-./venv/bin/python enrich_costco.py
-```
-
 ## Project Docs
 
-- `pm/tasks.org`
-- `pm/data-model.org`
-- `pm/scrape-giant.org`
+- `pm/tasks.org`: task tracking
+- `pm/data-model.org`: current data model notes
+- `pm/review-workflow.org`: review and resolution workflow
diff --git a/build_purchases.py b/build_purchases.py
index 4f4996f..416041a 100644
--- a/build_purchases.py
+++ b/build_purchases.py
@@ -7,7 +7,11 @@ import build_canonical_layer
 import build_observed_products
 import validate_cross_retailer_flow
 from enrich_giant import format_decimal, to_decimal
+<<<<<<< HEAD
 from layer_helpers import read_csv_rows, stable_id, write_csv_rows
+=======
+from layer_helpers import read_csv_rows, write_csv_rows
+>>>>>>> be1bf63 (Build pivot-ready purchase log)
 
 
 PURCHASE_FIELDS = [
@@ -18,8 +22,11 @@ PURCHASE_FIELDS = [
     "observed_item_key",
     "observed_product_id",
     "canonical_product_id",
+<<<<<<< HEAD
     "review_status",
     "resolution_action",
+=======
+>>>>>>> be1bf63 (Build pivot-ready purchase log)
     "raw_item_name",
     "normalized_item_name",
     "retailer_item_id",
@@ -62,6 +69,7 @@ EXAMPLE_FIELDS = [
     "notes",
 ]
 
+<<<<<<< HEAD
 CATALOG_FIELDS = [
     "canonical_product_id",
     "canonical_name",
@@ -87,6 +95,8 @@ RESOLUTION_FIELDS = [
     "reviewed_at",
 ]
 
+=======
+>>>>>>> be1bf63 (Build pivot-ready purchase log)
 
 def decimal_or_zero(value):
     return to_decimal(value) or Decimal("0")
@@ -165,6 +175,7 @@ def order_lookup(rows, retailer):
     }
 
 
+<<<<<<< HEAD
 def read_optional_csv_rows(path):
     path = Path(path)
     if not path.exists():
@@ -209,6 +220,9 @@ def catalog_row_from_canonical(row):
 
 
 def build_link_state(enriched_rows):
+=======
+def build_link_lookup(enriched_rows):
+>>>>>>> be1bf63 (Build pivot-ready purchase log)
     observed_rows = build_observed_products.build_observed_products(enriched_rows)
     canonical_rows, link_rows = build_canonical_layer.build_canonical_layer(observed_rows)
     giant_row, costco_row = validate_cross_retailer_flow.find_proof_pair(observed_rows)
@@ -225,6 +239,7 @@ def build_link_state(enriched_rows):
     canonical_id_by_observed = {
         row["observed_product_id"]: row["canonical_product_id"] for row in link_rows
     }
+<<<<<<< HEAD
     return observed_rows, canonical_rows, link_rows, observed_id_by_key, canonical_id_by_observed
 
 
@@ -253,6 +268,14 @@ def build_purchase_rows(
             canonical_id_by_observed[observed_product_id] = resolution["canonical_product_id"]
         elif action == "exclude":
             canonical_id_by_observed[observed_product_id] = ""
+=======
+    return observed_id_by_key, canonical_id_by_observed
+
+
+def build_purchase_rows(giant_enriched_rows, costco_enriched_rows, giant_orders, costco_orders):
+    all_enriched_rows = giant_enriched_rows + costco_enriched_rows
+    observed_id_by_key, canonical_id_by_observed = build_link_lookup(all_enriched_rows)
+>>>>>>> be1bf63 (Build pivot-ready purchase log)
     orders_by_id = {}
     orders_by_id.update(order_lookup(giant_orders, "giant"))
     orders_by_id.update(order_lookup(costco_orders, "costco"))
@@ -266,7 +289,10 @@ def build_purchase_rows(
         observed_product_id = observed_id_by_key.get(observed_key, "")
         order_row = orders_by_id.get((row["retailer"], row["order_id"]), {})
         metrics = derive_metrics(row)
+<<<<<<< HEAD
         resolution = resolution_lookup.get(observed_product_id, {})
+=======
+>>>>>>> be1bf63 (Build pivot-ready purchase log)
         purchase_rows.append(
             {
                 "purchase_date": row["order_date"],
@@ -276,8 +302,11 @@ def build_purchase_rows(
                 "observed_item_key": row["observed_item_key"],
                 "observed_product_id": observed_product_id,
                 "canonical_product_id": canonical_id_by_observed.get(observed_product_id, ""),
+<<<<<<< HEAD
                 "review_status": resolution.get("status", ""),
                 "resolution_action": resolution.get("resolution_action", ""),
+=======
+>>>>>>> be1bf63 (Build pivot-ready purchase log)
                 "raw_item_name": row["item_name"],
                 "normalized_item_name": row["item_name_norm"],
                 "retailer_item_id": row["retailer_item_id"],
@@ -301,6 +330,7 @@ def build_purchase_rows(
                 **metrics,
             }
         )
+<<<<<<< HEAD
     return purchase_rows, observed_rows, canonical_rows, link_rows
 
 
@@ -328,6 +358,9 @@ def apply_manual_resolutions_to_links(link_rows, resolution_rows):
                 "link_notes": resolution.get("resolution_notes", ""),
             }
     return sorted(link_by_observed.values(), key=lambda row: row["observed_product_id"])
+=======
+    return purchase_rows
+>>>>>>> be1bf63 (Build pivot-ready purchase log)
 
 
 def build_comparison_examples(purchase_rows):
@@ -366,9 +399,12 @@ def build_comparison_examples(purchase_rows):
 @click.option("--costco-items-enriched-csv", default="costco_output/items_enriched.csv", show_default=True)
 @click.option("--giant-orders-csv", default="giant_output/orders.csv", show_default=True)
 @click.option("--costco-orders-csv", default="costco_output/orders.csv", show_default=True)
+<<<<<<< HEAD
 @click.option("--resolutions-csv", default="combined_output/review_resolutions.csv", show_default=True)
 @click.option("--catalog-csv", default="combined_output/canonical_catalog.csv", show_default=True)
 @click.option("--links-csv", default="combined_output/product_links.csv", show_default=True)
+=======
+>>>>>>> be1bf63 (Build pivot-ready purchase log)
 @click.option("--output-csv", default="combined_output/purchases.csv", show_default=True)
 @click.option("--examples-csv", default="combined_output/comparison_examples.csv", show_default=True)
 def main(
@@ -376,6 +412,7 @@ def main(
     costco_items_enriched_csv,
     giant_orders_csv,
     costco_orders_csv,
+<<<<<<< HEAD
     resolutions_csv,
     catalog_csv,
     links_csv,
@@ -384,10 +421,17 @@ def main(
 ):
     resolution_rows = read_optional_csv_rows(resolutions_csv)
     purchase_rows, _observed_rows, canonical_rows, link_rows = build_purchase_rows(
+=======
+    output_csv,
+    examples_csv,
+):
+    purchase_rows = build_purchase_rows(
+>>>>>>> be1bf63 (Build pivot-ready purchase log)
         read_csv_rows(giant_items_enriched_csv),
         read_csv_rows(costco_items_enriched_csv),
         read_csv_rows(giant_orders_csv),
         read_csv_rows(costco_orders_csv),
+<<<<<<< HEAD
         resolution_rows,
     )
     existing_catalog_rows = read_optional_csv_rows(catalog_csv)
@@ -404,6 +448,14 @@ def main(
     click.echo(
         f"wrote {len(purchase_rows)} purchase rows to {output_csv}, "
         f"{len(merged_catalog_rows)} catalog rows to {catalog_csv}, "
+=======
+    )
+    example_rows = build_comparison_examples(purchase_rows)
+    write_csv_rows(output_csv, purchase_rows, PURCHASE_FIELDS)
+    write_csv_rows(examples_csv, example_rows, EXAMPLE_FIELDS)
+    click.echo(
+        f"wrote {len(purchase_rows)} purchase rows to {output_csv} "
+>>>>>>> be1bf63 (Build pivot-ready purchase log)
         f"and {len(example_rows)} comparison examples to {examples_csv}"
     )
 
diff --git a/tests/test_purchases.py b/tests/test_purchases.py
index 9cd9ccb..32c6fce 100644
--- a/tests/test_purchases.py
+++ b/tests/test_purchases.py
@@ -99,12 +99,19 @@ class PurchaseLogTests(unittest.TestCase):
             }
         ]
 
+<<<<<<< HEAD
         rows, _observed, _canon, _links = build_purchases.build_purchase_rows(
+=======
+        rows = build_purchases.build_purchase_rows(
+>>>>>>> be1bf63 (Build pivot-ready purchase log)
             [giant_row],
             [costco_row],
             giant_orders,
             costco_orders,
+<<<<<<< HEAD
             [],
+=======
+>>>>>>> be1bf63 (Build pivot-ready purchase log)
         )
 
         self.assertEqual(2, len(rows))
@@ -196,9 +203,12 @@ class PurchaseLogTests(unittest.TestCase):
                 costco_items_enriched_csv=str(costco_items),
                 giant_orders_csv=str(giant_orders),
                 costco_orders_csv=str(costco_orders),
+<<<<<<< HEAD
                 resolutions_csv=str(Path(tmpdir) / "review_resolutions.csv"),
                 catalog_csv=str(Path(tmpdir) / "canonical_catalog.csv"),
                 links_csv=str(Path(tmpdir) / "product_links.csv"),
+=======
+>>>>>>> be1bf63 (Build pivot-ready purchase log)
                 output_csv=str(purchases_csv),
                 examples_csv=str(examples_csv),
             )
@@ -212,6 +222,7 @@ class PurchaseLogTests(unittest.TestCase):
             self.assertEqual(2, len(purchase_rows))
             self.assertEqual(1, len(example_rows))
 
+<<<<<<< HEAD
     def test_build_purchase_rows_applies_manual_resolution(self):
         fieldnames = enrich_costco.OUTPUT_FIELDS
         giant_row = {field: "" for field in fieldnames}
@@ -262,6 +273,8 @@ class PurchaseLogTests(unittest.TestCase):
         self.assertEqual("approved", rows[0]["review_status"])
         self.assertEqual("create", rows[0]["resolution_action"])
 
+=======
+>>>>>>> be1bf63 (Build pivot-ready purchase log)
 
 if __name__ == "__main__":
     unittest.main()