diff --git a/README.md b/README.md index 3a45789..3c615a3 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,12 @@ Run each script step-by-step from the terminal. 6. `review_products.py`: review unresolved product matches in the terminal 7. `report_pipeline_status.py`: show how many rows survive each stage +Active refactor entrypoints: +- `collect_giant_web.py` +- `collect_costco_web.py` +- `normalize_giant_web.py` +- `normalize_costco_web.py` + ## Requirements - Python 3.10+ @@ -30,8 +36,8 @@ pip install -r requirements.txt ## Optional `.env` Current version works best with `.env` in the project root. The scraper will prompt for these values if they are not found in the current browser session. -- `scrape_giant` prompts if `GIANT_USER_ID` or `GIANT_LOYALTY_NUMBER` is missing. -- `scrape_costco` tries `.env` first, then Firefox local storage for session-backed values; `COSTCO_CLIENT_IDENTIFIER` should still be set explicitly. +- `collect_giant_web.py` prompts if `GIANT_USER_ID` or `GIANT_LOYALTY_NUMBER` is missing. +- `collect_costco_web.py` tries `.env` first, then Firefox local storage for session-backed values; `COSTCO_CLIENT_IDENTIFIER` should still be set explicitly. - Costco discount matching happens later in `enrich_costco.py`; you do not need to pre-clean discount lines by hand. ```env @@ -43,15 +49,39 @@ COSTCO_X_WCS_CLIENTID=... COSTCO_CLIENT_IDENTIFIER=... ``` +Current active path layout: + +```text +data/ + giant-web/ + raw/ + collected_orders.csv + collected_items.csv + normalized_items.csv + costco-web/ + raw/ + collected_orders.csv + collected_items.csv + normalized_items.csv + review/ + review_queue.csv + review_resolutions.csv + product_links.csv + purchases.csv + pipeline_status.csv + pipeline_status.json + catalog.csv +``` + ## Run Order Run the pipeline in this order: ```bash -python scrape_giant.py -python enrich_giant.py -python scrape_costco.py -python enrich_costco.py +python collect_giant_web.py +python normalize_giant_web.py +python collect_costco_web.py +python normalize_costco_web.py python build_purchases.py python review_products.py python build_purchases.py @@ -79,25 +109,25 @@ python report_pipeline_status.py ## Key Outputs Giant: -- `giant_output/orders.csv` -- `giant_output/items.csv` -- `giant_output/items_enriched.csv` +- `data/giant-web/collected_orders.csv` +- `data/giant-web/collected_items.csv` +- `data/giant-web/normalized_items.csv` Costco: -- `costco_output/orders.csv` -- `costco_output/items.csv` -- `costco_output/items_enriched.csv` -- `costco_output/items_enriched.csv` now preserves raw totals and matched net discount fields +- `data/costco-web/collected_orders.csv` +- `data/costco-web/collected_items.csv` +- `data/costco-web/normalized_items.csv` +- `data/costco-web/normalized_items.csv` preserves raw totals and matched net discount fields Combined: -- `combined_output/purchases.csv` -- `combined_output/review_queue.csv` -- `combined_output/review_resolutions.csv` -- `combined_output/canonical_catalog.csv` -- `combined_output/product_links.csv` -- `combined_output/comparison_examples.csv` -- `combined_output/pipeline_status.csv` -- `combined_output/pipeline_status.json` +- `data/review/purchases.csv` +- `data/review/review_queue.csv` +- `data/review/review_resolutions.csv` +- `data/review/product_links.csv` +- `data/review/comparison_examples.csv` +- `data/review/pipeline_status.csv` +- `data/review/pipeline_status.json` +- `data/catalog.csv` ## Review Workflow @@ -114,7 +144,7 @@ The review step is intentionally conservative: ## Notes - This project is designed around fragile retailer scraping flows, so the code favors explicit retailer-specific steps over heavy abstraction. -- `scrape_giant.py` and `scrape_costco.py` are meant to work as standalone acquisition scripts. +- `scrape_giant.py`, `scrape_costco.py`, `enrich_giant.py`, and `enrich_costco.py` are now legacy-compatible entrypoints; prefer the `collect_*` and `normalize_*` scripts for active work. - Costco discount rows are preserved for auditability and also matched back to purchased items during enrichment. - `validate_cross_retailer_flow.py` is a proof/check script, not a required production step. diff --git a/build_purchases.py b/build_purchases.py index 7907748..e7f1a74 100644 --- a/build_purchases.py +++ b/build_purchases.py @@ -368,15 +368,15 @@ def build_comparison_examples(purchase_rows): @click.command() -@click.option("--giant-items-enriched-csv", default="giant_output/items_enriched.csv", show_default=True) -@click.option("--costco-items-enriched-csv", default="costco_output/items_enriched.csv", show_default=True) -@click.option("--giant-orders-csv", default="giant_output/orders.csv", show_default=True) -@click.option("--costco-orders-csv", default="costco_output/orders.csv", show_default=True) -@click.option("--resolutions-csv", default="combined_output/review_resolutions.csv", show_default=True) -@click.option("--catalog-csv", default="combined_output/canonical_catalog.csv", show_default=True) -@click.option("--links-csv", default="combined_output/product_links.csv", show_default=True) -@click.option("--output-csv", default="combined_output/purchases.csv", show_default=True) -@click.option("--examples-csv", default="combined_output/comparison_examples.csv", show_default=True) +@click.option("--giant-items-enriched-csv", default="data/giant-web/normalized_items.csv", show_default=True) +@click.option("--costco-items-enriched-csv", default="data/costco-web/normalized_items.csv", show_default=True) +@click.option("--giant-orders-csv", default="data/giant-web/collected_orders.csv", show_default=True) +@click.option("--costco-orders-csv", default="data/costco-web/collected_orders.csv", show_default=True) +@click.option("--resolutions-csv", default="data/review/review_resolutions.csv", show_default=True) +@click.option("--catalog-csv", default="data/catalog.csv", show_default=True) +@click.option("--links-csv", default="data/review/product_links.csv", show_default=True) +@click.option("--output-csv", default="data/review/purchases.csv", show_default=True) +@click.option("--examples-csv", default="data/review/comparison_examples.csv", show_default=True) def main( giant_items_enriched_csv, costco_items_enriched_csv, diff --git a/pm/tasks.org b/pm/tasks.org index 78c5aa6..df2f7cf 100644 --- a/pm/tasks.org +++ b/pm/tasks.org @@ -546,7 +546,7 @@ make Giant and Costco emit the shared normalized line-item schema without introd - `normalized_item_id` is always present, but it only collapses repeated rows when the evidence is strong; otherwise it falls back to row-level identity via `normalized_row_id`. - Added `normalize_*` entry points for the new data-model layout while leaving the legacy `enrich_*` commands available during the transition. -* [ ] t1.14.2: finalize filesystem and schema alignment for the refactor (2-4 commits) +* [X] t1.14.2: finalize filesystem and schema alignment for the refactor (2-4 commits) bring on-disk outputs fully into the target `data/` structure without changing retailer behavior ** Acceptance Criteria @@ -579,10 +579,13 @@ bring on-disk outputs fully into the target `data/` structure without changing r ** evidence - commit: -- tests: -- datetime: +- tests: `./venv/bin/python -m unittest discover -s tests`; `./venv/bin/python build_purchases.py`; `./venv/bin/python review_products.py --refresh-only`; `./venv/bin/python report_pipeline_status.py`; `./venv/bin/python build_purchases.py --help`; `./venv/bin/python review_products.py --help`; `./venv/bin/python report_pipeline_status.py --help`; verified `data/giant-web/collected_orders.csv`, `data/giant-web/collected_items.csv`, `data/costco-web/collected_orders.csv`, `data/costco-web/collected_items.csv`, `data/catalog.csv`, and archived transitional review outputs under `data/review/archive/` +- datetime: 2026-03-20 10:04:15 EDT ** notes +- No recollection was needed; existing raw and collected exports were adapted in place and moved into the target names. +- Updated the active script defaults to point at `data/...` so the code and on-disk layout now agree. +- Kept obviously obsolete review artifacts, but moved them under `data/review/archive/` instead of deleting them outright. * [ ] t1.14.3: retailer-specific Costco normalization cleanup (2-4 commits) tighten Costco-specific normalization so normalized item names are cleaner and deterministic retailer grouping is less noisy diff --git a/report_pipeline_status.py b/report_pipeline_status.py index 594861e..5dcf8f7 100644 --- a/report_pipeline_status.py +++ b/report_pipeline_status.py @@ -75,16 +75,16 @@ def build_status_summary( @click.command() -@click.option("--giant-orders-csv", default="giant_output/orders.csv", show_default=True) -@click.option("--giant-items-csv", default="giant_output/items.csv", show_default=True) -@click.option("--giant-enriched-csv", default="giant_output/items_enriched.csv", show_default=True) -@click.option("--costco-orders-csv", default="costco_output/orders.csv", show_default=True) -@click.option("--costco-items-csv", default="costco_output/items.csv", show_default=True) -@click.option("--costco-enriched-csv", default="costco_output/items_enriched.csv", show_default=True) -@click.option("--purchases-csv", default="combined_output/purchases.csv", show_default=True) -@click.option("--resolutions-csv", default="combined_output/review_resolutions.csv", show_default=True) -@click.option("--summary-csv", default="combined_output/pipeline_status.csv", show_default=True) -@click.option("--summary-json", default="combined_output/pipeline_status.json", show_default=True) +@click.option("--giant-orders-csv", default="data/giant-web/collected_orders.csv", show_default=True) +@click.option("--giant-items-csv", default="data/giant-web/collected_items.csv", show_default=True) +@click.option("--giant-enriched-csv", default="data/giant-web/normalized_items.csv", show_default=True) +@click.option("--costco-orders-csv", default="data/costco-web/collected_orders.csv", show_default=True) +@click.option("--costco-items-csv", default="data/costco-web/collected_items.csv", show_default=True) +@click.option("--costco-enriched-csv", default="data/costco-web/normalized_items.csv", show_default=True) +@click.option("--purchases-csv", default="data/review/purchases.csv", show_default=True) +@click.option("--resolutions-csv", default="data/review/review_resolutions.csv", show_default=True) +@click.option("--summary-csv", default="data/review/pipeline_status.csv", show_default=True) +@click.option("--summary-json", default="data/review/pipeline_status.json", show_default=True) def main( giant_orders_csv, giant_items_csv, diff --git a/review_products.py b/review_products.py index a2c6956..b549418 100644 --- a/review_products.py +++ b/review_products.py @@ -375,10 +375,10 @@ def prompt_resolution(queue_row, related_rows, catalog_rows, queue_index, queue_ @click.command() -@click.option("--purchases-csv", default="combined_output/purchases.csv", show_default=True) -@click.option("--queue-csv", default="combined_output/review_queue.csv", show_default=True) -@click.option("--resolutions-csv", default="combined_output/review_resolutions.csv", show_default=True) -@click.option("--catalog-csv", default="combined_output/canonical_catalog.csv", show_default=True) +@click.option("--purchases-csv", default="data/review/purchases.csv", show_default=True) +@click.option("--queue-csv", default="data/review/review_queue.csv", show_default=True) +@click.option("--resolutions-csv", default="data/review/review_resolutions.csv", show_default=True) +@click.option("--catalog-csv", default="data/catalog.csv", show_default=True) @click.option("--limit", default=0, show_default=True, type=int) @click.option("--refresh-only", is_flag=True, help="Only rebuild review_queue.csv without prompting.") def main(purchases_csv, queue_csv, resolutions_csv, catalog_csv, limit, refresh_only):