Compare commits

..

99 Commits

Author SHA1 Message Date
ben
74d17b0b0c minor edit 2026-03-24 17:28:16 -04:00
ben
fea5132100 minor edi 2026-03-24 17:27:34 -04:00
ben
eb3959ae0f Record t1.22.1 task evidence 2026-03-24 17:26:00 -04:00
ben
867275c67a Trim requirements to direct runtime deps 2026-03-24 17:25:52 -04:00
ben
6336c15da8 Record t1.22 task evidence 2026-03-24 17:10:09 -04:00
ben
09829b2b9d Finalize post-refactor layout and remove old pipeline files 2026-03-24 17:09:57 -04:00
ben
cdb7a15739 Record t1.21 task evidence 2026-03-24 16:49:01 -04:00
ben
46a3b2c639 Add purchase analysis summaries 2026-03-24 16:48:53 -04:00
ben
c35688c87f Record t1.20 task evidence 2026-03-24 08:29:31 -04:00
ben
6940f165fb Document visit-level purchase analysis 2026-03-24 08:29:26 -04:00
ben
de8ff535b8 1.18 cleanup and review 2026-03-24 08:27:41 -04:00
ben
02be6f52c0 Record t1.19 task evidence 2026-03-23 15:32:48 -04:00
ben
8ccf3ff43b Reconcile review queue against current catalog state 2026-03-23 15:32:41 -04:00
ben
a93229408b Record t1.18.4 task evidence 2026-03-23 15:28:05 -04:00
ben
a45522c110 Finalize purchase effective price fields 2026-03-23 15:27:58 -04:00
ben
d78230f1c6 Record t1.18.3 task evidence 2026-03-23 13:56:56 -04:00
ben
73176117fe Fix Costco hash-size weight parsing 2026-03-23 13:56:47 -04:00
ben
facebced9c Record t1.18.2 task evidence 2026-03-23 13:23:03 -04:00
ben
23dfc3de3e Use picked weight for Giant quantity basis 2026-03-23 13:22:56 -04:00
ben
3bc76ed243 Record t1.18 and t1.18.1 evidence 2026-03-23 12:54:09 -04:00
ben
dc0d0614bb Add effective price to purchases 2026-03-23 12:53:54 -04:00
ben
605c94498b Add effective price regression tests 2026-03-23 12:52:41 -04:00
ben
d4f479b0d8 added effective_price and testing to id upstream data 2026-03-23 12:35:27 -04:00
ben
38c2c2ea2e Record t1.17 task evidence 2026-03-21 21:50:16 -04:00
ben
d25448b690 Fix normalized quantity basis 2026-03-21 21:50:10 -04:00
db761adafc added notes from first review session 2026-03-21 20:53:22 -04:00
e8e11e15b3 added draft scope for review/search loop 2026-03-21 09:48:34 -04:00
ben
afadd0c0d0 Restore skip and move search to find 2026-03-20 13:35:07 -04:00
ben
2847d2d59f Record t1.16.1 task evidence 2026-03-20 13:32:27 -04:00
ben
f93b9aa464 Add catalog search to review flow 2026-03-20 13:32:20 -04:00
ben
17158fb9e9 Record t1.16 task evidence 2026-03-20 12:45:57 -04:00
ben
975d44bebb Tighten review prompt flow 2026-03-20 12:45:38 -04:00
ben
f478795b5d added t1.16 to cleanup review process 2026-03-20 12:42:23 -04:00
ben
59fb881c0a Record t1.15 task evidence 2026-03-20 11:27:56 -04:00
ben
9104781b93 Refactor review pipeline around normalized items 2026-03-20 11:27:46 -04:00
ben
607c51038a Record t1.14.3 task evidence 2026-03-20 11:09:50 -04:00
ben
bcec6b37d3 Clean Costco normalization artifacts 2026-03-20 11:09:44 -04:00
ben
848d229f2d Record t1.14.2 task evidence 2026-03-20 10:05:08 -04:00
ben
d2e6f2afd3 Align refactor paths with data layout 2026-03-20 10:04:58 -04:00
424a777dd0 added git note 2026-03-20 09:58:25 -04:00
2e5d69c75e added 14.2 and 14.3 for refactor prep 2026-03-20 09:55:46 -04:00
ben
3c2462845b added task-sample 2026-03-18 15:47:12 -04:00
ben
c0023e8f3a Record t1.14.1 task evidence 2026-03-18 15:46:31 -04:00
ben
9064de5f67 Refactor retailer normalization outputs 2026-03-18 15:46:20 -04:00
ben
ec1f36a140 Record t1.14 task evidence 2026-03-18 15:18:54 -04:00
ben
48c6eaf753 Refactor retailer collection entrypoints 2026-03-18 15:18:47 -04:00
ben
e74253f6fb data-model prep for refactor, removing observed layer 2026-03-18 15:15:29 -04:00
ben
c13d144418 cleanup 2026-03-18 14:02:36 -04:00
ben
10aad05808 data-model refactor and prep scope 2026-03-18 13:08:28 -04:00
ben
9122821db1 Fix t1.13 evidence hashes 2026-03-17 15:08:09 -04:00
ben
7743421918 Record t1.13 task evidence 2026-03-17 15:07:51 -04:00
ben
08e2a86cbd Make canonical auto-linking more conservative 2026-03-17 15:07:48 -04:00
ben
56a03bcb1d Attach Costco discounts to purchase rows 2026-03-17 15:07:45 -04:00
ben
967e19e561 Add pipeline status accounting 2026-03-17 15:07:42 -04:00
ben
eddef7de2b updated readme and prep for next phase 2026-03-17 13:59:57 -04:00
ben
83bc6c4a7c Update t1.12 task evidence 2026-03-17 13:25:21 -04:00
ben
d39497c298 Refine product review prompt flow 2026-03-17 13:25:12 -04:00
ben
7b8141cd42 Improve product review display workflow 2026-03-17 12:25:47 -04:00
ben
e494386e64 build_purchases rev1 2026-03-17 12:21:44 -04:00
ben
7527fe37eb added git notes 2026-03-17 12:21:24 -04:00
ben
a1fafa3885 added t1.12 scope to simplify review process 2026-03-17 12:20:48 -04:00
ben
37b2196023 added git notes 2026-03-17 09:23:00 -04:00
ben
7f8c3ed8eb updated readme with Review steps 2026-03-17 09:14:14 -04:00
ben
91bfd3597e Record t1.11 task evidence 2026-03-16 20:45:57 -04:00
ben
c7dad5489e Add terminal review resolution workflow 2026-03-16 20:45:37 -04:00
ben
34eedff9c5 Record t1.8.7 and t1.9 task evidence 2026-03-16 18:01:16 -04:00
ben
be1bf6328e Build pivot-ready purchase log 2026-03-16 18:01:09 -04:00
ben
6806c0e7ff updated readme 2026-03-16 17:40:23 -04:00
ben
861955557a added instructions 2026-03-16 17:34:22 -04:00
ben
6e1cde2c83 fix json data pull from /raw 2026-03-16 17:34:01 -04:00
ben
23d0c7e5cd fix bug w session.headers.update missing auth_headers 2026-03-16 17:19:07 -04:00
ben
9a985bf98d updated to use .env, then pull idToken and clientID 2026-03-16 17:17:20 -04:00
ben
b0d4044dac updated task 1.8.7 2026-03-16 17:09:13 -04:00
ben
d7a0329332 Simplify browser session bootstrap 2026-03-16 17:08:44 -04:00
e48dd6c4c2 troubleshooting costco header extraction 2026-03-16 16:59:31 -04:00
ben
1b4c7dde25 Simplify Costco browser header extraction 2026-03-16 16:23:38 -04:00
5a331c9af4 fixed sqlite copy permission error 2026-03-16 16:18:50 -04:00
ben
4fd309251d Record t1.8.6 task evidence 2026-03-16 13:54:11 -04:00
ben
7789c2e6ae Add shared browser session bootstrap 2026-03-16 13:54:00 -04:00
0f797d0a96 added scope for browser session pull task and cleanup 2026-03-16 13:46:52 -04:00
a48a3c8396 added token and dotenv so costco scrapes successfully 36 mo 2026-03-16 13:46:22 -04:00
de0c276a24 Merge remote-tracking branch 'gitea/cx' into cx 2026-03-16 12:40:44 -04:00
d080a35697 added git issues notes 2026-03-16 12:33:50 -04:00
ben
2e5109bd11 Record t1.8.5 task evidence 2026-03-16 12:28:27 -04:00
ben
c0054dc51e Align Costco scraper with browser session flow 2026-03-16 12:28:19 -04:00
ben
58d6efb7bb assume local venv available 2026-03-16 11:44:10 -04:00
ben
031955ba54 Record t1.8.4 task evidence 2026-03-16 11:39:51 -04:00
ben
ac82fa64fb Fix Costco receipt enumeration windows 2026-03-16 11:39:45 -04:00
ben
0d1591a602 Record Costco task evidence 2026-03-16 09:18:05 -04:00
ben
da00288f10 Add Costco acquisition and enrich flow 2026-03-16 09:17:46 -04:00
ben
9497565978 Extend shared schema for retailer-native ids 2026-03-16 09:17:36 -04:00
ben
d20a131e04 updated scope to prep for costco scraper 2026-03-16 09:04:52 -04:00
ben
4216daa37c Record t1.4 through t1.7 task evidence 2026-03-16 00:45:04 -04:00
ben
385a31c07f Auto-link canonical products conservatively 2026-03-16 00:44:45 -04:00
ben
347cd44d09 Create canonical product layer scaffold 2026-03-16 00:43:21 -04:00
ben
9b13ec3b31 Build observed product review queue 2026-03-16 00:43:17 -04:00
ben
dc392149b5 Generate Giant observed products 2026-03-16 00:43:11 -04:00
ben
8cdc4a1ad3 Record t1.3 task evidence 2026-03-16 00:28:37 -04:00
ben
14f2cc2bac Build Giant item enricher 2026-03-16 00:28:28 -04:00
32 changed files with 8598 additions and 473 deletions

180
README.md Normal file
View File

@@ -0,0 +1,180 @@
# scrape-giant
CLI to pull purchase history from Giant and Costco websites and refine into a single product catalog for external analysis.
Run each script step-by-step from the terminal.
## What It Does
1. `collect_giant_web.py`: download Giant orders and items
2. `normalize_giant_web.py`: normalize Giant line items
3. `collect_costco_web.py`: download Costco orders and items
4. `normalize_costco_web.py`: normalize Costco line items
5. `build_purchases.py`: combine retailer outputs into one purchase table
6. `review_products.py`: review unresolved product matches in the terminal
7. `report_pipeline_status.py`: show how many rows survive each stage
8. `analyze_purchases.py`: write chart-ready analysis CSVs from the purchase table
## Requirements
- Python 3.10+
- Firefox installed with active Giant and Costco sessions
## Install
```bash
python -m venv venv
./venv/scripts/activate
pip install -r requirements.txt
```
## Optional `.env`
Current version works best with `.env` in the project root. The scraper will prompt for these values if they are not found in the current browser session.
- `collect_giant_web.py` prompts if `GIANT_USER_ID` or `GIANT_LOYALTY_NUMBER` is missing.
- `collect_costco_web.py` tries `.env` first, then Firefox local storage for session-backed values; `COSTCO_CLIENT_IDENTIFIER` should still be set explicitly.
- Costco discount matching happens later in `enrich_costco.py`; you do not need to pre-clean discount lines by hand.
```env
GIANT_USER_ID=...
GIANT_LOYALTY_NUMBER=...
COSTCO_X_AUTHORIZATION=...
COSTCO_X_WCS_CLIENTID=...
COSTCO_CLIENT_IDENTIFIER=...
```
Current active path layout:
```text
data/
giant-web/
raw/
collected_orders.csv
collected_items.csv
normalized_items.csv
costco-web/
raw/
collected_orders.csv
collected_items.csv
normalized_items.csv
review/
catalog.csv
review_queue.csv
review_resolutions.csv
product_links.csv
pipeline_status.csv
pipeline_status.json
analysis/
purchases.csv
comparison_examples.csv
item_price_over_time.csv
spend_by_visit.csv
items_per_visit.csv
category_spend_over_time.csv
retailer_store_breakdown.csv
```
## Run Order
Run the pipeline in this order:
```bash
python collect_giant_web.py
python normalize_giant_web.py
python collect_costco_web.py
python normalize_costco_web.py
python build_purchases.py
python review_products.py
python build_purchases.py
python review_products.py --refresh-only
python report_pipeline_status.py
python analyze_purchases.py
```
Why run `build_purchases.py` twice:
- first pass builds the current combined dataset and review queue inputs
- `review_products.py` writes durable review decisions
- second pass reapplies those decisions into the purchase output
If you only want to refresh the queue without reviewing interactively:
```bash
python review_products.py --refresh-only
```
If you want a quick stage-by-stage accountability check:
```bash
python report_pipeline_status.py
```
## Key Outputs
Giant:
- `data/giant-web/collected_orders.csv`
- `data/giant-web/collected_items.csv`
- `data/giant-web/normalized_items.csv`
Costco:
- `data/costco-web/collected_orders.csv`
- `data/costco-web/collected_items.csv`
- `data/costco-web/normalized_items.csv`
- `data/costco-web/normalized_items.csv` preserves raw totals and matched net discount fields
Combined:
- `data/analysis/purchases.csv`
- `data/analysis/comparison_examples.csv`
- `data/analysis/item_price_over_time.csv`
- `data/analysis/spend_by_visit.csv`
- `data/analysis/items_per_visit.csv`
- `data/analysis/category_spend_over_time.csv`
- `data/analysis/retailer_store_breakdown.csv`
- `data/review/review_queue.csv`
- `data/review/review_resolutions.csv`
- `data/review/product_links.csv`
- `data/review/pipeline_status.csv`
- `data/review/pipeline_status.json`
- `data/review/catalog.csv`
`data/analysis/purchases.csv` is the main analysis artifact. It is designed to support both:
- item-level price analysis
- visit-level analysis such as spend by visit, items per visit, category spend by visit, and retailer/store breakdown
The visit fields are carried directly in `purchases.csv`, so you can pivot on them without extra joins:
- `order_id`
- `purchase_date`
- `retailer`
- `store_name`
- `store_number`
- `store_city`
- `store_state`
## Review Workflow
Run `review_products.py` to cleanup unresolved or weakly unified items:
- link an item to an existing canonical product
- create a new canonical product
- exclude an item
- skip it for later
Decisions are saved and reused on later runs.
The review step is intentionally conservative:
- weak exact-name matches stay in the queue instead of auto-creating canonical products
- canonical names should describe stable product identity, not retailer packaging text
## Notes
- This project is designed around fragile retailer scraping flows, so the code favors explicit retailer-specific steps over heavy abstraction.
- Costco discount rows are preserved for auditability and also matched back to purchased items during enrichment.
## Test
```bash
./venv/bin/python -m unittest discover -s tests
```
## Project Docs
- `pm/tasks.org`: task tracking
- `pm/data-model.org`: current data model notes
- `pm/review-workflow.org`: review and resolution workflow

View File

@@ -7,6 +7,7 @@
## tech stack ## tech stack
- python; pandas or polars - python; pandas or polars
- file storage: json and csv, no sqlite or databases - file storage: json and csv, no sqlite or databases
- assume local virtual env is available and accessible
- do not add new dependencies unless explicitly approved; if unavoidable, document justification in the active task notes - do not add new dependencies unless explicitly approved; if unavoidable, document justification in the active task notes
## workflow ## workflow

271
analyze_purchases.py Normal file
View File

@@ -0,0 +1,271 @@
from collections import defaultdict
from pathlib import Path
import click
from enrich_giant import format_decimal, to_decimal
from layer_helpers import read_csv_rows, write_csv_rows
ITEM_PRICE_FIELDS = [
"purchase_date",
"retailer",
"store_name",
"store_number",
"store_city",
"store_state",
"order_id",
"catalog_id",
"catalog_name",
"category",
"product_type",
"effective_price",
"effective_price_unit",
"net_line_total",
"normalized_quantity",
]
SPEND_BY_VISIT_FIELDS = [
"purchase_date",
"retailer",
"order_id",
"store_name",
"store_number",
"store_city",
"store_state",
"visit_spend_total",
]
ITEMS_PER_VISIT_FIELDS = [
"purchase_date",
"retailer",
"order_id",
"store_name",
"store_number",
"store_city",
"store_state",
"item_row_count",
"distinct_catalog_count",
]
CATEGORY_SPEND_FIELDS = [
"purchase_date",
"retailer",
"category",
"category_spend_total",
]
RETAILER_STORE_FIELDS = [
"retailer",
"store_name",
"store_number",
"store_city",
"store_state",
"visit_count",
"item_row_count",
"store_spend_total",
]
def effective_total(row):
total = to_decimal(row.get("net_line_total"))
if total is not None:
return total
return to_decimal(row.get("line_total"))
def is_item_row(row):
return (
row.get("is_fee") != "true"
and row.get("is_discount_line") != "true"
and row.get("is_coupon_line") != "true"
)
def build_item_price_rows(purchase_rows):
rows = []
for row in purchase_rows:
if not row.get("catalog_name") or not row.get("effective_price"):
continue
rows.append(
{
"purchase_date": row.get("purchase_date", ""),
"retailer": row.get("retailer", ""),
"store_name": row.get("store_name", ""),
"store_number": row.get("store_number", ""),
"store_city": row.get("store_city", ""),
"store_state": row.get("store_state", ""),
"order_id": row.get("order_id", ""),
"catalog_id": row.get("catalog_id", ""),
"catalog_name": row.get("catalog_name", ""),
"category": row.get("category", ""),
"product_type": row.get("product_type", ""),
"effective_price": row.get("effective_price", ""),
"effective_price_unit": row.get("effective_price_unit", ""),
"net_line_total": row.get("net_line_total", ""),
"normalized_quantity": row.get("normalized_quantity", ""),
}
)
return rows
def build_spend_by_visit_rows(purchase_rows):
grouped = defaultdict(lambda: {"total": to_decimal("0")})
for row in purchase_rows:
total = effective_total(row)
if total is None:
continue
key = (
row.get("purchase_date", ""),
row.get("retailer", ""),
row.get("order_id", ""),
row.get("store_name", ""),
row.get("store_number", ""),
row.get("store_city", ""),
row.get("store_state", ""),
)
grouped[key]["total"] += total
rows = []
for key, values in sorted(grouped.items()):
rows.append(
{
"purchase_date": key[0],
"retailer": key[1],
"order_id": key[2],
"store_name": key[3],
"store_number": key[4],
"store_city": key[5],
"store_state": key[6],
"visit_spend_total": format_decimal(values["total"]),
}
)
return rows
def build_items_per_visit_rows(purchase_rows):
grouped = defaultdict(lambda: {"item_rows": 0, "catalog_ids": set()})
for row in purchase_rows:
if not is_item_row(row):
continue
key = (
row.get("purchase_date", ""),
row.get("retailer", ""),
row.get("order_id", ""),
row.get("store_name", ""),
row.get("store_number", ""),
row.get("store_city", ""),
row.get("store_state", ""),
)
grouped[key]["item_rows"] += 1
if row.get("catalog_id"):
grouped[key]["catalog_ids"].add(row["catalog_id"])
rows = []
for key, values in sorted(grouped.items()):
rows.append(
{
"purchase_date": key[0],
"retailer": key[1],
"order_id": key[2],
"store_name": key[3],
"store_number": key[4],
"store_city": key[5],
"store_state": key[6],
"item_row_count": str(values["item_rows"]),
"distinct_catalog_count": str(len(values["catalog_ids"])),
}
)
return rows
def build_category_spend_rows(purchase_rows):
grouped = defaultdict(lambda: to_decimal("0"))
for row in purchase_rows:
category = row.get("category", "")
total = effective_total(row)
if not category or total is None:
continue
key = (
row.get("purchase_date", ""),
row.get("retailer", ""),
category,
)
grouped[key] += total
rows = []
for key, total in sorted(grouped.items()):
rows.append(
{
"purchase_date": key[0],
"retailer": key[1],
"category": key[2],
"category_spend_total": format_decimal(total),
}
)
return rows
def build_retailer_store_rows(purchase_rows):
grouped = defaultdict(lambda: {"visit_ids": set(), "item_rows": 0, "total": to_decimal("0")})
for row in purchase_rows:
total = effective_total(row)
key = (
row.get("retailer", ""),
row.get("store_name", ""),
row.get("store_number", ""),
row.get("store_city", ""),
row.get("store_state", ""),
)
grouped[key]["visit_ids"].add((row.get("purchase_date", ""), row.get("order_id", "")))
if is_item_row(row):
grouped[key]["item_rows"] += 1
if total is not None:
grouped[key]["total"] += total
rows = []
for key, values in sorted(grouped.items()):
rows.append(
{
"retailer": key[0],
"store_name": key[1],
"store_number": key[2],
"store_city": key[3],
"store_state": key[4],
"visit_count": str(len(values["visit_ids"])),
"item_row_count": str(values["item_rows"]),
"store_spend_total": format_decimal(values["total"]),
}
)
return rows
@click.command()
@click.option("--purchases-csv", default="data/analysis/purchases.csv", show_default=True)
@click.option("--output-dir", default="data/analysis", show_default=True)
def main(purchases_csv, output_dir):
purchase_rows = read_csv_rows(purchases_csv)
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
item_price_rows = build_item_price_rows(purchase_rows)
spend_by_visit_rows = build_spend_by_visit_rows(purchase_rows)
items_per_visit_rows = build_items_per_visit_rows(purchase_rows)
category_spend_rows = build_category_spend_rows(purchase_rows)
retailer_store_rows = build_retailer_store_rows(purchase_rows)
outputs = [
("item_price_over_time.csv", item_price_rows, ITEM_PRICE_FIELDS),
("spend_by_visit.csv", spend_by_visit_rows, SPEND_BY_VISIT_FIELDS),
("items_per_visit.csv", items_per_visit_rows, ITEMS_PER_VISIT_FIELDS),
("category_spend_over_time.csv", category_spend_rows, CATEGORY_SPEND_FIELDS),
("retailer_store_breakdown.csv", retailer_store_rows, RETAILER_STORE_FIELDS),
]
for filename, rows, fieldnames in outputs:
write_csv_rows(output_path / filename, rows, fieldnames)
click.echo(f"wrote analysis outputs to {output_path}")
if __name__ == "__main__":
main()

129
browser_session.py Normal file
View File

@@ -0,0 +1,129 @@
import configparser
import os
import shutil
import sqlite3
import tempfile
from pathlib import Path
import browser_cookie3
def find_firefox_profile_dir():
profiles_ini = firefox_profiles_root() / "profiles.ini"
parser = configparser.RawConfigParser()
if not profiles_ini.exists():
raise FileNotFoundError(f"Firefox profiles.ini not found at {profiles_ini}")
parser.read(profiles_ini, encoding="utf-8")
profiles = []
for section in parser.sections():
if not section.startswith("Profile"):
continue
path_value = parser.get(section, "Path", fallback="")
if not path_value:
continue
is_relative = parser.getboolean(section, "IsRelative", fallback=True)
profile_path = (
profiles_ini.parent / path_value if is_relative else Path(path_value)
)
profiles.append(
(
parser.getboolean(section, "Default", fallback=False),
profile_path,
)
)
if not profiles:
raise FileNotFoundError("No Firefox profiles found in profiles.ini")
profiles.sort(key=lambda item: (not item[0], str(item[1])))
return profiles[0][1]
def firefox_profiles_root():
if os.name == "nt":
appdata = os.getenv("APPDATA", "").strip()
if not appdata:
raise FileNotFoundError("APPDATA is not set")
return Path(appdata) / "Mozilla" / "Firefox"
return Path.home() / ".mozilla" / "firefox"
def load_firefox_cookies(domain_name, profile_dir):
cookie_file = Path(profile_dir) / "cookies.sqlite"
return browser_cookie3.firefox(cookie_file=str(cookie_file), domain_name=domain_name)
def read_firefox_local_storage(profile_dir, origin_filter):
storage_root = profile_dir / "storage" / "default"
if not storage_root.exists():
return {}
for ls_path in storage_root.glob("*/ls/data.sqlite"):
origin = decode_firefox_origin(ls_path.parents[1].name)
if origin_filter.lower() not in origin.lower():
continue
return {
stringify_sql_value(row[0]): stringify_sql_value(row[1])
for row in query_sqlite(ls_path, "SELECT key, value FROM data")
}
return {}
def read_firefox_webapps_store(profile_dir, origin_filter):
webapps_path = profile_dir / "webappsstore.sqlite"
if not webapps_path.exists():
return {}
values = {}
for row in query_sqlite(
webapps_path,
"SELECT originKey, key, value FROM webappsstore2",
):
origin = stringify_sql_value(row[0])
if origin_filter.lower() not in origin.lower():
continue
values[stringify_sql_value(row[1])] = stringify_sql_value(row[2])
return values
def query_sqlite(path, query):
copied_path = copy_sqlite_to_temp(path)
connection = None
cursor = None
try:
connection = sqlite3.connect(copied_path)
cursor = connection.cursor()
cursor.execute(query)
rows = cursor.fetchall()
return rows
except sqlite3.OperationalError:
return []
finally:
if cursor is not None:
cursor.close()
if connection is not None:
connection.close()
copied_path.unlink(missing_ok=True)
def copy_sqlite_to_temp(path):
fd, tmp = tempfile.mkstemp(suffix=".sqlite")
os.close(fd)
shutil.copyfile(path, tmp)
return Path(tmp)
def decode_firefox_origin(raw_origin):
origin = raw_origin.split("^", 1)[0]
return origin.replace("+++", "://")
def stringify_sql_value(value):
if value is None:
return ""
if isinstance(value, bytes):
for encoding in ("utf-8", "utf-16-le", "utf-16"):
try:
return value.decode(encoding)
except UnicodeDecodeError:
continue
return value.decode("utf-8", errors="ignore")
return str(value)

487
build_purchases.py Normal file
View File

@@ -0,0 +1,487 @@
from decimal import Decimal
from pathlib import Path
import click
from enrich_giant import format_decimal, to_decimal
from layer_helpers import read_csv_rows, write_csv_rows
PURCHASE_FIELDS = [
"purchase_date",
"retailer",
"catalog_name",
"product_type",
"category",
"net_line_total",
"normalized_quantity",
"normalized_quantity_unit",
"effective_price",
"effective_price_unit",
"order_id",
"line_no",
"normalized_row_id",
"normalized_item_id",
"catalog_id",
"review_status",
"resolution_action",
"raw_item_name",
"normalized_item_name",
"brand",
"variant",
"image_url",
"retailer_item_id",
"upc",
"qty",
"unit",
"pack_qty",
"size_value",
"size_unit",
"measure_type",
"line_total",
"unit_price",
"matched_discount_amount",
"net_line_total",
"store_name",
"store_number",
"store_city",
"store_state",
"price_per_each",
"price_per_each_basis",
"price_per_count",
"price_per_count_basis",
"price_per_lb",
"price_per_lb_basis",
"price_per_oz",
"price_per_oz_basis",
"is_discount_line",
"is_coupon_line",
"is_fee",
"raw_order_path",
]
EXAMPLE_FIELDS = [
"example_name",
"catalog_id",
"giant_purchase_date",
"giant_raw_item_name",
"giant_price_per_lb",
"costco_purchase_date",
"costco_raw_item_name",
"costco_price_per_lb",
"notes",
]
CATALOG_FIELDS = [
"catalog_id",
"catalog_name",
"category",
"product_type",
"brand",
"variant",
"size_value",
"size_unit",
"pack_qty",
"measure_type",
"notes",
"created_at",
"updated_at",
]
PRODUCT_LINK_FIELDS = [
"normalized_item_id",
"catalog_id",
"link_method",
"link_confidence",
"review_status",
"reviewed_by",
"reviewed_at",
"link_notes",
]
RESOLUTION_FIELDS = [
"normalized_item_id",
"catalog_id",
"resolution_action",
"status",
"resolution_notes",
"reviewed_at",
]
def derive_metrics(row):
line_total = to_decimal(row.get("net_line_total") or row.get("line_total"))
qty = to_decimal(row.get("qty"))
pack_qty = to_decimal(row.get("pack_qty"))
size_value = to_decimal(row.get("size_value"))
picked_weight = to_decimal(row.get("picked_weight"))
size_unit = row.get("size_unit", "")
price_per_each = row.get("price_per_each", "")
price_per_lb = row.get("price_per_lb", "")
price_per_oz = row.get("price_per_oz", "")
price_per_count = ""
basis_each = ""
basis_count = ""
basis_lb = ""
basis_oz = ""
if price_per_each:
basis_each = "line_total_over_qty"
elif line_total is not None and qty not in (None, 0):
price_per_each = format_decimal(line_total / qty)
basis_each = "line_total_over_qty"
if line_total is not None and pack_qty not in (None, 0):
total_count = pack_qty * (qty or Decimal("1"))
if total_count not in (None, 0):
price_per_count = format_decimal(line_total / total_count)
basis_count = "line_total_over_pack_qty"
if picked_weight not in (None, 0):
price_per_lb = format_decimal(line_total / picked_weight) if line_total is not None else ""
price_per_oz = (
format_decimal((line_total / picked_weight) / Decimal("16"))
if line_total is not None
else ""
)
basis_lb = "picked_weight_lb"
basis_oz = "picked_weight_lb_to_oz"
elif line_total is not None and size_value not in (None, 0):
total_units = size_value * (pack_qty or Decimal("1")) * (qty or Decimal("1"))
if size_unit == "lb" and total_units not in (None, 0):
per_lb = line_total / total_units
price_per_lb = format_decimal(per_lb)
price_per_oz = format_decimal(per_lb / Decimal("16"))
basis_lb = "parsed_size_lb"
basis_oz = "parsed_size_lb_to_oz"
elif size_unit == "oz" and total_units not in (None, 0):
per_oz = line_total / total_units
price_per_oz = format_decimal(per_oz)
price_per_lb = format_decimal(per_oz * Decimal("16"))
basis_lb = "parsed_size_oz_to_lb"
basis_oz = "parsed_size_oz"
return {
"price_per_each": price_per_each,
"price_per_each_basis": basis_each,
"price_per_count": price_per_count,
"price_per_count_basis": basis_count,
"price_per_lb": price_per_lb,
"price_per_lb_basis": basis_lb,
"price_per_oz": price_per_oz,
"price_per_oz_basis": basis_oz,
}
def derive_effective_price(row):
normalized_quantity = to_decimal(row.get("normalized_quantity"))
if normalized_quantity in (None, Decimal("0")):
return ""
numerator = to_decimal(derive_net_line_total(row))
if numerator is None:
return ""
return format_decimal(numerator / normalized_quantity)
def derive_effective_price_unit(row):
normalized_quantity = to_decimal(row.get("normalized_quantity"))
if normalized_quantity in (None, Decimal("0")):
return ""
return row.get("normalized_quantity_unit", "")
def derive_net_line_total(row):
existing_net = row.get("net_line_total", "")
if str(existing_net).strip() != "":
return str(existing_net)
line_total = to_decimal(row.get("line_total"))
if line_total is None:
return ""
matched_discount_amount = to_decimal(row.get("matched_discount_amount"))
if matched_discount_amount is not None:
return format_decimal(line_total + matched_discount_amount)
return format_decimal(line_total)
def order_lookup(rows, retailer):
return {(retailer, row["order_id"]): row for row in rows}
def read_optional_csv_rows(path):
path = Path(path)
if not path.exists():
return []
return read_csv_rows(path)
def normalize_catalog_row(row):
return {
"catalog_id": row.get("catalog_id") or row.get("canonical_product_id", ""),
"catalog_name": row.get("catalog_name") or row.get("canonical_name", ""),
"category": row.get("category", ""),
"product_type": row.get("product_type", ""),
"brand": row.get("brand", ""),
"variant": row.get("variant", ""),
"size_value": row.get("size_value", ""),
"size_unit": row.get("size_unit", ""),
"pack_qty": row.get("pack_qty", ""),
"measure_type": row.get("measure_type", ""),
"notes": row.get("notes", ""),
"created_at": row.get("created_at", ""),
"updated_at": row.get("updated_at", ""),
}
def is_review_first_catalog_row(row):
notes = row.get("notes", "").strip().lower()
if notes.startswith("auto-linked via"):
return False
return True
def normalize_link_row(row):
return {
"normalized_item_id": row.get("normalized_item_id", ""),
"catalog_id": row.get("catalog_id") or row.get("canonical_product_id", ""),
"link_method": row.get("link_method", ""),
"link_confidence": row.get("link_confidence", ""),
"review_status": row.get("review_status", ""),
"reviewed_by": row.get("reviewed_by", ""),
"reviewed_at": row.get("reviewed_at", ""),
"link_notes": row.get("link_notes", ""),
}
def normalize_resolution_row(row):
return {
"normalized_item_id": row.get("normalized_item_id", ""),
"catalog_id": row.get("catalog_id") or row.get("canonical_product_id", ""),
"resolution_action": row.get("resolution_action", ""),
"status": row.get("status", ""),
"resolution_notes": row.get("resolution_notes", ""),
"reviewed_at": row.get("reviewed_at", ""),
}
def load_resolution_lookup(resolution_rows):
lookup = {}
for row in resolution_rows:
normalized_row = normalize_resolution_row(row)
normalized_item_id = normalized_row.get("normalized_item_id", "")
if not normalized_item_id:
continue
lookup[normalized_item_id] = normalized_row
return lookup
def merge_catalog_rows(existing_rows, new_rows):
merged = {}
for row in existing_rows + new_rows:
normalized_row = normalize_catalog_row(row)
catalog_id = normalized_row.get("catalog_id", "")
if catalog_id:
merged[catalog_id] = normalized_row
return sorted(merged.values(), key=lambda row: row["catalog_id"])
def load_link_lookup(link_rows):
lookup = {}
for row in link_rows:
normalized_row = normalize_link_row(row)
normalized_item_id = normalized_row.get("normalized_item_id", "")
if not normalized_item_id:
continue
lookup[normalized_item_id] = normalized_row
return lookup
def build_purchase_rows(
giant_enriched_rows,
costco_enriched_rows,
giant_orders,
costco_orders,
resolution_rows,
link_rows=None,
catalog_rows=None,
):
all_enriched_rows = giant_enriched_rows + costco_enriched_rows
resolution_lookup = load_resolution_lookup(resolution_rows)
link_lookup = load_link_lookup(link_rows or [])
catalog_lookup = {
row["catalog_id"]: normalize_catalog_row(row)
for row in (catalog_rows or [])
if normalize_catalog_row(row).get("catalog_id")
}
for normalized_item_id, resolution in resolution_lookup.items():
action = resolution.get("resolution_action", "")
status = resolution.get("status", "")
if status != "approved":
continue
if action in {"link", "create"} and resolution.get("catalog_id"):
link_lookup[normalized_item_id] = {
"normalized_item_id": normalized_item_id,
"catalog_id": resolution["catalog_id"],
"link_method": f"manual_{action}",
"link_confidence": "high",
"review_status": status,
"reviewed_by": "",
"reviewed_at": resolution.get("reviewed_at", ""),
"link_notes": resolution.get("resolution_notes", ""),
}
elif action == "exclude":
link_lookup.pop(normalized_item_id, None)
orders_by_id = {}
orders_by_id.update(order_lookup(giant_orders, "giant"))
orders_by_id.update(order_lookup(costco_orders, "costco"))
purchase_rows = []
for row in sorted(
all_enriched_rows,
key=lambda item: (item["order_date"], item["retailer"], item["order_id"], int(item["line_no"])),
):
normalized_item_id = row.get("normalized_item_id", "")
resolution = resolution_lookup.get(normalized_item_id, {})
link_row = link_lookup.get(normalized_item_id, {})
catalog_row = catalog_lookup.get(link_row.get("catalog_id", ""), {})
order_row = orders_by_id.get((row["retailer"], row["order_id"]), {})
metrics = derive_metrics(row)
purchase_rows.append(
{
"purchase_date": row["order_date"],
"retailer": row["retailer"],
"catalog_name": catalog_row.get("catalog_name", ""),
"product_type": catalog_row.get("product_type", ""),
"category": catalog_row.get("category", ""),
"net_line_total": derive_net_line_total(row),
"normalized_quantity": row.get("normalized_quantity", ""),
"normalized_quantity_unit": row.get("normalized_quantity_unit", ""),
"effective_price": derive_effective_price({**row, "net_line_total": derive_net_line_total(row)}),
"effective_price_unit": derive_effective_price_unit(row),
"order_id": row["order_id"],
"line_no": row["line_no"],
"normalized_row_id": row.get("normalized_row_id", ""),
"normalized_item_id": normalized_item_id,
"catalog_id": link_row.get("catalog_id", ""),
"review_status": resolution.get("status", ""),
"resolution_action": resolution.get("resolution_action", ""),
"raw_item_name": row["item_name"],
"normalized_item_name": row["item_name_norm"],
"brand": catalog_row.get("brand", ""),
"variant": catalog_row.get("variant", ""),
"image_url": row.get("image_url", ""),
"retailer_item_id": row["retailer_item_id"],
"upc": row["upc"],
"qty": row["qty"],
"unit": row["unit"],
"pack_qty": row["pack_qty"],
"size_value": row["size_value"],
"size_unit": row["size_unit"],
"measure_type": row["measure_type"],
"line_total": row["line_total"],
"unit_price": row["unit_price"],
"matched_discount_amount": row.get("matched_discount_amount", ""),
"store_name": order_row.get("store_name", ""),
"store_number": order_row.get("store_number", ""),
"store_city": order_row.get("store_city", ""),
"store_state": order_row.get("store_state", ""),
"is_discount_line": row["is_discount_line"],
"is_coupon_line": row["is_coupon_line"],
"is_fee": row["is_fee"],
"raw_order_path": row["raw_order_path"],
**metrics,
}
)
return purchase_rows, sorted(link_lookup.values(), key=lambda row: row["normalized_item_id"])
def build_comparison_examples(purchase_rows):
giant_banana = None
costco_banana = None
for row in purchase_rows:
if row.get("normalized_item_name") != "BANANA":
continue
if not row.get("catalog_id"):
continue
if row["retailer"] == "giant" and row.get("price_per_lb"):
giant_banana = row
if row["retailer"] == "costco" and row.get("price_per_lb"):
costco_banana = row
if not giant_banana or not costco_banana:
return []
return [
{
"example_name": "banana_price_per_lb",
"catalog_id": giant_banana["catalog_id"],
"giant_purchase_date": giant_banana["purchase_date"],
"giant_raw_item_name": giant_banana["raw_item_name"],
"giant_price_per_lb": giant_banana["price_per_lb"],
"costco_purchase_date": costco_banana["purchase_date"],
"costco_raw_item_name": costco_banana["raw_item_name"],
"costco_price_per_lb": costco_banana["price_per_lb"],
"notes": "Example comparison using normalized price_per_lb across Giant and Costco",
}
]
@click.command()
@click.option("--giant-items-enriched-csv", default="data/giant-web/normalized_items.csv", show_default=True)
@click.option("--costco-items-enriched-csv", default="data/costco-web/normalized_items.csv", show_default=True)
@click.option("--giant-orders-csv", default="data/giant-web/collected_orders.csv", show_default=True)
@click.option("--costco-orders-csv", default="data/costco-web/collected_orders.csv", show_default=True)
@click.option("--resolutions-csv", default="data/review/review_resolutions.csv", show_default=True)
@click.option("--catalog-csv", default="data/review/catalog.csv", show_default=True)
@click.option("--links-csv", default="data/review/product_links.csv", show_default=True)
@click.option("--output-csv", default="data/analysis/purchases.csv", show_default=True)
@click.option("--examples-csv", default="data/analysis/comparison_examples.csv", show_default=True)
def main(
giant_items_enriched_csv,
costco_items_enriched_csv,
giant_orders_csv,
costco_orders_csv,
resolutions_csv,
catalog_csv,
links_csv,
output_csv,
examples_csv,
):
resolution_rows = read_optional_csv_rows(resolutions_csv)
catalog_rows = merge_catalog_rows(
[row for row in read_optional_csv_rows(catalog_csv) if is_review_first_catalog_row(row)],
[],
)
existing_links = [normalize_link_row(row) for row in read_optional_csv_rows(links_csv)]
purchase_rows, link_rows = build_purchase_rows(
read_csv_rows(giant_items_enriched_csv),
read_csv_rows(costco_items_enriched_csv),
read_csv_rows(giant_orders_csv),
read_csv_rows(costco_orders_csv),
resolution_rows,
existing_links,
catalog_rows,
)
example_rows = build_comparison_examples(purchase_rows)
write_csv_rows(catalog_csv, catalog_rows, CATALOG_FIELDS)
write_csv_rows(links_csv, link_rows, PRODUCT_LINK_FIELDS)
write_csv_rows(output_csv, purchase_rows, PURCHASE_FIELDS)
write_csv_rows(examples_csv, example_rows, EXAMPLE_FIELDS)
click.echo(
f"wrote {len(purchase_rows)} purchase rows to {output_csv}, "
f"{len(catalog_rows)} catalog rows to {catalog_csv}, "
f"{len(link_rows)} product links to {links_csv}, "
f"and {len(example_rows)} comparison examples to {examples_csv}"
)
if __name__ == "__main__":
main()

65
collect_costco_web.py Normal file
View File

@@ -0,0 +1,65 @@
import click
import scrape_costco
@click.command()
@click.option(
"--outdir",
default="data/costco-web",
show_default=True,
help="Directory for Costco raw and collected outputs.",
)
@click.option(
"--document-type",
default="all",
show_default=True,
help="Summary document type.",
)
@click.option(
"--document-sub-type",
default="all",
show_default=True,
help="Summary document sub type.",
)
@click.option(
"--window-days",
default=92,
show_default=True,
type=int,
help="Maximum number of days to request per summary window.",
)
@click.option(
"--months-back",
default=36,
show_default=True,
type=int,
help="How many months of receipts to enumerate back from today.",
)
@click.option(
"--firefox-profile-dir",
default=None,
help="Firefox profile directory to use for cookies and session storage.",
)
def main(
outdir,
document_type,
document_sub_type,
window_days,
months_back,
firefox_profile_dir,
):
scrape_costco.run_collection(
outdir=outdir,
document_type=document_type,
document_sub_type=document_sub_type,
window_days=window_days,
months_back=months_back,
firefox_profile_dir=firefox_profile_dir,
orders_filename="collected_orders.csv",
items_filename="collected_items.csv",
)
if __name__ == "__main__":
main()

34
collect_giant_web.py Normal file
View File

@@ -0,0 +1,34 @@
import click
import scrape_giant
@click.command()
@click.option("--user-id", default=None, help="Giant user id.")
@click.option("--loyalty", default=None, help="Giant loyalty number.")
@click.option(
"--outdir",
default="data/giant-web",
show_default=True,
help="Directory for raw json and collected csv outputs.",
)
@click.option(
"--sleep-seconds",
default=1.5,
show_default=True,
type=float,
help="Delay between order detail requests.",
)
def main(user_id, loyalty, outdir, sleep_seconds):
scrape_giant.run_collection(
user_id,
loyalty,
outdir,
sleep_seconds,
orders_filename="collected_orders.csv",
items_filename="collected_items.csv",
)
if __name__ == "__main__":
main()

379
enrich_costco.py Normal file
View File

@@ -0,0 +1,379 @@
import csv
import json
import re
from collections import defaultdict
from pathlib import Path
import click
from enrich_giant import (
OUTPUT_FIELDS,
derive_normalized_quantity,
derive_price_fields,
format_decimal,
normalization_identity,
normalize_number,
normalize_unit,
normalize_whitespace,
singularize_tokens,
to_decimal,
)
PARSER_VERSION = "costco-enrich-v1"
RETAILER = "costco"
DEFAULT_INPUT_DIR = Path("costco_output/raw")
DEFAULT_OUTPUT_CSV = Path("costco_output/items_enriched.csv")
CODE_TOKEN_RE = re.compile(
r"\b(?:SL\d+|T\d+H\d+|P\d+(?:/\d+)?|W\d+T\d+H\d+|FY\d+|CSPC#|C\d+T\d+H\d+|EC\d+T\d+H\d+|\d+X\d+)\b"
)
PACK_FRACTION_RE = re.compile(r"(?<![A-Z0-9])(\d+)\s*/\s*(\d+(?:\.\d+)?)\s*(OZ|LB|LBS|CT)\b")
HASH_SIZE_RE = re.compile(r"(?<![A-Z0-9])(\d+(?:\.\d+)?)#(?=\s|$)")
ITEM_CODE_RE = re.compile(r"#\w+\b")
DUAL_WEIGHT_RE = re.compile(
r"\b\d+(?:\.\d+)?\s*(?:KG|G|LB|LBS|OZ)\s*/\s*\d+(?:\.\d+)?\s*(?:KG|G|LB|LBS|OZ)\b"
)
LOGISTICS_SLASH_RE = re.compile(r"\b(?:T\d+/H\d+(?:/P\d+)?/?|H\d+/P\d+/?|T\d+/H\d+/?)\b")
PACK_DASH_RE = re.compile(r"(?<![A-Z0-9])(\d+)\s*-\s*PACK\b")
PACK_WORD_RE = re.compile(r"(?<![A-Z0-9])(\d+)\s*PACK\b")
SIZE_RE = re.compile(
r"(?<![A-Z0-9])(\d+(?:\.\d+)?)\s*(OZ|LB|LBS|CT|KG|G|QT|QTS|PT|PTS|GAL|GALS|FL OZ|FLOZ)\b"
)
DISCOUNT_TARGET_RE = re.compile(r"^/\s*(\d+)\b")
def clean_costco_name(name):
cleaned = normalize_whitespace(name).upper().replace('"', "")
cleaned = CODE_TOKEN_RE.sub(" ", cleaned)
cleaned = re.sub(r"\s*/\s*\d+(?:\.\d+)?\s*(KG|G)\b", " ", cleaned)
cleaned = normalize_whitespace(cleaned)
return cleaned
def combine_description(item):
return normalize_whitespace(
" ".join(
str(part).strip()
for part in [item.get("itemDescription01"), item.get("itemDescription02")]
if part
)
)
def parse_costco_size_and_pack(cleaned_name):
pack_qty = ""
size_value = ""
size_unit = ""
match = PACK_FRACTION_RE.search(cleaned_name)
if match:
pack_qty = normalize_number(match.group(1))
size_value = normalize_number(match.group(2))
size_unit = normalize_unit(match.group(3))
return size_value, size_unit, pack_qty
match = HASH_SIZE_RE.search(cleaned_name)
if match:
size_value = normalize_number(match.group(1))
size_unit = "lb"
match = PACK_DASH_RE.search(cleaned_name) or PACK_WORD_RE.search(cleaned_name)
if match:
pack_qty = normalize_number(match.group(1))
matches = list(SIZE_RE.finditer(cleaned_name))
if matches:
last = matches[-1]
unit = last.group(2)
size_value = normalize_number(last.group(1))
size_unit = "count" if unit == "CT" else normalize_unit(unit)
return size_value, size_unit, pack_qty
def normalize_costco_name(cleaned_name):
brand = ""
base = cleaned_name
if base.startswith("KS "):
brand = "KS"
base = normalize_whitespace(base[3:])
size_value, size_unit, pack_qty = parse_costco_size_and_pack(base)
if size_value and size_unit:
if pack_qty:
base = PACK_FRACTION_RE.sub(" ", base)
else:
base = SIZE_RE.sub(" ", base)
base = DUAL_WEIGHT_RE.sub(" ", base)
base = HASH_SIZE_RE.sub(" ", base)
base = ITEM_CODE_RE.sub(" ", base)
base = LOGISTICS_SLASH_RE.sub(" ", base)
base = PACK_DASH_RE.sub(" ", base)
base = PACK_WORD_RE.sub(" ", base)
base = normalize_whitespace(base)
tokens = []
for token in base.split():
if token in {"/", "-"}:
continue
if token in {"ORG"}:
continue
if token in {"PEANUT", "BUTTER"} and "JIF" in base:
continue
tokens.append(token)
base = singularize_tokens(" ".join(tokens))
return normalize_whitespace(base), brand, size_value, size_unit, pack_qty
def guess_measure_type(size_unit, pack_qty, is_discount_line):
if is_discount_line:
return "each"
if size_unit in {"lb", "oz", "g", "kg"}:
return "weight"
if size_unit in {"ml", "l", "qt", "pt", "gal", "fl_oz"}:
return "volume"
if size_unit == "count" or pack_qty:
return "count"
return "each"
def derive_costco_prices(item, measure_type, size_value, size_unit, pack_qty):
line_total = to_decimal(item.get("amount"))
qty = to_decimal(item.get("unit"))
parsed_size = to_decimal(size_value)
parsed_pack = to_decimal(pack_qty) or 1
price_per_each = ""
price_per_lb = ""
price_per_oz = ""
if line_total is None:
return price_per_each, price_per_lb, price_per_oz
if measure_type in {"each", "count"} and qty not in (None, 0):
price_per_each = format_decimal(line_total / qty)
if parsed_size not in (None, 0):
total_units = parsed_size * parsed_pack * (qty or 1)
if size_unit == "lb":
per_lb = line_total / total_units
price_per_lb = format_decimal(per_lb)
price_per_oz = format_decimal(per_lb / 16)
elif size_unit == "oz":
per_oz = line_total / total_units
price_per_oz = format_decimal(per_oz)
price_per_lb = format_decimal(per_oz * 16)
return price_per_each, price_per_lb, price_per_oz
def is_discount_item(item):
amount = to_decimal(item.get("amount")) or 0
unit = to_decimal(item.get("unit")) or 0
description = combine_description(item)
return amount < 0 or unit < 0 or description.startswith("/")
def discount_target_id(raw_name):
match = DISCOUNT_TARGET_RE.match(normalize_whitespace(raw_name))
if not match:
return ""
return match.group(1)
def parse_costco_item(order_id, order_date, raw_path, line_no, item):
raw_name = combine_description(item)
cleaned_name = clean_costco_name(raw_name)
item_name_norm, brand_guess, size_value, size_unit, pack_qty = normalize_costco_name(
cleaned_name
)
is_discount_line = is_discount_item(item)
is_coupon_line = "true" if raw_name.startswith("/") else "false"
measure_type = guess_measure_type(size_unit, pack_qty, is_discount_line)
price_per_each, price_per_lb, price_per_oz = derive_costco_prices(
item, measure_type, size_value, size_unit, pack_qty
)
normalized_row_id = f"{RETAILER}:{order_id}:{line_no}"
normalized_quantity, normalized_quantity_unit = derive_normalized_quantity(
item.get("unit"),
size_value,
size_unit,
pack_qty,
measure_type,
"",
)
identity_key, normalization_basis = normalization_identity(
{
"retailer": RETAILER,
"normalized_row_id": normalized_row_id,
"upc": "",
"retailer_item_id": str(item.get("itemNumber", "")),
"item_name_norm": item_name_norm,
"size_value": size_value,
"size_unit": size_unit,
"pack_qty": pack_qty,
}
)
price_fields = derive_price_fields(
price_per_each,
price_per_lb,
price_per_oz,
str(item.get("amount", "")),
str(item.get("unit", "")),
pack_qty,
)
return {
"retailer": RETAILER,
"order_id": str(order_id),
"line_no": str(line_no),
"normalized_row_id": normalized_row_id,
"normalized_item_id": f"cnorm:{identity_key}",
"normalization_basis": normalization_basis,
"observed_item_key": normalized_row_id,
"order_date": normalize_whitespace(order_date),
"retailer_item_id": str(item.get("itemNumber", "")),
"pod_id": "",
"item_name": raw_name,
"upc": "",
"category_id": str(item.get("itemDepartmentNumber", "")),
"category": str(item.get("transDepartmentNumber", "")),
"qty": str(item.get("unit", "")),
"unit": str(item.get("itemIdentifier", "")),
"unit_price": str(item.get("itemUnitPriceAmount", "")),
"line_total": str(item.get("amount", "")),
"picked_weight": "",
"mvp_savings": "",
"reward_savings": "",
"coupon_savings": str(item.get("amount", "")) if is_discount_line else "",
"coupon_price": "",
"matched_discount_amount": "",
"net_line_total": str(item.get("amount", "")) if not is_discount_line else "",
"image_url": "",
"raw_order_path": raw_path.as_posix(),
"item_name_norm": item_name_norm,
"brand_guess": brand_guess,
"variant": "",
"size_value": size_value,
"size_unit": size_unit,
"pack_qty": pack_qty,
"measure_type": measure_type,
"normalized_quantity": normalized_quantity,
"normalized_quantity_unit": normalized_quantity_unit,
"is_store_brand": "true" if brand_guess else "false",
"is_item": "false" if is_discount_line else "true",
"is_fee": "false",
"is_discount_line": "true" if is_discount_line else "false",
"is_coupon_line": is_coupon_line,
**price_fields,
"parse_version": PARSER_VERSION,
"parse_notes": "",
}
def match_costco_discounts(rows):
rows_by_order = defaultdict(list)
for row in rows:
rows_by_order[row["order_id"]].append(row)
for order_rows in rows_by_order.values():
purchase_rows_by_item_id = defaultdict(list)
for row in order_rows:
if row.get("is_discount_line") == "true":
continue
retailer_item_id = row.get("retailer_item_id", "")
if retailer_item_id:
purchase_rows_by_item_id[retailer_item_id].append(row)
for row in order_rows:
if row.get("is_discount_line") != "true":
continue
target_id = discount_target_id(row.get("item_name", ""))
if not target_id:
continue
matches = purchase_rows_by_item_id.get(target_id, [])
if len(matches) != 1:
row["parse_notes"] = normalize_whitespace(
f"{row.get('parse_notes', '')};discount_target_unmatched={target_id}"
).strip(";")
continue
purchase_row = matches[0]
matched_discount = to_decimal(row.get("line_total"))
gross_total = to_decimal(purchase_row.get("line_total"))
existing_discount = to_decimal(purchase_row.get("matched_discount_amount")) or 0
if matched_discount is None or gross_total is None:
continue
total_discount = existing_discount + matched_discount
purchase_row["matched_discount_amount"] = format_decimal(total_discount)
purchase_row["net_line_total"] = format_decimal(gross_total + total_discount)
purchase_row["parse_notes"] = normalize_whitespace(
f"{purchase_row.get('parse_notes', '')};matched_discount={target_id}"
).strip(";")
row["parse_notes"] = normalize_whitespace(
f"{row.get('parse_notes', '')};matched_to_item={target_id}"
).strip(";")
def iter_costco_rows(raw_dir):
for path in discover_json_files(raw_dir):
if path.name in {"summary.json", "summary_requests.json"}:
continue
payload = json.loads(path.read_text(encoding="utf-8"))
if not isinstance(payload, dict):
continue
receipts = payload.get("data", {}).get("receiptsWithCounts", {}).get("receipts", [])
for receipt in receipts:
order_id = receipt["transactionBarcode"]
order_date = receipt.get("transactionDate", "")
for line_no, item in enumerate(receipt.get("itemArray", []), start=1):
yield parse_costco_item(order_id, order_date, path, line_no, item)
def discover_json_files(raw_dir):
raw_dir = Path(raw_dir)
candidates = sorted(raw_dir.glob("*.json"))
if candidates:
return candidates
if raw_dir.name == "raw" and raw_dir.parent.exists():
return sorted(raw_dir.parent.glob("*.json"))
return []
def build_items_enriched(raw_dir):
rows = list(iter_costco_rows(raw_dir))
match_costco_discounts(rows)
rows.sort(key=lambda row: (row["order_date"], row["order_id"], int(row["line_no"])))
return rows
def write_csv(path, rows):
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=OUTPUT_FIELDS)
writer.writeheader()
writer.writerows(rows)
@click.command()
@click.option(
"--input-dir",
default=str(DEFAULT_INPUT_DIR),
show_default=True,
help="Directory containing Costco raw order json files.",
)
@click.option(
"--output-csv",
default=str(DEFAULT_OUTPUT_CSV),
show_default=True,
help="CSV path for enriched Costco item rows.",
)
def main(input_dir, output_csv):
click.echo("legacy entrypoint: prefer normalize_costco_web.py for data-model outputs")
rows = build_items_enriched(Path(input_dir))
write_csv(Path(output_csv), rows)
click.echo(f"wrote {len(rows)} rows to {output_csv}")
if __name__ == "__main__":
main()

579
enrich_giant.py Normal file
View File

@@ -0,0 +1,579 @@
import csv
import json
import re
from decimal import Decimal, InvalidOperation, ROUND_HALF_UP
from pathlib import Path
import click
PARSER_VERSION = "giant-enrich-v1"
RETAILER = "giant"
DEFAULT_INPUT_DIR = Path("giant_output/raw")
DEFAULT_OUTPUT_CSV = Path("giant_output/items_enriched.csv")
OUTPUT_FIELDS = [
"retailer",
"order_id",
"line_no",
"normalized_row_id",
"normalized_item_id",
"normalization_basis",
"observed_item_key",
"order_date",
"retailer_item_id",
"pod_id",
"item_name",
"upc",
"category_id",
"category",
"qty",
"unit",
"unit_price",
"line_total",
"picked_weight",
"mvp_savings",
"reward_savings",
"coupon_savings",
"coupon_price",
"matched_discount_amount",
"net_line_total",
"image_url",
"raw_order_path",
"item_name_norm",
"brand_guess",
"variant",
"size_value",
"size_unit",
"pack_qty",
"measure_type",
"normalized_quantity",
"normalized_quantity_unit",
"is_store_brand",
"is_item",
"is_fee",
"is_discount_line",
"is_coupon_line",
"price_per_each",
"price_per_each_basis",
"price_per_count",
"price_per_count_basis",
"price_per_lb",
"price_per_lb_basis",
"price_per_oz",
"price_per_oz_basis",
"parse_version",
"parse_notes",
]
STORE_BRAND_PREFIXES = {
"SB": "SB",
"NP": "NP",
}
DROP_TOKENS = {"FRESH"}
ABBREVIATIONS = {
"APPLE": "APPLE",
"APPLES": "APPLES",
"APLE": "APPLE",
"BASIL": "BASIL",
"BLK": "BLACK",
"BNLS": "BONELESS",
"BRWN": "BROWN",
"CARROTS": "CARROTS",
"CHDR": "CHEDDAR",
"CHICKEN": "CHICKEN",
"CHOC": "CHOCOLATE",
"CHS": "CHEESE",
"CHSE": "CHEESE",
"CHZ": "CHEESE",
"CILANTRO": "CILANTRO",
"CKI": "COOKIE",
"CRSHD": "CRUSHED",
"FLR": "FLOUR",
"FRSH": "FRESH",
"GALA": "GALA",
"GRAHM": "GRAHAM",
"HOT": "HOT",
"HRSRDSH": "HORSERADISH",
"IMP": "IMPORTED",
"IQF": "IQF",
"LENTILS": "LENTILS",
"LG": "LARGE",
"MLK": "MILK",
"MSTRD": "MUSTARD",
"ONION": "ONION",
"ORG": "ORGANIC",
"PEPPER": "PEPPER",
"PEPPERS": "PEPPERS",
"POT": "POTATO",
"POTATO": "POTATO",
"PPR": "PEPPER",
"RICOTTA": "RICOTTA",
"ROASTER": "ROASTER",
"ROTINI": "ROTINI",
"SCE": "SAUCE",
"SLC": "SLICED",
"SPINCH": "SPINACH",
"SPNC": "SPINACH",
"SPINACH": "SPINACH",
"SQZ": "SQUEEZE",
"SWT": "SWEET",
"THYME": "THYME",
"TOM": "TOMATO",
"TOMS": "TOMATOES",
"TRTL": "TORTILLA",
"VEG": "VEGETABLE",
"VINEGAR": "VINEGAR",
"WHT": "WHITE",
"WHOLE": "WHOLE",
"YLW": "YELLOW",
"YLWGLD": "YELLOW_GOLD",
}
FEE_PATTERNS = [
re.compile(r"\bBAG CHARGE\b"),
re.compile(r"\bDISC AT TOTAL\b"),
]
SIZE_RE = re.compile(r"(?<![A-Z0-9])(\d+(?:\.\d+)?)(?:\s*)(OZ|Z|LB|LBS|ML|L|FZ|FL OZ|QT|PT|GAL|GA)\b")
PACK_RE = re.compile(r"(?<![A-Z0-9])(\d+(?:\.\d+)?)(?:\s*)(CT|PK|PKG|PACK)\b")
def to_decimal(value):
if value in ("", None):
return None
try:
return Decimal(str(value))
except (InvalidOperation, ValueError):
return None
def format_decimal(value, places=4):
if value is None:
return ""
quant = Decimal("1").scaleb(-places)
normalized = value.quantize(quant, rounding=ROUND_HALF_UP).normalize()
return format(normalized, "f")
def normalize_whitespace(value):
return " ".join(str(value or "").strip().split())
def clean_item_name(name):
cleaned = normalize_whitespace(name).upper()
cleaned = re.sub(r"^\+", "", cleaned)
cleaned = re.sub(r"^PLU#\d+\s*", "", cleaned)
cleaned = cleaned.replace("#", " ")
return normalize_whitespace(cleaned)
def extract_store_brand_prefix(cleaned_name):
for prefix, brand in STORE_BRAND_PREFIXES.items():
if cleaned_name == prefix or cleaned_name.startswith(f"{prefix} "):
return prefix, brand
return "", ""
def extract_image_url(item):
image = item.get("image")
if isinstance(image, dict):
for key in ["xlarge", "large", "medium", "small"]:
value = image.get(key)
if value:
return value
if isinstance(image, str):
return image
return ""
def parse_size_and_pack(cleaned_name):
size_value = ""
size_unit = ""
pack_qty = ""
size_matches = list(SIZE_RE.finditer(cleaned_name))
if size_matches:
match = size_matches[-1]
size_value = normalize_number(match.group(1))
size_unit = normalize_unit(match.group(2))
pack_matches = list(PACK_RE.finditer(cleaned_name))
if pack_matches:
match = pack_matches[-1]
pack_qty = normalize_number(match.group(1))
return size_value, size_unit, pack_qty
def normalize_number(value):
decimal = to_decimal(value)
if decimal is None:
return ""
return format(decimal.normalize(), "f")
def normalize_unit(unit):
collapsed = normalize_whitespace(unit).upper()
return {
"Z": "oz",
"OZ": "oz",
"FZ": "fl_oz",
"FL OZ": "fl_oz",
"FLOZ": "fl_oz",
"LB": "lb",
"LBS": "lb",
"ML": "ml",
"L": "l",
"QT": "qt",
"QTS": "qt",
"PT": "pt",
"PTS": "pt",
"GAL": "gal",
"GALS": "gal",
"GA": "gal",
}.get(collapsed, collapsed.lower())
def strip_measure_tokens(cleaned_name):
without_sizes = SIZE_RE.sub(" ", cleaned_name)
without_measures = PACK_RE.sub(" ", without_sizes)
return normalize_whitespace(without_measures)
def expand_token(token):
return ABBREVIATIONS.get(token, token)
def normalize_item_name(cleaned_name):
prefix, _brand = extract_store_brand_prefix(cleaned_name)
base = cleaned_name
if prefix:
base = normalize_whitespace(base[len(prefix):])
base = strip_measure_tokens(base)
expanded_tokens = []
for token in base.split():
expanded = expand_token(token)
if expanded in DROP_TOKENS:
continue
expanded_tokens.append(expanded)
expanded = " ".join(token for token in expanded_tokens if token)
return singularize_tokens(normalize_whitespace(expanded))
def singularize_tokens(text):
singular_map = {
"APPLES": "APPLE",
"BANANAS": "BANANA",
"BERRIES": "BERRY",
"EGGS": "EGG",
"LEMONS": "LEMON",
"LIMES": "LIME",
"MANDARINS": "MANDARIN",
"PEPPERS": "PEPPER",
"STRAWBERRIES": "STRAWBERRY",
}
tokens = [singular_map.get(token, token) for token in text.split()]
return normalize_whitespace(" ".join(tokens))
def guess_measure_type(item, size_unit, pack_qty):
unit = normalize_whitespace(item.get("lbEachCd")).upper()
picked_weight = to_decimal(item.get("totalPickedWeight"))
qty = to_decimal(item.get("shipQy"))
if unit == "LB" or (picked_weight is not None and picked_weight > 0 and unit != "EA"):
return "weight"
if size_unit in {"lb", "oz"}:
return "weight"
if size_unit in {"ml", "l", "qt", "pt", "gal", "fl_oz"}:
return "volume"
if pack_qty:
return "count"
if unit == "EA" or (qty is not None and qty > 0):
return "each"
return ""
def is_fee_item(cleaned_name):
return any(pattern.search(cleaned_name) for pattern in FEE_PATTERNS)
def derive_prices(item, measure_type, size_value="", size_unit="", pack_qty=""):
qty = to_decimal(item.get("shipQy"))
line_total = to_decimal(item.get("groceryAmount"))
picked_weight = to_decimal(item.get("totalPickedWeight"))
parsed_size = to_decimal(size_value)
parsed_pack = to_decimal(pack_qty) or Decimal("1")
price_per_each = ""
price_per_lb = ""
price_per_oz = ""
if line_total is None:
return price_per_each, price_per_lb, price_per_oz
if measure_type == "each" and qty not in (None, Decimal("0")):
price_per_each = format_decimal(line_total / qty)
if measure_type == "count" and qty not in (None, Decimal("0")):
price_per_each = format_decimal(line_total / qty)
if measure_type == "weight" and picked_weight not in (None, Decimal("0")):
per_lb = line_total / picked_weight
price_per_lb = format_decimal(per_lb)
price_per_oz = format_decimal(per_lb / Decimal("16"))
return price_per_each, price_per_lb, price_per_oz
if measure_type == "weight" and parsed_size not in (None, Decimal("0")) and qty not in (None, Decimal("0")):
total_units = qty * parsed_pack * parsed_size
if size_unit == "lb":
per_lb = line_total / total_units
price_per_lb = format_decimal(per_lb)
price_per_oz = format_decimal(per_lb / Decimal("16"))
elif size_unit == "oz":
per_oz = line_total / total_units
price_per_oz = format_decimal(per_oz)
price_per_lb = format_decimal(per_oz * Decimal("16"))
return price_per_each, price_per_lb, price_per_oz
def derive_normalized_quantity(qty, size_value, size_unit, pack_qty, measure_type, picked_weight=""):
parsed_qty = to_decimal(qty)
parsed_size = to_decimal(size_value)
parsed_pack = to_decimal(pack_qty)
parsed_picked_weight = to_decimal(picked_weight)
total_multiplier = None
if parsed_qty not in (None, Decimal("0")):
total_multiplier = parsed_qty * (parsed_pack or Decimal("1"))
if (
parsed_size not in (None, Decimal("0"))
and size_unit
and total_multiplier not in (None, Decimal("0"))
):
return format_decimal(parsed_size * total_multiplier), size_unit
if measure_type == "weight" and parsed_picked_weight not in (None, Decimal("0")):
return format_decimal(parsed_picked_weight), "lb"
if measure_type == "count" and total_multiplier not in (None, Decimal("0")):
return format_decimal(total_multiplier), "count"
if measure_type == "each" and parsed_qty not in (None, Decimal("0")):
return format_decimal(parsed_qty), "each"
return "", ""
def derive_price_fields(price_per_each, price_per_lb, price_per_oz, line_total, qty, pack_qty):
line_total_decimal = to_decimal(line_total)
qty_decimal = to_decimal(qty)
pack_decimal = to_decimal(pack_qty)
price_per_count = ""
price_per_count_basis = ""
if line_total_decimal is not None and qty_decimal not in (None, Decimal("0")) and pack_decimal not in (
None,
Decimal("0"),
):
price_per_count = format_decimal(line_total_decimal / (qty_decimal * pack_decimal))
price_per_count_basis = "line_total_over_pack_qty"
return {
"price_per_each": price_per_each,
"price_per_each_basis": "line_total_over_qty" if price_per_each else "",
"price_per_count": price_per_count,
"price_per_count_basis": price_per_count_basis,
"price_per_lb": price_per_lb,
"price_per_lb_basis": "parsed_or_picked_weight" if price_per_lb else "",
"price_per_oz": price_per_oz,
"price_per_oz_basis": "parsed_or_picked_weight" if price_per_oz else "",
}
def normalization_identity(row):
if row.get("upc"):
return f"{row['retailer']}|upc={row['upc']}", "exact_upc"
if row.get("retailer_item_id"):
return f"{row['retailer']}|retailer_item_id={row['retailer_item_id']}", "exact_retailer_item_id"
if row.get("item_name_norm"):
return (
"|".join(
[
row["retailer"],
f"name={row['item_name_norm']}",
f"size={row.get('size_value', '')}",
f"unit={row.get('size_unit', '')}",
f"pack={row.get('pack_qty', '')}",
]
),
"exact_name_size_pack",
)
return row["normalized_row_id"], "row_identity"
def parse_item(order_id, order_date, raw_path, line_no, item):
cleaned_name = clean_item_name(item.get("itemName", ""))
size_value, size_unit, pack_qty = parse_size_and_pack(cleaned_name)
prefix, brand_guess = extract_store_brand_prefix(cleaned_name)
normalized_name = normalize_item_name(cleaned_name)
measure_type = guess_measure_type(item, size_unit, pack_qty)
price_per_each, price_per_lb, price_per_oz = derive_prices(
item,
measure_type,
size_value=size_value,
size_unit=size_unit,
pack_qty=pack_qty,
)
is_fee = is_fee_item(cleaned_name)
parse_notes = []
if prefix:
parse_notes.append(f"store_brand_prefix={prefix}")
if is_fee:
parse_notes.append("fee_item")
if size_value and not size_unit:
parse_notes.append("size_without_unit")
normalized_row_id = f"{RETAILER}:{order_id}:{line_no}"
normalized_quantity, normalized_quantity_unit = derive_normalized_quantity(
item.get("shipQy"),
size_value,
size_unit,
pack_qty,
measure_type,
item.get("totalPickedWeight"),
)
identity_key, normalization_basis = normalization_identity(
{
"retailer": RETAILER,
"normalized_row_id": normalized_row_id,
"upc": stringify(item.get("primUpcCd")),
"retailer_item_id": stringify(item.get("podId")),
"item_name_norm": normalized_name,
"size_value": size_value,
"size_unit": size_unit,
"pack_qty": pack_qty,
}
)
price_fields = derive_price_fields(
price_per_each,
price_per_lb,
price_per_oz,
stringify(item.get("groceryAmount")),
stringify(item.get("shipQy")),
pack_qty,
)
return {
"retailer": RETAILER,
"order_id": str(order_id),
"line_no": str(line_no),
"normalized_row_id": normalized_row_id,
"normalized_item_id": f"gnorm:{identity_key}",
"normalization_basis": normalization_basis,
"observed_item_key": normalized_row_id,
"order_date": normalize_whitespace(order_date),
"retailer_item_id": stringify(item.get("podId")),
"pod_id": stringify(item.get("podId")),
"item_name": stringify(item.get("itemName")),
"upc": stringify(item.get("primUpcCd")),
"category_id": stringify(item.get("categoryId")),
"category": stringify(item.get("categoryDesc")),
"qty": stringify(item.get("shipQy")),
"unit": stringify(item.get("lbEachCd")),
"unit_price": stringify(item.get("unitPrice")),
"line_total": stringify(item.get("groceryAmount")),
"picked_weight": stringify(item.get("totalPickedWeight")),
"mvp_savings": stringify(item.get("mvpSavings")),
"reward_savings": stringify(item.get("rewardSavings")),
"coupon_savings": stringify(item.get("couponSavings")),
"coupon_price": stringify(item.get("couponPrice")),
"matched_discount_amount": "",
"net_line_total": stringify(item.get("totalPrice")),
"image_url": extract_image_url(item),
"raw_order_path": raw_path.as_posix(),
"item_name_norm": normalized_name,
"brand_guess": brand_guess,
"variant": "",
"size_value": size_value,
"size_unit": size_unit,
"pack_qty": pack_qty,
"measure_type": measure_type,
"normalized_quantity": normalized_quantity,
"normalized_quantity_unit": normalized_quantity_unit,
"is_store_brand": "true" if bool(prefix) else "false",
"is_item": "false" if is_fee else "true",
"is_fee": "true" if is_fee else "false",
"is_discount_line": "false",
"is_coupon_line": "false",
**price_fields,
"parse_version": PARSER_VERSION,
"parse_notes": ";".join(parse_notes),
}
def stringify(value):
if value is None:
return ""
return str(value)
def iter_order_rows(raw_dir):
for path in sorted(raw_dir.glob("*.json")):
if path.name == "history.json":
continue
payload = json.loads(path.read_text(encoding="utf-8"))
order_id = payload.get("orderId", path.stem)
order_date = payload.get("orderDate", "")
for line_no, item in enumerate(payload.get("items", []), start=1):
yield parse_item(order_id, order_date, path, line_no, item)
def build_items_enriched(raw_dir):
rows = list(iter_order_rows(raw_dir))
rows.sort(key=lambda row: (row["order_date"], row["order_id"], int(row["line_no"])))
return rows
def write_csv(path, rows):
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=OUTPUT_FIELDS)
writer.writeheader()
writer.writerows(rows)
@click.command()
@click.option(
"--input-dir",
default=str(DEFAULT_INPUT_DIR),
show_default=True,
help="Directory containing Giant raw order json files.",
)
@click.option(
"--output-csv",
default=str(DEFAULT_OUTPUT_CSV),
show_default=True,
help="CSV path for enriched Giant item rows.",
)
def main(input_dir, output_csv):
click.echo("legacy entrypoint: prefer normalize_giant_web.py for data-model outputs")
raw_dir = Path(input_dir)
output_path = Path(output_csv)
if not raw_dir.exists():
raise click.ClickException(f"input dir does not exist: {raw_dir}")
rows = build_items_enriched(raw_dir)
write_csv(output_path, rows)
click.echo(f"wrote {len(rows)} rows to {output_path}")
if __name__ == "__main__":
main()

54
layer_helpers.py Normal file
View File

@@ -0,0 +1,54 @@
import csv
import hashlib
from collections import Counter
from pathlib import Path
def read_csv_rows(path):
path = Path(path)
with path.open(newline="", encoding="utf-8") as handle:
return list(csv.DictReader(handle))
def write_csv_rows(path, rows, fieldnames):
path = Path(path)
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(rows)
def stable_id(prefix, raw_key):
digest = hashlib.sha1(str(raw_key).encode("utf-8")).hexdigest()[:12]
return f"{prefix}_{digest}"
def first_nonblank(rows, field):
for row in rows:
value = row.get(field, "")
if value:
return value
return ""
def representative_value(rows, field):
values = [row.get(field, "") for row in rows if row.get(field, "")]
if not values:
return ""
counts = Counter(values)
return sorted(counts.items(), key=lambda item: (-item[1], item[0]))[0][0]
def distinct_values(rows, field):
return sorted({row.get(field, "") for row in rows if row.get(field, "")})
def compact_join(values, limit=3):
unique = []
seen = set()
for value in values:
if value and value not in seen:
seen.add(value)
unique.append(value)
return " | ".join(unique[:limit])

28
normalize_costco_web.py Normal file
View File

@@ -0,0 +1,28 @@
from pathlib import Path
import click
import enrich_costco
@click.command()
@click.option(
"--input-dir",
default="data/costco-web/raw",
show_default=True,
help="Directory containing Costco raw order json files.",
)
@click.option(
"--output-csv",
default="data/costco-web/normalized_items.csv",
show_default=True,
help="CSV path for normalized Costco item rows.",
)
def main(input_dir, output_csv):
rows = enrich_costco.build_items_enriched(Path(input_dir))
enrich_costco.write_csv(Path(output_csv), rows)
click.echo(f"wrote {len(rows)} rows to {output_csv}")
if __name__ == "__main__":
main()

28
normalize_giant_web.py Normal file
View File

@@ -0,0 +1,28 @@
from pathlib import Path
import click
import enrich_giant
@click.command()
@click.option(
"--input-dir",
default="data/giant-web/raw",
show_default=True,
help="Directory containing Giant raw order json files.",
)
@click.option(
"--output-csv",
default="data/giant-web/normalized_items.csv",
show_default=True,
help="CSV path for normalized Giant item rows.",
)
def main(input_dir, output_csv):
rows = enrich_giant.build_items_enriched(Path(input_dir))
enrich_giant.write_csv(Path(output_csv), rows)
click.echo(f"wrote {len(rows)} rows to {output_csv}")
if __name__ == "__main__":
main()

View File

@@ -1,100 +1,165 @@
* grocery data model and file layout * Grocery data model and file layout
This document defines the shared file layout and stable CSV schemas for the This document defines the shared file layout and stable CSV schemas for the
grocery pipeline. The goal is to keep retailer-specific ingest separate from grocery pipeline.
cross-retailer product modeling so Giant-specific quirks do not become the Goals:
system of record. - Ensure data gathering is separate from analysis
- Enable multiple data gathering methods
** design rules - One layer for review and analysis
** Design Rules
- Raw retailer exports remain the source of truth. - Raw retailer exports remain the source of truth.
- Retailer parsing is isolated to retailer-specific files and ids. - Retailer parsing is isolated to retailer-specific files and ids.
- Cross-retailer product layers begin only after retailer-specific enrichment. - Cross-retailer product layers begin only after retailer-specific normalization.
- CSV schemas are stable and additive: new columns may be appended, but - CSV schemas are stable and additive: new columns may be appended, but
existing columns should not be repurposed. existing columns should not be repurposed.
- Unknown values should be left blank rather than guessed. - Unknown values should be left blank rather than guessed.
** directory layout *** Retailer-specific data:
Use one top-level data root:
#+begin_example
data/
giant/
raw/
history.json
orders/
<order_id>.json
orders.csv
items_raw.csv
items_enriched.csv
products_observed.csv
costco/
raw/
...
orders.csv
items_raw.csv
items_enriched.csv
products_observed.csv
shared/
products_canonical.csv
product_links.csv
review_queue.csv
#+end_example
** layer responsibilities
- `data/<retailer>/raw/`
Stores unmodified retailer payloads exactly as fetched.
- `data/<retailer>/orders.csv`
One row per retailer order or visit, flattened from raw order data.
- `data/<retailer>/items_raw.csv`
One row per retailer line item, preserving retailer-native values needed for
reruns and debugging.
- `data/<retailer>/items_enriched.csv`
Parsed retailer line items with normalized fields and derived guesses, still
retailer-specific.
- `data/<retailer>/products_observed.csv`
Distinct retailer-facing observed products aggregated from enriched items.
- `data/shared/products_canonical.csv`
Cross-retailer canonical product entities used for comparison.
- `data/shared/product_links.csv`
Links from retailer observed products to canonical products.
- `data/shared/review_queue.csv`
Human review queue for unresolved or low-confidence matching/parsing cases.
** retailer-specific versus shared
Retailer-specific:
- raw json payloads - raw json payloads
- retailer order ids - retailer order ids
- retailer line numbers - retailer line numbers
- retailer category ids and names - retailer category ids and names
- retailer item names - retailer item names
- retailer image urls - retailer image urls
- parsed guesses derived from one retailer feed
- observed products scoped to one retailer
Shared:
- canonical products
- observed-to-canonical links
- human review state for unresolved cases
- comparison-ready normalized quantity basis fields - comparison-ready normalized quantity basis fields
Observed products are the boundary between retailer-specific parsing and *** Review/Combined data:
cross-retailer canonicalization. Nothing upstream of `products_observed.csv` - catalog of reviewed products
should require knowledge of another retailer. - links from normalized retailer items to catalog
- human review state for unresolved cases
** schema: `data/<retailer>/orders.csv`
One row per order or visit. * Pipeline
Each step can be run alone if its dependents exist.
Each retail provider script must produce deterministic line-item outputs, and
normalization may assign within-retailer product identity only when the
retailer itself provides strong evidence.
| column | meaning | Key:
|- - (1) input
| `retailer` | retailer slug such as `giant` | - [1] output
| `order_id` | retailer order or visit id |
** 1. Collect
Get raw receipt/visit and item data from a retailer.
Scraping is unique to a Retailer and method (e.g., Giant-Web and Giant-Scan).
Preserve complete raw data and preserve fidelity.
Avoid interpretation beyond basic data flattening.
- (1) Source access (Varies, eg header data, auth for API access)
- [1] collected visits from each retailer
- [2] collected items from each retailer
- [3] any other raw data that supports [1] and [2]; explicit source (eventual receipt scan?)
** 2. Normalize
Parse and extract structured facts from retailer-specific raw data
to create a standardized item format for that retailer.
Strictly dependent on Collect method and output.
- Extract quantity, size, pack, pricing, variant
- Add discount line items to product line items using upc/retail_item_id and concurrence
- Cleanup naming to facilitate later matching
- Assign retailer-level `normalized_item_id` only when evidence is deterministic
- Never use fuzzy or semantic matching here
- (1) collected items from each retailer
- (2) collected visits from each retailer
- [1] normalized items from each retailer
** 3. Review/Combine (Canonicalization)
Decide whether two normalized retailer items are "the same product";
match items across retailers using algo/logic and human review.
Create catalog linked to normalized retailer items.
- Review operates on distinct `normalized_item_id` values, not individual purchase rows
- Cross-retailer identity decisions happen only here
- Asking human to create a canonical/catalog item with:
- friendly/catalog_name: "bell pepper"; "milk"
- category: "produce"; "dairy"
- product_type: "pepper"; "milk"
- ? variant? "whole, "skim", "2pct"
- Then link the group of items to that catalog item.
- (1) normalized items from each retailer
- [1] review queue of items to be reviewed
- [2] catalog (lookup table) of confirmed normalized retailer items and catalog_id
- [3] purchase list of normalized items , pivot-ready
** Unresolved Issues
1. need central script to orchestrate; metadata belongs there and nowhere else
2. `LIME` and `LIME . / .` appearing in the catalog: names must come from review-approved names, not raw strings
* Directory Layout
Use one top-level data root:
#+begin_example
main.py
collect_<retailer>_<method>.py
normalize_<retailer>_<method>.py
review.py
data/
<retailer-method>/
raw/ # unmodified retailer payloads exactly as fetched
<order_id.json>
collected_items.csv # one row per retailer line item w/ retailer-native values
collected_orders.csv # one row per receipt/visit, flattened from raw order data
normalized_items.csv # parsed retailer-specific line items with normalized fields
costco-web/ # sample
raw/
orders/
history.json
<order_id>.json
collected_items.csv
collected_orders.csv
normalized_items.csv
review/
review_queue.csv # Human review queue for unresolved matching/parsing cases.
product_links.csv # Links from normalized retailer items to catalog items.
catalog.csv # Cross-retailer product catalog entities used for comparison.
analysis/
purchases.csv
comparison_examples.csv
item_price_over_time.csv
spend_by_visit.csv
items_per_visit.csv
category_spend_over_time.csv
retailer_store_breakdown.csv
#+end_example
Notes:
- The current repo still uses transitional root-level scripts and output folders.
- This layout is the target structure for the refactor, not a claim that migration is already complete.
* Schemas
** `data/<retailer-method>/collected_items.csv`
One row per retailer line item.
| key | definition |
|--------------------+--------------------------------------------|
| `retailer` PK | retailer slug |
| `order_id` PK | retailer order id |
| `line_no` PK | stable line number within order export |
| `order_date` | copied from order when available |
| `retailer_item_id` | retailer-native item id when available |
| `pod_id` | retailer pod/item id |
| `item_name` | raw retailer item name |
| `upc` | retailer UPC or PLU value |
| `category_id` | retailer category id |
| `category` | retailer category description |
| `qty` | retailer quantity field |
| `unit` | retailer unit code such as `EA` or `LB` |
| `unit_price` | retailer unit price field |
| `line_total` | retailer extended price field |
| `picked_weight` | retailer picked weight field |
| `mvp_savings` | retailer savings field |
| `reward_savings` | retailer rewards savings field |
| `coupon_savings` | retailer coupon savings field |
| `coupon_price` | retailer coupon price field |
| `image_url` | raw retailer image url when present |
| `raw_order_path` | relative path to source order payload |
| `is_discount_line` | retailer adjustment or discount-line flag |
| `is_coupon_line` | coupon-like line flag when distinguishable |
** `data/<retailer-method>/collected_orders.csv`
One row per order/visit/receipt.
| key | definition |
|---------------------------+-------------------------------------------------|
| `retailer` PK | retailer slug such as `giant` |
| `order_id` PK | retailer order or visit id |
| `order_date` | order date in `YYYY-MM-DD` when available | | `order_date` | order date in `YYYY-MM-DD` when available |
| `delivery_date` | fulfillment date in `YYYY-MM-DD` when available | | `delivery_date` | fulfillment date in `YYYY-MM-DD` when available |
| `service_type` | retailer service type such as `INSTORE` | | `service_type` | retailer service type such as `INSTORE` |
@@ -115,159 +180,84 @@ One row per order or visit.
| `raw_history_path` | relative path to source history payload | | `raw_history_path` | relative path to source history payload |
| `raw_order_path` | relative path to source order payload | | `raw_order_path` | relative path to source order payload |
Primary key: ** `data/<retailer-method>/normalized_items.csv`
One row per retailer line item after deterministic parsing. Preserve raw
fields from `collected_items.csv` and add parsed fields that make later review
and grouping easier. Normalization may assign retailer-level identity when the
evidence is deterministic and retailer-scoped.
- (`retailer`, `order_id`) | key | definition |
|----------------------------+------------------------------------------------------------------|
** schema: `data/<retailer>/items_raw.csv` | `retailer` PK | retailer slug |
| `order_id` PK | retailer order id |
One row per retailer line item. | `line_no` PK | line number within order |
| `normalized_row_id` | stable row key, typically `<retailer>:<order_id>:<line_no>` |
| column | meaning | | `normalized_item_id` | stable retailer-level item identity when deterministic grouping is supported |
|------------------+-----------------------------------------| | `normalization_basis` | basis used to assign `normalized_item_id` |
| `retailer` | retailer slug | | `retailer_item_id` | retailer-native item id |
| `order_id` | retailer order id |
| `line_no` | stable line number within order export |
| `order_date` | copied from order when available |
| `pod_id` | retailer pod/item id |
| `item_name` | raw retailer item name | | `item_name` | raw retailer item name |
| `upc` | retailer UPC or PLU value | | `item_name_norm` | normalized retailer item name |
| `category_id` | retailer category id |
| `category` | retailer category description |
| `qty` | retailer quantity field |
| `unit` | retailer unit code such as `EA` or `LB` |
| `unit_price` | retailer unit price field |
| `line_total` | retailer extended price field |
| `picked_weight` | retailer picked weight field |
| `mvp_savings` | retailer savings field |
| `reward_savings` | retailer rewards savings field |
| `coupon_savings` | retailer coupon savings field |
| `coupon_price` | retailer coupon price field |
| `image_url` | raw retailer image url when present |
| `raw_order_path` | relative path to source order payload |
Primary key:
- (`retailer`, `order_id`, `line_no`)
** schema: `data/<retailer>/items_enriched.csv`
One row per retailer line item after deterministic parsing. Preserve the raw
fields from `items_raw.csv` and add parsed fields.
| column | meaning |
|---------------------+-------------------------------------------------------------|
| `retailer` | retailer slug |
| `order_id` | retailer order id |
| `line_no` | line number within order |
| `observed_item_key` | stable row key, typically `<retailer>:<order_id>:<line_no>` |
| `item_name` | raw retailer item name |
| `item_name_norm` | normalized item name |
| `brand_guess` | parsed brand guess | | `brand_guess` | parsed brand guess |
| `variant` | parsed variant text | | `variant` | parsed variant text |
| `size_value` | parsed numeric size value | | `size_value` | parsed numeric size value |
| `size_unit` | parsed size unit such as `oz`, `lb`, `fl_oz` | | `size_unit` | parsed size unit such as `oz`, `lb`, `fl_oz` |
| `pack_qty` | parsed pack or count guess | | `pack_qty` | parsed pack or count guess |
| `measure_type` | `each`, `weight`, `volume`, `count`, or blank | | `measure_type` | `each`, `weight`, `volume`, `count`, or blank |
| `normalized_quantity` | numeric comparison basis derived during normalization |
| `normalized_quantity_unit` | basis unit such as `oz`, `lb`, `count`, or blank |
| `is_item` | item flag |
| `is_store_brand` | store-brand guess | | `is_store_brand` | store-brand guess |
| `is_fee` | fee or non-product flag | | `is_fee` | fee or non-product flag |
| `is_discount_line` | discount or adjustment-line flag |
| `is_coupon_line` | coupon-like line flag |
| `matched_discount_amount` | matched discount value carried onto purchased row when supported |
| `net_line_total` | line total after matched discount when supported |
| `price_per_each` | derived per-each price when supported | | `price_per_each` | derived per-each price when supported |
| `price_per_each_basis` | source basis for `price_per_each` |
| `price_per_count` | derived per-count price when supported |
| `price_per_count_basis` | source basis for `price_per_count` |
| `price_per_lb` | derived per-pound price when supported | | `price_per_lb` | derived per-pound price when supported |
| `price_per_lb_basis` | source basis for `price_per_lb` |
| `price_per_oz` | derived per-ounce price when supported | | `price_per_oz` | derived per-ounce price when supported |
| `price_per_oz_basis` | source basis for `price_per_oz` |
| `image_url` | best available retailer image url | | `image_url` | best available retailer image url |
| `raw_order_path` | relative path to source order payload |
| `parse_version` | parser version string for reruns | | `parse_version` | parser version string for reruns |
| `parse_notes` | optional non-fatal parser notes | | `parse_notes` | optional non-fatal parser notes |
Primary key: Notes:
- `normalized_row_id` identifies the purchase row; `normalized_item_id` identifies a repeated retailer item when strong retailer evidence supports grouping.
- Valid `normalization_basis` values should be explicit, e.g. `exact_upc`, `exact_retailer_item_id`, `exact_name_size_pack`, or `approved_retailer_alias`.
- Do not use fuzzy or semantic matching to assign `normalized_item_id`.
- Discount/coupon rows may remain as standalone normalized rows for auditability even when their amounts are attached to a purchased row via `matched_discount_amount`.
- Cross-retailer identity is handled later in review/combine via `data/review/catalog.csv` and `product_links.csv`.
- (`retailer`, `order_id`, `line_no`) ** `data/review/product_links.csv`
One row per review-approved link from a normalized retailer item to a catalog item.
Many normalized retailer items may link to the same catalog item.
** schema: `data/<retailer>/products_observed.csv` | key | definition |
|-------------------------+---------------------------------------------|
One row per distinct retailer-facing observed product. | `normalized_item_id` PK | normalized retailer item id |
| `catalog_id` PK | linked catalog product id |
| column | meaning | | `link_method` | `manual`, `exact_upc`, `exact_name_size`, etc. |
|-------------------------------+----------------------------------------------------------------|
| `observed_product_id` | stable observed product id |
| `retailer` | retailer slug |
| `observed_key` | deterministic grouping key used to create the observed product |
| `representative_upc` | best representative UPC/PLU |
| `representative_item_name` | representative raw retailer name |
| `representative_name_norm` | representative normalized name |
| `representative_brand` | representative brand guess |
| `representative_variant` | representative variant |
| `representative_size_value` | representative size value |
| `representative_size_unit` | representative size unit |
| `representative_pack_qty` | representative pack/count |
| `representative_measure_type` | representative measure type |
| `representative_image_url` | representative image url |
| `is_store_brand` | representative store-brand flag |
| `is_fee` | representative fee flag |
| `first_seen_date` | first order date seen |
| `last_seen_date` | last order date seen |
| `times_seen` | number of enriched item rows grouped here |
| `example_order_id` | one example retailer order id |
| `example_item_name` | one example raw item name |
Primary key:
- (`observed_product_id`)
** schema: `data/shared/products_canonical.csv`
One row per cross-retailer canonical product.
| column | meaning |
|----------------------------+--------------------------------------------------|
| `canonical_product_id` | stable canonical product id |
| `canonical_name` | canonical human-readable name |
| `product_type` | broad class such as `apple`, `milk`, `trash_bag` |
| `brand` | canonical brand when applicable |
| `variant` | canonical variant |
| `size_value` | normalized size value |
| `size_unit` | normalized size unit |
| `pack_qty` | normalized pack/count |
| `measure_type` | normalized measure type |
| `normalized_quantity` | numeric comparison basis value |
| `normalized_quantity_unit` | basis unit such as `oz`, `lb`, `count` |
| `notes` | optional human notes |
| `created_at` | creation timestamp or date |
| `updated_at` | last update timestamp or date |
Primary key:
- (`canonical_product_id`)
** schema: `data/shared/product_links.csv`
One row per observed-to-canonical relationship.
| column | meaning |
|-
| `observed_product_id` | retailer observed product id |
| `canonical_product_id` | linked canonical product id |
| `link_method` | `manual`, `exact_upc`, `exact_name`, etc. |
| `link_confidence` | optional confidence label | | `link_confidence` | optional confidence label |
| `review_status` | `pending`, `approved`, `rejected`, or blank | | `review_status` | `pending`, `approved`, `rejected`, or blank |
| `reviewed_by` | reviewer id or initials | | `reviewed_by` | reviewer id or initials |
| `reviewed_at` | review timestamp or date | | `reviewed_at` | review timestamp or date |
| `link_notes` | optional notes | | `link_notes` | optional notes |
Primary key: ** `data/review/review_queue.csv`
- (`observed_product_id`, `canonical_product_id`)
** schema: `data/shared/review_queue.csv`
One row per issue needing human review. One row per issue needing human review.
| column | meaning | | key | definition |
|- |----------------------+-----------------------------------------------------|
| `review_id` | stable review row id | | `review_id` PK | stable review row id |
| `queue_type` | `observed_product`, `link_candidate`, `parse_issue` | | `queue_type` | `link_candidate`, `parse_issue`, `catalog_cleanup` |
| `retailer` | retailer slug when applicable | | `retailer` | retailer slug when applicable |
| `observed_product_id` | observed product id when applicable | | `normalized_item_id` | normalized retailer item id when review is item-level |
| `canonical_product_id` | candidate canonical id when applicable | | `normalized_row_id` | normalized row id when review is row-specific |
| `catalog_id` | candidate canonical id |
| `reason_code` | machine-readable review reason | | `reason_code` | machine-readable review reason |
| `priority` | optional priority label | | `priority` | optional priority label |
| `raw_item_names` | compact list of example raw names | | `raw_item_names` | compact list of example raw names |
@@ -280,21 +270,90 @@ One row per issue needing human review.
| `resolution_notes` | reviewer notes | | `resolution_notes` | reviewer notes |
| `created_at` | creation timestamp or date | | `created_at` | creation timestamp or date |
| `updated_at` | last update timestamp or date | | `updated_at` | last update timestamp or date |
** `data/review/catalog.csv`
One row per cross-retailer catalog product.
| key | definition |
|----------------------------+----------------------------------------|
| `catalog_id` PK | stable catalog product id |
| `catalog_name` | human-reviewed product name |
| `product_type` | generic product eg `apple`, `milk` |
| `category` | broad section eg `produce`, `dairy` |
| `brand` | canonical brand when applicable |
| `variant` | canonical variant |
| `size_value` | normalized size value |
| `size_unit` | normalized size unit |
| `pack_qty` | normalized pack/count |
| `measure_type` | normalized measure type |
| `normalized_quantity` | numeric comparison basis value |
| `normalized_quantity_unit` | basis unit such as `oz`, `lb`, `count` |
| `notes` | optional human notes |
| `created_at` | creation timestamp or date |
| `updated_at` | last update timestamp or date |
Primary key: Notes:
- Do not auto-create new catalog rows from weak normalized names alone.
- Do not encode packaging/count into `catalog_name` unless it is essential to product identity.
- `catalog_name` should come from review-approved naming, not raw retailer strings.
- (`review_id`) ** `data/analysis/purchases.csv`
One row per purchased item (i.e., `is_item`==true from normalized layer), with
catalog attributes denormalized in and discounts already applied.
** current giant mapping | key | definition |
|----------------------------+----------------------------------------------------------------|
| `purchase_date` | date of purchase (from order) |
| `retailer` | retailer slug |
| `order_id` | retailer order id |
| `line_no` | line number within order |
| `normalized_row_id` | `<retailer>:<order_id>:<line_no>` |
| `normalized_item_id` | retailer-level normalized item identity |
| `catalog_id` | linked catalog product id |
| `catalog_name` | catalog product name for analysis |
| `catalog_product_type` | broader product family (e.g., `egg`, `milk`) |
| `catalog_category` | category such as `produce`, `dairy` |
| `catalog_brand` | canonical brand when applicable |
| `catalog_variant` | canonical variant when applicable |
| `raw_item_name` | original retailer item name |
| `normalized_item_name` | cleaned/normalized retailer item name |
| `retailer_item_id` | retailer-native item id |
| `upc` | UPC/PLU when available |
| `qty` | retailer quantity field |
| `unit` | retailer unit (e.g., `EA`, `LB`) |
| `pack_qty` | parsed pack/count |
| `size_value` | parsed size value |
| `size_unit` | parsed size unit |
| `measure_type` | `each`, `weight`, `volume`, `count` |
| `normalized_quantity` | normalized comparison quantity |
| `normalized_quantity_unit` | unit for normalized quantity |
| `unit_price` | retailer unit price |
| `line_total` | original retailer extended price (pre-discount) |
| `matched_discount_amount` | discount amount matched from discount lines |
| `net_line_total` | effective price after discount (`line_total` + discounts) |
| `store_name` | retailer store name |
| `store_city` | store city |
| `store_state` | store state |
| `price_per_each` | derived per-each price |
| `price_per_each_basis` | source basis for per-each calc |
| `price_per_count` | derived per-count price |
| `price_per_count_basis` | source basis for per-count calc |
| `price_per_lb` | derived per-pound price |
| `price_per_lb_basis` | source basis for per-pound calc |
| `price_per_oz` | derived per-ounce price |
| `price_per_oz_basis` | source basis for per-ounce calc |
| `is_fee` | true if row represents non-product fee |
| `raw_order_path` | relative path to original order payload |
Current scraper outputs map to the new layout as follows: Notes:
- Only rows that represent purchased items should appear here.
- `giant_output/raw/history.json` -> `data/giant/raw/history.json` - `line_total` preserves retailer truth; `net_line_total` is what you actually paid.
- `giant_output/raw/<order_id>.json` -> `data/giant/raw/orders/<order_id>.json` - catalog fields are denormalized in to make pivoting trivial.
- `giant_output/orders.csv` -> `data/giant/orders.csv` - no discount/coupon rows exist here; their effects are carried via `matched_discount_amount`.
- `giant_output/items.csv` -> `data/giant/items_raw.csv` - review/link decisions should apply at the `normalized_item_id` level, then fan out to all purchase rows sharing that id.
Current Giant raw order payloads already expose fields needed for future
enrichment, including `image`, `itemName`, `primUpcCd`, `lbEachCd`,
`unitPrice`, `groceryAmount`, and `totalPickedWeight`.
* /
Normalized quantity is deterministic and conservative:
- if `qty * pack_qty * size_value` is available, use that total with `size_unit`
- else if count basis is explicit, use `qty * pack_qty` with unit `count`
- else if `measure_type` is `each`, use `qty each`
- else leave both fields blank
- no hidden unit conversion is applied inside normalization; values stay in their parsed units such as `oz`, `lb`, `qt`, or `count`

654
pm/notes.org Normal file

File diff suppressed because one or more lines are too long

73
pm/review-workflow.org Normal file
View File

@@ -0,0 +1,73 @@
* review and item-resolution workflow
This document defines the durable review workflow for unresolved observed
products.
** persistent files
- `combined_output/purchases.csv`
Flat normalized purchase log. This is the review input because it retains:
- raw item name
- normalized item name
- observed product id
- canonical product id when resolved
- retailer/order/date/price context
- `combined_output/review_queue.csv`
Current unresolved observed products grouped for review.
- `combined_output/review_resolutions.csv`
Durable mapping decisions from observed products to canonical products.
- `combined_output/canonical_catalog.csv`
Durable canonical item catalog used by manual review and later purchase-log
rebuilds.
There is no separate alias file in v1. `review_resolutions.csv` is the mapping
layer from observed products to canonical product ids.
** workflow
1. Run `build_purchases.py`
This refreshes the purchase log and seeds/updates the canonical catalog from
current auto-linked canonical rows.
2. Run `review_products.py`
This rebuilds `review_queue.csv` from unresolved purchase rows and prompts in
the terminal for one observed product at a time.
3. Choose one of:
- link to existing canonical
- create new canonical
- exclude
- skip
4. `review_products.py` writes decisions immediately to:
- `review_resolutions.csv`
- `canonical_catalog.csv` when a new canonical item is created
5. Rerun `build_purchases.py`
This reapplies approved resolutions so the final normalized purchase log now
carries the reviewed `canonical_product_id`.
** what the human edits
The primary interface is terminal prompts in `review_products.py`.
The human provides:
- existing canonical id when linking
- canonical name/category/product type when creating a new canonical item
- optional resolution notes
The generated CSVs remain editable by hand if needed, but the intended workflow
is terminal-first.
** durability
- Resolutions are keyed by `observed_product_id`, not by one-off text
substitution.
- Canonical products are keyed by stable `canonical_product_id`.
- Future runs reuse approved mappings through `review_resolutions.csv`.
** retention of audit fields
The final `purchases.csv` retains:
- `raw_item_name`
- `normalized_item_name`
- `canonical_product_id`
This preserves the raw receipt description, the deterministic parser output, and
the human-approved canonical identity in one flat purchase log.

View File

@@ -1,107 +0,0 @@
* python setup
venv install playwright, pandas
playwright install
1. scrape - raw giant json
2. enrich -
cols:
item_name_norm
brand_guess
size_value
size_unit
pack_qty
variant
is_store_brand
is_fee
measure_type
price_per_lb
price_per_oz
price_per_each
image_url
normalize abbreviationsta
extract size like 12z, 10ct, 5lb
detect fees like bag charges
infer whether something is sold by each vs weight
carry forward image url
3. build observed-product atble from enriched items
* item:
get:
/api/v6.0/user/369513017/order/history/detail/69a2e44a16be1142e74ad3cc
headers:
request:
GET /api/v6.0/user/369513017/order/history/detail/69a2e44a16be1142e74ad3cc?isInStore=true HTTP/2
Host: giantfood.com
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:148.0) Gecko/20100101 Firefox/148.0
Accept: application/json, text/plain, */*
Accept-Language: en-US,en;q=0.9
Accept-Encoding: gzip, deflate, br, zstd
DNT: 1
Sec-GPC: 1
Connection: keep-alive
Referer: https://giantfood.com/account/history/invoice/in-store
Cookie: datadome=rDtvd3J2hO5AeghJMSFRRxGc6ifKCQYgMLcqPNr9rWiz2rdcXb032AY6GIZn8tUmYB96BKKbzh3_jSjEzYWLj8hDjl3oGYYAiu4jwdaxpf3vh2v4f7KH7kbqgsMWpkjt; cf_clearance=WEPyQokx9f0qoyS4Svsw4EkZ1TYOxjOwcUHspT3.rXw-1773348940-1.2.1.1-fPvERGxBlFUaBW83sUppbUWpwvFG7mZivag5vBvZb3kxUQv2WSVIV1tON0HV2n8bkVY0U8_BBl62a00Np.oJylYQcGME540gZlYEoL.gMs4WynLqApFe5BOXAEwOm01_6h6b62H90bl4ypRehVb_TXEi4qHaPLVSZhjZK_h.fv6RBqjgYch2j_8XnHe5HXvLziVjl1k2aJskozqy04KOyeHyc3OyIPTZd5On_KAzFIM; dvrctk=MnjKJVShVraEtbrBkkxWxLaZrXnIGNQlwB7QtZVPFeA=; __cflb=0H28vXMLFyydRmDMNgcPHijM6auXkCspCkuh58tVuJ3; __cf_bm=C6QbqiEvbbwdrYBpoJOkcWcedf60vcOfPfTPPbZzKbM-1773348202-1.0.1.1-cSHoYwi8ZjIHTdBItXQP_iXJdRJS6FYjFsGdl1eGHvS5pgfbcT4Lg19P6UStX.bZz1u0OXiS5ykdipPBtwP6OvZr68k4XSmjYpir05jNLhw; _dd_s=rum=0&expire=1773349846445; ppdtk=Uog72CR22mD85C7U4iZHlgOQeRmvHEYp0OdQc+0lEes1c5/LeqGT+ZUlXpSC6FpW; cartId=3820547
Sec-Fetch-Dest: empty
Sec-Fetch-Mode: cors
Sec-Fetch-Site: same-origin
Priority: u=0
TE: trailers
response:
HTTP/2 200
date: Thu, 12 Mar 2026 20:55:47 GMT
content-type: application/json
server: cloudflare
cf-ray: 9db5b3a5d84aff28-IAD
cf-cache-status: DYNAMIC
content-encoding: gzip
set-cookie: datadome=MXMri0hss6PlQ0_oS7gG2iMdOKnNkbDmGvOxelgN~nCcupgkJQOqjcjcgdprIaI7hSlt_w8E9Ri_RAzPFrGqtUfqAJ_szB_aNZ2FdC26qmI3870Nn4~T0vtx8Gj3dEZR; Max-Age=31536000; Domain=.giantfood.com; Path=/; Secure; SameSite=Lax
strict-transport-security: max-age=31536000; includeSubDomains
vary: Origin, Access-Control-Request-Method, Access-Control-Request-Headers, accept-encoding
accept-ch: Sec-CH-UA,Sec-CH-UA-Mobile,Sec-CH-UA-Platform,Sec-CH-UA-Arch,Sec-CH-UA-Full-Version-List,Sec-CH-UA-Model,Sec-CH-Device-Memory
x-datadome: protected
request-context: appId=cid-v1:75750625-0c81-4f08-9f5d-ce4f73198e54
X-Firefox-Spdy: h2
* history:
GET
https://giantfood.com/api/v6.0/user/369513017/order/history?filter=instore&loyaltyNumber=440155630880
headers:
request:
GET /api/v6.0/user/369513017/order/history?filter=instore&loyaltyNumber=440155630880 HTTP/2
Host: giantfood.com
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:148.0) Gecko/20100101 Firefox/148.0
Accept: application/json, text/plain, */*
Accept-Language: en-US,en;q=0.9
Accept-Encoding: gzip, deflate, br, zstd
DNT: 1
Sec-GPC: 1
Connection: keep-alive
Referer: https://giantfood.com/account/history/invoice/in-store
Cookie: datadome=OH2XjtCoI6XjE3Qsz_b0F1YULKLatAC0Ea~VMeDGBP0N9Z~CeI3RqEbvkGmNW_VCOU~vRb6p0kqibvF2tLbWnzyAGIdO7jsC41KiYbp7USpJDnefZhIg0e1ypAugvDSw; cf_clearance=WEPyQokx9f0qoyS4Svsw4EkZ1TYOxjOwcUHspT3.rXw-1773348940-1.2.1.1-fPvERGxBlFUaBW83sUppbUWpwvFG7mZivag5vBvZb3kxUQv2WSVIV1tON0HV2n8bkVY0U8_BBl62a00Np.oJylYQcGME540gZlYEoL.gMs4WynLqApFe5BOXAEwOm01_6h6b62H90bl4ypRehVb_TXEi4qHaPLVSZhjZK_h.fv6RBqjgYch2j_8XnHe5HXvLziVjl1k2aJskozqy04KOyeHyc3OyIPTZd5On_KAzFIM; dvrctk=MnjKJVShVraEtbrBkkxWxLaZrXnIGNQlwB7QtZVPFeA=; __cflb=0H28vXMLFyydRmDMNgcPHijM6auXkCspCkuh58tVuJ3; __cf_bm=C6QbqiEvbbwdrYBpoJOkcWcedf60vcOfPfTPPbZzKbM-1773348202-1.0.1.1-cSHoYwi8ZjIHTdBItXQP_iXJdRJS6FYjFsGdl1eGHvS5pgfbcT4Lg19P6UStX.bZz1u0OXiS5ykdipPBtwP6OvZr68k4XSmjYpir05jNLhw; _dd_s=rum=0&expire=1773349842848; ppdtk=Uog72CR22mD85C7U4iZHlgOQeRmvHEYp0OdQc+0lEes1c5/LeqGT+ZUlXpSC6FpW; cartId=3820547
Sec-Fetch-Dest: empty
Sec-Fetch-Mode: cors
Sec-Fetch-Site: same-origin
Priority: u=0
TE: trailers
response:
HTTP/2 200
date: Thu, 12 Mar 2026 20:55:43 GMT
content-type: application/json
server: cloudflare
cf-ray: 9db5b38f7eebff28-IAD
cf-cache-status: DYNAMIC
content-encoding: gzip
set-cookie: datadome=rDtvd3J2hO5AeghJMSFRRxGc6ifKCQYgMLcqPNr9rWiz2rdcXb032AY6GIZn8tUmYB96BKKbzh3_jSjEzYWLj8hDjl3oGYYAiu4jwdaxpf3vh2v4f7KH7kbqgsMWpkjt; Max-Age=31536000; Domain=.giantfood.com; Path=/; Secure; SameSite=Lax
strict-transport-security: max-age=31536000; includeSubDomains
vary: Origin, Access-Control-Request-Method, Access-Control-Request-Headers, accept-encoding
accept-ch: Sec-CH-UA,Sec-CH-UA-Mobile,Sec-CH-UA-Platform,Sec-CH-UA-Arch,Sec-CH-UA-Full-Version-List,Sec-CH-UA-Model,Sec-CH-Device-Memory
x-datadome: protected
request-context: appId=cid-v1:75750625-0c81-4f08-9f5d-ce4f73198e54
X-Firefox-Spdy: h2

22
pm/task-sample.org Normal file
View File

@@ -0,0 +1,22 @@
#+title: Task Log
#+updated: [2026-03-18 Wed 14:19]
Use the template below, which should be a top-level org-mode header.
* [ ] M.m.m: Task Title (estimate # commits)
replace the old observed/canonical workflow with a review-first pipeline that groups normalized rows only during review/combine and links them to catalog items
** Acceptance Criteria
1. Criterion
- expanded data
2. Criterion
- pm note: amplifying information
** evidence
- commit: abc123, bcd234
- tests:
- datetime: [2026-03-18 Wed 14:15]
** notes
- explanation of work done, decisions made, reasoning

File diff suppressed because it is too large Load Diff

129
report_pipeline_status.py Normal file
View File

@@ -0,0 +1,129 @@
import json
from pathlib import Path
import click
import build_purchases
import review_products
from layer_helpers import read_csv_rows, write_csv_rows
SUMMARY_FIELDS = ["stage", "count"]
def read_rows_if_exists(path):
path = Path(path)
if not path.exists():
return []
return read_csv_rows(path)
def build_status_summary(
giant_orders,
giant_items,
giant_enriched,
costco_orders,
costco_items,
costco_enriched,
purchases,
resolutions,
links,
catalog,
):
normalized_rows = giant_enriched + costco_enriched
queue_rows = review_products.build_review_queue(purchases, resolutions, links, catalog, [])
queue_ids = {row["normalized_item_id"] for row in queue_rows}
unresolved_purchase_rows = [
row
for row in purchases
if row.get("normalized_item_id")
and not row.get("catalog_id")
and row.get("resolution_action") != "exclude"
and row.get("is_fee") != "true"
and row.get("is_discount_line") != "true"
and row.get("is_coupon_line") != "true"
]
excluded_rows = [row for row in purchases if row.get("resolution_action") == "exclude"]
linked_purchase_rows = [row for row in purchases if row.get("catalog_id")]
distinct_normalized_items = {
row["normalized_item_id"] for row in normalized_rows if row.get("normalized_item_id")
}
linked_normalized_items = {
row["normalized_item_id"] for row in purchases if row.get("normalized_item_id") and row.get("catalog_id")
}
summary = [
{"stage": "raw_orders", "count": len(giant_orders) + len(costco_orders)},
{"stage": "raw_items", "count": len(giant_items) + len(costco_items)},
{"stage": "normalized_items", "count": len(normalized_rows)},
{"stage": "distinct_normalized_items", "count": len(distinct_normalized_items)},
{"stage": "review_queue_normalized_items", "count": len(queue_rows)},
{"stage": "linked_normalized_items", "count": len(linked_normalized_items)},
{"stage": "linked_purchase_rows", "count": len(linked_purchase_rows)},
{"stage": "final_purchase_rows", "count": len(purchases)},
{"stage": "unresolved_purchase_rows", "count": len(unresolved_purchase_rows)},
{"stage": "excluded_purchase_rows", "count": len(excluded_rows)},
{
"stage": "unresolved_not_in_review_rows",
"count": len(
[
row
for row in unresolved_purchase_rows
if row.get("normalized_item_id") not in queue_ids
]
),
},
]
return summary
@click.command()
@click.option("--giant-orders-csv", default="data/giant-web/collected_orders.csv", show_default=True)
@click.option("--giant-items-csv", default="data/giant-web/collected_items.csv", show_default=True)
@click.option("--giant-enriched-csv", default="data/giant-web/normalized_items.csv", show_default=True)
@click.option("--costco-orders-csv", default="data/costco-web/collected_orders.csv", show_default=True)
@click.option("--costco-items-csv", default="data/costco-web/collected_items.csv", show_default=True)
@click.option("--costco-enriched-csv", default="data/costco-web/normalized_items.csv", show_default=True)
@click.option("--purchases-csv", default="data/analysis/purchases.csv", show_default=True)
@click.option("--resolutions-csv", default="data/review/review_resolutions.csv", show_default=True)
@click.option("--links-csv", default="data/review/product_links.csv", show_default=True)
@click.option("--catalog-csv", default="data/review/catalog.csv", show_default=True)
@click.option("--summary-csv", default="data/review/pipeline_status.csv", show_default=True)
@click.option("--summary-json", default="data/review/pipeline_status.json", show_default=True)
def main(
giant_orders_csv,
giant_items_csv,
giant_enriched_csv,
costco_orders_csv,
costco_items_csv,
costco_enriched_csv,
purchases_csv,
resolutions_csv,
links_csv,
catalog_csv,
summary_csv,
summary_json,
):
summary_rows = build_status_summary(
read_rows_if_exists(giant_orders_csv),
read_rows_if_exists(giant_items_csv),
read_rows_if_exists(giant_enriched_csv),
read_rows_if_exists(costco_orders_csv),
read_rows_if_exists(costco_items_csv),
read_rows_if_exists(costco_enriched_csv),
read_rows_if_exists(purchases_csv),
[build_purchases.normalize_resolution_row(row) for row in read_rows_if_exists(resolutions_csv)],
[build_purchases.normalize_link_row(row) for row in read_rows_if_exists(links_csv)],
[build_purchases.normalize_catalog_row(row) for row in read_rows_if_exists(catalog_csv)],
)
write_csv_rows(summary_csv, summary_rows, SUMMARY_FIELDS)
summary_json_path = Path(summary_json)
summary_json_path.parent.mkdir(parents=True, exist_ok=True)
summary_json_path.write_text(json.dumps(summary_rows, indent=2), encoding="utf-8")
for row in summary_rows:
click.echo(f"{row['stage']}: {row['count']}")
if __name__ == "__main__":
main()

View File

@@ -1,10 +1,4 @@
browser-cookie3==0.20.1 browser-cookie3==0.20.1
certifi==2026.2.25
cffi==2.0.0
click==8.3.1 click==8.3.1
curl_cffi==0.14.0 curl_cffi==0.14.0
jeepney==0.9.0
lz4==4.4.5
pycparser==3.0
pycryptodomex==3.23.0
python-dotenv==1.1.1 python-dotenv==1.1.1

670
review_products.py Normal file
View File

@@ -0,0 +1,670 @@
from collections import defaultdict
from datetime import date
import re
import click
import build_purchases
from layer_helpers import compact_join, stable_id, write_csv_rows
QUEUE_FIELDS = [
"review_id",
"retailer",
"normalized_item_id",
"catalog_id",
"reason_code",
"priority",
"raw_item_names",
"normalized_names",
"upc_values",
"example_prices",
"seen_count",
"status",
"resolution_action",
"resolution_notes",
"created_at",
"updated_at",
]
INFO_COLOR = "cyan"
PROMPT_COLOR = "bright_yellow"
WARNING_COLOR = "magenta"
TOKEN_RE = re.compile(r"[A-Z0-9]+")
REQUIRED_CATALOG_FIELDS = ("catalog_name", "product_type")
def print_intro_text():
click.secho("Review guide:", fg=INFO_COLOR)
click.echo(" catalog name: unique product identity including variant, but not packaging")
click.echo(" product type: general product you want to compare across purchases")
click.echo(" category: broad analysis bucket such as dairy, produce, or frozen")
def has_complete_catalog_row(catalog_row):
if not catalog_row:
return False
return all(catalog_row.get(field, "").strip() for field in REQUIRED_CATALOG_FIELDS)
def load_queue_lookup(queue_rows):
lookup = {}
for row in queue_rows:
normalized_item_id = row.get("normalized_item_id", "")
if normalized_item_id:
lookup[normalized_item_id] = row
return lookup
def build_review_queue(
purchase_rows,
resolution_rows,
link_rows=None,
catalog_rows=None,
existing_queue_rows=None,
):
by_normalized = defaultdict(list)
resolution_lookup = build_purchases.load_resolution_lookup(resolution_rows)
link_lookup = build_purchases.load_link_lookup(link_rows or [])
catalog_lookup = {
row.get("catalog_id", ""): build_purchases.normalize_catalog_row(row)
for row in (catalog_rows or [])
if row.get("catalog_id", "")
}
queue_lookup = load_queue_lookup(existing_queue_rows or [])
for row in purchase_rows:
normalized_item_id = row.get("normalized_item_id", "")
if not normalized_item_id:
continue
by_normalized[normalized_item_id].append(row)
today_text = str(date.today())
queue_rows = []
for normalized_item_id, rows in sorted(by_normalized.items()):
current_resolution = resolution_lookup.get(normalized_item_id, {})
if current_resolution.get("status") == "approved" and current_resolution.get("resolution_action") == "exclude":
continue
existing_queue_row = queue_lookup.get(normalized_item_id, {})
linked_catalog_id = current_resolution.get("catalog_id") or link_lookup.get(normalized_item_id, {}).get("catalog_id", "")
linked_catalog_row = catalog_lookup.get(linked_catalog_id, {})
has_valid_catalog_link = bool(linked_catalog_id and has_complete_catalog_row(linked_catalog_row))
unresolved_rows = [
row
for row in rows
if row.get("is_item", "true") != "false"
and row.get("is_fee") != "true"
and row.get("is_discount_line") != "true"
and row.get("is_coupon_line") != "true"
]
if not unresolved_rows or has_valid_catalog_link:
continue
retailers = sorted({row["retailer"] for row in rows})
review_id = stable_id("rvw", normalized_item_id)
reason_code = "missing_catalog_link"
if linked_catalog_id and linked_catalog_id not in catalog_lookup:
reason_code = "orphaned_catalog_link"
elif linked_catalog_id and not has_complete_catalog_row(linked_catalog_row):
reason_code = "incomplete_catalog_link"
queue_rows.append(
{
"review_id": review_id,
"retailer": " | ".join(retailers),
"normalized_item_id": normalized_item_id,
"catalog_id": linked_catalog_id,
"reason_code": reason_code,
"priority": "high",
"raw_item_names": compact_join(
sorted({row["raw_item_name"] for row in rows if row["raw_item_name"]}),
limit=8,
),
"normalized_names": compact_join(
sorted(
{
row["normalized_item_name"]
for row in rows
if row["normalized_item_name"]
}
),
limit=8,
),
"upc_values": compact_join(
sorted({row["upc"] for row in rows if row["upc"]}),
limit=8,
),
"example_prices": compact_join(
sorted({row["line_total"] for row in rows if row["line_total"]}),
limit=8,
),
"seen_count": str(len(rows)),
"status": existing_queue_row.get("status") or current_resolution.get("status", "pending"),
"resolution_action": existing_queue_row.get("resolution_action")
or current_resolution.get("resolution_action", ""),
"resolution_notes": existing_queue_row.get("resolution_notes")
or current_resolution.get("resolution_notes", ""),
"created_at": existing_queue_row.get("created_at")
or current_resolution.get("reviewed_at", today_text),
"updated_at": today_text,
}
)
return queue_rows
def save_resolution_rows(path, rows):
write_csv_rows(path, rows, build_purchases.RESOLUTION_FIELDS)
def save_catalog_rows(path, rows):
write_csv_rows(path, rows, build_purchases.CATALOG_FIELDS)
def save_link_rows(path, rows):
write_csv_rows(path, rows, build_purchases.PRODUCT_LINK_FIELDS)
def sort_related_items(rows):
return sorted(
rows,
key=lambda row: (
row.get("purchase_date", ""),
row.get("order_id", ""),
int(row.get("line_no", "0") or "0"),
),
reverse=True,
)
def tokenize_match_text(*values):
tokens = set()
for value in values:
tokens.update(TOKEN_RE.findall((value or "").upper()))
return tokens
def build_catalog_suggestions(related_rows, purchase_rows, catalog_rows, limit=3):
normalized_names = {
row.get("normalized_item_name", "").strip().upper()
for row in related_rows
if row.get("normalized_item_name", "").strip()
}
upcs = {
row.get("upc", "").strip()
for row in related_rows
if row.get("upc", "").strip()
}
catalog_by_id = {
row.get("catalog_id", ""): row for row in catalog_rows if row.get("catalog_id", "")
}
suggestions = []
seen_ids = set()
def add_catalog_id(catalog_id, reason):
if not catalog_id or catalog_id in seen_ids or catalog_id not in catalog_by_id:
return False
seen_ids.add(catalog_id)
catalog_row = catalog_by_id[catalog_id]
suggestions.append(
{
"catalog_id": catalog_id,
"catalog_name": catalog_row.get("catalog_name", ""),
"reason": reason,
}
)
return len(suggestions) >= limit
reviewed_purchase_rows = [
row for row in purchase_rows if row.get("catalog_id") and row.get("normalized_item_id")
]
for row in reviewed_purchase_rows:
if row.get("upc", "").strip() and row.get("upc", "").strip() in upcs:
if add_catalog_id(row.get("catalog_id", ""), "exact upc"):
return suggestions
for row in reviewed_purchase_rows:
if row.get("normalized_item_name", "").strip().upper() in normalized_names:
if add_catalog_id(row.get("catalog_id", ""), "exact normalized name"):
return suggestions
for catalog_row in catalog_rows:
catalog_name = catalog_row.get("catalog_name", "").strip().upper()
if not catalog_name:
continue
for normalized_name in normalized_names:
if normalized_name in catalog_name or catalog_name in normalized_name:
if add_catalog_id(catalog_row.get("catalog_id", ""), "catalog name contains match"):
return suggestions
break
return suggestions
def search_catalog_rows(query, catalog_rows, purchase_rows, current_normalized_item_id, limit=10):
query_tokens = tokenize_match_text(query)
if not query_tokens:
return []
linked_purchase_counts = defaultdict(int)
linked_normalized_ids = defaultdict(set)
current_catalog_id = ""
for row in purchase_rows:
catalog_id = row.get("catalog_id", "")
normalized_item_id = row.get("normalized_item_id", "")
if catalog_id and normalized_item_id:
linked_purchase_counts[catalog_id] += 1
linked_normalized_ids[catalog_id].add(normalized_item_id)
if normalized_item_id == current_normalized_item_id and catalog_id:
current_catalog_id = catalog_id
ranked_rows = []
for row in catalog_rows:
catalog_id = row.get("catalog_id", "")
if not catalog_id or catalog_id == current_catalog_id:
continue
catalog_tokens = tokenize_match_text(
row.get("catalog_name", ""),
row.get("product_type", ""),
row.get("variant", ""),
)
overlap = query_tokens & catalog_tokens
if not overlap:
continue
ranked_rows.append(
{
"catalog_id": catalog_id,
"catalog_name": row.get("catalog_name", ""),
"product_type": row.get("product_type", ""),
"category": row.get("category", ""),
"variant": row.get("variant", ""),
"linked_normalized_items": len(linked_normalized_ids.get(catalog_id, set())),
"linked_purchase_rows": linked_purchase_counts.get(catalog_id, 0),
"score": len(overlap),
}
)
ranked_rows.sort(
key=lambda row: (-row["score"], row["catalog_name"], row["catalog_id"])
)
return ranked_rows[:limit]
def suggestion_display_rows(suggestions, purchase_rows, catalog_rows):
linked_purchase_counts = defaultdict(int)
linked_normalized_ids = defaultdict(set)
for row in purchase_rows:
catalog_id = row.get("catalog_id", "")
normalized_item_id = row.get("normalized_item_id", "")
if not catalog_id or not normalized_item_id:
continue
linked_purchase_counts[catalog_id] += 1
linked_normalized_ids[catalog_id].add(normalized_item_id)
display_rows = []
catalog_details = {
row["catalog_id"]: {
"product_type": row.get("product_type", ""),
"category": row.get("category", ""),
}
for row in catalog_rows
if row.get("catalog_id")
}
for row in purchase_rows:
if row.get("catalog_id"):
catalog_details.setdefault(
row["catalog_id"],
{
"product_type": row.get("product_type", ""),
"category": row.get("category", ""),
},
)
for row in suggestions:
catalog_id = row["catalog_id"]
details = catalog_details.get(catalog_id, {})
display_rows.append(
{
**row,
"product_type": details.get("product_type", ""),
"category": details.get("category", ""),
"linked_purchase_rows": linked_purchase_counts.get(catalog_id, 0),
"linked_normalized_items": len(linked_normalized_ids.get(catalog_id, set())),
}
)
return display_rows
def print_catalog_rows(rows):
for index, row in enumerate(rows, start=1):
click.echo(
f" [{index}] {row['catalog_name']}, {row.get('product_type', '')}, "
f"{row.get('category', '')} ({row['linked_normalized_items']} items, "
f"{row['linked_purchase_rows']} rows)"
)
def build_display_lines(related_rows):
lines = []
for index, row in enumerate(sort_related_items(related_rows), start=1):
lines.append(
" [{index}] {raw_item_name} | {retailer} | {purchase_date} | {line_total} | {image_url}".format(
index=index,
raw_item_name=row.get("raw_item_name", ""),
retailer=row.get("retailer", ""),
purchase_date=row.get("purchase_date", ""),
line_total=row.get("line_total", ""),
image_url=row.get("image_url", ""),
)
)
if not lines:
lines.append(" [1] no matched item rows found")
return lines
def normalized_label(queue_row, related_rows):
if queue_row.get("normalized_names"):
return queue_row["normalized_names"].split(" | ")[0]
for row in related_rows:
if row.get("normalized_item_name"):
return row["normalized_item_name"]
return queue_row.get("normalized_item_id", "")
def choose_existing_catalog(display_rows, normalized_name, matched_count):
click.secho(
f"Select the catalog_name to associate {matched_count} items with:",
fg=INFO_COLOR,
)
print_catalog_rows(display_rows)
choice = click.prompt(
click.style("selection", fg=PROMPT_COLOR),
type=click.IntRange(1, len(display_rows)),
)
chosen_row = display_rows[choice - 1]
click.echo(
f'{matched_count} "{normalized_name}" items and future matches will be associated '
f'with "{chosen_row["catalog_name"]}".'
)
click.secho("actions: [y]es [n]o [b]ack [s]kip [q]uit", fg=PROMPT_COLOR)
confirm = click.prompt(
click.style("confirm", fg=PROMPT_COLOR),
type=click.Choice(["y", "n", "b", "s", "q"]),
)
if confirm == "y":
return chosen_row["catalog_id"], ""
if confirm == "s":
return "", "skip"
if confirm == "q":
return "", "quit"
return "", "back"
def prompt_resolution(queue_row, related_rows, purchase_rows, catalog_rows, queue_index, queue_total):
suggestions = suggestion_display_rows(
build_catalog_suggestions(related_rows, purchase_rows, catalog_rows),
purchase_rows,
catalog_rows,
)
normalized_name = normalized_label(queue_row, related_rows)
matched_count = len(related_rows)
click.echo("")
click.secho(
f"Review {queue_index}/{queue_total}: {normalized_name}",
fg=INFO_COLOR,
)
click.echo(f"{matched_count} matched items:")
for line in build_display_lines(related_rows):
click.echo(line)
if suggestions:
click.echo(f"{len(suggestions)} catalog_name suggestions found:")
print_catalog_rows(suggestions)
else:
click.echo("no catalog_name suggestions found")
prompt_bits = []
if suggestions:
prompt_bits.append("[#] link to suggestion")
prompt_bits.extend(["[f]ind", "[n]ew", "[s]kip", "e[x]clude", "[q]uit"])
click.secho(" ".join(prompt_bits) + " >", fg=PROMPT_COLOR)
action = click.prompt("", type=str, prompt_suffix=" ").strip().lower()
if action.isdigit() and suggestions:
choice = int(action)
if 1 <= choice <= len(suggestions):
chosen_row = suggestions[choice - 1]
notes = click.prompt(click.style("link notes", fg=PROMPT_COLOR), default="", show_default=False)
return {
"normalized_item_id": queue_row["normalized_item_id"],
"catalog_id": chosen_row["catalog_id"],
"resolution_action": "link",
"status": "approved",
"resolution_notes": notes,
"reviewed_at": str(date.today()),
}, None
click.secho("invalid suggestion number", fg=WARNING_COLOR)
return prompt_resolution(queue_row, related_rows, purchase_rows, catalog_rows, queue_index, queue_total)
if action == "q":
return None, None
if action == "s":
return {
"normalized_item_id": queue_row["normalized_item_id"],
"catalog_id": "",
"resolution_action": "skip",
"status": "pending",
"resolution_notes": queue_row.get("resolution_notes", ""),
"reviewed_at": str(date.today()),
}, None
if action == "f":
while True:
query = click.prompt(click.style("search", fg=PROMPT_COLOR), default="", show_default=False).strip()
if not query:
return prompt_resolution(queue_row, related_rows, purchase_rows, catalog_rows, queue_index, queue_total)
search_rows = search_catalog_rows(
query,
catalog_rows,
purchase_rows,
queue_row["normalized_item_id"],
)
if not search_rows:
click.echo("no matches found")
retry = click.prompt(
click.style("search again? [enter=yes, q=no]", fg=PROMPT_COLOR),
default="",
show_default=False,
).strip().lower()
if retry == "q":
return prompt_resolution(queue_row, related_rows, purchase_rows, catalog_rows, queue_index, queue_total)
continue
click.echo(f"{len(search_rows)} search results found:")
print_catalog_rows(search_rows)
choice = click.prompt(
click.style("selection", fg=PROMPT_COLOR),
type=click.IntRange(1, len(search_rows)),
)
chosen_row = search_rows[choice - 1]
notes = click.prompt(click.style("link notes", fg=PROMPT_COLOR), default="", show_default=False)
return {
"normalized_item_id": queue_row["normalized_item_id"],
"catalog_id": chosen_row["catalog_id"],
"resolution_action": "link",
"status": "approved",
"resolution_notes": notes,
"reviewed_at": str(date.today()),
}, None
if action == "x":
notes = click.prompt(click.style("exclude notes", fg=PROMPT_COLOR), default="", show_default=False)
return {
"normalized_item_id": queue_row["normalized_item_id"],
"catalog_id": "",
"resolution_action": "exclude",
"status": "approved",
"resolution_notes": notes,
"reviewed_at": str(date.today()),
}, None
if action != "n":
click.secho("invalid action", fg=WARNING_COLOR)
return prompt_resolution(queue_row, related_rows, purchase_rows, catalog_rows, queue_index, queue_total)
catalog_name = click.prompt(click.style("catalog name", fg=PROMPT_COLOR), type=str)
product_type = click.prompt(click.style("product type", fg=PROMPT_COLOR), default="", show_default=False)
category = click.prompt(click.style("category", fg=PROMPT_COLOR), default="", show_default=False)
notes = click.prompt(click.style("notes", fg=PROMPT_COLOR), default="", show_default=False)
catalog_id = stable_id("cat", f"manual|{catalog_name}|{category}|{product_type}")
catalog_row = {
"catalog_id": catalog_id,
"catalog_name": catalog_name,
"category": category,
"product_type": product_type,
"brand": "",
"variant": "",
"size_value": "",
"size_unit": "",
"pack_qty": "",
"measure_type": "",
"notes": notes,
"created_at": str(date.today()),
"updated_at": str(date.today()),
}
resolution_row = {
"normalized_item_id": queue_row["normalized_item_id"],
"catalog_id": catalog_id,
"resolution_action": "create",
"status": "approved",
"resolution_notes": notes,
"reviewed_at": str(date.today()),
}
return resolution_row, catalog_row
def apply_resolution_to_queue(queue_rows, resolution_lookup):
today_text = str(date.today())
updated_rows = []
for row in queue_rows:
resolution = resolution_lookup.get(row["normalized_item_id"], {})
row_copy = dict(row)
if resolution:
row_copy["catalog_id"] = resolution.get("catalog_id", "")
row_copy["status"] = resolution.get("status", row_copy.get("status", "pending"))
row_copy["resolution_action"] = resolution.get("resolution_action", "")
row_copy["resolution_notes"] = resolution.get("resolution_notes", "")
row_copy["updated_at"] = resolution.get("reviewed_at", today_text)
if resolution.get("status") == "approved":
row_copy["created_at"] = row_copy.get("created_at") or resolution.get("reviewed_at", today_text)
updated_rows.append(row_copy)
return updated_rows
def link_rows_from_state(link_lookup):
return sorted(link_lookup.values(), key=lambda row: row["normalized_item_id"])
@click.command()
@click.option("--giant-items-enriched-csv", default="data/giant-web/normalized_items.csv", show_default=True)
@click.option("--costco-items-enriched-csv", default="data/costco-web/normalized_items.csv", show_default=True)
@click.option("--giant-orders-csv", default="data/giant-web/collected_orders.csv", show_default=True)
@click.option("--costco-orders-csv", default="data/costco-web/collected_orders.csv", show_default=True)
@click.option("--purchases-csv", default="data/analysis/purchases.csv", show_default=True)
@click.option("--queue-csv", default="data/review/review_queue.csv", show_default=True)
@click.option("--resolutions-csv", default="data/review/review_resolutions.csv", show_default=True)
@click.option("--catalog-csv", default="data/review/catalog.csv", show_default=True)
@click.option("--links-csv", default="data/review/product_links.csv", show_default=True)
@click.option("--limit", default=0, show_default=True, type=int)
@click.option("--refresh-only", is_flag=True, help="Only rebuild review_queue.csv without prompting.")
def main(
giant_items_enriched_csv,
costco_items_enriched_csv,
giant_orders_csv,
costco_orders_csv,
purchases_csv,
queue_csv,
resolutions_csv,
catalog_csv,
links_csv,
limit,
refresh_only,
):
resolution_rows = build_purchases.read_optional_csv_rows(resolutions_csv)
catalog_rows = build_purchases.merge_catalog_rows(build_purchases.read_optional_csv_rows(catalog_csv), [])
link_rows = build_purchases.read_optional_csv_rows(links_csv)
purchase_rows, refreshed_link_rows = build_purchases.build_purchase_rows(
build_purchases.read_optional_csv_rows(giant_items_enriched_csv),
build_purchases.read_optional_csv_rows(costco_items_enriched_csv),
build_purchases.read_optional_csv_rows(giant_orders_csv),
build_purchases.read_optional_csv_rows(costco_orders_csv),
resolution_rows,
link_rows,
catalog_rows,
)
build_purchases.write_csv_rows(purchases_csv, purchase_rows, build_purchases.PURCHASE_FIELDS)
link_lookup = build_purchases.load_link_lookup(refreshed_link_rows)
queue_rows = build_review_queue(
purchase_rows,
resolution_rows,
refreshed_link_rows,
catalog_rows,
build_purchases.read_optional_csv_rows(queue_csv),
)
write_csv_rows(queue_csv, queue_rows, QUEUE_FIELDS)
click.echo(f"wrote {len(queue_rows)} rows to {queue_csv}")
if refresh_only:
return
print_intro_text()
resolution_lookup = build_purchases.load_resolution_lookup(resolution_rows)
catalog_by_id = {row["catalog_id"]: row for row in catalog_rows if row.get("catalog_id")}
rows_by_normalized = defaultdict(list)
for row in purchase_rows:
normalized_item_id = row.get("normalized_item_id", "")
if normalized_item_id:
rows_by_normalized[normalized_item_id].append(row)
reviewed = 0
for index, queue_row in enumerate(queue_rows, start=1):
if limit and reviewed >= limit:
break
related_rows = rows_by_normalized.get(queue_row["normalized_item_id"], [])
result = prompt_resolution(queue_row, related_rows, purchase_rows, catalog_rows, index, len(queue_rows))
if result == (None, None):
break
resolution_row, catalog_row = result
resolution_lookup[resolution_row["normalized_item_id"]] = resolution_row
if catalog_row and catalog_row["catalog_id"] not in catalog_by_id:
catalog_by_id[catalog_row["catalog_id"]] = catalog_row
catalog_rows.append(catalog_row)
normalized_item_id = resolution_row["normalized_item_id"]
if resolution_row["status"] == "approved":
if resolution_row["resolution_action"] in {"link", "create"} and resolution_row.get("catalog_id"):
link_lookup[normalized_item_id] = {
"normalized_item_id": normalized_item_id,
"catalog_id": resolution_row["catalog_id"],
"link_method": f"manual_{resolution_row['resolution_action']}",
"link_confidence": "high",
"review_status": "approved",
"reviewed_by": "",
"reviewed_at": resolution_row.get("reviewed_at", ""),
"link_notes": resolution_row.get("resolution_notes", ""),
}
elif resolution_row["resolution_action"] == "exclude":
link_lookup.pop(normalized_item_id, None)
queue_rows = apply_resolution_to_queue(queue_rows, resolution_lookup)
write_csv_rows(queue_csv, queue_rows, QUEUE_FIELDS)
save_resolution_rows(
resolutions_csv,
sorted(resolution_lookup.values(), key=lambda row: row["normalized_item_id"]),
)
save_catalog_rows(catalog_csv, sorted(catalog_by_id.values(), key=lambda row: row["catalog_id"]))
save_link_rows(links_csv, link_rows_from_state(link_lookup))
reviewed += 1
save_resolution_rows(resolutions_csv, sorted(resolution_lookup.values(), key=lambda row: row["normalized_item_id"]))
save_catalog_rows(catalog_csv, sorted(catalog_by_id.values(), key=lambda row: row["catalog_id"]))
save_link_rows(links_csv, link_rows_from_state(link_lookup))
click.echo(
f"saved {len(resolution_lookup)} resolution rows to {resolutions_csv}, "
f"{len(catalog_by_id)} catalog rows to {catalog_csv}, "
f"and {len(link_lookup)} product links to {links_csv}"
)
if __name__ == "__main__":
main()

View File

@@ -1,5 +0,0 @@
from scraper import main
if __name__ == "__main__":
main()

738
scrape_costco.py Normal file
View File

@@ -0,0 +1,738 @@
import os
import csv
import json
import time
import re
from pathlib import Path
from calendar import monthrange
from datetime import datetime, timedelta
from dotenv import load_dotenv
import click
from curl_cffi import requests
from browser_session import (
find_firefox_profile_dir,
load_firefox_cookies,
read_firefox_local_storage,
read_firefox_webapps_store,
)
BASE_URL = "https://ecom-api.costco.com/ebusiness/order/v1/orders/graphql"
RETAILER = "costco"
SUMMARY_QUERY = """
query receiptsWithCounts($startDate: String!, $endDate: String!, $documentType: String!, $documentSubType: String!) {
receiptsWithCounts(startDate: $startDate, endDate: $endDate, documentType: $documentType, documentSubType: $documentSubType) {
inWarehouse
gasStation
carWash
gasAndCarWash
receipts {
warehouseName
receiptType
documentType
transactionDateTime
transactionBarcode
warehouseName
transactionType
total
totalItemCount
itemArray {
itemNumber
}
tenderArray {
tenderTypeCode
tenderDescription
amountTender
}
couponArray {
upcnumberCoupon
}
}
}
}
""".strip()
DETAIL_QUERY = """
query receiptsWithCounts($barcode: String!, $documentType: String!) {
receiptsWithCounts(barcode: $barcode, documentType: $documentType) {
receipts {
warehouseName
receiptType
documentType
transactionDateTime
transactionDate
companyNumber
warehouseNumber
operatorNumber
warehouseShortName
registerNumber
transactionNumber
transactionType
transactionBarcode
total
warehouseAddress1
warehouseAddress2
warehouseCity
warehouseState
warehouseCountry
warehousePostalCode
totalItemCount
subTotal
taxes
total
invoiceNumber
sequenceNumber
itemArray {
itemNumber
itemDescription01
frenchItemDescription1
itemDescription02
frenchItemDescription2
itemIdentifier
itemDepartmentNumber
unit
amount
taxFlag
merchantID
entryMethod
transDepartmentNumber
fuelUnitQuantity
fuelGradeCode
itemUnitPriceAmount
fuelUomCode
fuelUomDescription
fuelUomDescriptionFr
fuelGradeDescription
fuelGradeDescriptionFr
}
tenderArray {
tenderTypeCode
tenderSubTypeCode
tenderDescription
amountTender
displayAccountNumber
sequenceNumber
approvalNumber
responseCode
tenderTypeName
transactionID
merchantID
entryMethod
tenderAcctTxnNumber
tenderAuthorizationCode
tenderTypeNameFr
tenderEntryMethodDescription
walletType
walletId
storedValueBucket
}
subTaxes {
tax1
tax2
tax3
tax4
aTaxPercent
aTaxLegend
aTaxAmount
aTaxPrintCode
aTaxPrintCodeFR
aTaxIdentifierCode
bTaxPercent
bTaxLegend
bTaxAmount
bTaxPrintCode
bTaxPrintCodeFR
bTaxIdentifierCode
cTaxPercent
cTaxLegend
cTaxAmount
cTaxIdentifierCode
dTaxPercent
dTaxLegend
dTaxAmount
dTaxPrintCode
dTaxPrintCodeFR
dTaxIdentifierCode
uTaxLegend
uTaxAmount
uTaxableAmount
}
instantSavings
membershipNumber
}
}
}
""".strip()
ORDER_FIELDS = [
"retailer",
"order_id",
"order_date",
"delivery_date",
"service_type",
"order_total",
"payment_method",
"total_item_count",
"total_savings",
"your_savings_total",
"coupons_discounts_total",
"store_name",
"store_number",
"store_address1",
"store_city",
"store_state",
"store_zipcode",
"refund_order",
"ebt_order",
"raw_history_path",
"raw_order_path",
]
ITEM_FIELDS = [
"retailer",
"order_id",
"line_no",
"order_date",
"retailer_item_id",
"pod_id",
"item_name",
"upc",
"category_id",
"category",
"qty",
"unit",
"unit_price",
"line_total",
"picked_weight",
"mvp_savings",
"reward_savings",
"coupon_savings",
"coupon_price",
"image_url",
"raw_order_path",
"is_discount_line",
"is_coupon_line",
]
COSTCO_STORAGE_ORIGIN = "costco.com"
COSTCO_ID_TOKEN_STORAGE_KEY = "idToken"
COSTCO_CLIENT_ID_STORAGE_KEY = "clientID"
def load_config():
load_dotenv()
return {
"authorization": os.getenv("COSTCO_X_AUTHORIZATION", "").strip(),
"client_id": os.getenv("COSTCO_X_WCS_CLIENTID", "").strip(),
"client_identifier": os.getenv("COSTCO_CLIENT_IDENTIFIER", "").strip(),
}
def build_headers(auth_headers):
headers = {
"accept": "*/*",
"content-type": "application/json-patch+json",
"costco.service": "restOrders",
"costco.env": "ecom",
"origin": "https://www.costco.com",
"referer": "https://www.costco.com/",
"user-agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:148.0) "
"Gecko/20100101 Firefox/148.0"
),
}
headers.update(auth_headers)
return headers
def load_costco_browser_headers(profile_dir, authorization, client_id, client_identifier):
local_storage = read_firefox_local_storage(profile_dir, COSTCO_STORAGE_ORIGIN)
webapps_store = read_firefox_webapps_store(profile_dir, COSTCO_STORAGE_ORIGIN)
auth_header = authorization.strip() if authorization else ""
if client_id:
client_id = client_id.strip()
if client_identifier:
client_identifier = client_identifier.strip()
if not auth_header:
id_token = (
local_storage.get(COSTCO_ID_TOKEN_STORAGE_KEY, "").strip()
or webapps_store.get(COSTCO_ID_TOKEN_STORAGE_KEY, "").strip()
)
if id_token:
auth_header = f"Bearer {id_token}"
client_id = client_id or (
local_storage.get(COSTCO_CLIENT_ID_STORAGE_KEY, "").strip()
or webapps_store.get(COSTCO_CLIENT_ID_STORAGE_KEY, "").strip()
)
if not auth_header:
raise click.ClickException(
"could not find Costco auth token; set COSTCO_X_AUTHORIZATION or load Firefox idToken"
)
if not client_id or not client_identifier:
raise click.ClickException(
"missing Costco client ids; set COSTCO_X_WCS_CLIENTID and COSTCO_CLIENT_IDENTIFIER"
)
return {
"costco-x-authorization": auth_header,
"costco-x-wcs-clientId": client_id,
"client-identifier": client_identifier,
}
def build_session(profile_dir, auth_headers):
session = requests.Session()
session.cookies.update(load_firefox_cookies(".costco.com", profile_dir))
session.headers.update(build_headers(auth_headers))
session.headers.update(auth_headers)
return session
def graphql_post(session, query, variables):
last_response = None
for attempt in range(3):
try:
response = session.post(
BASE_URL,
json={"query": query, "variables": variables},
impersonate="firefox",
timeout=30,
)
last_response = response
if response.status_code == 200:
return response.json()
click.echo(f"retry {attempt + 1}/3 status={response.status_code} body={response.text[:500]}")
except Exception as exc: # pragma: no cover - network error path
click.echo(f"retry {attempt + 1}/3 error={exc}")
time.sleep(3)
if last_response is not None:
last_response.raise_for_status()
raise RuntimeError("failed to fetch Costco GraphQL payload")
def safe_filename(value):
return re.sub(r'[<>:"/\\|?*]+', "-", str(value))
def summary_receipts(payload):
return payload.get("data", {}).get("receiptsWithCounts", {}).get("receipts", [])
def detail_receipts(payload):
return payload.get("data", {}).get("receiptsWithCounts", {}).get("receipts", [])
def summary_counts(payload):
counts = payload.get("data", {}).get("receiptsWithCounts", {})
return {
"inWarehouse": counts.get("inWarehouse", 0) or 0,
"gasStation": counts.get("gasStation", 0) or 0,
"carWash": counts.get("carWash", 0) or 0,
"gasAndCarWash": counts.get("gasAndCarWash", 0) or 0,
}
def parse_cli_date(value):
return datetime.strptime(value, "%m/%d/%Y").date()
def format_cli_date(value):
return f"{value.month}/{value.day:02d}/{value.year}"
def subtract_months(value, months):
year = value.year
month = value.month - months
while month <= 0:
month += 12
year -= 1
day = min(value.day, monthrange(year, month)[1])
return value.replace(year=year, month=month, day=day)
def resolve_date_range(months_back, today=None):
if months_back < 1:
raise click.ClickException("months-back must be at least 1")
end = today or datetime.now().date()
start = subtract_months(end, months_back)
return format_cli_date(start), format_cli_date(end)
def build_date_windows(start_date, end_date, window_days):
start = parse_cli_date(start_date)
end = parse_cli_date(end_date)
if end < start:
raise click.ClickException("end-date must be on or after start-date")
if window_days < 1:
raise click.ClickException("window-days must be at least 1")
windows = []
current = start
while current <= end:
window_end = min(current + timedelta(days=window_days - 1), end)
windows.append(
{
"startDate": format_cli_date(current),
"endDate": format_cli_date(window_end),
}
)
current = window_end + timedelta(days=1)
return windows
def unique_receipts(receipts):
by_barcode = {}
for receipt in receipts:
key = receipt_key(receipt)
if key:
by_barcode[key] = receipt
return list(by_barcode.values())
def receipt_key(receipt):
barcode = receipt.get("transactionBarcode", "")
transaction_date_time = receipt.get("transactionDateTime", "")
if not barcode:
return ""
return f"{barcode}::{transaction_date_time}"
def fetch_summary_windows(
session,
start_date,
end_date,
document_type,
document_sub_type,
window_days,
):
requests_metadata = []
combined_receipts = []
for window in build_date_windows(start_date, end_date, window_days):
variables = {
"startDate": window["startDate"],
"endDate": window["endDate"],
"text": "custom",
"documentType": document_type,
"documentSubType": document_sub_type,
}
payload = graphql_post(session, SUMMARY_QUERY, variables)
receipts = summary_receipts(payload)
counts = summary_counts(payload)
warehouse_count = sum(
1 for receipt in receipts if receipt.get("receiptType") == "In-Warehouse"
)
mismatch = counts["inWarehouse"] != warehouse_count
requests_metadata.append(
{
**variables,
"returnedReceipts": len(receipts),
"returnedInWarehouseReceipts": warehouse_count,
"inWarehouse": counts["inWarehouse"],
"gasStation": counts["gasStation"],
"carWash": counts["carWash"],
"gasAndCarWash": counts["gasAndCarWash"],
"countMismatch": mismatch,
}
)
if mismatch:
click.echo(
(
"warning: summary count mismatch for "
f"{window['startDate']} to {window['endDate']}: "
f"inWarehouse={counts['inWarehouse']} "
f"returnedInWarehouseReceipts={warehouse_count}"
),
err=True,
)
combined_receipts.extend(receipts)
unique = unique_receipts(combined_receipts)
aggregate_payload = {
"data": {
"receiptsWithCounts": {
"inWarehouse": sum(row["inWarehouse"] for row in requests_metadata),
"gasStation": sum(row["gasStation"] for row in requests_metadata),
"carWash": sum(row["carWash"] for row in requests_metadata),
"gasAndCarWash": sum(row["gasAndCarWash"] for row in requests_metadata),
"receipts": unique,
}
}
}
return aggregate_payload, requests_metadata
def flatten_costco_data(summary_payload, detail_payloads, raw_dir):
summary_lookup = {
receipt_key(receipt): receipt
for receipt in summary_receipts(summary_payload)
if receipt_key(receipt)
}
orders = []
items = []
for detail_payload in detail_payloads:
for receipt in detail_receipts(detail_payload):
order_id = receipt["transactionBarcode"]
receipt_id = receipt_key(receipt)
summary_row = summary_lookup.get(receipt_id, {})
coupon_numbers = {
row.get("upcnumberCoupon", "")
for row in summary_row.get("couponArray", []) or []
if row.get("upcnumberCoupon")
}
raw_order_path = raw_dir / f"{safe_filename(receipt_id or order_id)}.json"
orders.append(
{
"retailer": RETAILER,
"order_id": order_id,
"order_date": receipt.get("transactionDate", ""),
"delivery_date": receipt.get("transactionDate", ""),
"service_type": receipt.get("receiptType", ""),
"order_total": stringify(receipt.get("total")),
"payment_method": compact_join(
summary_row.get("tenderArray", []) or [], "tenderDescription"
),
"total_item_count": stringify(receipt.get("totalItemCount")),
"total_savings": stringify(receipt.get("instantSavings")),
"your_savings_total": stringify(receipt.get("instantSavings")),
"coupons_discounts_total": stringify(receipt.get("instantSavings")),
"store_name": receipt.get("warehouseName", ""),
"store_number": stringify(receipt.get("warehouseNumber")),
"store_address1": receipt.get("warehouseAddress1", ""),
"store_city": receipt.get("warehouseCity", ""),
"store_state": receipt.get("warehouseState", ""),
"store_zipcode": receipt.get("warehousePostalCode", ""),
"refund_order": "false",
"ebt_order": "false",
"raw_history_path": (raw_dir / "summary.json").as_posix(),
"raw_order_path": raw_order_path.as_posix(),
}
)
for line_no, item in enumerate(receipt.get("itemArray", []), start=1):
item_number = stringify(item.get("itemNumber"))
description = join_descriptions(
item.get("itemDescription01"), item.get("itemDescription02")
)
is_discount = is_discount_line(item)
is_coupon = is_discount and (
item_number in coupon_numbers
or description.startswith("/")
)
items.append(
{
"retailer": RETAILER,
"order_id": order_id,
"line_no": str(line_no),
"order_date": receipt.get("transactionDate", ""),
"retailer_item_id": item_number,
"pod_id": "",
"item_name": description,
"upc": "",
"category_id": stringify(item.get("itemDepartmentNumber")),
"category": stringify(item.get("transDepartmentNumber")),
"qty": stringify(item.get("unit")),
"unit": stringify(item.get("itemIdentifier")),
"unit_price": stringify(item.get("itemUnitPriceAmount")),
"line_total": stringify(item.get("amount")),
"picked_weight": "",
"mvp_savings": "",
"reward_savings": "",
"coupon_savings": stringify(item.get("amount") if is_coupon else ""),
"coupon_price": "",
"image_url": "",
"raw_order_path": raw_order_path.as_posix(),
"is_discount_line": "true" if is_discount else "false",
"is_coupon_line": "true" if is_coupon else "false",
}
)
return orders, items
def join_descriptions(*parts):
return " ".join(str(part).strip() for part in parts if part).strip()
def compact_join(rows, field):
values = [str(row.get(field, "")).strip() for row in rows if row.get(field)]
return " | ".join(values)
def is_discount_line(item):
amount = item.get("amount")
unit = item.get("unit")
description = join_descriptions(
item.get("itemDescription01"), item.get("itemDescription02")
)
try:
amount_val = float(amount)
except (TypeError, ValueError):
amount_val = 0.0
try:
unit_val = float(unit)
except (TypeError, ValueError):
unit_val = 0.0
return amount_val < 0 or unit_val < 0 or description.startswith("/")
def stringify(value):
if value is None:
return ""
return str(value)
def write_json(path, payload):
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
def write_csv(path, rows, fieldnames):
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(rows)
@click.command()
@click.option(
"--outdir",
default="costco_output",
show_default=True,
help="Output directory for Costco raw and flattened files.",
)
@click.option(
"--document-type",
default="all",
show_default=True,
help="Summary document type.",
)
@click.option(
"--document-sub-type",
default="all",
show_default=True,
help="Summary document sub type.",
)
@click.option(
"--window-days",
default=92,
show_default=True,
type=int,
help="Maximum number of days to request per summary window.",
)
@click.option(
"--months-back",
default=36,
show_default=True,
type=int,
help="How many months of receipts to enumerate back from today.",
)
@click.option(
"--firefox-profile-dir",
default=None,
help="Firefox profile directory to use for cookies and session storage.",
)
def main(
outdir,
document_type,
document_sub_type,
window_days,
months_back,
firefox_profile_dir,
):
click.echo("legacy entrypoint: prefer collect_costco_web.py for data-model outputs")
run_collection(
outdir=outdir,
document_type=document_type,
document_sub_type=document_sub_type,
window_days=window_days,
months_back=months_back,
firefox_profile_dir=firefox_profile_dir,
)
def run_collection(
outdir,
document_type,
document_sub_type,
window_days,
months_back,
firefox_profile_dir,
orders_filename="orders.csv",
items_filename="items.csv",
):
outdir = Path(outdir)
raw_dir = outdir / "raw"
config = load_config()
profile_dir = Path(firefox_profile_dir) if firefox_profile_dir else None
if profile_dir is None:
try:
profile_dir = find_firefox_profile_dir()
except Exception:
profile_dir = click.prompt(
"Firefox profile dir",
type=click.Path(exists=True, file_okay=False, path_type=Path),
)
auth_headers = load_costco_browser_headers(
profile_dir,
authorization=config["authorization"],
client_id=config["client_id"],
client_identifier=config["client_identifier"],
)
session = build_session(profile_dir, auth_headers)
click.echo(
"session bootstrap: "
f"cookies={True} "
f"authorization={bool(auth_headers.get('costco-x-authorization'))} "
f"client_id={bool(auth_headers.get('costco-x-wcs-clientId'))} "
f"client_identifier={bool(auth_headers.get('client-identifier'))}"
)
start_date, end_date = resolve_date_range(months_back)
summary_payload, request_metadata = fetch_summary_windows(
session,
start_date,
end_date,
document_type,
document_sub_type,
window_days,
)
write_json(raw_dir / "summary.json", summary_payload)
write_json(raw_dir / "summary_requests.json", request_metadata)
receipts = summary_receipts(summary_payload)
detail_payloads = []
for receipt in receipts:
barcode = receipt["transactionBarcode"]
receipt_id = receipt_key(receipt) or barcode
click.echo(f"fetching {barcode}")
detail_payload = graphql_post(
session,
DETAIL_QUERY,
{"barcode": barcode, "documentType": "warehouse"},
)
detail_payloads.append(detail_payload)
write_json(raw_dir / f"{safe_filename(receipt_id)}.json", detail_payload)
orders, items = flatten_costco_data(summary_payload, detail_payloads, raw_dir)
write_csv(outdir / orders_filename, orders, ORDER_FIELDS)
write_csv(outdir / items_filename, items, ITEM_FIELDS)
click.echo(f"wrote {len(orders)} orders and {len(items)} item rows to {outdir}")
if __name__ == "__main__":
main()

View File

@@ -3,16 +3,20 @@ import json
import os import os
import time import time
from pathlib import Path from pathlib import Path
from dotenv import load_dotenv
import browser_cookie3
from curl_cffi import requests
import click import click
from dotenv import load_dotenv
from curl_cffi import requests
from browser_session import find_firefox_profile_dir, load_firefox_cookies
BASE = "https://giantfood.com" BASE = "https://giantfood.com"
ACCOUNT_PAGE = f"{BASE}/account/history/invoice/in-store" ACCOUNT_PAGE = f"{BASE}/account/history/invoice/in-store"
RETAILER = "giant"
ORDER_FIELDS = [ ORDER_FIELDS = [
"retailer",
"order_id", "order_id",
"order_date", "order_date",
"delivery_date", "delivery_date",
@@ -31,12 +35,16 @@ ORDER_FIELDS = [
"store_zipcode", "store_zipcode",
"refund_order", "refund_order",
"ebt_order", "ebt_order",
"raw_history_path",
"raw_order_path",
] ]
ITEM_FIELDS = [ ITEM_FIELDS = [
"retailer",
"order_id", "order_id",
"order_date", "order_date",
"line_no", "line_no",
"retailer_item_id",
"pod_id", "pod_id",
"item_name", "item_name",
"upc", "upc",
@@ -51,6 +59,10 @@ ITEM_FIELDS = [
"reward_savings", "reward_savings",
"coupon_savings", "coupon_savings",
"coupon_price", "coupon_price",
"image_url",
"raw_order_path",
"is_discount_line",
"is_coupon_line",
] ]
@@ -65,8 +77,9 @@ def load_config():
def build_session(): def build_session():
profile_dir = find_firefox_profile_dir()
session = requests.Session() session = requests.Session()
session.cookies.update(browser_cookie3.firefox(domain_name="giantfood.com")) session.cookies.update(load_firefox_cookies("giantfood.com", profile_dir))
session.headers.update( session.headers.update(
{ {
"user-agent": ( "user-agent": (
@@ -127,18 +140,21 @@ def get_order_detail(session, user_id, order_id):
return response.json() return response.json()
def flatten_orders(history, details): def flatten_orders(history, details, history_path=None, raw_dir=None):
orders = [] orders = []
items = [] items = []
history_lookup = {record["orderId"]: record for record in history.get("records", [])} history_lookup = {record["orderId"]: record for record in history.get("records", [])}
history_path_value = history_path.as_posix() if history_path else ""
for detail in details: for detail in details:
order_id = str(detail["orderId"]) order_id = str(detail["orderId"])
history_row = history_lookup.get(detail["orderId"], {}) history_row = history_lookup.get(detail["orderId"], {})
pickup = detail.get("pup", {}) pickup = detail.get("pup", {})
raw_order_path = (raw_dir / f"{order_id}.json").as_posix() if raw_dir else ""
orders.append( orders.append(
{ {
"retailer": RETAILER,
"order_id": order_id, "order_id": order_id,
"order_date": detail.get("orderDate"), "order_date": detail.get("orderDate"),
"delivery_date": detail.get("deliveryDate"), "delivery_date": detail.get("deliveryDate"),
@@ -157,15 +173,19 @@ def flatten_orders(history, details):
"store_zipcode": pickup.get("storeZipcode"), "store_zipcode": pickup.get("storeZipcode"),
"refund_order": detail.get("refundOrder"), "refund_order": detail.get("refundOrder"),
"ebt_order": detail.get("ebtOrder"), "ebt_order": detail.get("ebtOrder"),
"raw_history_path": history_path_value,
"raw_order_path": raw_order_path,
} }
) )
for line_no, item in enumerate(detail.get("items", []), start=1): for line_no, item in enumerate(detail.get("items", []), start=1):
items.append( items.append(
{ {
"retailer": RETAILER,
"order_id": order_id, "order_id": order_id,
"order_date": detail.get("orderDate"), "order_date": detail.get("orderDate"),
"line_no": str(line_no), "line_no": str(line_no),
"retailer_item_id": "",
"pod_id": item.get("podId"), "pod_id": item.get("podId"),
"item_name": item.get("itemName"), "item_name": item.get("itemName"),
"upc": item.get("primUpcCd"), "upc": item.get("primUpcCd"),
@@ -180,6 +200,10 @@ def flatten_orders(history, details):
"reward_savings": item.get("rewardSavings"), "reward_savings": item.get("rewardSavings"),
"coupon_savings": item.get("couponSavings"), "coupon_savings": item.get("couponSavings"),
"coupon_price": item.get("couponPrice"), "coupon_price": item.get("couponPrice"),
"image_url": "",
"raw_order_path": raw_order_path,
"is_discount_line": "false",
"is_coupon_line": "false",
} }
) )
@@ -266,6 +290,18 @@ def write_json(path, payload):
help="Delay between order detail requests.", help="Delay between order detail requests.",
) )
def main(user_id, loyalty, outdir, sleep_seconds): def main(user_id, loyalty, outdir, sleep_seconds):
click.echo("legacy entrypoint: prefer collect_giant_web.py for data-model outputs")
run_collection(user_id, loyalty, outdir, sleep_seconds)
def run_collection(
user_id,
loyalty,
outdir,
sleep_seconds,
orders_filename="orders.csv",
items_filename="items.csv",
):
config = load_config() config = load_config()
user_id = user_id or config["user_id"] or click.prompt("Giant user id", type=str) user_id = user_id or config["user_id"] or click.prompt("Giant user id", type=str)
loyalty = loyalty or config["loyalty"] or click.prompt( loyalty = loyalty or config["loyalty"] or click.prompt(
@@ -276,66 +312,55 @@ def main(user_id, loyalty, outdir, sleep_seconds):
rawdir = outdir / "raw" rawdir = outdir / "raw"
rawdir.mkdir(parents=True, exist_ok=True) rawdir.mkdir(parents=True, exist_ok=True)
orders_csv = outdir / "orders.csv" orders_csv = outdir / orders_filename
items_csv = outdir / "items.csv" items_csv = outdir / items_filename
existing_order_ids = read_existing_order_ids(orders_csv)
click.echo("Using cookies from your current Firefox profile.")
click.echo(f"Open Giant here, confirm you're logged in, then return: {ACCOUNT_PAGE}")
click.pause(info="Press any key once Giant is open and logged in")
session = build_session() session = build_session()
click.echo("Fetching order history...")
history = get_history(session, user_id, loyalty) history = get_history(session, user_id, loyalty)
write_json(rawdir / "history.json", history) history_path = rawdir / "history.json"
write_json(history_path, history)
records = history.get("records", []) records = history.get("records", [])
click.echo(f"History returned {len(records)} visits.") click.echo(f"history returned {len(records)} visits; Giant exposes only the most recent 50")
unseen_records = [
record
for record in records
if stringify(record.get("orderId")) not in existing_order_ids
]
click.echo( click.echo(
"Note: Giant appears to expose only the most recent 50 visits, " f"found {len(unseen_records)} unseen visits "
"so run this periodically if you want full continuity." f"({len(existing_order_ids)} already stored)"
) )
history_order_ids = [str(record["orderId"]) for record in records]
existing_order_ids = read_existing_order_ids(orders_csv)
new_order_ids = [order_id for order_id in history_order_ids if order_id not in existing_order_ids]
click.echo(f"Existing orders in csv: {len(existing_order_ids)}")
click.echo(f"New orders to fetch: {len(new_order_ids)}")
if not new_order_ids:
click.echo("No new orders found. Done.")
return
details = [] details = []
for order_id in new_order_ids: for index, record in enumerate(unseen_records, start=1):
click.echo(f"Fetching {order_id}") order_id = stringify(record.get("orderId"))
click.echo(f"[{index}/{len(unseen_records)}] fetching {order_id}")
detail = get_order_detail(session, user_id, order_id) detail = get_order_detail(session, user_id, order_id)
details.append(detail)
write_json(rawdir / f"{order_id}.json", detail) write_json(rawdir / f"{order_id}.json", detail)
details.append(detail)
if index < len(unseen_records):
time.sleep(sleep_seconds) time.sleep(sleep_seconds)
click.echo("Flattening new data...") orders, items = flatten_orders(history, details, history_path=history_path, raw_dir=rawdir)
orders, items = flatten_orders(history, details) merged_orders = append_dedup(
all_orders = append_dedup(
orders_csv, orders_csv,
orders, orders,
subset=["order_id"], subset=["order_id"],
fieldnames=ORDER_FIELDS, fieldnames=ORDER_FIELDS,
) )
all_items = append_dedup( merged_items = append_dedup(
items_csv, items_csv,
items, items,
subset=["order_id", "line_no", "item_name", "upc", "line_total"], subset=["order_id", "line_no"],
fieldnames=ITEM_FIELDS, fieldnames=ITEM_FIELDS,
) )
click.echo(
click.echo("Done.") f"wrote {len(orders)} new orders / {len(items)} new items "
click.echo(f"Orders csv: {orders_csv}") f"({len(merged_orders)} total orders, {len(merged_items)} total items)"
click.echo(f"Items csv: {items_csv}") )
click.echo(f"Total orders stored: {len(all_orders)}")
click.echo(f"Total item rows stored: {len(all_items)}")
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -0,0 +1,149 @@
import csv
import tempfile
import unittest
from pathlib import Path
import analyze_purchases
class AnalyzePurchasesTests(unittest.TestCase):
def test_analysis_outputs_cover_required_views(self):
with tempfile.TemporaryDirectory() as tmpdir:
purchases_csv = Path(tmpdir) / "purchases.csv"
output_dir = Path(tmpdir) / "analysis"
fieldnames = [
"purchase_date",
"retailer",
"order_id",
"catalog_id",
"catalog_name",
"category",
"product_type",
"net_line_total",
"line_total",
"normalized_quantity",
"normalized_quantity_unit",
"effective_price",
"effective_price_unit",
"store_name",
"store_number",
"store_city",
"store_state",
"is_fee",
"is_discount_line",
"is_coupon_line",
]
with purchases_csv.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(
[
{
"purchase_date": "2026-03-01",
"retailer": "giant",
"order_id": "g1",
"catalog_id": "cat_banana",
"catalog_name": "BANANA",
"category": "produce",
"product_type": "banana",
"net_line_total": "1.29",
"line_total": "1.29",
"normalized_quantity": "2.19",
"normalized_quantity_unit": "lb",
"effective_price": "0.589",
"effective_price_unit": "lb",
"store_name": "Giant",
"store_number": "42",
"store_city": "Springfield",
"store_state": "VA",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
},
{
"purchase_date": "2026-03-01",
"retailer": "giant",
"order_id": "g1",
"catalog_id": "cat_ice",
"catalog_name": "ICE",
"category": "frozen",
"product_type": "ice",
"net_line_total": "3.50",
"line_total": "3.50",
"normalized_quantity": "20",
"normalized_quantity_unit": "lb",
"effective_price": "0.175",
"effective_price_unit": "lb",
"store_name": "Giant",
"store_number": "42",
"store_city": "Springfield",
"store_state": "VA",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
},
{
"purchase_date": "2026-03-02",
"retailer": "costco",
"order_id": "c1",
"catalog_id": "cat_banana",
"catalog_name": "BANANA",
"category": "produce",
"product_type": "banana",
"net_line_total": "1.49",
"line_total": "2.98",
"normalized_quantity": "3",
"normalized_quantity_unit": "lb",
"effective_price": "0.4967",
"effective_price_unit": "lb",
"store_name": "MT VERNON",
"store_number": "1115",
"store_city": "ALEXANDRIA",
"store_state": "VA",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
},
]
)
analyze_purchases.main.callback(
purchases_csv=str(purchases_csv),
output_dir=str(output_dir),
)
expected_files = [
"item_price_over_time.csv",
"spend_by_visit.csv",
"items_per_visit.csv",
"category_spend_over_time.csv",
"retailer_store_breakdown.csv",
]
for name in expected_files:
self.assertTrue((output_dir / name).exists(), name)
with (output_dir / "spend_by_visit.csv").open(newline="", encoding="utf-8") as handle:
spend_rows = list(csv.DictReader(handle))
self.assertEqual("4.79", spend_rows[0]["visit_spend_total"])
with (output_dir / "items_per_visit.csv").open(newline="", encoding="utf-8") as handle:
item_rows = list(csv.DictReader(handle))
self.assertEqual("2", item_rows[0]["item_row_count"])
self.assertEqual("2", item_rows[0]["distinct_catalog_count"])
with (output_dir / "category_spend_over_time.csv").open(newline="", encoding="utf-8") as handle:
category_rows = list(csv.DictReader(handle))
produce_row = next(row for row in category_rows if row["purchase_date"] == "2026-03-01" and row["category"] == "produce")
self.assertEqual("1.29", produce_row["category_spend_total"])
with (output_dir / "retailer_store_breakdown.csv").open(newline="", encoding="utf-8") as handle:
store_rows = list(csv.DictReader(handle))
giant_row = next(row for row in store_rows if row["retailer"] == "giant")
self.assertEqual("1", giant_row["visit_count"])
self.assertEqual("2", giant_row["item_row_count"])
self.assertEqual("4.79", giant_row["store_spend_total"])
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,155 @@
import sqlite3
import tempfile
import unittest
from pathlib import Path
from unittest import mock
import browser_session
import scrape_costco
class BrowserSessionTests(unittest.TestCase):
def test_read_firefox_local_storage_reads_copied_sqlite(self):
with tempfile.TemporaryDirectory() as tmpdir:
profile_dir = Path(tmpdir) / "abcd.default-release"
ls_dir = profile_dir / "storage" / "default" / "https+++www.costco.com" / "ls"
ls_dir.mkdir(parents=True)
db_path = ls_dir / "data.sqlite"
with sqlite3.connect(db_path) as connection:
connection.execute("CREATE TABLE data (key TEXT, value TEXT)")
connection.execute(
"INSERT INTO data (key, value) VALUES (?, ?)",
("costco-x-wcs-clientId", "4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf"),
)
values = browser_session.read_firefox_local_storage(
profile_dir,
origin_filter="costco.com",
)
self.assertEqual(
"4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",
values["costco-x-wcs-clientId"],
)
def test_load_costco_browser_headers_reads_id_token_and_client_id(self):
with tempfile.TemporaryDirectory() as tmpdir:
profile_dir = Path(tmpdir)
storage_dir = profile_dir / "storage" / "default" / "https+++www.costco.com" / "ls"
storage_dir.mkdir(parents=True)
db_path = storage_dir / "data.sqlite"
with sqlite3.connect(db_path) as connection:
connection.execute("CREATE TABLE data (key TEXT, value TEXT)")
connection.execute(
"INSERT INTO data (key, value) VALUES (?, ?)",
("idToken", "header.payload.signature"),
)
connection.execute(
"INSERT INTO data (key, value) VALUES (?, ?)",
("clientID", "4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf"),
)
headers = scrape_costco.load_costco_browser_headers(
profile_dir,
authorization="",
client_id="",
client_identifier="481b1aec-aa3b-454b-b81b-48187e28f205",
)
self.assertEqual("Bearer header.payload.signature", headers["costco-x-authorization"])
self.assertEqual(
"4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",
headers["costco-x-wcs-clientId"],
)
self.assertEqual(
"481b1aec-aa3b-454b-b81b-48187e28f205",
headers["client-identifier"],
)
def test_load_costco_browser_headers_prefers_env_values(self):
with tempfile.TemporaryDirectory() as tmpdir:
profile_dir = Path(tmpdir)
storage_dir = profile_dir / "storage" / "default" / "https+++www.costco.com" / "ls"
storage_dir.mkdir(parents=True)
db_path = storage_dir / "data.sqlite"
with sqlite3.connect(db_path) as connection:
connection.execute("CREATE TABLE data (key TEXT, value TEXT)")
connection.execute(
"INSERT INTO data (key, value) VALUES (?, ?)",
("idToken", "storage.payload.signature"),
)
connection.execute(
"INSERT INTO data (key, value) VALUES (?, ?)",
("clientID", "4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf"),
)
headers = scrape_costco.load_costco_browser_headers(
profile_dir,
authorization="Bearer env.payload.signature",
client_id="env-client-id",
client_identifier="481b1aec-aa3b-454b-b81b-48187e28f205",
)
self.assertEqual("Bearer env.payload.signature", headers["costco-x-authorization"])
self.assertEqual("env-client-id", headers["costco-x-wcs-clientId"])
def test_scrape_costco_prompts_for_profile_dir_when_autodiscovery_fails(self):
with mock.patch.object(
scrape_costco,
"find_firefox_profile_dir",
side_effect=FileNotFoundError("no default profile"),
), mock.patch.object(
scrape_costco.click,
"prompt",
return_value=Path("/tmp/profile"),
) as mocked_prompt, mock.patch.object(
scrape_costco,
"load_config",
return_value={
"authorization": "",
"client_id": "4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",
"client_identifier": "481b1aec-aa3b-454b-b81b-48187e28f205",
},
), mock.patch.object(
scrape_costco,
"load_costco_browser_headers",
return_value={
"costco-x-authorization": "Bearer header.payload.signature",
"costco-x-wcs-clientId": "4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",
"client-identifier": "481b1aec-aa3b-454b-b81b-48187e28f205",
},
), mock.patch.object(
scrape_costco,
"build_session",
return_value=object(),
), mock.patch.object(
scrape_costco,
"fetch_summary_windows",
return_value=(
{"data": {"receiptsWithCounts": {"receipts": []}}},
[],
),
), mock.patch.object(
scrape_costco,
"write_json",
), mock.patch.object(
scrape_costco,
"write_csv",
):
scrape_costco.main.callback(
outdir="/tmp/costco_output",
document_type="all",
document_sub_type="all",
window_days=92,
months_back=3,
firefox_profile_dir=None,
)
mocked_prompt.assert_called_once()
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,533 @@
import csv
import json
import tempfile
import unittest
from pathlib import Path
from unittest import mock
import enrich_costco
import scrape_costco
class CostcoPipelineTests(unittest.TestCase):
def test_resolve_date_range_uses_months_back(self):
start_date, end_date = scrape_costco.resolve_date_range(
3, today=scrape_costco.parse_cli_date("3/16/2026")
)
self.assertEqual("12/16/2025", start_date)
self.assertEqual("3/16/2026", end_date)
def test_build_date_windows_splits_long_ranges(self):
windows = scrape_costco.build_date_windows("1/01/2026", "6/30/2026", 92)
self.assertEqual(
[
{"startDate": "1/01/2026", "endDate": "4/02/2026"},
{"startDate": "4/03/2026", "endDate": "6/30/2026"},
],
windows,
)
def test_fetch_summary_windows_records_metadata_and_warns_on_mismatch(self):
payloads = [
{
"data": {
"receiptsWithCounts": {
"inWarehouse": 2,
"gasStation": 0,
"carWash": 0,
"gasAndCarWash": 0,
"receipts": [
{
"transactionBarcode": "abc",
"receiptType": "In-Warehouse",
}
],
}
}
},
{
"data": {
"receiptsWithCounts": {
"inWarehouse": 1,
"gasStation": 0,
"carWash": 0,
"gasAndCarWash": 0,
"receipts": [
{
"transactionBarcode": "def",
"receiptType": "In-Warehouse",
}
],
}
}
},
]
with mock.patch.object(
scrape_costco, "graphql_post", side_effect=payloads
) as mocked_post, mock.patch.object(scrape_costco.click, "echo") as mocked_echo:
summary_payload, metadata = scrape_costco.fetch_summary_windows(
session=object(),
start_date="1/01/2026",
end_date="6/30/2026",
document_type="all",
document_sub_type="all",
window_days=92,
)
self.assertEqual(2, mocked_post.call_count)
self.assertEqual(2, len(metadata))
self.assertTrue(metadata[0]["countMismatch"])
self.assertFalse(metadata[1]["countMismatch"])
self.assertEqual("1/01/2026", metadata[0]["startDate"])
self.assertEqual("4/03/2026", metadata[1]["startDate"])
self.assertEqual(
["abc", "def"],
[
row["transactionBarcode"]
for row in scrape_costco.summary_receipts(summary_payload)
],
)
mocked_echo.assert_called_once()
warning_text = mocked_echo.call_args.args[0]
self.assertIn("warning: summary count mismatch", warning_text)
def test_flatten_costco_data_preserves_discount_rows(self):
summary_payload = {
"data": {
"receiptsWithCounts": {
"receipts": [
{
"transactionBarcode": "abc",
"tenderArray": [{"tenderDescription": "VISA"}],
"couponArray": [{"upcnumberCoupon": "2100003746641"}],
}
]
}
}
}
detail_payloads = [
{
"data": {
"receiptsWithCounts": {
"receipts": [
{
"transactionBarcode": "abc",
"transactionDate": "2026-03-12",
"receiptType": "In-Warehouse",
"total": 10.0,
"totalItemCount": 2,
"instantSavings": 5.0,
"warehouseName": "MT VERNON",
"warehouseNumber": 1115,
"warehouseAddress1": "7940 RICHMOND HWY",
"warehouseCity": "ALEXANDRIA",
"warehouseState": "VA",
"warehousePostalCode": "22306",
"itemArray": [
{
"itemNumber": "4873222",
"itemDescription01": "ALL F&C",
"itemDescription02": "200OZ 160LOADS P104",
"itemDepartmentNumber": 14,
"transDepartmentNumber": 14,
"unit": 1,
"itemIdentifier": "E",
"amount": 19.99,
"itemUnitPriceAmount": 19.99,
},
{
"itemNumber": "374664",
"itemDescription01": "/ 4873222",
"itemDescription02": None,
"itemDepartmentNumber": 14,
"transDepartmentNumber": 14,
"unit": -1,
"itemIdentifier": None,
"amount": -5,
"itemUnitPriceAmount": 0,
},
],
}
]
}
}
}
]
orders, items = scrape_costco.flatten_costco_data(
summary_payload, detail_payloads, Path("costco_output/raw")
)
self.assertEqual(1, len(orders))
self.assertEqual(2, len(items))
self.assertEqual("false", items[0]["is_discount_line"])
self.assertEqual("true", items[1]["is_discount_line"])
self.assertEqual("true", items[1]["is_coupon_line"])
def test_flatten_costco_data_uses_composite_summary_lookup_key(self):
summary_payload = {
"data": {
"receiptsWithCounts": {
"receipts": [
{
"transactionBarcode": "dup",
"transactionDateTime": "2026-03-12T16:16:00",
"tenderArray": [{"tenderDescription": "VISA"}],
"couponArray": [{"upcnumberCoupon": "111"}],
},
{
"transactionBarcode": "dup",
"transactionDateTime": "2026-02-14T16:25:00",
"tenderArray": [{"tenderDescription": "MASTERCARD"}],
"couponArray": [],
},
]
}
}
}
detail_payloads = [
{
"data": {
"receiptsWithCounts": {
"receipts": [
{
"transactionBarcode": "dup",
"transactionDateTime": "2026-03-12T16:16:00",
"transactionDate": "2026-03-12",
"receiptType": "In-Warehouse",
"total": 10.0,
"totalItemCount": 1,
"instantSavings": 5.0,
"warehouseName": "MT VERNON",
"warehouseNumber": 1115,
"warehouseAddress1": "7940 RICHMOND HWY",
"warehouseCity": "ALEXANDRIA",
"warehouseState": "VA",
"warehousePostalCode": "22306",
"itemArray": [
{
"itemNumber": "111",
"itemDescription01": "/ 111",
"itemDescription02": None,
"itemDepartmentNumber": 14,
"transDepartmentNumber": 14,
"unit": -1,
"itemIdentifier": None,
"amount": -5,
"itemUnitPriceAmount": 0,
}
],
}
]
}
}
}
]
orders, items = scrape_costco.flatten_costco_data(
summary_payload, detail_payloads, Path("costco_output/raw")
)
self.assertEqual("VISA", orders[0]["payment_method"])
self.assertEqual("true", items[0]["is_coupon_line"])
self.assertIn("dup-2026-03-12T16-16-00.json", items[0]["raw_order_path"])
def test_costco_enricher_parses_size_pack_and_discount(self):
row = enrich_costco.parse_costco_item(
order_id="abc",
order_date="2026-03-12",
raw_path=Path("costco_output/raw/abc.json"),
line_no=1,
item={
"itemNumber": "60357",
"itemDescription01": "MIXED PEPPER",
"itemDescription02": "6-PACK",
"itemDepartmentNumber": 65,
"transDepartmentNumber": 65,
"unit": 1,
"itemIdentifier": "E",
"amount": 7.49,
"itemUnitPriceAmount": 7.49,
},
)
self.assertEqual("60357", row["retailer_item_id"])
self.assertEqual("MIXED PEPPER", row["item_name_norm"])
self.assertEqual("6", row["pack_qty"])
self.assertEqual("count", row["measure_type"])
self.assertEqual("costco:abc:1", row["normalized_row_id"])
self.assertEqual("exact_retailer_item_id", row["normalization_basis"])
self.assertTrue(row["normalized_item_id"])
self.assertEqual("6", row["normalized_quantity"])
self.assertEqual("count", row["normalized_quantity_unit"])
volume_row = enrich_costco.parse_costco_item(
order_id="abc",
order_date="2026-03-12",
raw_path=Path("costco_output/raw/abc.json"),
line_no=3,
item={
"itemNumber": "1185912",
"itemDescription01": "KS ALMND BAR US 1.74QTS CN",
"itemDescription02": None,
"itemDepartmentNumber": 18,
"transDepartmentNumber": 18,
"unit": 2,
"itemIdentifier": "E",
"amount": 21.98,
"itemUnitPriceAmount": 10.99,
},
)
self.assertEqual("3.48", volume_row["normalized_quantity"])
self.assertEqual("qt", volume_row["normalized_quantity_unit"])
discount = enrich_costco.parse_costco_item(
order_id="abc",
order_date="2026-03-12",
raw_path=Path("costco_output/raw/abc.json"),
line_no=2,
item={
"itemNumber": "374664",
"itemDescription01": "/ 4873222",
"itemDescription02": None,
"itemDepartmentNumber": 14,
"transDepartmentNumber": 14,
"unit": -1,
"itemIdentifier": None,
"amount": -5,
"itemUnitPriceAmount": 0,
},
)
self.assertEqual("true", discount["is_discount_line"])
self.assertEqual("true", discount["is_coupon_line"])
self.assertEqual("false", discount["is_item"])
def test_costco_name_cleanup_removes_dual_weight_and_logistics_artifacts(self):
mixed_units = enrich_costco.parse_costco_item(
order_id="abc",
order_date="2026-03-12",
raw_path=Path("costco_output/raw/abc.json"),
line_no=1,
item={
"itemNumber": "18600",
"itemDescription01": "MANDARINS 2.27 KG / 5 LBS",
"itemDescription02": None,
"itemDepartmentNumber": 65,
"transDepartmentNumber": 65,
"unit": 1,
"itemIdentifier": "E",
"amount": 7.49,
"itemUnitPriceAmount": 7.49,
},
)
self.assertEqual("MANDARIN", mixed_units["item_name_norm"])
self.assertEqual("5", mixed_units["size_value"])
self.assertEqual("lb", mixed_units["size_unit"])
logistics = enrich_costco.parse_costco_item(
order_id="abc",
order_date="2026-03-12",
raw_path=Path("costco_output/raw/abc.json"),
line_no=2,
item={
"itemNumber": "1375005",
"itemDescription01": "LIFE 6'TABLE MDL #80873U - T12/H3/P36",
"itemDescription02": None,
"itemDepartmentNumber": 18,
"transDepartmentNumber": 18,
"unit": 1,
"itemIdentifier": "E",
"amount": 119.98,
"itemUnitPriceAmount": 119.98,
},
)
self.assertEqual("LIFE 6'TABLE MDL", logistics["item_name_norm"])
def test_costco_hash_weight_parses_into_weight_basis(self):
row = enrich_costco.parse_costco_item(
order_id="abc",
order_date="2024-11-29",
raw_path=Path("costco_output/raw/abc.json"),
line_no=4,
item={
"itemNumber": "999",
"itemDescription01": "25# FLOUR ALL-PURPOSE HARV P98/100",
"itemDescription02": None,
"itemDepartmentNumber": 14,
"transDepartmentNumber": 14,
"unit": 1,
"itemIdentifier": "E",
"amount": 8.79,
"itemUnitPriceAmount": 8.79,
},
)
self.assertEqual("FLOUR ALL-PURPOSE HARV", row["item_name_norm"])
self.assertEqual("25", row["size_value"])
self.assertEqual("lb", row["size_unit"])
self.assertEqual("weight", row["measure_type"])
self.assertEqual("25", row["normalized_quantity"])
self.assertEqual("lb", row["normalized_quantity_unit"])
self.assertEqual("0.3516", row["price_per_lb"])
def test_build_items_enriched_matches_discount_to_item(self):
with tempfile.TemporaryDirectory() as tmpdir:
raw_dir = Path(tmpdir) / "raw"
raw_dir.mkdir()
payload = {
"data": {
"receiptsWithCounts": {
"receipts": [
{
"transactionBarcode": "abc",
"transactionDate": "2026-03-12",
"itemArray": [
{
"itemNumber": "4873222",
"itemDescription01": "ALL F&C",
"itemDescription02": "200OZ 160LOADS P104",
"itemDepartmentNumber": 14,
"transDepartmentNumber": 14,
"unit": 1,
"itemIdentifier": "E",
"amount": 19.99,
"itemUnitPriceAmount": 19.99,
},
{
"itemNumber": "374664",
"itemDescription01": "/ 4873222",
"itemDescription02": None,
"itemDepartmentNumber": 14,
"transDepartmentNumber": 14,
"unit": -1,
"itemIdentifier": None,
"amount": -5,
"itemUnitPriceAmount": 0,
},
],
}
]
}
}
}
(raw_dir / "abc.json").write_text(json.dumps(payload), encoding="utf-8")
rows = enrich_costco.build_items_enriched(raw_dir)
purchase_row = next(row for row in rows if row["is_discount_line"] == "false")
discount_row = next(row for row in rows if row["is_discount_line"] == "true")
self.assertEqual("-5", purchase_row["matched_discount_amount"])
self.assertEqual("14.99", purchase_row["net_line_total"])
self.assertIn("matched_discount=4873222", purchase_row["parse_notes"])
self.assertIn("matched_to_item=4873222", discount_row["parse_notes"])
def test_main_writes_summary_request_metadata(self):
with tempfile.TemporaryDirectory() as tmpdir:
outdir = Path(tmpdir) / "costco_output"
summary_payload = {
"data": {
"receiptsWithCounts": {
"inWarehouse": 1,
"gasStation": 0,
"carWash": 0,
"gasAndCarWash": 0,
"receipts": [
{
"transactionBarcode": "abc",
"receiptType": "In-Warehouse",
"tenderArray": [],
"couponArray": [],
}
],
}
}
}
detail_payload = {
"data": {
"receiptsWithCounts": {
"receipts": [
{
"transactionBarcode": "abc",
"transactionDate": "2026-03-12",
"receiptType": "In-Warehouse",
"total": 10.0,
"totalItemCount": 1,
"instantSavings": 0,
"warehouseName": "MT VERNON",
"warehouseNumber": 1115,
"warehouseAddress1": "7940 RICHMOND HWY",
"warehouseCity": "ALEXANDRIA",
"warehouseState": "VA",
"warehousePostalCode": "22306",
"itemArray": [],
}
]
}
}
}
metadata = [
{
"startDate": "1/01/2026",
"endDate": "3/31/2026",
"text": "custom",
"documentType": "all",
"documentSubType": "all",
"returnedReceipts": 1,
"returnedInWarehouseReceipts": 1,
"inWarehouse": 1,
"gasStation": 0,
"carWash": 0,
"gasAndCarWash": 0,
"countMismatch": False,
}
]
with mock.patch.object(
scrape_costco,
"load_config",
return_value={
"authorization": "",
"client_id": "4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",
"client_identifier": "481b1aec-aa3b-454b-b81b-48187e28f205",
},
), mock.patch.object(
scrape_costco,
"find_firefox_profile_dir",
return_value=Path("/tmp/profile"),
), mock.patch.object(
scrape_costco,
"load_costco_browser_headers",
return_value={
"costco-x-authorization": "Bearer header.payload.signature",
"costco-x-wcs-clientId": "4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",
"client-identifier": "481b1aec-aa3b-454b-b81b-48187e28f205",
},
), mock.patch.object(
scrape_costco, "build_session", return_value=object()
), mock.patch.object(
scrape_costco,
"fetch_summary_windows",
return_value=(summary_payload, metadata),
), mock.patch.object(
scrape_costco,
"graphql_post",
return_value=detail_payload,
):
scrape_costco.main.callback(
outdir=str(outdir),
document_type="all",
document_sub_type="all",
window_days=92,
months_back=3,
firefox_profile_dir=None,
)
metadata_path = outdir / "raw" / "summary_requests.json"
self.assertTrue(metadata_path.exists())
saved_metadata = json.loads(metadata_path.read_text(encoding="utf-8"))
self.assertEqual(metadata, saved_metadata)
if __name__ == "__main__":
unittest.main()

272
tests/test_enrich_giant.py Normal file
View File

@@ -0,0 +1,272 @@
import csv
import json
import tempfile
import unittest
from pathlib import Path
import enrich_giant
class EnrichGiantTests(unittest.TestCase):
def test_parse_size_and_pack_handles_pack_and_weight_tokens(self):
size_value, size_unit, pack_qty = enrich_giant.parse_size_and_pack(
"COKE CHERRY 6PK 7.5Z"
)
self.assertEqual("7.5", size_value)
self.assertEqual("oz", size_unit)
self.assertEqual("6", pack_qty)
def test_parse_item_marks_store_brand_fee_and_weight_prices(self):
row = enrich_giant.parse_item(
order_id="abc123",
order_date="2026-03-01",
raw_path=Path("raw/abc123.json"),
line_no=1,
item={
"podId": 1,
"shipQy": 1,
"totalPickedWeight": 2,
"unitPrice": 3.98,
"itemName": "+SB GALA APPLE 5 LB",
"lbEachCd": "LB",
"groceryAmount": 3.98,
"primUpcCd": "111",
"mvpSavings": 0,
"rewardSavings": 0,
"couponSavings": 0,
"couponPrice": 0,
"categoryId": "1",
"categoryDesc": "Grocery",
"image": {"large": "https://example.test/apple.jpg"},
},
)
self.assertEqual("SB", row["brand_guess"])
self.assertEqual("GALA APPLE", row["item_name_norm"])
self.assertEqual("5", row["size_value"])
self.assertEqual("lb", row["size_unit"])
self.assertEqual("weight", row["measure_type"])
self.assertEqual("true", row["is_store_brand"])
self.assertEqual("1.99", row["price_per_lb"])
self.assertEqual("0.1244", row["price_per_oz"])
self.assertEqual("https://example.test/apple.jpg", row["image_url"])
self.assertEqual("giant:abc123:1", row["normalized_row_id"])
self.assertEqual("exact_upc", row["normalization_basis"])
self.assertEqual("5", row["normalized_quantity"])
self.assertEqual("lb", row["normalized_quantity_unit"])
self.assertEqual("true", row["is_item"])
fee_row = enrich_giant.parse_item(
order_id="abc123",
order_date="2026-03-01",
raw_path=Path("raw/abc123.json"),
line_no=2,
item={
"podId": 2,
"shipQy": 1,
"totalPickedWeight": 0,
"unitPrice": 0.05,
"itemName": "GL BAG CHARGE",
"lbEachCd": "EA",
"groceryAmount": 0.05,
"primUpcCd": "",
"mvpSavings": 0,
"rewardSavings": 0,
"couponSavings": 0,
"couponPrice": 0,
"categoryId": "1",
"categoryDesc": "Grocery",
},
)
self.assertEqual("true", fee_row["is_fee"])
self.assertEqual("GL BAG CHARGE", fee_row["item_name_norm"])
self.assertEqual("false", fee_row["is_item"])
def test_parse_item_derives_packaged_weight_prices_from_size_tokens(self):
row = enrich_giant.parse_item(
order_id="abc123",
order_date="2026-03-01",
raw_path=Path("raw/abc123.json"),
line_no=1,
item={
"podId": 1,
"shipQy": 2,
"totalPickedWeight": 0,
"unitPrice": 3.0,
"itemName": "PEPSI 6PK 7.5Z",
"lbEachCd": "EA",
"groceryAmount": 6.0,
"primUpcCd": "111",
"mvpSavings": 0,
"rewardSavings": 0,
"couponSavings": 0,
"couponPrice": 0,
"categoryId": "1",
"categoryDesc": "Grocery",
},
)
self.assertEqual("weight", row["measure_type"])
self.assertEqual("6", row["pack_qty"])
self.assertEqual("7.5", row["size_value"])
self.assertEqual("90", row["normalized_quantity"])
self.assertEqual("oz", row["normalized_quantity_unit"])
self.assertEqual("0.0667", row["price_per_oz"])
self.assertEqual("1.0667", row["price_per_lb"])
def test_derive_normalized_quantity_handles_count_volume_and_each(self):
self.assertEqual(
("18", "count"),
enrich_giant.derive_normalized_quantity("1", "", "", "18", "count"),
)
self.assertEqual(
("3.48", "qt"),
enrich_giant.derive_normalized_quantity("2", "1.74", "qt", "", "volume"),
)
self.assertEqual(
("2", "each"),
enrich_giant.derive_normalized_quantity("2", "", "", "", "each"),
)
self.assertEqual(
("1.68", "lb"),
enrich_giant.derive_normalized_quantity("1", "", "", "", "weight", "1.68"),
)
def test_parse_item_uses_picked_weight_for_loose_weight_items(self):
banana = enrich_giant.parse_item(
order_id="abc123",
order_date="2026-03-01",
raw_path=Path("raw/abc123.json"),
line_no=1,
item={
"podId": 1,
"shipQy": 1,
"totalPickedWeight": 1.68,
"unitPrice": 0.99,
"itemName": "FRESH BANANA",
"lbEachCd": "LB",
"groceryAmount": 0.99,
"primUpcCd": "111",
"mvpSavings": 0,
"rewardSavings": 0,
"couponSavings": 0,
"couponPrice": 0,
"categoryId": "1",
"categoryDesc": "Grocery",
},
)
self.assertEqual("weight", banana["measure_type"])
self.assertEqual("1.68", banana["normalized_quantity"])
self.assertEqual("lb", banana["normalized_quantity_unit"])
patty = enrich_giant.parse_item(
order_id="abc123",
order_date="2026-03-01",
raw_path=Path("raw/abc123.json"),
line_no=2,
item={
"podId": 2,
"shipQy": 1,
"totalPickedWeight": 1.29,
"unitPrice": 10.05,
"itemName": "80% PATTIES PK12",
"lbEachCd": "LB",
"groceryAmount": 10.05,
"primUpcCd": "222",
"mvpSavings": 0,
"rewardSavings": 0,
"couponSavings": 0,
"couponPrice": 0,
"categoryId": "1",
"categoryDesc": "Grocery",
},
)
self.assertEqual("1.29", patty["normalized_quantity"])
self.assertEqual("lb", patty["normalized_quantity_unit"])
def test_build_items_enriched_reads_raw_order_files_and_writes_csv(self):
with tempfile.TemporaryDirectory() as tmpdir:
raw_dir = Path(tmpdir) / "raw"
raw_dir.mkdir()
(raw_dir / "history.json").write_text("{}", encoding="utf-8")
(raw_dir / "order-2.json").write_text(
json.dumps(
{
"orderId": "order-2",
"orderDate": "2026-03-02",
"items": [
{
"podId": 20,
"shipQy": 1,
"totalPickedWeight": 0,
"unitPrice": 2.99,
"itemName": "SB ROTINI 16Z",
"lbEachCd": "EA",
"groceryAmount": 2.99,
"primUpcCd": "222",
"mvpSavings": 0,
"rewardSavings": 0,
"couponSavings": 0,
"couponPrice": 0,
"categoryId": "1",
"categoryDesc": "Grocery",
"image": {"small": "https://example.test/rotini.jpg"},
}
],
}
),
encoding="utf-8",
)
(raw_dir / "order-1.json").write_text(
json.dumps(
{
"orderId": "order-1",
"orderDate": "2026-03-01",
"items": [
{
"podId": 10,
"shipQy": 2,
"totalPickedWeight": 0,
"unitPrice": 1.5,
"itemName": "PEPSI 6PK 7.5Z",
"lbEachCd": "EA",
"groceryAmount": 3.0,
"primUpcCd": "111",
"mvpSavings": 0,
"rewardSavings": 0,
"couponSavings": 0,
"couponPrice": 0,
"categoryId": "1",
"categoryDesc": "Grocery",
}
],
}
),
encoding="utf-8",
)
rows = enrich_giant.build_items_enriched(raw_dir)
output_csv = Path(tmpdir) / "items_enriched.csv"
enrich_giant.write_csv(output_csv, rows)
self.assertEqual(["order-1", "order-2"], [row["order_id"] for row in rows])
self.assertEqual("PEPSI", rows[0]["item_name_norm"])
self.assertEqual("6", rows[0]["pack_qty"])
self.assertEqual("7.5", rows[0]["size_value"])
self.assertEqual("10", rows[0]["retailer_item_id"])
self.assertEqual("true", rows[1]["is_store_brand"])
self.assertTrue(rows[0]["normalized_item_id"])
self.assertEqual("exact_upc", rows[0]["normalization_basis"])
with output_csv.open(newline="", encoding="utf-8") as handle:
written_rows = list(csv.DictReader(handle))
self.assertEqual(2, len(written_rows))
self.assertEqual(enrich_giant.OUTPUT_FIELDS, list(written_rows[0].keys()))
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,96 @@
import unittest
import report_pipeline_status
class PipelineStatusTests(unittest.TestCase):
def test_build_status_summary_reports_unresolved_and_reviewed_counts(self):
summary = report_pipeline_status.build_status_summary(
giant_orders=[{"order_id": "g1"}],
giant_items=[{"order_id": "g1", "line_no": "1"}],
giant_enriched=[
{
"retailer": "giant",
"order_id": "g1",
"line_no": "1",
"normalized_item_id": "gnorm_banana",
"item_name_norm": "BANANA",
"item_name": "FRESH BANANA",
"retailer_item_id": "1",
"upc": "4011",
"brand_guess": "",
"variant": "",
"size_value": "",
"size_unit": "",
"pack_qty": "",
"measure_type": "weight",
"image_url": "",
"is_store_brand": "false",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
"order_date": "2026-03-01",
"line_total": "1.29",
}
],
costco_orders=[],
costco_items=[],
costco_enriched=[],
purchases=[
{
"normalized_item_id": "gnorm_banana",
"catalog_id": "cat_banana",
"resolution_action": "",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
"retailer": "giant",
"raw_item_name": "FRESH BANANA",
"normalized_item_name": "BANANA",
"upc": "4011",
"line_total": "1.29",
},
{
"normalized_item_id": "cnorm_lime",
"catalog_id": "",
"resolution_action": "",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
"retailer": "costco",
"raw_item_name": "LIME 5LB",
"normalized_item_name": "LIME",
"upc": "",
"line_total": "4.99",
},
],
resolutions=[],
links=[
{
"normalized_item_id": "gnorm_banana",
"catalog_id": "cat_banana",
"review_status": "approved",
}
],
catalog=[
{
"catalog_id": "cat_banana",
"catalog_name": "BANANA",
"product_type": "banana",
"category": "produce",
}
],
)
counts = {row["stage"]: row["count"] for row in summary}
self.assertEqual(1, counts["raw_orders"])
self.assertEqual(1, counts["raw_items"])
self.assertEqual(1, counts["normalized_items"])
self.assertEqual(1, counts["linked_purchase_rows"])
self.assertEqual(1, counts["unresolved_purchase_rows"])
self.assertEqual(1, counts["review_queue_normalized_items"])
self.assertEqual(0, counts["unresolved_not_in_review_rows"])
if __name__ == "__main__":
unittest.main()

722
tests/test_purchases.py Normal file
View File

@@ -0,0 +1,722 @@
import csv
import tempfile
import unittest
from pathlib import Path
import build_purchases
import enrich_costco
class PurchaseLogTests(unittest.TestCase):
def test_derive_net_line_total_preserves_existing_then_derives(self):
self.assertEqual("1.49", build_purchases.derive_net_line_total({"net_line_total": "1.49", "line_total": "2.98"}))
self.assertEqual("5.99", build_purchases.derive_net_line_total({"line_total": "6.99", "matched_discount_amount": "-1.00"}))
self.assertEqual("3.5", build_purchases.derive_net_line_total({"line_total": "3.50"}))
def test_derive_metrics_prefers_picked_weight_and_pack_count(self):
metrics = build_purchases.derive_metrics(
{
"line_total": "4.00",
"qty": "1",
"pack_qty": "4",
"size_value": "",
"size_unit": "",
"picked_weight": "2",
"price_per_each": "",
"price_per_lb": "",
"price_per_oz": "",
}
)
self.assertEqual("4", metrics["price_per_each"])
self.assertEqual("1", metrics["price_per_count"])
self.assertEqual("2", metrics["price_per_lb"])
self.assertEqual("0.125", metrics["price_per_oz"])
self.assertEqual("picked_weight_lb", metrics["price_per_lb_basis"])
def test_build_purchase_rows_maps_catalog_ids(self):
fieldnames = enrich_costco.OUTPUT_FIELDS
giant_row = {field: "" for field in fieldnames}
giant_row.update(
{
"retailer": "giant",
"order_id": "g1",
"line_no": "1",
"normalized_row_id": "giant:g1:1",
"normalized_item_id": "gnorm:banana",
"order_date": "2026-03-01",
"item_name": "FRESH BANANA",
"item_name_norm": "BANANA",
"image_url": "https://example.test/banana.jpg",
"retailer_item_id": "100",
"upc": "4011",
"qty": "1",
"unit": "LB",
"normalized_quantity": "1",
"normalized_quantity_unit": "lb",
"line_total": "1.29",
"unit_price": "1.29",
"measure_type": "weight",
"price_per_lb": "1.29",
"raw_order_path": "data/giant-web/raw/g1.json",
"is_discount_line": "false",
"is_coupon_line": "false",
"is_fee": "false",
}
)
costco_row = {field: "" for field in fieldnames}
costco_row.update(
{
"retailer": "costco",
"order_id": "c1",
"line_no": "1",
"normalized_row_id": "costco:c1:1",
"normalized_item_id": "cnorm:banana",
"order_date": "2026-03-12",
"item_name": "BANANAS 3 LB / 1.36 KG",
"item_name_norm": "BANANA",
"retailer_item_id": "30669",
"qty": "1",
"unit": "E",
"normalized_quantity": "3",
"normalized_quantity_unit": "lb",
"line_total": "2.98",
"unit_price": "2.98",
"size_value": "3",
"size_unit": "lb",
"measure_type": "weight",
"price_per_lb": "0.9933",
"raw_order_path": "data/costco-web/raw/c1.json",
"is_discount_line": "false",
"is_coupon_line": "false",
"is_fee": "false",
}
)
giant_orders = [
{
"order_id": "g1",
"store_name": "Giant",
"store_number": "42",
"store_city": "Springfield",
"store_state": "VA",
}
]
costco_orders = [
{
"order_id": "c1",
"store_name": "MT VERNON",
"store_number": "1115",
"store_city": "ALEXANDRIA",
"store_state": "VA",
}
]
catalog_rows = [
{
"catalog_id": "cat_banana",
"catalog_name": "BANANA",
"category": "produce",
"product_type": "banana",
"brand": "",
"variant": "",
"size_value": "",
"size_unit": "",
"pack_qty": "",
"measure_type": "",
"notes": "",
"created_at": "",
"updated_at": "",
}
]
link_rows = [
{
"normalized_item_id": "gnorm:banana",
"catalog_id": "cat_banana",
"link_method": "manual_link",
"link_confidence": "high",
"review_status": "approved",
"reviewed_by": "",
"reviewed_at": "",
"link_notes": "",
},
{
"normalized_item_id": "cnorm:banana",
"catalog_id": "cat_banana",
"link_method": "manual_link",
"link_confidence": "high",
"review_status": "approved",
"reviewed_by": "",
"reviewed_at": "",
"link_notes": "",
},
]
rows, _links = build_purchases.build_purchase_rows(
[giant_row],
[costco_row],
giant_orders,
costco_orders,
[],
link_rows,
catalog_rows,
)
self.assertEqual(2, len(rows))
self.assertTrue(all(row["catalog_id"] == "cat_banana" for row in rows))
self.assertEqual({"giant", "costco"}, {row["retailer"] for row in rows})
self.assertEqual("https://example.test/banana.jpg", rows[0]["image_url"])
self.assertEqual("1", rows[0]["normalized_quantity"])
self.assertEqual("lb", rows[0]["normalized_quantity_unit"])
self.assertEqual("lb", rows[0]["effective_price_unit"])
self.assertEqual("g1", rows[0]["order_id"])
self.assertEqual("Giant", rows[0]["store_name"])
self.assertEqual("42", rows[0]["store_number"])
self.assertEqual("Springfield", rows[0]["store_city"])
self.assertEqual("VA", rows[0]["store_state"])
def test_main_writes_purchase_and_example_csvs(self):
with tempfile.TemporaryDirectory() as tmpdir:
giant_items = Path(tmpdir) / "giant_items.csv"
costco_items = Path(tmpdir) / "costco_items.csv"
giant_orders = Path(tmpdir) / "giant_orders.csv"
costco_orders = Path(tmpdir) / "costco_orders.csv"
resolutions_csv = Path(tmpdir) / "review_resolutions.csv"
catalog_csv = Path(tmpdir) / "catalog.csv"
links_csv = Path(tmpdir) / "product_links.csv"
purchases_csv = Path(tmpdir) / "review" / "purchases.csv"
examples_csv = Path(tmpdir) / "review" / "comparison_examples.csv"
fieldnames = enrich_costco.OUTPUT_FIELDS
giant_row = {field: "" for field in fieldnames}
giant_row.update(
{
"retailer": "giant",
"order_id": "g1",
"line_no": "1",
"normalized_row_id": "giant:g1:1",
"normalized_item_id": "gnorm:banana",
"order_date": "2026-03-01",
"item_name": "FRESH BANANA",
"item_name_norm": "BANANA",
"retailer_item_id": "100",
"upc": "4011",
"qty": "1",
"unit": "LB",
"normalized_quantity": "1",
"normalized_quantity_unit": "lb",
"line_total": "1.29",
"unit_price": "1.29",
"measure_type": "weight",
"price_per_lb": "1.29",
"raw_order_path": "data/giant-web/raw/g1.json",
"is_discount_line": "false",
"is_coupon_line": "false",
"is_fee": "false",
}
)
costco_row = {field: "" for field in fieldnames}
costco_row.update(
{
"retailer": "costco",
"order_id": "c1",
"line_no": "1",
"normalized_row_id": "costco:c1:1",
"normalized_item_id": "cnorm:banana",
"order_date": "2026-03-12",
"item_name": "BANANAS 3 LB / 1.36 KG",
"item_name_norm": "BANANA",
"retailer_item_id": "30669",
"qty": "1",
"unit": "E",
"normalized_quantity": "3",
"normalized_quantity_unit": "lb",
"line_total": "2.98",
"unit_price": "2.98",
"size_value": "3",
"size_unit": "lb",
"measure_type": "weight",
"price_per_lb": "0.9933",
"raw_order_path": "data/costco-web/raw/c1.json",
"is_discount_line": "false",
"is_coupon_line": "false",
"is_fee": "false",
}
)
for path, source_rows in [(giant_items, [giant_row]), (costco_items, [costco_row])]:
with path.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(source_rows)
order_fields = ["order_id", "store_name", "store_number", "store_city", "store_state"]
for path, source_rows in [
(
giant_orders,
[
{
"order_id": "g1",
"store_name": "Giant",
"store_number": "42",
"store_city": "Springfield",
"store_state": "VA",
}
],
),
(
costco_orders,
[
{
"order_id": "c1",
"store_name": "MT VERNON",
"store_number": "1115",
"store_city": "ALEXANDRIA",
"store_state": "VA",
}
],
),
]:
with path.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=order_fields)
writer.writeheader()
writer.writerows(source_rows)
with catalog_csv.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=build_purchases.CATALOG_FIELDS)
writer.writeheader()
writer.writerow(
{
"catalog_id": "cat_banana",
"catalog_name": "BANANA",
"category": "produce",
"product_type": "banana",
"brand": "",
"variant": "",
"size_value": "",
"size_unit": "",
"pack_qty": "",
"measure_type": "",
"notes": "",
"created_at": "",
"updated_at": "",
}
)
with links_csv.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=build_purchases.PRODUCT_LINK_FIELDS)
writer.writeheader()
writer.writerows(
[
{
"normalized_item_id": "gnorm:banana",
"catalog_id": "cat_banana",
"link_method": "manual_link",
"link_confidence": "high",
"review_status": "approved",
"reviewed_by": "",
"reviewed_at": "",
"link_notes": "",
},
{
"normalized_item_id": "cnorm:banana",
"catalog_id": "cat_banana",
"link_method": "manual_link",
"link_confidence": "high",
"review_status": "approved",
"reviewed_by": "",
"reviewed_at": "",
"link_notes": "",
},
]
)
build_purchases.main.callback(
giant_items_enriched_csv=str(giant_items),
costco_items_enriched_csv=str(costco_items),
giant_orders_csv=str(giant_orders),
costco_orders_csv=str(costco_orders),
resolutions_csv=str(resolutions_csv),
catalog_csv=str(catalog_csv),
links_csv=str(links_csv),
output_csv=str(purchases_csv),
examples_csv=str(examples_csv),
)
self.assertTrue(purchases_csv.exists())
self.assertTrue(examples_csv.exists())
with purchases_csv.open(newline="", encoding="utf-8") as handle:
purchase_rows = list(csv.DictReader(handle))
with examples_csv.open(newline="", encoding="utf-8") as handle:
example_rows = list(csv.DictReader(handle))
self.assertEqual(2, len(purchase_rows))
self.assertEqual(1, len(example_rows))
def test_build_purchase_rows_applies_manual_resolution(self):
fieldnames = enrich_costco.OUTPUT_FIELDS
giant_row = {field: "" for field in fieldnames}
giant_row.update(
{
"retailer": "giant",
"order_id": "g1",
"line_no": "1",
"normalized_row_id": "giant:g1:1",
"normalized_item_id": "gnorm:ice",
"order_date": "2026-03-01",
"item_name": "SB BAGGED ICE 20LB",
"item_name_norm": "BAGGED ICE",
"retailer_item_id": "100",
"upc": "",
"qty": "1",
"unit": "EA",
"normalized_quantity": "1",
"normalized_quantity_unit": "each",
"line_total": "3.50",
"unit_price": "3.50",
"measure_type": "each",
"raw_order_path": "data/giant-web/raw/g1.json",
"is_discount_line": "false",
"is_coupon_line": "false",
"is_fee": "false",
}
)
rows, links = build_purchases.build_purchase_rows(
[giant_row],
[],
[
{
"order_id": "g1",
"store_name": "Giant",
"store_number": "42",
"store_city": "Springfield",
"store_state": "VA",
}
],
[],
[
{
"normalized_item_id": "gnorm:ice",
"catalog_id": "cat_ice",
"resolution_action": "create",
"status": "approved",
"resolution_notes": "manual ice merge",
"reviewed_at": "2026-03-16",
}
],
[],
[
{
"catalog_id": "cat_ice",
"catalog_name": "ICE",
"category": "frozen",
"product_type": "ice",
"brand": "",
"variant": "",
"size_value": "",
"size_unit": "",
"pack_qty": "",
"measure_type": "",
"notes": "",
"created_at": "",
"updated_at": "",
}
],
)
self.assertEqual("cat_ice", rows[0]["catalog_id"])
self.assertEqual("approved", rows[0]["review_status"])
self.assertEqual("create", rows[0]["resolution_action"])
self.assertEqual("cat_ice", links[0]["catalog_id"])
self.assertEqual("1", rows[0]["normalized_quantity"])
self.assertEqual("each", rows[0]["normalized_quantity_unit"])
def test_build_purchase_rows_derives_effective_price_for_known_cases(self):
fieldnames = enrich_costco.OUTPUT_FIELDS
def base_row():
return {field: "" for field in fieldnames}
giant_banana = base_row()
giant_banana.update(
{
"retailer": "giant",
"order_id": "g1",
"line_no": "1",
"normalized_row_id": "giant:g1:1",
"normalized_item_id": "gnorm:banana",
"order_date": "2026-03-01",
"item_name": "FRESH BANANA",
"item_name_norm": "BANANA",
"retailer_item_id": "100",
"qty": "1",
"unit": "LB",
"normalized_quantity": "1.68",
"normalized_quantity_unit": "lb",
"line_total": "0.99",
"unit_price": "0.99",
"measure_type": "weight",
"price_per_lb": "0.5893",
"raw_order_path": "data/giant-web/raw/g1.json",
"is_discount_line": "false",
"is_coupon_line": "false",
"is_fee": "false",
}
)
costco_banana = base_row()
costco_banana.update(
{
"retailer": "costco",
"order_id": "c1",
"line_no": "1",
"normalized_row_id": "costco:c1:1",
"normalized_item_id": "cnorm:banana",
"order_date": "2026-03-12",
"item_name": "BANANAS 3 LB / 1.36 KG",
"item_name_norm": "BANANA",
"retailer_item_id": "30669",
"qty": "1",
"unit": "E",
"normalized_quantity": "3",
"normalized_quantity_unit": "lb",
"line_total": "2.98",
"net_line_total": "1.49",
"unit_price": "2.98",
"size_value": "3",
"size_unit": "lb",
"measure_type": "weight",
"price_per_lb": "0.4967",
"raw_order_path": "data/costco-web/raw/c1.json",
"is_discount_line": "false",
"is_coupon_line": "false",
"is_fee": "false",
}
)
giant_ice = base_row()
giant_ice.update(
{
"retailer": "giant",
"order_id": "g2",
"line_no": "1",
"normalized_row_id": "giant:g2:1",
"normalized_item_id": "gnorm:ice",
"order_date": "2026-03-02",
"item_name": "SB BAGGED ICE 20LB",
"item_name_norm": "BAGGED ICE",
"retailer_item_id": "101",
"qty": "2",
"unit": "EA",
"normalized_quantity": "40",
"normalized_quantity_unit": "lb",
"line_total": "9.98",
"unit_price": "4.99",
"size_value": "20",
"size_unit": "lb",
"measure_type": "weight",
"price_per_lb": "0.2495",
"raw_order_path": "data/giant-web/raw/g2.json",
"is_discount_line": "false",
"is_coupon_line": "false",
"is_fee": "false",
}
)
costco_patty = base_row()
costco_patty.update(
{
"retailer": "costco",
"order_id": "c2",
"line_no": "1",
"normalized_row_id": "costco:c2:1",
"normalized_item_id": "cnorm:patty",
"order_date": "2026-03-03",
"item_name": "BEEF PATTIES 6# BAG",
"item_name_norm": "BEEF PATTIES 6# BAG",
"retailer_item_id": "777",
"qty": "1",
"unit": "E",
"normalized_quantity": "1",
"normalized_quantity_unit": "each",
"line_total": "26.99",
"net_line_total": "26.99",
"unit_price": "26.99",
"measure_type": "each",
"raw_order_path": "data/costco-web/raw/c2.json",
"is_discount_line": "false",
"is_coupon_line": "false",
"is_fee": "false",
}
)
giant_patty = base_row()
giant_patty.update(
{
"retailer": "giant",
"order_id": "g3",
"line_no": "1",
"normalized_row_id": "giant:g3:1",
"normalized_item_id": "gnorm:patty",
"order_date": "2026-03-04",
"item_name": "80% PATTIES PK12",
"item_name_norm": "80% PATTIES PK12",
"retailer_item_id": "102",
"qty": "1",
"unit": "LB",
"normalized_quantity": "",
"normalized_quantity_unit": "",
"line_total": "10.05",
"unit_price": "10.05",
"measure_type": "weight",
"price_per_lb": "7.7907",
"raw_order_path": "data/giant-web/raw/g3.json",
"is_discount_line": "false",
"is_coupon_line": "false",
"is_fee": "false",
}
)
rows, _links = build_purchases.build_purchase_rows(
[giant_banana, giant_ice, giant_patty],
[costco_banana, costco_patty],
[],
[],
[],
[],
[],
)
rows_by_item = {row["normalized_item_id"]: row for row in rows}
self.assertEqual("0.5893", rows_by_item["gnorm:banana"]["effective_price"])
self.assertEqual("lb", rows_by_item["gnorm:banana"]["effective_price_unit"])
self.assertEqual("0.4967", rows_by_item["cnorm:banana"]["effective_price"])
self.assertEqual("lb", rows_by_item["cnorm:banana"]["effective_price_unit"])
self.assertEqual("0.2495", rows_by_item["gnorm:ice"]["effective_price"])
self.assertEqual("lb", rows_by_item["gnorm:ice"]["effective_price_unit"])
self.assertEqual("26.99", rows_by_item["cnorm:patty"]["effective_price"])
self.assertEqual("each", rows_by_item["cnorm:patty"]["effective_price_unit"])
self.assertEqual("", rows_by_item["gnorm:patty"]["effective_price"])
self.assertEqual("", rows_by_item["gnorm:patty"]["effective_price_unit"])
def test_build_purchase_rows_leaves_effective_price_blank_without_valid_denominator(self):
fieldnames = enrich_costco.OUTPUT_FIELDS
row = {field: "" for field in fieldnames}
row.update(
{
"retailer": "giant",
"order_id": "g1",
"line_no": "1",
"normalized_row_id": "giant:g1:1",
"normalized_item_id": "gnorm:blank",
"order_date": "2026-03-01",
"item_name": "MYSTERY ITEM",
"item_name_norm": "MYSTERY ITEM",
"retailer_item_id": "100",
"qty": "1",
"unit": "EA",
"normalized_quantity": "0",
"normalized_quantity_unit": "each",
"line_total": "3.50",
"unit_price": "3.50",
"measure_type": "each",
"raw_order_path": "data/giant-web/raw/g1.json",
"is_discount_line": "false",
"is_coupon_line": "false",
"is_fee": "false",
}
)
rows, _links = build_purchases.build_purchase_rows([row], [], [], [], [], [], [])
self.assertEqual("", rows[0]["effective_price"])
self.assertEqual("", rows[0]["effective_price_unit"])
def test_purchase_rows_support_visit_level_grouping_without_extra_joins(self):
fieldnames = enrich_costco.OUTPUT_FIELDS
def base_row():
return {field: "" for field in fieldnames}
row_one = base_row()
row_one.update(
{
"retailer": "giant",
"order_id": "g1",
"line_no": "1",
"normalized_row_id": "giant:g1:1",
"normalized_item_id": "gnorm:first",
"order_date": "2026-03-01",
"item_name": "FIRST ITEM",
"item_name_norm": "FIRST ITEM",
"qty": "1",
"unit": "EA",
"normalized_quantity": "1",
"normalized_quantity_unit": "each",
"line_total": "3.50",
"measure_type": "each",
"raw_order_path": "data/giant-web/raw/g1.json",
"is_discount_line": "false",
"is_coupon_line": "false",
"is_fee": "false",
}
)
row_two = base_row()
row_two.update(
{
"retailer": "giant",
"order_id": "g1",
"line_no": "2",
"normalized_row_id": "giant:g1:2",
"normalized_item_id": "gnorm:second",
"order_date": "2026-03-01",
"item_name": "SECOND ITEM",
"item_name_norm": "SECOND ITEM",
"qty": "1",
"unit": "EA",
"normalized_quantity": "1",
"normalized_quantity_unit": "each",
"line_total": "2.00",
"measure_type": "each",
"raw_order_path": "data/giant-web/raw/g1.json",
"is_discount_line": "false",
"is_coupon_line": "false",
"is_fee": "false",
}
)
rows, _links = build_purchases.build_purchase_rows(
[row_one, row_two],
[],
[
{
"order_id": "g1",
"store_name": "Giant",
"store_number": "42",
"store_city": "Springfield",
"store_state": "VA",
}
],
[],
[],
[],
[],
)
visit_key = {
(
row["retailer"],
row["order_id"],
row["purchase_date"],
row["store_name"],
row["store_number"],
row["store_city"],
row["store_state"],
)
for row in rows
}
visit_total = sum(float(row["net_line_total"]) for row in rows)
self.assertEqual(1, len(visit_key))
self.assertEqual(5.5, visit_total)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,760 @@
import csv
import tempfile
import unittest
from pathlib import Path
from unittest import mock
from click.testing import CliRunner
import enrich_costco
import review_products
def write_review_source_files(tmpdir, rows):
giant_items_csv = Path(tmpdir) / "giant_items.csv"
costco_items_csv = Path(tmpdir) / "costco_items.csv"
giant_orders_csv = Path(tmpdir) / "giant_orders.csv"
costco_orders_csv = Path(tmpdir) / "costco_orders.csv"
fieldnames = enrich_costco.OUTPUT_FIELDS
grouped_rows = {"giant": [], "costco": []}
grouped_orders = {"giant": {}, "costco": {}}
for index, row in enumerate(rows, start=1):
retailer = row.get("retailer", "giant")
normalized_row = {field: "" for field in fieldnames}
normalized_row.update(
{
"retailer": retailer,
"order_id": row.get("order_id", f"{retailer[0]}{index}"),
"line_no": row.get("line_no", str(index)),
"normalized_row_id": row.get(
"normalized_row_id",
f"{retailer}:{row.get('order_id', f'{retailer[0]}{index}')}:{row.get('line_no', str(index))}",
),
"normalized_item_id": row.get("normalized_item_id", ""),
"order_date": row.get("purchase_date", ""),
"item_name": row.get("raw_item_name", ""),
"item_name_norm": row.get("normalized_item_name", ""),
"image_url": row.get("image_url", ""),
"upc": row.get("upc", ""),
"line_total": row.get("line_total", ""),
"net_line_total": row.get("net_line_total", ""),
"matched_discount_amount": row.get("matched_discount_amount", ""),
"qty": row.get("qty", "1"),
"unit": row.get("unit", "EA"),
"normalized_quantity": row.get("normalized_quantity", ""),
"normalized_quantity_unit": row.get("normalized_quantity_unit", ""),
"size_value": row.get("size_value", ""),
"size_unit": row.get("size_unit", ""),
"pack_qty": row.get("pack_qty", ""),
"measure_type": row.get("measure_type", "each"),
"retailer_item_id": row.get("retailer_item_id", ""),
"price_per_each": row.get("price_per_each", ""),
"price_per_lb": row.get("price_per_lb", ""),
"price_per_oz": row.get("price_per_oz", ""),
"is_discount_line": row.get("is_discount_line", "false"),
"is_coupon_line": row.get("is_coupon_line", "false"),
"is_fee": row.get("is_fee", "false"),
"raw_order_path": row.get("raw_order_path", ""),
}
)
grouped_rows[retailer].append(normalized_row)
order_id = normalized_row["order_id"]
grouped_orders[retailer].setdefault(
order_id,
{
"order_id": order_id,
"store_name": row.get("store_name", ""),
"store_number": row.get("store_number", ""),
"store_city": row.get("store_city", ""),
"store_state": row.get("store_state", ""),
},
)
for path, source_rows in [
(giant_items_csv, grouped_rows["giant"]),
(costco_items_csv, grouped_rows["costco"]),
]:
with path.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(source_rows)
order_fields = ["order_id", "store_name", "store_number", "store_city", "store_state"]
for path, source_rows in [
(giant_orders_csv, grouped_orders["giant"].values()),
(costco_orders_csv, grouped_orders["costco"].values()),
]:
with path.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=order_fields)
writer.writeheader()
writer.writerows(source_rows)
return giant_items_csv, costco_items_csv, giant_orders_csv, costco_orders_csv
class ReviewWorkflowTests(unittest.TestCase):
def test_build_review_queue_groups_unresolved_purchases(self):
queue_rows = review_products.build_review_queue(
[
{
"normalized_item_id": "gnorm_1",
"catalog_id": "",
"retailer": "giant",
"raw_item_name": "SB BAGGED ICE 20LB",
"normalized_item_name": "BAGGED ICE",
"upc": "",
"line_total": "3.50",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
},
{
"normalized_item_id": "gnorm_1",
"catalog_id": "",
"retailer": "giant",
"raw_item_name": "SB BAG ICE CUBED 10LB",
"normalized_item_name": "BAG ICE",
"upc": "",
"line_total": "2.50",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
},
],
[],
)
self.assertEqual(1, len(queue_rows))
self.assertEqual("gnorm_1", queue_rows[0]["normalized_item_id"])
self.assertIn("SB BAGGED ICE 20LB", queue_rows[0]["raw_item_names"])
def test_build_catalog_suggestions_prefers_upc_then_name(self):
suggestions = review_products.build_catalog_suggestions(
[
{
"normalized_item_name": "MIXED PEPPER",
"upc": "12345",
}
],
[
{
"normalized_item_id": "prior_1",
"normalized_item_name": "MIXED PEPPER 6 PACK",
"upc": "12345",
"catalog_id": "cat_2",
}
],
[
{
"catalog_id": "cat_1",
"catalog_name": "MIXED PEPPER",
},
{
"catalog_id": "cat_2",
"catalog_name": "MIXED PEPPER 6 PACK",
},
],
)
self.assertEqual("cat_2", suggestions[0]["catalog_id"])
self.assertEqual("exact upc", suggestions[0]["reason"])
def test_search_catalog_rows_ranks_token_overlap(self):
results = review_products.search_catalog_rows(
"mixed pepper",
[
{
"catalog_id": "cat_1",
"catalog_name": "MIXED PEPPER",
"product_type": "pepper",
"category": "produce",
"variant": "",
},
{
"catalog_id": "cat_2",
"catalog_name": "GROUND PEPPER",
"product_type": "spice",
"category": "baking",
"variant": "",
},
],
[
{
"normalized_item_id": "gnorm_mix",
"catalog_id": "cat_1",
}
],
"cnorm_mix",
)
self.assertEqual("cat_1", results[0]["catalog_id"])
self.assertGreater(results[0]["score"], results[1]["score"])
def test_review_products_displays_position_items_and_suggestions(self):
with tempfile.TemporaryDirectory() as tmpdir:
purchases_csv = Path(tmpdir) / "purchases.csv"
queue_csv = Path(tmpdir) / "review_queue.csv"
resolutions_csv = Path(tmpdir) / "review_resolutions.csv"
catalog_csv = Path(tmpdir) / "catalog.csv"
links_csv = Path(tmpdir) / "product_links.csv"
giant_items_csv, costco_items_csv, giant_orders_csv, costco_orders_csv = write_review_source_files(
tmpdir,
[
{
"purchase_date": "2026-03-14",
"retailer": "costco",
"order_id": "c2",
"line_no": "2",
"normalized_item_id": "cnorm_mix",
"raw_item_name": "MIXED PEPPER 6-PACK",
"normalized_item_name": "MIXED PEPPER",
"image_url": "",
"upc": "",
"line_total": "7.49",
},
{
"purchase_date": "2026-03-12",
"retailer": "costco",
"order_id": "c1",
"line_no": "1",
"normalized_item_id": "cnorm_mix",
"raw_item_name": "MIXED PEPPER 6-PACK",
"normalized_item_name": "MIXED PEPPER",
"image_url": "https://example.test/mixed-pepper.jpg",
"upc": "",
"line_total": "6.99",
},
{
"purchase_date": "2026-03-10",
"retailer": "giant",
"order_id": "g1",
"line_no": "1",
"normalized_item_id": "gnorm_mix",
"raw_item_name": "MIXED PEPPER",
"normalized_item_name": "MIXED PEPPER",
"image_url": "",
"upc": "",
"line_total": "5.99",
},
],
)
with catalog_csv.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=review_products.build_purchases.CATALOG_FIELDS)
writer.writeheader()
writer.writerow(
{
"catalog_id": "cat_mix",
"catalog_name": "MIXED PEPPER",
"category": "produce",
"product_type": "pepper",
"brand": "",
"variant": "",
"size_value": "",
"size_unit": "",
"pack_qty": "",
"measure_type": "",
"notes": "",
"created_at": "",
"updated_at": "",
}
)
with links_csv.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=review_products.build_purchases.PRODUCT_LINK_FIELDS)
writer.writeheader()
writer.writerow(
{
"normalized_item_id": "gnorm_mix",
"catalog_id": "cat_mix",
"link_method": "manual_link",
"link_confidence": "high",
"review_status": "approved",
"reviewed_by": "",
"reviewed_at": "",
"link_notes": "",
}
)
runner = CliRunner()
result = runner.invoke(
review_products.main,
[
"--giant-items-enriched-csv",
str(giant_items_csv),
"--costco-items-enriched-csv",
str(costco_items_csv),
"--giant-orders-csv",
str(giant_orders_csv),
"--costco-orders-csv",
str(costco_orders_csv),
"--purchases-csv",
str(purchases_csv),
"--queue-csv",
str(queue_csv),
"--resolutions-csv",
str(resolutions_csv),
"--catalog-csv",
str(catalog_csv),
"--links-csv",
str(links_csv),
],
input="q\n",
color=True,
)
self.assertEqual(0, result.exit_code)
self.assertIn("Review guide:", result.output)
self.assertIn("Review 1/1: MIXED PEPPER", result.output)
self.assertIn("2 matched items:", result.output)
self.assertIn("[#] link to suggestion [f]ind [n]ew [s]kip e[x]clude [q]uit >", result.output)
first_item = result.output.index("[1] MIXED PEPPER 6-PACK | costco | 2026-03-14 | 7.49 | ")
second_item = result.output.index("[2] MIXED PEPPER 6-PACK | costco | 2026-03-12 | 6.99 | https://example.test/mixed-pepper.jpg")
self.assertLess(first_item, second_item)
self.assertIn("1 catalog_name suggestions found:", result.output)
self.assertIn("[1] MIXED PEPPER, pepper, produce (1 items, 1 rows)", result.output)
self.assertIn("\x1b[", result.output)
def test_review_products_no_suggestions_is_informational(self):
with tempfile.TemporaryDirectory() as tmpdir:
purchases_csv = Path(tmpdir) / "purchases.csv"
queue_csv = Path(tmpdir) / "review_queue.csv"
resolutions_csv = Path(tmpdir) / "review_resolutions.csv"
catalog_csv = Path(tmpdir) / "catalog.csv"
links_csv = Path(tmpdir) / "product_links.csv"
giant_items_csv, costco_items_csv, giant_orders_csv, costco_orders_csv = write_review_source_files(
tmpdir,
[
{
"purchase_date": "2026-03-14",
"retailer": "giant",
"order_id": "g1",
"line_no": "1",
"normalized_item_id": "gnorm_ice",
"raw_item_name": "SB BAGGED ICE 20LB",
"normalized_item_name": "BAGGED ICE",
"image_url": "",
"upc": "",
"line_total": "3.50",
}
],
)
with catalog_csv.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=review_products.build_purchases.CATALOG_FIELDS)
writer.writeheader()
result = CliRunner().invoke(
review_products.main,
[
"--giant-items-enriched-csv",
str(giant_items_csv),
"--costco-items-enriched-csv",
str(costco_items_csv),
"--giant-orders-csv",
str(giant_orders_csv),
"--costco-orders-csv",
str(costco_orders_csv),
"--purchases-csv",
str(purchases_csv),
"--queue-csv",
str(queue_csv),
"--resolutions-csv",
str(resolutions_csv),
"--catalog-csv",
str(catalog_csv),
"--links-csv",
str(links_csv),
],
input="q\n",
color=True,
)
self.assertEqual(0, result.exit_code)
self.assertIn("no catalog_name suggestions found", result.output)
def test_search_links_catalog_and_writes_link_row(self):
with tempfile.TemporaryDirectory() as tmpdir:
purchases_csv = Path(tmpdir) / "purchases.csv"
queue_csv = Path(tmpdir) / "review_queue.csv"
resolutions_csv = Path(tmpdir) / "review_resolutions.csv"
catalog_csv = Path(tmpdir) / "catalog.csv"
links_csv = Path(tmpdir) / "product_links.csv"
giant_items_csv, costco_items_csv, giant_orders_csv, costco_orders_csv = write_review_source_files(
tmpdir,
[
{
"purchase_date": "2026-03-14",
"retailer": "costco",
"order_id": "c2",
"line_no": "2",
"normalized_item_id": "cnorm_mix",
"raw_item_name": "MIXED PEPPER 6-PACK",
"normalized_item_name": "MIXED PEPPER",
"image_url": "",
"upc": "",
"line_total": "7.49",
},
{
"purchase_date": "2026-03-12",
"retailer": "costco",
"order_id": "c1",
"line_no": "1",
"normalized_item_id": "cnorm_mix",
"raw_item_name": "MIXED PEPPER 6-PACK",
"normalized_item_name": "MIXED PEPPER",
"image_url": "",
"upc": "",
"line_total": "6.99",
},
{
"purchase_date": "2026-03-10",
"retailer": "giant",
"order_id": "g1",
"line_no": "1",
"normalized_item_id": "gnorm_mix",
"raw_item_name": "MIXED PEPPER",
"normalized_item_name": "MIXED PEPPER",
"image_url": "",
"upc": "",
"line_total": "5.99",
},
],
)
with catalog_csv.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=review_products.build_purchases.CATALOG_FIELDS)
writer.writeheader()
writer.writerow(
{
"catalog_id": "cat_mix",
"catalog_name": "MIXED PEPPER",
"category": "",
"product_type": "",
"brand": "",
"variant": "",
"size_value": "",
"size_unit": "",
"pack_qty": "",
"measure_type": "",
"notes": "",
"created_at": "",
"updated_at": "",
}
)
with links_csv.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=review_products.build_purchases.PRODUCT_LINK_FIELDS)
writer.writeheader()
writer.writerow(
{
"normalized_item_id": "gnorm_mix",
"catalog_id": "cat_mix",
"link_method": "manual_link",
"link_confidence": "high",
"review_status": "approved",
"reviewed_by": "",
"reviewed_at": "",
"link_notes": "",
}
)
result = CliRunner().invoke(
review_products.main,
[
"--giant-items-enriched-csv",
str(giant_items_csv),
"--costco-items-enriched-csv",
str(costco_items_csv),
"--giant-orders-csv",
str(giant_orders_csv),
"--costco-orders-csv",
str(costco_orders_csv),
"--purchases-csv",
str(purchases_csv),
"--queue-csv",
str(queue_csv),
"--resolutions-csv",
str(resolutions_csv),
"--catalog-csv",
str(catalog_csv),
"--links-csv",
str(links_csv),
"--limit",
"1",
],
input="f\nmixed pepper\n1\nlinked by test\n",
color=True,
)
self.assertEqual(0, result.exit_code)
self.assertIn("1 search results found:", result.output)
with resolutions_csv.open(newline="", encoding="utf-8") as handle:
rows = list(csv.DictReader(handle))
with links_csv.open(newline="", encoding="utf-8") as handle:
link_rows = list(csv.DictReader(handle))
self.assertEqual("cat_mix", rows[0]["catalog_id"])
self.assertEqual("link", rows[0]["resolution_action"])
self.assertEqual("cat_mix", link_rows[0]["catalog_id"])
def test_search_no_matches_allows_retry_or_return(self):
with tempfile.TemporaryDirectory() as tmpdir:
purchases_csv = Path(tmpdir) / "purchases.csv"
queue_csv = Path(tmpdir) / "review_queue.csv"
resolutions_csv = Path(tmpdir) / "review_resolutions.csv"
catalog_csv = Path(tmpdir) / "catalog.csv"
links_csv = Path(tmpdir) / "product_links.csv"
giant_items_csv, costco_items_csv, giant_orders_csv, costco_orders_csv = write_review_source_files(
tmpdir,
[
{
"purchase_date": "2026-03-14",
"retailer": "giant",
"order_id": "g1",
"line_no": "1",
"normalized_item_id": "gnorm_ice",
"raw_item_name": "SB BAGGED ICE 20LB",
"normalized_item_name": "BAGGED ICE",
"image_url": "",
"upc": "",
"line_total": "3.50",
}
],
)
with catalog_csv.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=review_products.build_purchases.CATALOG_FIELDS)
writer.writeheader()
writer.writerow(
{
"catalog_id": "cat_ice",
"catalog_name": "ICE",
"category": "frozen",
"product_type": "ice",
"brand": "",
"variant": "",
"size_value": "",
"size_unit": "",
"pack_qty": "",
"measure_type": "",
"notes": "",
"created_at": "",
"updated_at": "",
}
)
result = CliRunner().invoke(
review_products.main,
[
"--giant-items-enriched-csv",
str(giant_items_csv),
"--costco-items-enriched-csv",
str(costco_items_csv),
"--giant-orders-csv",
str(giant_orders_csv),
"--costco-orders-csv",
str(costco_orders_csv),
"--purchases-csv",
str(purchases_csv),
"--queue-csv",
str(queue_csv),
"--resolutions-csv",
str(resolutions_csv),
"--catalog-csv",
str(catalog_csv),
"--links-csv",
str(links_csv),
],
input="f\nzzz\nq\nq\n",
color=True,
)
self.assertEqual(0, result.exit_code)
self.assertIn("no matches found", result.output)
def test_skip_remains_available_from_main_prompt(self):
with tempfile.TemporaryDirectory() as tmpdir:
purchases_csv = Path(tmpdir) / "purchases.csv"
queue_csv = Path(tmpdir) / "review_queue.csv"
resolutions_csv = Path(tmpdir) / "review_resolutions.csv"
catalog_csv = Path(tmpdir) / "catalog.csv"
links_csv = Path(tmpdir) / "product_links.csv"
giant_items_csv, costco_items_csv, giant_orders_csv, costco_orders_csv = write_review_source_files(
tmpdir,
[
{
"purchase_date": "2026-03-14",
"retailer": "giant",
"order_id": "g1",
"line_no": "1",
"normalized_item_id": "gnorm_skip",
"raw_item_name": "TEST ITEM",
"normalized_item_name": "TEST ITEM",
"image_url": "",
"upc": "",
"line_total": "1.00",
}
],
)
with catalog_csv.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=review_products.build_purchases.CATALOG_FIELDS)
writer.writeheader()
result = CliRunner().invoke(
review_products.main,
[
"--giant-items-enriched-csv",
str(giant_items_csv),
"--costco-items-enriched-csv",
str(costco_items_csv),
"--giant-orders-csv",
str(giant_orders_csv),
"--costco-orders-csv",
str(costco_orders_csv),
"--purchases-csv",
str(purchases_csv),
"--queue-csv",
str(queue_csv),
"--resolutions-csv",
str(resolutions_csv),
"--catalog-csv",
str(catalog_csv),
"--links-csv",
str(links_csv),
"--limit",
"1",
],
input="s\n",
color=True,
)
self.assertEqual(0, result.exit_code)
with resolutions_csv.open(newline="", encoding="utf-8") as handle:
rows = list(csv.DictReader(handle))
self.assertEqual("skip", rows[0]["resolution_action"])
self.assertEqual("pending", rows[0]["status"])
def test_review_products_creates_catalog_and_resolution(self):
with tempfile.TemporaryDirectory() as tmpdir:
purchases_csv = Path(tmpdir) / "purchases.csv"
queue_csv = Path(tmpdir) / "review_queue.csv"
resolutions_csv = Path(tmpdir) / "review_resolutions.csv"
catalog_csv = Path(tmpdir) / "catalog.csv"
links_csv = Path(tmpdir) / "product_links.csv"
giant_items_csv, costco_items_csv, giant_orders_csv, costco_orders_csv = write_review_source_files(
tmpdir,
[
{
"purchase_date": "2026-03-15",
"normalized_item_id": "gnorm_ice",
"retailer": "giant",
"raw_item_name": "SB BAGGED ICE 20LB",
"normalized_item_name": "BAGGED ICE",
"image_url": "",
"upc": "",
"line_total": "3.50",
"order_id": "g1",
"line_no": "1",
}
],
)
with mock.patch.object(
review_products.click,
"prompt",
side_effect=["n", "ICE", "frozen", "ice", "manual merge", "q"],
):
review_products.main.callback(
giant_items_enriched_csv=str(giant_items_csv),
costco_items_enriched_csv=str(costco_items_csv),
giant_orders_csv=str(giant_orders_csv),
costco_orders_csv=str(costco_orders_csv),
purchases_csv=str(purchases_csv),
queue_csv=str(queue_csv),
resolutions_csv=str(resolutions_csv),
catalog_csv=str(catalog_csv),
links_csv=str(links_csv),
limit=1,
refresh_only=False,
)
self.assertTrue(queue_csv.exists())
self.assertTrue(resolutions_csv.exists())
self.assertTrue(catalog_csv.exists())
self.assertTrue(links_csv.exists())
with queue_csv.open(newline="", encoding="utf-8") as handle:
queue_rows = list(csv.DictReader(handle))
with resolutions_csv.open(newline="", encoding="utf-8") as handle:
resolution_rows = list(csv.DictReader(handle))
with catalog_csv.open(newline="", encoding="utf-8") as handle:
catalog_rows = list(csv.DictReader(handle))
with links_csv.open(newline="", encoding="utf-8") as handle:
link_rows = list(csv.DictReader(handle))
self.assertEqual("approved", queue_rows[0]["status"])
self.assertEqual("create", queue_rows[0]["resolution_action"])
self.assertEqual("create", resolution_rows[0]["resolution_action"])
self.assertEqual("approved", resolution_rows[0]["status"])
self.assertEqual("ICE", catalog_rows[0]["catalog_name"])
self.assertEqual(catalog_rows[0]["catalog_id"], link_rows[0]["catalog_id"])
def test_build_review_queue_readds_orphaned_and_incomplete_links(self):
purchase_rows = [
{
"normalized_item_id": "gnorm_orphan",
"catalog_id": "cat_missing",
"retailer": "giant",
"raw_item_name": "ORPHAN ITEM",
"normalized_item_name": "ORPHAN ITEM",
"upc": "",
"line_total": "3.50",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
},
{
"normalized_item_id": "gnorm_incomplete",
"catalog_id": "cat_incomplete",
"retailer": "giant",
"raw_item_name": "INCOMPLETE ITEM",
"normalized_item_name": "INCOMPLETE ITEM",
"upc": "",
"line_total": "4.50",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
},
]
link_rows = [
{
"normalized_item_id": "gnorm_orphan",
"catalog_id": "cat_missing",
},
{
"normalized_item_id": "gnorm_incomplete",
"catalog_id": "cat_incomplete",
},
]
catalog_rows = [
{
"catalog_id": "cat_incomplete",
"catalog_name": "INCOMPLETE ITEM",
"product_type": "",
}
]
queue_rows = review_products.build_review_queue(
purchase_rows,
[],
link_rows,
catalog_rows,
[],
)
reasons = {row["normalized_item_id"]: row["reason_code"] for row in queue_rows}
self.assertEqual("orphaned_catalog_link", reasons["gnorm_orphan"])
self.assertEqual("incomplete_catalog_link", reasons["gnorm_incomplete"])
if __name__ == "__main__":
unittest.main()

View File

@@ -3,7 +3,7 @@ import tempfile
import unittest import unittest
from pathlib import Path from pathlib import Path
import scraper import scrape_giant as scraper
class ScraperTests(unittest.TestCase): class ScraperTests(unittest.TestCase):
@@ -58,14 +58,25 @@ class ScraperTests(unittest.TestCase):
} }
] ]
orders, items = scraper.flatten_orders(history, details) orders, items = scraper.flatten_orders(
history,
details,
history_path=Path("data/giant-web/raw/history.json"),
raw_dir=Path("data/giant-web/raw"),
)
self.assertEqual(1, len(orders)) self.assertEqual(1, len(orders))
self.assertEqual("abc123", orders[0]["order_id"]) self.assertEqual("abc123", orders[0]["order_id"])
self.assertEqual("giant", orders[0]["retailer"])
self.assertEqual("PICKUP", orders[0]["service_type"]) self.assertEqual("PICKUP", orders[0]["service_type"])
self.assertEqual("data/giant-web/raw/history.json", orders[0]["raw_history_path"])
self.assertEqual("data/giant-web/raw/abc123.json", orders[0]["raw_order_path"])
self.assertEqual(1, len(items)) self.assertEqual(1, len(items))
self.assertEqual("1", items[0]["line_no"]) self.assertEqual("1", items[0]["line_no"])
self.assertEqual("Bananas", items[0]["item_name"]) self.assertEqual("Bananas", items[0]["item_name"])
self.assertEqual("giant", items[0]["retailer"])
self.assertEqual("data/giant-web/raw/abc123.json", items[0]["raw_order_path"])
self.assertEqual("false", items[0]["is_discount_line"])
def test_append_dedup_replaces_duplicate_rows_and_preserves_new_values(self): def test_append_dedup_replaces_duplicate_rows_and_preserves_new_values(self):
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir: