Compare commits

...

12 Commits

Author SHA1 Message Date
ben
de8ff535b8 1.18 cleanup and review 2026-03-24 08:27:41 -04:00
ben
02be6f52c0 Record t1.19 task evidence 2026-03-23 15:32:48 -04:00
ben
8ccf3ff43b Reconcile review queue against current catalog state 2026-03-23 15:32:41 -04:00
ben
a93229408b Record t1.18.4 task evidence 2026-03-23 15:28:05 -04:00
ben
a45522c110 Finalize purchase effective price fields 2026-03-23 15:27:58 -04:00
ben
d78230f1c6 Record t1.18.3 task evidence 2026-03-23 13:56:56 -04:00
ben
73176117fe Fix Costco hash-size weight parsing 2026-03-23 13:56:47 -04:00
ben
facebced9c Record t1.18.2 task evidence 2026-03-23 13:23:03 -04:00
ben
23dfc3de3e Use picked weight for Giant quantity basis 2026-03-23 13:22:56 -04:00
ben
3bc76ed243 Record t1.18 and t1.18.1 evidence 2026-03-23 12:54:09 -04:00
ben
dc0d0614bb Add effective price to purchases 2026-03-23 12:53:54 -04:00
ben
605c94498b Add effective price regression tests 2026-03-23 12:52:41 -04:00
11 changed files with 1012 additions and 267 deletions

View File

@@ -10,6 +10,14 @@ from layer_helpers import read_csv_rows, write_csv_rows
PURCHASE_FIELDS = [ PURCHASE_FIELDS = [
"purchase_date", "purchase_date",
"retailer", "retailer",
"catalog_name",
"product_type",
"category",
"net_line_total",
"normalized_quantity",
"normalized_quantity_unit",
"effective_price",
"effective_price_unit",
"order_id", "order_id",
"line_no", "line_no",
"normalized_row_id", "normalized_row_id",
@@ -19,9 +27,6 @@ PURCHASE_FIELDS = [
"resolution_action", "resolution_action",
"raw_item_name", "raw_item_name",
"normalized_item_name", "normalized_item_name",
"catalog_name",
"category",
"product_type",
"brand", "brand",
"variant", "variant",
"image_url", "image_url",
@@ -29,8 +34,6 @@ PURCHASE_FIELDS = [
"upc", "upc",
"qty", "qty",
"unit", "unit",
"normalized_quantity",
"normalized_quantity_unit",
"pack_qty", "pack_qty",
"size_value", "size_value",
"size_unit", "size_unit",
@@ -172,6 +175,41 @@ def derive_metrics(row):
} }
def derive_effective_price(row):
normalized_quantity = to_decimal(row.get("normalized_quantity"))
if normalized_quantity in (None, Decimal("0")):
return ""
numerator = to_decimal(derive_net_line_total(row))
if numerator is None:
return ""
return format_decimal(numerator / normalized_quantity)
def derive_effective_price_unit(row):
normalized_quantity = to_decimal(row.get("normalized_quantity"))
if normalized_quantity in (None, Decimal("0")):
return ""
return row.get("normalized_quantity_unit", "")
def derive_net_line_total(row):
existing_net = row.get("net_line_total", "")
if str(existing_net).strip() != "":
return str(existing_net)
line_total = to_decimal(row.get("line_total"))
if line_total is None:
return ""
matched_discount_amount = to_decimal(row.get("matched_discount_amount"))
if matched_discount_amount is not None:
return format_decimal(line_total + matched_discount_amount)
return format_decimal(line_total)
def order_lookup(rows, retailer): def order_lookup(rows, retailer):
return {(retailer, row["order_id"]): row for row in rows} return {(retailer, row["order_id"]): row for row in rows}
@@ -320,6 +358,14 @@ def build_purchase_rows(
{ {
"purchase_date": row["order_date"], "purchase_date": row["order_date"],
"retailer": row["retailer"], "retailer": row["retailer"],
"catalog_name": catalog_row.get("catalog_name", ""),
"product_type": catalog_row.get("product_type", ""),
"category": catalog_row.get("category", ""),
"net_line_total": derive_net_line_total(row),
"normalized_quantity": row.get("normalized_quantity", ""),
"normalized_quantity_unit": row.get("normalized_quantity_unit", ""),
"effective_price": derive_effective_price({**row, "net_line_total": derive_net_line_total(row)}),
"effective_price_unit": derive_effective_price_unit(row),
"order_id": row["order_id"], "order_id": row["order_id"],
"line_no": row["line_no"], "line_no": row["line_no"],
"normalized_row_id": row.get("normalized_row_id", ""), "normalized_row_id": row.get("normalized_row_id", ""),
@@ -329,9 +375,6 @@ def build_purchase_rows(
"resolution_action": resolution.get("resolution_action", ""), "resolution_action": resolution.get("resolution_action", ""),
"raw_item_name": row["item_name"], "raw_item_name": row["item_name"],
"normalized_item_name": row["item_name_norm"], "normalized_item_name": row["item_name_norm"],
"catalog_name": catalog_row.get("catalog_name", ""),
"category": catalog_row.get("category", ""),
"product_type": catalog_row.get("product_type", ""),
"brand": catalog_row.get("brand", ""), "brand": catalog_row.get("brand", ""),
"variant": catalog_row.get("variant", ""), "variant": catalog_row.get("variant", ""),
"image_url": row.get("image_url", ""), "image_url": row.get("image_url", ""),
@@ -339,8 +382,6 @@ def build_purchase_rows(
"upc": row["upc"], "upc": row["upc"],
"qty": row["qty"], "qty": row["qty"],
"unit": row["unit"], "unit": row["unit"],
"normalized_quantity": row.get("normalized_quantity", ""),
"normalized_quantity_unit": row.get("normalized_quantity_unit", ""),
"pack_qty": row["pack_qty"], "pack_qty": row["pack_qty"],
"size_value": row["size_value"], "size_value": row["size_value"],
"size_unit": row["size_unit"], "size_unit": row["size_unit"],
@@ -348,7 +389,6 @@ def build_purchase_rows(
"line_total": row["line_total"], "line_total": row["line_total"],
"unit_price": row["unit_price"], "unit_price": row["unit_price"],
"matched_discount_amount": row.get("matched_discount_amount", ""), "matched_discount_amount": row.get("matched_discount_amount", ""),
"net_line_total": row.get("net_line_total", ""),
"store_name": order_row.get("store_name", ""), "store_name": order_row.get("store_name", ""),
"store_number": order_row.get("store_number", ""), "store_number": order_row.get("store_number", ""),
"store_city": order_row.get("store_city", ""), "store_city": order_row.get("store_city", ""),

View File

@@ -29,7 +29,7 @@ CODE_TOKEN_RE = re.compile(
r"\b(?:SL\d+|T\d+H\d+|P\d+(?:/\d+)?|W\d+T\d+H\d+|FY\d+|CSPC#|C\d+T\d+H\d+|EC\d+T\d+H\d+|\d+X\d+)\b" r"\b(?:SL\d+|T\d+H\d+|P\d+(?:/\d+)?|W\d+T\d+H\d+|FY\d+|CSPC#|C\d+T\d+H\d+|EC\d+T\d+H\d+|\d+X\d+)\b"
) )
PACK_FRACTION_RE = re.compile(r"(?<![A-Z0-9])(\d+)\s*/\s*(\d+(?:\.\d+)?)\s*(OZ|LB|LBS|CT)\b") PACK_FRACTION_RE = re.compile(r"(?<![A-Z0-9])(\d+)\s*/\s*(\d+(?:\.\d+)?)\s*(OZ|LB|LBS|CT)\b")
HASH_SIZE_RE = re.compile(r"(?<![A-Z0-9])(\d+(?:\.\d+)?)#\b") HASH_SIZE_RE = re.compile(r"(?<![A-Z0-9])(\d+(?:\.\d+)?)#(?=\s|$)")
ITEM_CODE_RE = re.compile(r"#\w+\b") ITEM_CODE_RE = re.compile(r"#\w+\b")
DUAL_WEIGHT_RE = re.compile( DUAL_WEIGHT_RE = re.compile(
r"\b\d+(?:\.\d+)?\s*(?:KG|G|LB|LBS|OZ)\s*/\s*\d+(?:\.\d+)?\s*(?:KG|G|LB|LBS|OZ)\b" r"\b\d+(?:\.\d+)?\s*(?:KG|G|LB|LBS|OZ)\s*/\s*\d+(?:\.\d+)?\s*(?:KG|G|LB|LBS|OZ)\b"
@@ -199,6 +199,7 @@ def parse_costco_item(order_id, order_date, raw_path, line_no, item):
size_unit, size_unit,
pack_qty, pack_qty,
measure_type, measure_type,
"",
) )
identity_key, normalization_basis = normalization_identity( identity_key, normalization_basis = normalization_identity(
{ {

View File

@@ -344,10 +344,11 @@ def derive_prices(item, measure_type, size_value="", size_unit="", pack_qty=""):
return price_per_each, price_per_lb, price_per_oz return price_per_each, price_per_lb, price_per_oz
def derive_normalized_quantity(qty, size_value, size_unit, pack_qty, measure_type): def derive_normalized_quantity(qty, size_value, size_unit, pack_qty, measure_type, picked_weight=""):
parsed_qty = to_decimal(qty) parsed_qty = to_decimal(qty)
parsed_size = to_decimal(size_value) parsed_size = to_decimal(size_value)
parsed_pack = to_decimal(pack_qty) parsed_pack = to_decimal(pack_qty)
parsed_picked_weight = to_decimal(picked_weight)
total_multiplier = None total_multiplier = None
if parsed_qty not in (None, Decimal("0")): if parsed_qty not in (None, Decimal("0")):
total_multiplier = parsed_qty * (parsed_pack or Decimal("1")) total_multiplier = parsed_qty * (parsed_pack or Decimal("1"))
@@ -358,6 +359,8 @@ def derive_normalized_quantity(qty, size_value, size_unit, pack_qty, measure_typ
and total_multiplier not in (None, Decimal("0")) and total_multiplier not in (None, Decimal("0"))
): ):
return format_decimal(parsed_size * total_multiplier), size_unit return format_decimal(parsed_size * total_multiplier), size_unit
if measure_type == "weight" and parsed_picked_weight not in (None, Decimal("0")):
return format_decimal(parsed_picked_weight), "lb"
if measure_type == "count" and total_multiplier not in (None, Decimal("0")): if measure_type == "count" and total_multiplier not in (None, Decimal("0")):
return format_decimal(total_multiplier), "count" return format_decimal(total_multiplier), "count"
if measure_type == "each" and parsed_qty not in (None, Decimal("0")): if measure_type == "each" and parsed_qty not in (None, Decimal("0")):
@@ -441,6 +444,7 @@ def parse_item(order_id, order_date, raw_path, line_no, item):
size_unit, size_unit,
pack_qty, pack_qty,
measure_type, measure_type,
item.get("totalPickedWeight"),
) )
identity_key, normalization_basis = normalization_identity( identity_key, normalization_basis = normalization_identity(
{ {

View File

@@ -587,4 +587,68 @@ instead of
[5] yellow onion, onion, produce (0 items, 0 rows) [5] yellow onion, onion, produce (0 items, 0 rows)
selection: selection:
* * data cleanup [2026-03-23 Mon]
ok we're getting closer. still see some issues
1. reorder purchases columns for display: catalog_name, product_type, category (makes data/troubleshooting way easier)
2. shouldn't net_line_price should never be empty? to allow cumulative cost comparison/analysis (we can see normalized price per X via effective_price but shouldnt this be weighted against how much we bought? eg if we bought 5lb flour at $0.970/lb this is weighted as 1-to-1 with a 25lb purchase as 0.670/lb
3. some items missing entire categorizations? probably a result of me trying to do data cleanup. i found the orphaned values in teh product_links table and removed them, but re-running review_products.py did not catch this...
shouldn't review_products run a comparison between each vendor's normalized_items and compare to the existing review_queu?
RSET POTATO US 1
GREEK YOGURT DOM55
FDLY CHY VAN IC CRM
DUNKIN DONUT CANISTER ORIG BLND P=260
ICE CUBES
BLACK BEANS
KETCHUP SQUEEZE BTL
YELLOW_GOLD POTATO US 1
YELLOW_GOLD POTATO US 1
PINTO BEANS
4. cleanup deprecated .py files
5. Goals:
1. When have I purchased this item, what did I pay, and how has the price changed over time?
- we're close, but missing units - eg AP flour shows a value that looks like price/lb but you just see $0.765
- doesnt seem like we've captured everything but that's just a gut feeling
2. Visit breakdown as well as catalog/product/category? this certainly belongs in purchases.csv.
3. Consider dash/plotly for better-than-excel tracking, since we're really only looking at a couple of graphs and filtering within certain values? (obv keep purchases as a user-friendly output)
** 1. Cleanup purchases column order
purchase_date
retailer
catalog_name
product_type
category
net_line_total
normalized_quantity
effective_price
effective_price_unit (new)
order_id
line_no
raw_item_name
normalized_item_name
catalog_id
normalized_item_id
** 2. Populate and use purchases.net_line_total
net_line_total = line_total+matched_discount_amoun
effective_price = net_line_total / normalized_quantity
weighted cost analysis uses net_line_total, not just avg effective_price
** 3. Improve review robustness, enable norm_item re review
1. should regenerate candidates from:
- normalized items with no valid catalog_id
- normalized items whose linked catalog_id no longer exists
- normalized items whose linked catalog row exists but missing required fields if you want completeness review
2. review_products.py should compare:
- current normalized universe
- current product_links
- current catalog
- current review_queue
** 4. Remove deprecated.py
** 5. Improve Charts
1. Histogram: add effective_price_unit to purchases.py
1. Visits: plot by order_id enable display of:
1. spend by visit
2. items per visit
3. category spend by visit
4. retailer/store breakdown
* /

View File

@@ -803,26 +803,19 @@ correct and document deterministic normalized quantity fields so unit-cost analy
- The missing purchases fields were a carry-through bug: normalization had `normalized_quantity` and `normalized_quantity_unit`, but `build_purchases.py` never wrote them into `data/review/purchases.csv`. - The missing purchases fields were a carry-through bug: normalization had `normalized_quantity` and `normalized_quantity_unit`, but `build_purchases.py` never wrote them into `data/review/purchases.csv`.
- Normalized quantity now prefers explicit package basis over `each`, so rows like `PEPSI 6PK 7.5Z` resolve to `90 oz` and `KS ALMND BAR US 1.74QTS` purchased twice resolves to `3.48 qt`. - Normalized quantity now prefers explicit package basis over `each`, so rows like `PEPSI 6PK 7.5Z` resolve to `90 oz` and `KS ALMND BAR US 1.74QTS` purchased twice resolves to `3.48 qt`.
- The derivation stays conservative and does not convert units during normalization; parsed units such as `oz`, `lb`, `qt`, and `count` are preserved as-is. - The derivation stays conservative and does not convert units during normalization; parsed units such as `oz`, `lb`, `qt`, and `count` are preserved as-is.
* [ ] t1.18: add regression tests for known quantity/price failures (1-2 commits) * [X] t1.18: add regression tests for known quantity/price failures (1-2 commits)
capture the currently broken comparison cases before changing normalization or purchases logic capture the currently broken comparison cases before changing normalization or purchases logic
** acceptance criteria ** acceptance criteria
1. when generating `data/purchases.csv`, add `effective_price` = `effective_total` / `normalized_quantity` 1. ensure the new tests assert the intended `effective_price` behavior for the known banana, ice, and beef patty examples
2. define `effective_price` behavior explicitly from the covered cases: 2. add tests covering known broken cases:
- use `net_line_total` when present and non-zero, else use `line_total`
- divide by `normalized_quantity` when `normalized_quantity > 0`
- leave blank when no valid denominator exists
- never emit `0` or divide-by-zero for missing-basis cases
- `effective_price` only comparable within same `normalized_quantity_unit` unless later analysis converts the units
3. ensure the new tests assert the intended `effective_price` behavior for the known banana, ice, and beef patty examples
4. add tests covering known broken cases:
- giant bananas produce non-blank effective price - giant bananas produce non-blank effective price
- giant bagged ice produces non-zero effective price - giant bagged ice produces non-zero effective price
- costco bananas retain correct effective price - costco bananas retain correct effective price
- beef patty comparison rows preserve expected quantity basis behavior - beef patty comparison rows preserve expected quantity basis behavior
5. tests fail against current broken behavior and document the expected outcome 3. tests fail against current broken behavior and document the expected outcome
6. include at least one assertion that effective_price is blank rather than `0` or divide-by-zero when no denominator exists 4. include at least one assertion that effective_price is blank rather than `0` or divide-by-zero when no denominator exists
7. pm note: this task should only add tests/fixtures and not change business logic - pm note: this task should only add tests/fixtures and not change business logic
** pm identified problems ** pm identified problems
we have a few problems to scope. looks like: we have a few problems to scope. looks like:
1. normalize_giant_web not always propagating weight data to price_per 1. normalize_giant_web not always propagating weight data to price_per
@@ -862,34 +855,40 @@ purchase_date retailer normalized_item_name catalog_name category product_type q
10/10/2025 giant BAGGED ICE bagged ice cubes frozen ice 1 EA 20 lb 20 lb weight 4.99 4.99 4.99 line_total_over_qty 0.2495 parsed_size_lb 0.0156 parsed_size_lb_to_oz 0 10/10/2025 giant BAGGED ICE bagged ice cubes frozen ice 1 EA 20 lb 20 lb weight 4.99 4.99 4.99 line_total_over_qty 0.2495 parsed_size_lb 0.0156 parsed_size_lb_to_oz 0
``` ```
** evidence ** evidence
- commit: - commit: `605c944`
- tests: - tests: `./venv/bin/python -m unittest tests.test_purchases` (fails as expected before implementation: missing `effective_price` in purchases rows)
- datetime: - datetime: 2026-03-23 12:52:32 EDT
** notes ** notes
- Added purchases-level regression coverage for the known comparison cases before implementation: Giant banana, Costco banana, Giant bagged ice, Costco beef patties, and a blank-denominator case.
- The current failure mode is the intended one for this task: `build_purchase_rows()` does not yet emit `effective_price`, so the tests document the missing behavior before `t1.18.1`.
* [ ] t1.18.1: fix effective price calculation precedence and blank handling (1-3 commits) * [X] t1.18.1: fix effective price calculation precedence and blank handling (1-3 commits)
correct purchases/effective price logic for the known broken cases using existing normalized fields correct purchases/effective price logic for the known broken cases using existing normalized fields
** acceptance criteria ** acceptance criteria
1. effective_price uses explicit numerator precedence: 1. when generating `data/purchases.csv`, add `effective_price` = `effective_total` / `normalized_quantity`
2. effective_price uses explicit numerator precedence:
- prefer `net_line_total` - prefer `net_line_total`
- fallback to `line_total` - fallback to `line_total`
2. effective_price uses `normalized_quantity` when present and > 0 3. effective_price uses `normalized_quantity` if not blank
3. effective_price is blank when no valid denominator exists 4. effective_price is blank when no valid denominator exists
4. effective_price is never written as `0` or divide-by-zero for missing-basis cases 5. effective_price is never written as `0` or divide-by-zero for missing-basis cases
5. existing regression tests for bananas and ice pass 6. effective_price is only comparable within same `normalized_quantity_unit` unless later analysis converts the units
7. existing regression tests for bananas and ice pass
- pm note: keep this limited to calculation logic; do not broaden into catalog or review changes - pm note: keep this limited to calculation logic; do not broaden into catalog or review changes
** evidence ** evidence
- commit: - commit: `dc0d061`
- tests: - tests: `./venv/bin/python -m unittest discover -s tests`; `./venv/bin/python build_purchases.py`
- datetime: - datetime: 2026-03-23 12:53:34 EDT
** notes ** notes
- `effective_price` is now a downstream purchases field only. It does not replace `price_per_lb` / `price_per_each`; it gives one deterministic comparison value based on the existing normalized quantity basis.
- The implemented precedence is: use non-zero `net_line_total` when present, otherwise `line_total`; divide by `normalized_quantity` when that denominator is > 0; otherwise leave blank.
- This keeps the calculation conservative for mixed-quality data: Costco bananas and ice now compute correctly, while rows like Giant patties with no quantity basis stay blank instead of producing `0` or a divide-by-zero artifact.
* [X] t1.18.2: fix giant normalization quantity carry-through for weight-based items (1-3 commits)
* [ ] t1.18.2: fix giant normalization quantity carry-through for weight-based items (1-3 commits)
ensure giant normalization emits usable normalized quantity for known weight-based cases ensure giant normalization emits usable normalized quantity for known weight-based cases
** acceptance criteria ** acceptance criteria
@@ -898,6 +897,149 @@ ensure giant normalization emits usable normalized quantity for known weight-bas
3. existing regression tests pass without changing normalized_item_id behavior 3. existing regression tests pass without changing normalized_item_id behavior
4. blanks are preserved only when no deterministic quantity basis exists 4. blanks are preserved only when no deterministic quantity basis exists
- pm note: this task is about normalization carry-through, not fuzzy matching or catalog cleanup - pm note: this task is about normalization carry-through, not fuzzy matching or catalog cleanup
** pm notes
*** banana
giant bananas have picked weight and price_per_oz but normalized missing
| purchase_date | retailer | normalized_item_name | catalog_name | qty | unit | normalized_quantity | normalized_quantity_unit | pack_qty | size_value | size_unit | measure_type | line_total | unit_price | net_line_total | price_per_each | price_per_each_basis | price_per_count | price_per_count_basis | price_per_lb | price_per_lb_basis | price_per_oz | price_per_oz_basis | effective_price |
| 8/6/2024 | costco | BANANAS 3 LB / 1.36 KG | BANANA | 1 | E | 3 | lb | | 3 | lb | weight | 1.49 | 1.49 | 1.49 | 1.49 | line_total_over_qty | | | 0.4967 | parsed_size_lb | 0.031 | parsed_size_lb_to_oz | $0.50 |
| 12/6/2024 | giant | FRESH BANANA | BANANA | 1 | LB | | | | | | weight | 0.99 | 0.99 | | 0.99 | line_total_over_qty | | | 0.5893 | picked_weight_lb | 0.0368 | picked_weight_lb_to_oz | |
| 12/12/2024 | giant | FRESH BANANA | BANANA | 1 | LB | | | | | | weight | 1.37 | 1.37 | | 1.37 | line_total_over_qty | | | 0.5905 | picked_weight_lb | 0.0369 | picked_weight_lb_to_oz | |
| 1/7/2025 | giant | FRESH BANANA | BANANA | 1 | LB | | | | | | weight | 1.44 | 1.44 | | 1.44 | line_total_over_qty | | | 0.5902 | picked_weight_lb | 0.0369 | picked_weight_lb_to_oz | |
| 1/24/2025 | costco | BANANAS 3 LB / 1.36 KG | BANANA | 1 | E | 3 | lb | | 3 | lb | weight | 1.49 | 1.49 | 1.49 | 1.49 | line_total_over_qty | | | 0.4967 | parsed_size_lb | 0.031 | parsed_size_lb_to_oz | 0.4967 |
| 2/16/2025 | giant | FRESH BANANA | BANANA | 2 | LB | | | | | | weight | 2.54 | 1.27 | | 1.27 | line_total_over_qty | | | 0.588 | picked_weight_lb | 0.0367 | picked_weight_lb_to_oz | |
| 2/20/2025 | giant | FRESH BANANA | BANANA | 1 | LB | | | | | | weight | 1.4 | 1.4 | | 1.4 | line_total_over_qty | | | 0.5907 | picked_weight_lb | 0.0369 | picked_weight_lb_to_oz | |
| 6/25/2025 | giant | FRESH BANANA | BANANA | 1 | LB | | | | | | weight | 1.29 | 1.29 | | 1.29 | line_total_over_qty | | | 0.589 | picked_weight_lb | 0.0368 | picked_weight_lb_to_oz | |
| 2/14/2026 | costco | BANANAS 3 LB / 1.36 KG | BANANA | 1 | E | 3 | lb | | 3 | lb | weight | 1.49 | 1.49 | 1.49 | 1.49 | line_total_over_qty | | | 0.4967 | parsed_size_lb | 0.031 | parsed_size_lb_to_oz | 0.4967 |
| 3/12/2026 | costco | BANANAS 3 LB / 1.36 KG | BANANA | 2 | E | 6 | lb | | 3 | lb | weight | 2.98 | 1.49 | 2.98 | 1.49 | line_total_over_qty | | | 0.4967 | parsed_size_lb | 0.031 | parsed_size_lb_to_oz | 0.4967 |
*** beef patty
beef patty by weight not made into effective price
| purchase_date | retailer | normalized_item_name | product_type | qty | unit | normalized_quantity | normalized_quantity_unit | pack_qty | size_value | size_unit | measure_type | line_total | unit_price | matched_discount_amount | net_line_total | store_name | price_per_each | price_per_each_basis | price_per_count | price_per_count_basis | price_per_lb | price_per_lb_basis | price_per_oz | price_per_oz_basis | effective_price |
| 9/9/2023 | costco | BEEF PATTIES 6# BAG | hamburger | 1 | E | 1 | each | | | | each | 26.99 | 26.99 | | 26.99 | MT VERNON | 26.99 | line_total_over_qty | | | | | | | $26.99 |
| 11/26/2025 | giant | PATTIES PK12 | hamburger | 1 | LB | | | | | | weight | 10.05 | 10.05 | | | Giant Food | 10.05 | line_total_over_qty | | | 7.7907 | picked_weight_lb | 0.4869 | picked_weight_lb_to_oz | |
** evidence
- commit: `23dfc3d` `Use picked weight for Giant quantity basis`
- tests: `./venv/bin/python -m unittest tests.test_enrich_giant tests.test_purchases`; `./venv/bin/python normalize_giant_web.py`; `./venv/bin/python build_purchases.py`
- datetime: 2026-03-23 13:22:47 EDT
** notes
- Giant loose-weight rows already had deterministic `picked_weight` and `price_per_lb`; this task reuses that basis when parsed size/pack is absent.
- Parsed package size still wins when present, so fixed-size products keep their original comparison basis and `normalized_item_id` behavior does not change.
* [X] t1.18.3: fix costco normalization quantity carry-through for weight-based items (1-3 commits)
** acceptance criteria
1. add regression tests covering known broken Costco quantity-basis cases before changing parser logic
2. Costco normalization correctly parses explicit weight-bearing package text into normalized quantity fields for known cases such as:
- `25# FLOUR ALL-PURPOSE HARV ...` -> `normalized_quantity=25`, `normalized_quantity_unit=lb`, `measure_type=weight`
3. corrected Costco normalized rows carry through to `data/purchases.csv` without changing `normalized_item_id` behavior
4. `effective_price` for corrected Costco rows uses the same rule already established for Giant:
- use `net_line_total` when present, otherwise `line_total`
- divide by `normalized_quantity` when `normalized_quantity > 0`
- leave blank when no valid denominator exists
5. rerun output verifies the broken Costco flour examples no longer behave like `each` items and now produce non-blank weight-based effective prices
6. keep this task limited to the identified Costco parsing failures; do not broaden into catalog cleanup or fuzzy matching
*** All Purpose Flour
Costco 25# FLOUR not parsed into normalized weight - meaure_type says each
| purchase_date | retailer | normalized_item_name | catalog_name | qty | unit | normalized_quantity | normalized_quantity_unit | pack_qty | size_value | size_unit | measure_type | line_total | unit_price | matched_discount_amount | net_line_total | store_name | price_per_each | price_per_each_basis | price_per_count | price_per_count_basis | price_per_lb | price_per_lb_basis | price_per_oz | price_per_oz_basis | effective_price | is_discount_line | is_coupon_line | is_fee | raw_order_path | |
| 9/9/2023 | costco | 10LB BAKERS 4.5KG / 10 LB | all purpose flour | 1 | E | 10 | lb | | 10 | lb | weight | 5.99 | 5.99 | | 5.99 | VA | 5.99 | line_total_over_qty | | | 0.599 | parsed_size_lb | 0.0374 | parsed_size_lb_to_oz | $0.60 | FALSE | FALSE | FALSE | data/costco-web/raw/21111500603752309091647-2023-09-09T16-47-00.json | |
| 8/6/2024 | costco | 10LB BAKERS 4.5KG / 10 LB | all purpose flour | 1 | E | 10 | lb | | 10 | lb | weight | 5.29 | 5.29 | | 5.29 | VA | 5.29 | line_total_over_qty | | | 0.529 | parsed_size_lb | 0.0331 | parsed_size_lb_to_oz | $0.53 | FALSE | FALSE | FALSE | data/costco-web/raw/21111520101732408061704-2024-08-06T17-04-00.json | |
| 11/29/2024 | costco | 25# FLOUR ALL-PURPOSE HARV P98/100 | all purpose flour | 1 | E | 1 | each | | | | each | 8.79 | 8.79 | | 8.79 | VA | 8.79 | line_total_over_qty | | | | | | | $8.79 | FALSE | FALSE | FALSE | data/costco-web/raw/21111500803392411291626-2024-11-29T16-26-00.json | |
| 12/14/2024 | costco | KS ORG FLOUR 2/10 LB P112 | all purpose flour | 1 | E | 20 | lb | 2 | 10 | lb | weight | 17.99 | 17.99 | | 17.99 | VA | 17.99 | line_total_over_qty | 8.995 | line_total_over_pack_qty | 0.8995 | parsed_size_lb | 0.0562 | parsed_size_lb_to_oz | 0.8995 | FALSE | FALSE | FALSE | data/costco-web/raw/21111500301442412141209-2024-12-14T12-09-00.json | |
| 12/14/2024 | costco | 10LB BAKERS 4.5KG / 10 LB | all purpose flour | 1 | E | 10 | lb | | 10 | lb | weight | 5.49 | 5.49 | | 5.49 | VA | 5.49 | line_total_over_qty | | | 0.549 | parsed_size_lb | 0.0343 | parsed_size_lb_to_oz | 0.549 | FALSE | FALSE | FALSE | data/costco-web/raw/21111500301442412141209-2024-12-14T12-09-00.json | |
| 1/10/2025 | costco | 10LB BAKERS 4.5KG / 10 LB | all purpose flour | 1 | E | 10 | lb | | 10 | lb | weight | 5.49 | 5.49 | | 5.49 | VA | 5.49 | line_total_over_qty | | | 0.549 | parsed_size_lb | 0.0343 | parsed_size_lb_to_oz | 0.549 | FALSE | FALSE | FALSE | data/costco-web/raw/21111500702462501101630-2025-01-10T16-30-00.json | |
| 1/10/2025 | costco | KS ORG FLOUR 2/10 LB P112 | all purpose flour | 1 | E | 20 | lb | 2 | 10 | lb | weight | 17.99 | 17.99 | | 17.99 | VA | 17.99 | line_total_over_qty | 8.995 | line_total_over_pack_qty | 0.8995 | parsed_size_lb | 0.0562 | parsed_size_lb_to_oz | 0.8995 | FALSE | FALSE | FALSE | data/costco-web/raw/21111500702462501101630-2025-01-10T16-30-00.json | |
| 1/31/2026 | giant | SB FLOUR ALL PRPSE 5LB | all purpose flour | 1 | EA | 5 | lb | | 5 | lb | weight | 3.39 | 3.39 | | | VA | 3.39 | line_total_over_qty | | | 0.678 | parsed_size_lb | 0.0424 | parsed_size_lb_to_oz | 0.678 | FALSE | FALSE | FALSE | data/giant-web/raw/697f42031c28e23df08d95f9.json | |
| 3/12/2026 | costco | 25# FLOUR ALL-PURPOSE HARV P98/100 | all purpose flour | 1 | E | 1 | each | | | | each | 9.49 | 9.49 | | 9.49 | VA | 9.49 | line_total_over_qty | | | | | | | 9.49 | FALSE | FALSE | FALSE | data/costco-web/raw/21111500804012603121616-2026-03-12T16-16-00.json
| |
** evidence
- commit: `7317611` `Fix Costco hash-size weight parsing`
- tests: `./venv/bin/python -m unittest tests.test_costco_pipeline tests.test_purchases`; `./venv/bin/python normalize_costco_web.py`; `./venv/bin/python build_purchases.py`
- datetime: 2026-03-23 13:56:38 EDT
** notes
- Costco `25#` weight text was falling through to `each` because the hash-size parser missed sizes followed by whitespace.
- This fix is intentionally narrow: explicit `#`-weight parsing now feeds the existing quantity and effective-price flow without changing `normalized_item_id` behavior.
* [X] t1.18.4: clean purchases output and finalize effective price fields (2-4 commits)
make `purchases.csv` easier to inspect and ensure price fields support weighted cost analysis
** acceptance criteria
1. reorder `data/purchases.csv` columns for human inspection, with analysis fields first:
- `purchase_date`
- `retailer`
- `catalog_name`
- `product_type`
- `category`
- `net_line_total`
- `normalized_quantity`
- `effective_price`
- `effective_price_unit`
- followed by order/item/provenance fields
3. populate `net_line_total` for all purchase rows:
- preserve existing net_line_total when already populated;
- otherwise, derive `net_line_total = line_total + matched_discount_amount` when discount exists;
- else `net_line_total = line_total`
4. compute `effective_price` from `net_line_total / normalized_quantity` when `normalized_quantity > 0`
5. add `effective_price_unit` and populate it consistently from the normalized quantity basis
6. preserve blanks rather than writing `0` or divide-by-zero when no valid denominator exists
- pm note: this task is about final purchase output correctness and usability, not review/catalog logic
** evidence
- commit: `a45522c` `Finalize purchase effective price fields`
- tests: `./venv/bin/python -m unittest tests.test_purchases`; `./venv/bin/python build_purchases.py`
- datetime: 2026-03-23 15:27:42 EDT
** notes
- `purchases.csv` now carries a filled `net_line_total` for every row, preserving existing values from normalization and deriving the rest from `line_total` plus matched discounts.
- `effective_price_unit` now mirrors the normalized quantity basis, so downstream analysis can tell whether an `effective_price` is per `lb`, `oz`, `count`, or `each`.
* [X] t1.19: make review_products.py robust to orphaned and incomplete catalog links (2-4 commits)
refresh review state from the current normalized universe so missing or broken links re-enter review instead of silently disappearing
** acceptance criteria
1. `review_products.py` regenerates review candidates from the current normalized item universe, not just previously queued items (/data/<provider>/normalized_items.csv)
2. items are added or re-added to review when:
- they have no valid `catalog_id`
- their linked `catalog_id` no longer exists
- their linked catalog row does noth have both "catalog_name" AND "product_type"
3. `review_products.py` compares and reconciles:
- current normalized items
- current product_links
- current catalog
- current review_queue
4. rerunning review after manual cleanup of `product_links.csv` or `catalog.csv` surfaces newly orphaned normalized items
5. unresolved items remain visible and are not silently dropped from review or purchases accounting
- pm note: keep the logic explicit and auditable; this is a refresh/reconciliation task, not a new matching system
** evidence
- commit: `8ccf3ff` `Reconcile review queue against current catalog state`
- tests: `./venv/bin/python -m unittest tests.test_review_workflow tests.test_purchases`; `./venv/bin/python review_products.py --refresh-only`; `./venv/bin/python report_pipeline_status.py`
- datetime: 2026-03-23 15:32:29 EDT
** notes
- `review_products.py` now rebuilds its queue from the current normalized files and order files instead of trusting stale `purchases.csv` state.
- Missing catalog rows and incomplete catalog rows now re-enter review explicitly as `orphaned_catalog_link` or `incomplete_catalog_link`, and excluded rows no longer inflate unresolved-not-in-review accounting.
* [ ] t1.20: add visit-level fields and outputs for spend analysis (2-4 commits)
ensure purchases retains enough visit/order context to support spend-by-visit and store-level analysis
** acceptance criteria
1. `data/purchases.csv` retains or adds the visit/order fields needed for visit analysis:
- `order_id`
- `purchase_date`
- `store_name`
- `store_number`
- `store_city`
- `store_state`
- `retailer`
2. purchases output supports these analyses without additional joins:
- spend by visit
- items per visit
- category spend by visit
- retailer/store breakdown
3. documentation or task notes make clear that `purchases.csv` is the primary analysis artifact for both item-level and visit-level reporting
- pm note: do not build dash/plotly here; this task is only about carrying the right data through - pm note: do not build dash/plotly here; this task is only about carrying the right data through
** evidence ** evidence
@@ -905,6 +1047,20 @@ ensure giant normalization emits usable normalized quantity for known weight-bas
- tests: - tests:
- datetime: - datetime:
** notes
* [ ] t1.21: add lightweight charting/analysis surface on top of purchases.csv (2-4 commits)
build a minimal analysis layer for common price and visit charts without changing the csv pipeline
** acceptance criteria
1. support charting of:
- item price over time
- spend by visit
- items per visit
- category spend over time
- retailer/store comparison
2. use `data/purchases.csv` as the source of truth
3. keep excel/pivot compatibility intact
- pm note: thin reader layer only; do not move business logic out of the pipeline - pm note: thin reader layer only; do not move business logic out of the pipeline
* [ ] t1.10: add optional llm-assisted suggestion workflow for unresolved normalized retailer items (2-4 commits) * [ ] t1.10: add optional llm-assisted suggestion workflow for unresolved normalized retailer items (2-4 commits)

View File

@@ -27,9 +27,11 @@ def build_status_summary(
costco_enriched, costco_enriched,
purchases, purchases,
resolutions, resolutions,
links,
catalog,
): ):
normalized_rows = giant_enriched + costco_enriched normalized_rows = giant_enriched + costco_enriched
queue_rows = review_products.build_review_queue(purchases, resolutions) queue_rows = review_products.build_review_queue(purchases, resolutions, links, catalog, [])
queue_ids = {row["normalized_item_id"] for row in queue_rows} queue_ids = {row["normalized_item_id"] for row in queue_rows}
unresolved_purchase_rows = [ unresolved_purchase_rows = [
@@ -37,6 +39,7 @@ def build_status_summary(
for row in purchases for row in purchases
if row.get("normalized_item_id") if row.get("normalized_item_id")
and not row.get("catalog_id") and not row.get("catalog_id")
and row.get("resolution_action") != "exclude"
and row.get("is_fee") != "true" and row.get("is_fee") != "true"
and row.get("is_discount_line") != "true" and row.get("is_discount_line") != "true"
and row.get("is_coupon_line") != "true" and row.get("is_coupon_line") != "true"
@@ -84,6 +87,8 @@ def build_status_summary(
@click.option("--costco-enriched-csv", default="data/costco-web/normalized_items.csv", show_default=True) @click.option("--costco-enriched-csv", default="data/costco-web/normalized_items.csv", show_default=True)
@click.option("--purchases-csv", default="data/review/purchases.csv", show_default=True) @click.option("--purchases-csv", default="data/review/purchases.csv", show_default=True)
@click.option("--resolutions-csv", default="data/review/review_resolutions.csv", show_default=True) @click.option("--resolutions-csv", default="data/review/review_resolutions.csv", show_default=True)
@click.option("--links-csv", default="data/review/product_links.csv", show_default=True)
@click.option("--catalog-csv", default="data/catalog.csv", show_default=True)
@click.option("--summary-csv", default="data/review/pipeline_status.csv", show_default=True) @click.option("--summary-csv", default="data/review/pipeline_status.csv", show_default=True)
@click.option("--summary-json", default="data/review/pipeline_status.json", show_default=True) @click.option("--summary-json", default="data/review/pipeline_status.json", show_default=True)
def main( def main(
@@ -95,6 +100,8 @@ def main(
costco_enriched_csv, costco_enriched_csv,
purchases_csv, purchases_csv,
resolutions_csv, resolutions_csv,
links_csv,
catalog_csv,
summary_csv, summary_csv,
summary_json, summary_json,
): ):
@@ -107,6 +114,8 @@ def main(
read_rows_if_exists(costco_enriched_csv), read_rows_if_exists(costco_enriched_csv),
read_rows_if_exists(purchases_csv), read_rows_if_exists(purchases_csv),
[build_purchases.normalize_resolution_row(row) for row in read_rows_if_exists(resolutions_csv)], [build_purchases.normalize_resolution_row(row) for row in read_rows_if_exists(resolutions_csv)],
[build_purchases.normalize_link_row(row) for row in read_rows_if_exists(links_csv)],
[build_purchases.normalize_catalog_row(row) for row in read_rows_if_exists(catalog_csv)],
) )
write_csv_rows(summary_csv, summary_rows, SUMMARY_FIELDS) write_csv_rows(summary_csv, summary_rows, SUMMARY_FIELDS)
summary_json_path = Path(summary_json) summary_json_path = Path(summary_json)

View File

@@ -31,6 +31,7 @@ INFO_COLOR = "cyan"
PROMPT_COLOR = "bright_yellow" PROMPT_COLOR = "bright_yellow"
WARNING_COLOR = "magenta" WARNING_COLOR = "magenta"
TOKEN_RE = re.compile(r"[A-Z0-9]+") TOKEN_RE = re.compile(r"[A-Z0-9]+")
REQUIRED_CATALOG_FIELDS = ("catalog_name", "product_type")
def print_intro_text(): def print_intro_text():
@@ -40,9 +41,37 @@ def print_intro_text():
click.echo(" category: broad analysis bucket such as dairy, produce, or frozen") click.echo(" category: broad analysis bucket such as dairy, produce, or frozen")
def build_review_queue(purchase_rows, resolution_rows): def has_complete_catalog_row(catalog_row):
if not catalog_row:
return False
return all(catalog_row.get(field, "").strip() for field in REQUIRED_CATALOG_FIELDS)
def load_queue_lookup(queue_rows):
lookup = {}
for row in queue_rows:
normalized_item_id = row.get("normalized_item_id", "")
if normalized_item_id:
lookup[normalized_item_id] = row
return lookup
def build_review_queue(
purchase_rows,
resolution_rows,
link_rows=None,
catalog_rows=None,
existing_queue_rows=None,
):
by_normalized = defaultdict(list) by_normalized = defaultdict(list)
resolution_lookup = build_purchases.load_resolution_lookup(resolution_rows) resolution_lookup = build_purchases.load_resolution_lookup(resolution_rows)
link_lookup = build_purchases.load_link_lookup(link_rows or [])
catalog_lookup = {
row.get("catalog_id", ""): build_purchases.normalize_catalog_row(row)
for row in (catalog_rows or [])
if row.get("catalog_id", "")
}
queue_lookup = load_queue_lookup(existing_queue_rows or [])
for row in purchase_rows: for row in purchase_rows:
normalized_item_id = row.get("normalized_item_id", "") normalized_item_id = row.get("normalized_item_id", "")
@@ -54,30 +83,40 @@ def build_review_queue(purchase_rows, resolution_rows):
queue_rows = [] queue_rows = []
for normalized_item_id, rows in sorted(by_normalized.items()): for normalized_item_id, rows in sorted(by_normalized.items()):
current_resolution = resolution_lookup.get(normalized_item_id, {}) current_resolution = resolution_lookup.get(normalized_item_id, {})
if current_resolution.get("status") == "approved": if current_resolution.get("status") == "approved" and current_resolution.get("resolution_action") == "exclude":
continue continue
existing_queue_row = queue_lookup.get(normalized_item_id, {})
linked_catalog_id = current_resolution.get("catalog_id") or link_lookup.get(normalized_item_id, {}).get("catalog_id", "")
linked_catalog_row = catalog_lookup.get(linked_catalog_id, {})
has_valid_catalog_link = bool(linked_catalog_id and has_complete_catalog_row(linked_catalog_row))
unresolved_rows = [ unresolved_rows = [
row row
for row in rows for row in rows
if not row.get("catalog_id") if row.get("is_item", "true") != "false"
and row.get("is_item", "true") != "false"
and row.get("is_fee") != "true" and row.get("is_fee") != "true"
and row.get("is_discount_line") != "true" and row.get("is_discount_line") != "true"
and row.get("is_coupon_line") != "true" and row.get("is_coupon_line") != "true"
] ]
if not unresolved_rows: if not unresolved_rows or has_valid_catalog_link:
continue continue
retailers = sorted({row["retailer"] for row in rows}) retailers = sorted({row["retailer"] for row in rows})
review_id = stable_id("rvw", normalized_item_id) review_id = stable_id("rvw", normalized_item_id)
reason_code = "missing_catalog_link"
if linked_catalog_id and linked_catalog_id not in catalog_lookup:
reason_code = "orphaned_catalog_link"
elif linked_catalog_id and not has_complete_catalog_row(linked_catalog_row):
reason_code = "incomplete_catalog_link"
queue_rows.append( queue_rows.append(
{ {
"review_id": review_id, "review_id": review_id,
"retailer": " | ".join(retailers), "retailer": " | ".join(retailers),
"normalized_item_id": normalized_item_id, "normalized_item_id": normalized_item_id,
"catalog_id": current_resolution.get("catalog_id", ""), "catalog_id": linked_catalog_id,
"reason_code": "missing_catalog_link", "reason_code": reason_code,
"priority": "high", "priority": "high",
"raw_item_names": compact_join( "raw_item_names": compact_join(
sorted({row["raw_item_name"] for row in rows if row["raw_item_name"]}), sorted({row["raw_item_name"] for row in rows if row["raw_item_name"]}),
@@ -102,10 +141,13 @@ def build_review_queue(purchase_rows, resolution_rows):
limit=8, limit=8,
), ),
"seen_count": str(len(rows)), "seen_count": str(len(rows)),
"status": current_resolution.get("status", "pending"), "status": existing_queue_row.get("status") or current_resolution.get("status", "pending"),
"resolution_action": current_resolution.get("resolution_action", ""), "resolution_action": existing_queue_row.get("resolution_action")
"resolution_notes": current_resolution.get("resolution_notes", ""), or current_resolution.get("resolution_action", ""),
"created_at": current_resolution.get("reviewed_at", today_text), "resolution_notes": existing_queue_row.get("resolution_notes")
or current_resolution.get("resolution_notes", ""),
"created_at": existing_queue_row.get("created_at")
or current_resolution.get("reviewed_at", today_text),
"updated_at": today_text, "updated_at": today_text,
} }
) )
@@ -516,6 +558,10 @@ def link_rows_from_state(link_lookup):
@click.command() @click.command()
@click.option("--giant-items-enriched-csv", default="data/giant-web/normalized_items.csv", show_default=True)
@click.option("--costco-items-enriched-csv", default="data/costco-web/normalized_items.csv", show_default=True)
@click.option("--giant-orders-csv", default="data/giant-web/collected_orders.csv", show_default=True)
@click.option("--costco-orders-csv", default="data/costco-web/collected_orders.csv", show_default=True)
@click.option("--purchases-csv", default="data/review/purchases.csv", show_default=True) @click.option("--purchases-csv", default="data/review/purchases.csv", show_default=True)
@click.option("--queue-csv", default="data/review/review_queue.csv", show_default=True) @click.option("--queue-csv", default="data/review/review_queue.csv", show_default=True)
@click.option("--resolutions-csv", default="data/review/review_resolutions.csv", show_default=True) @click.option("--resolutions-csv", default="data/review/review_resolutions.csv", show_default=True)
@@ -523,12 +569,40 @@ def link_rows_from_state(link_lookup):
@click.option("--links-csv", default="data/review/product_links.csv", show_default=True) @click.option("--links-csv", default="data/review/product_links.csv", show_default=True)
@click.option("--limit", default=0, show_default=True, type=int) @click.option("--limit", default=0, show_default=True, type=int)
@click.option("--refresh-only", is_flag=True, help="Only rebuild review_queue.csv without prompting.") @click.option("--refresh-only", is_flag=True, help="Only rebuild review_queue.csv without prompting.")
def main(purchases_csv, queue_csv, resolutions_csv, catalog_csv, links_csv, limit, refresh_only): def main(
purchase_rows = build_purchases.read_optional_csv_rows(purchases_csv) giant_items_enriched_csv,
costco_items_enriched_csv,
giant_orders_csv,
costco_orders_csv,
purchases_csv,
queue_csv,
resolutions_csv,
catalog_csv,
links_csv,
limit,
refresh_only,
):
resolution_rows = build_purchases.read_optional_csv_rows(resolutions_csv) resolution_rows = build_purchases.read_optional_csv_rows(resolutions_csv)
catalog_rows = build_purchases.merge_catalog_rows(build_purchases.read_optional_csv_rows(catalog_csv), []) catalog_rows = build_purchases.merge_catalog_rows(build_purchases.read_optional_csv_rows(catalog_csv), [])
link_lookup = build_purchases.load_link_lookup(build_purchases.read_optional_csv_rows(links_csv)) link_rows = build_purchases.read_optional_csv_rows(links_csv)
queue_rows = build_review_queue(purchase_rows, resolution_rows) purchase_rows, refreshed_link_rows = build_purchases.build_purchase_rows(
build_purchases.read_optional_csv_rows(giant_items_enriched_csv),
build_purchases.read_optional_csv_rows(costco_items_enriched_csv),
build_purchases.read_optional_csv_rows(giant_orders_csv),
build_purchases.read_optional_csv_rows(costco_orders_csv),
resolution_rows,
link_rows,
catalog_rows,
)
build_purchases.write_csv_rows(purchases_csv, purchase_rows, build_purchases.PURCHASE_FIELDS)
link_lookup = build_purchases.load_link_lookup(refreshed_link_rows)
queue_rows = build_review_queue(
purchase_rows,
resolution_rows,
refreshed_link_rows,
catalog_rows,
build_purchases.read_optional_csv_rows(queue_csv),
)
write_csv_rows(queue_csv, queue_rows, QUEUE_FIELDS) write_csv_rows(queue_csv, queue_rows, QUEUE_FIELDS)
click.echo(f"wrote {len(queue_rows)} rows to {queue_csv}") click.echo(f"wrote {len(queue_rows)} rows to {queue_csv}")

View File

@@ -346,6 +346,32 @@ class CostcoPipelineTests(unittest.TestCase):
) )
self.assertEqual("LIFE 6'TABLE MDL", logistics["item_name_norm"]) self.assertEqual("LIFE 6'TABLE MDL", logistics["item_name_norm"])
def test_costco_hash_weight_parses_into_weight_basis(self):
row = enrich_costco.parse_costco_item(
order_id="abc",
order_date="2024-11-29",
raw_path=Path("costco_output/raw/abc.json"),
line_no=4,
item={
"itemNumber": "999",
"itemDescription01": "25# FLOUR ALL-PURPOSE HARV P98/100",
"itemDescription02": None,
"itemDepartmentNumber": 14,
"transDepartmentNumber": 14,
"unit": 1,
"itemIdentifier": "E",
"amount": 8.79,
"itemUnitPriceAmount": 8.79,
},
)
self.assertEqual("FLOUR ALL-PURPOSE HARV", row["item_name_norm"])
self.assertEqual("25", row["size_value"])
self.assertEqual("lb", row["size_unit"])
self.assertEqual("weight", row["measure_type"])
self.assertEqual("25", row["normalized_quantity"])
self.assertEqual("lb", row["normalized_quantity_unit"])
self.assertEqual("0.3516", row["price_per_lb"])
def test_build_items_enriched_matches_discount_to_item(self): def test_build_items_enriched_matches_discount_to_item(self):
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
raw_dir = Path(tmpdir) / "raw" raw_dir = Path(tmpdir) / "raw"

View File

@@ -129,6 +129,63 @@ class EnrichGiantTests(unittest.TestCase):
("2", "each"), ("2", "each"),
enrich_giant.derive_normalized_quantity("2", "", "", "", "each"), enrich_giant.derive_normalized_quantity("2", "", "", "", "each"),
) )
self.assertEqual(
("1.68", "lb"),
enrich_giant.derive_normalized_quantity("1", "", "", "", "weight", "1.68"),
)
def test_parse_item_uses_picked_weight_for_loose_weight_items(self):
banana = enrich_giant.parse_item(
order_id="abc123",
order_date="2026-03-01",
raw_path=Path("raw/abc123.json"),
line_no=1,
item={
"podId": 1,
"shipQy": 1,
"totalPickedWeight": 1.68,
"unitPrice": 0.99,
"itemName": "FRESH BANANA",
"lbEachCd": "LB",
"groceryAmount": 0.99,
"primUpcCd": "111",
"mvpSavings": 0,
"rewardSavings": 0,
"couponSavings": 0,
"couponPrice": 0,
"categoryId": "1",
"categoryDesc": "Grocery",
},
)
self.assertEqual("weight", banana["measure_type"])
self.assertEqual("1.68", banana["normalized_quantity"])
self.assertEqual("lb", banana["normalized_quantity_unit"])
patty = enrich_giant.parse_item(
order_id="abc123",
order_date="2026-03-01",
raw_path=Path("raw/abc123.json"),
line_no=2,
item={
"podId": 2,
"shipQy": 1,
"totalPickedWeight": 1.29,
"unitPrice": 10.05,
"itemName": "80% PATTIES PK12",
"lbEachCd": "LB",
"groceryAmount": 10.05,
"primUpcCd": "222",
"mvpSavings": 0,
"rewardSavings": 0,
"couponSavings": 0,
"couponPrice": 0,
"categoryId": "1",
"categoryDesc": "Grocery",
},
)
self.assertEqual("1.29", patty["normalized_quantity"])
self.assertEqual("lb", patty["normalized_quantity_unit"])
def test_build_items_enriched_reads_raw_order_files_and_writes_csv(self): def test_build_items_enriched_reads_raw_order_files_and_writes_csv(self):
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:

View File

@@ -8,6 +8,11 @@ import enrich_costco
class PurchaseLogTests(unittest.TestCase): class PurchaseLogTests(unittest.TestCase):
def test_derive_net_line_total_preserves_existing_then_derives(self):
self.assertEqual("1.49", build_purchases.derive_net_line_total({"net_line_total": "1.49", "line_total": "2.98"}))
self.assertEqual("5.99", build_purchases.derive_net_line_total({"line_total": "6.99", "matched_discount_amount": "-1.00"}))
self.assertEqual("3.5", build_purchases.derive_net_line_total({"line_total": "3.50"}))
def test_derive_metrics_prefers_picked_weight_and_pack_count(self): def test_derive_metrics_prefers_picked_weight_and_pack_count(self):
metrics = build_purchases.derive_metrics( metrics = build_purchases.derive_metrics(
{ {
@@ -161,6 +166,7 @@ class PurchaseLogTests(unittest.TestCase):
self.assertEqual("https://example.test/banana.jpg", rows[0]["image_url"]) self.assertEqual("https://example.test/banana.jpg", rows[0]["image_url"])
self.assertEqual("1", rows[0]["normalized_quantity"]) self.assertEqual("1", rows[0]["normalized_quantity"])
self.assertEqual("lb", rows[0]["normalized_quantity_unit"]) self.assertEqual("lb", rows[0]["normalized_quantity_unit"])
self.assertEqual("lb", rows[0]["effective_price_unit"])
def test_main_writes_purchase_and_example_csvs(self): def test_main_writes_purchase_and_example_csvs(self):
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
@@ -418,6 +424,206 @@ class PurchaseLogTests(unittest.TestCase):
self.assertEqual("1", rows[0]["normalized_quantity"]) self.assertEqual("1", rows[0]["normalized_quantity"])
self.assertEqual("each", rows[0]["normalized_quantity_unit"]) self.assertEqual("each", rows[0]["normalized_quantity_unit"])
def test_build_purchase_rows_derives_effective_price_for_known_cases(self):
fieldnames = enrich_costco.OUTPUT_FIELDS
def base_row():
return {field: "" for field in fieldnames}
giant_banana = base_row()
giant_banana.update(
{
"retailer": "giant",
"order_id": "g1",
"line_no": "1",
"normalized_row_id": "giant:g1:1",
"normalized_item_id": "gnorm:banana",
"order_date": "2026-03-01",
"item_name": "FRESH BANANA",
"item_name_norm": "BANANA",
"retailer_item_id": "100",
"qty": "1",
"unit": "LB",
"normalized_quantity": "1.68",
"normalized_quantity_unit": "lb",
"line_total": "0.99",
"unit_price": "0.99",
"measure_type": "weight",
"price_per_lb": "0.5893",
"raw_order_path": "data/giant-web/raw/g1.json",
"is_discount_line": "false",
"is_coupon_line": "false",
"is_fee": "false",
}
)
costco_banana = base_row()
costco_banana.update(
{
"retailer": "costco",
"order_id": "c1",
"line_no": "1",
"normalized_row_id": "costco:c1:1",
"normalized_item_id": "cnorm:banana",
"order_date": "2026-03-12",
"item_name": "BANANAS 3 LB / 1.36 KG",
"item_name_norm": "BANANA",
"retailer_item_id": "30669",
"qty": "1",
"unit": "E",
"normalized_quantity": "3",
"normalized_quantity_unit": "lb",
"line_total": "2.98",
"net_line_total": "1.49",
"unit_price": "2.98",
"size_value": "3",
"size_unit": "lb",
"measure_type": "weight",
"price_per_lb": "0.4967",
"raw_order_path": "data/costco-web/raw/c1.json",
"is_discount_line": "false",
"is_coupon_line": "false",
"is_fee": "false",
}
)
giant_ice = base_row()
giant_ice.update(
{
"retailer": "giant",
"order_id": "g2",
"line_no": "1",
"normalized_row_id": "giant:g2:1",
"normalized_item_id": "gnorm:ice",
"order_date": "2026-03-02",
"item_name": "SB BAGGED ICE 20LB",
"item_name_norm": "BAGGED ICE",
"retailer_item_id": "101",
"qty": "2",
"unit": "EA",
"normalized_quantity": "40",
"normalized_quantity_unit": "lb",
"line_total": "9.98",
"unit_price": "4.99",
"size_value": "20",
"size_unit": "lb",
"measure_type": "weight",
"price_per_lb": "0.2495",
"raw_order_path": "data/giant-web/raw/g2.json",
"is_discount_line": "false",
"is_coupon_line": "false",
"is_fee": "false",
}
)
costco_patty = base_row()
costco_patty.update(
{
"retailer": "costco",
"order_id": "c2",
"line_no": "1",
"normalized_row_id": "costco:c2:1",
"normalized_item_id": "cnorm:patty",
"order_date": "2026-03-03",
"item_name": "BEEF PATTIES 6# BAG",
"item_name_norm": "BEEF PATTIES 6# BAG",
"retailer_item_id": "777",
"qty": "1",
"unit": "E",
"normalized_quantity": "1",
"normalized_quantity_unit": "each",
"line_total": "26.99",
"net_line_total": "26.99",
"unit_price": "26.99",
"measure_type": "each",
"raw_order_path": "data/costco-web/raw/c2.json",
"is_discount_line": "false",
"is_coupon_line": "false",
"is_fee": "false",
}
)
giant_patty = base_row()
giant_patty.update(
{
"retailer": "giant",
"order_id": "g3",
"line_no": "1",
"normalized_row_id": "giant:g3:1",
"normalized_item_id": "gnorm:patty",
"order_date": "2026-03-04",
"item_name": "80% PATTIES PK12",
"item_name_norm": "80% PATTIES PK12",
"retailer_item_id": "102",
"qty": "1",
"unit": "LB",
"normalized_quantity": "",
"normalized_quantity_unit": "",
"line_total": "10.05",
"unit_price": "10.05",
"measure_type": "weight",
"price_per_lb": "7.7907",
"raw_order_path": "data/giant-web/raw/g3.json",
"is_discount_line": "false",
"is_coupon_line": "false",
"is_fee": "false",
}
)
rows, _links = build_purchases.build_purchase_rows(
[giant_banana, giant_ice, giant_patty],
[costco_banana, costco_patty],
[],
[],
[],
[],
[],
)
rows_by_item = {row["normalized_item_id"]: row for row in rows}
self.assertEqual("0.5893", rows_by_item["gnorm:banana"]["effective_price"])
self.assertEqual("lb", rows_by_item["gnorm:banana"]["effective_price_unit"])
self.assertEqual("0.4967", rows_by_item["cnorm:banana"]["effective_price"])
self.assertEqual("lb", rows_by_item["cnorm:banana"]["effective_price_unit"])
self.assertEqual("0.2495", rows_by_item["gnorm:ice"]["effective_price"])
self.assertEqual("lb", rows_by_item["gnorm:ice"]["effective_price_unit"])
self.assertEqual("26.99", rows_by_item["cnorm:patty"]["effective_price"])
self.assertEqual("each", rows_by_item["cnorm:patty"]["effective_price_unit"])
self.assertEqual("", rows_by_item["gnorm:patty"]["effective_price"])
self.assertEqual("", rows_by_item["gnorm:patty"]["effective_price_unit"])
def test_build_purchase_rows_leaves_effective_price_blank_without_valid_denominator(self):
fieldnames = enrich_costco.OUTPUT_FIELDS
row = {field: "" for field in fieldnames}
row.update(
{
"retailer": "giant",
"order_id": "g1",
"line_no": "1",
"normalized_row_id": "giant:g1:1",
"normalized_item_id": "gnorm:blank",
"order_date": "2026-03-01",
"item_name": "MYSTERY ITEM",
"item_name_norm": "MYSTERY ITEM",
"retailer_item_id": "100",
"qty": "1",
"unit": "EA",
"normalized_quantity": "0",
"normalized_quantity_unit": "each",
"line_total": "3.50",
"unit_price": "3.50",
"measure_type": "each",
"raw_order_path": "data/giant-web/raw/g1.json",
"is_discount_line": "false",
"is_coupon_line": "false",
"is_fee": "false",
}
)
rows, _links = build_purchases.build_purchase_rows([row], [], [], [], [], [], [])
self.assertEqual("", rows[0]["effective_price"])
self.assertEqual("", rows[0]["effective_price_unit"])
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()

View File

@@ -6,9 +6,94 @@ from unittest import mock
from click.testing import CliRunner from click.testing import CliRunner
import enrich_costco
import review_products import review_products
def write_review_source_files(tmpdir, rows):
giant_items_csv = Path(tmpdir) / "giant_items.csv"
costco_items_csv = Path(tmpdir) / "costco_items.csv"
giant_orders_csv = Path(tmpdir) / "giant_orders.csv"
costco_orders_csv = Path(tmpdir) / "costco_orders.csv"
fieldnames = enrich_costco.OUTPUT_FIELDS
grouped_rows = {"giant": [], "costco": []}
grouped_orders = {"giant": {}, "costco": {}}
for index, row in enumerate(rows, start=1):
retailer = row.get("retailer", "giant")
normalized_row = {field: "" for field in fieldnames}
normalized_row.update(
{
"retailer": retailer,
"order_id": row.get("order_id", f"{retailer[0]}{index}"),
"line_no": row.get("line_no", str(index)),
"normalized_row_id": row.get(
"normalized_row_id",
f"{retailer}:{row.get('order_id', f'{retailer[0]}{index}')}:{row.get('line_no', str(index))}",
),
"normalized_item_id": row.get("normalized_item_id", ""),
"order_date": row.get("purchase_date", ""),
"item_name": row.get("raw_item_name", ""),
"item_name_norm": row.get("normalized_item_name", ""),
"image_url": row.get("image_url", ""),
"upc": row.get("upc", ""),
"line_total": row.get("line_total", ""),
"net_line_total": row.get("net_line_total", ""),
"matched_discount_amount": row.get("matched_discount_amount", ""),
"qty": row.get("qty", "1"),
"unit": row.get("unit", "EA"),
"normalized_quantity": row.get("normalized_quantity", ""),
"normalized_quantity_unit": row.get("normalized_quantity_unit", ""),
"size_value": row.get("size_value", ""),
"size_unit": row.get("size_unit", ""),
"pack_qty": row.get("pack_qty", ""),
"measure_type": row.get("measure_type", "each"),
"retailer_item_id": row.get("retailer_item_id", ""),
"price_per_each": row.get("price_per_each", ""),
"price_per_lb": row.get("price_per_lb", ""),
"price_per_oz": row.get("price_per_oz", ""),
"is_discount_line": row.get("is_discount_line", "false"),
"is_coupon_line": row.get("is_coupon_line", "false"),
"is_fee": row.get("is_fee", "false"),
"raw_order_path": row.get("raw_order_path", ""),
}
)
grouped_rows[retailer].append(normalized_row)
order_id = normalized_row["order_id"]
grouped_orders[retailer].setdefault(
order_id,
{
"order_id": order_id,
"store_name": row.get("store_name", ""),
"store_number": row.get("store_number", ""),
"store_city": row.get("store_city", ""),
"store_state": row.get("store_state", ""),
},
)
for path, source_rows in [
(giant_items_csv, grouped_rows["giant"]),
(costco_items_csv, grouped_rows["costco"]),
]:
with path.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(source_rows)
order_fields = ["order_id", "store_name", "store_number", "store_city", "store_state"]
for path, source_rows in [
(giant_orders_csv, grouped_orders["giant"].values()),
(costco_orders_csv, grouped_orders["costco"].values()),
]:
with path.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=order_fields)
writer.writeheader()
writer.writerows(source_rows)
return giant_items_csv, costco_items_csv, giant_orders_csv, costco_orders_csv
class ReviewWorkflowTests(unittest.TestCase): class ReviewWorkflowTests(unittest.TestCase):
def test_build_review_queue_groups_unresolved_purchases(self): def test_build_review_queue_groups_unresolved_purchases(self):
queue_rows = review_products.build_review_queue( queue_rows = review_products.build_review_queue(
@@ -114,66 +199,47 @@ class ReviewWorkflowTests(unittest.TestCase):
resolutions_csv = Path(tmpdir) / "review_resolutions.csv" resolutions_csv = Path(tmpdir) / "review_resolutions.csv"
catalog_csv = Path(tmpdir) / "catalog.csv" catalog_csv = Path(tmpdir) / "catalog.csv"
links_csv = Path(tmpdir) / "product_links.csv" links_csv = Path(tmpdir) / "product_links.csv"
giant_items_csv, costco_items_csv, giant_orders_csv, costco_orders_csv = write_review_source_files(
purchase_fields = [ tmpdir,
"purchase_date", [
"retailer", {
"order_id", "purchase_date": "2026-03-14",
"line_no", "retailer": "costco",
"normalized_item_id", "order_id": "c2",
"catalog_id", "line_no": "2",
"raw_item_name", "normalized_item_id": "cnorm_mix",
"normalized_item_name", "raw_item_name": "MIXED PEPPER 6-PACK",
"image_url", "normalized_item_name": "MIXED PEPPER",
"upc", "image_url": "",
"line_total", "upc": "",
] "line_total": "7.49",
with purchases_csv.open("w", newline="", encoding="utf-8") as handle: },
writer = csv.DictWriter(handle, fieldnames=purchase_fields) {
writer.writeheader() "purchase_date": "2026-03-12",
writer.writerows( "retailer": "costco",
[ "order_id": "c1",
{ "line_no": "1",
"purchase_date": "2026-03-14", "normalized_item_id": "cnorm_mix",
"retailer": "costco", "raw_item_name": "MIXED PEPPER 6-PACK",
"order_id": "c2", "normalized_item_name": "MIXED PEPPER",
"line_no": "2", "image_url": "https://example.test/mixed-pepper.jpg",
"normalized_item_id": "cnorm_mix", "upc": "",
"catalog_id": "", "line_total": "6.99",
"raw_item_name": "MIXED PEPPER 6-PACK", },
"normalized_item_name": "MIXED PEPPER", {
"image_url": "", "purchase_date": "2026-03-10",
"upc": "", "retailer": "giant",
"line_total": "7.49", "order_id": "g1",
}, "line_no": "1",
{ "normalized_item_id": "gnorm_mix",
"purchase_date": "2026-03-12", "raw_item_name": "MIXED PEPPER",
"retailer": "costco", "normalized_item_name": "MIXED PEPPER",
"order_id": "c1", "image_url": "",
"line_no": "1", "upc": "",
"normalized_item_id": "cnorm_mix", "line_total": "5.99",
"catalog_id": "", },
"raw_item_name": "MIXED PEPPER 6-PACK", ],
"normalized_item_name": "MIXED PEPPER", )
"image_url": "https://example.test/mixed-pepper.jpg",
"upc": "",
"line_total": "6.99",
},
{
"purchase_date": "2026-03-10",
"retailer": "giant",
"order_id": "g1",
"line_no": "1",
"normalized_item_id": "gnorm_mix",
"catalog_id": "cat_mix",
"raw_item_name": "MIXED PEPPER",
"normalized_item_name": "MIXED PEPPER",
"image_url": "",
"upc": "",
"line_total": "5.99",
},
]
)
with catalog_csv.open("w", newline="", encoding="utf-8") as handle: with catalog_csv.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=review_products.build_purchases.CATALOG_FIELDS) writer = csv.DictWriter(handle, fieldnames=review_products.build_purchases.CATALOG_FIELDS)
@@ -195,11 +261,34 @@ class ReviewWorkflowTests(unittest.TestCase):
"updated_at": "", "updated_at": "",
} }
) )
with links_csv.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=review_products.build_purchases.PRODUCT_LINK_FIELDS)
writer.writeheader()
writer.writerow(
{
"normalized_item_id": "gnorm_mix",
"catalog_id": "cat_mix",
"link_method": "manual_link",
"link_confidence": "high",
"review_status": "approved",
"reviewed_by": "",
"reviewed_at": "",
"link_notes": "",
}
)
runner = CliRunner() runner = CliRunner()
result = runner.invoke( result = runner.invoke(
review_products.main, review_products.main,
[ [
"--giant-items-enriched-csv",
str(giant_items_csv),
"--costco-items-enriched-csv",
str(costco_items_csv),
"--giant-orders-csv",
str(giant_orders_csv),
"--costco-orders-csv",
str(costco_orders_csv),
"--purchases-csv", "--purchases-csv",
str(purchases_csv), str(purchases_csv),
"--queue-csv", "--queue-csv",
@@ -234,40 +323,23 @@ class ReviewWorkflowTests(unittest.TestCase):
resolutions_csv = Path(tmpdir) / "review_resolutions.csv" resolutions_csv = Path(tmpdir) / "review_resolutions.csv"
catalog_csv = Path(tmpdir) / "catalog.csv" catalog_csv = Path(tmpdir) / "catalog.csv"
links_csv = Path(tmpdir) / "product_links.csv" links_csv = Path(tmpdir) / "product_links.csv"
giant_items_csv, costco_items_csv, giant_orders_csv, costco_orders_csv = write_review_source_files(
with purchases_csv.open("w", newline="", encoding="utf-8") as handle: tmpdir,
writer = csv.DictWriter( [
handle,
fieldnames=[
"purchase_date",
"retailer",
"order_id",
"line_no",
"normalized_item_id",
"catalog_id",
"raw_item_name",
"normalized_item_name",
"image_url",
"upc",
"line_total",
],
)
writer.writeheader()
writer.writerow(
{ {
"purchase_date": "2026-03-14", "purchase_date": "2026-03-14",
"retailer": "giant", "retailer": "giant",
"order_id": "g1", "order_id": "g1",
"line_no": "1", "line_no": "1",
"normalized_item_id": "gnorm_ice", "normalized_item_id": "gnorm_ice",
"catalog_id": "",
"raw_item_name": "SB BAGGED ICE 20LB", "raw_item_name": "SB BAGGED ICE 20LB",
"normalized_item_name": "BAGGED ICE", "normalized_item_name": "BAGGED ICE",
"image_url": "", "image_url": "",
"upc": "", "upc": "",
"line_total": "3.50", "line_total": "3.50",
} }
) ],
)
with catalog_csv.open("w", newline="", encoding="utf-8") as handle: with catalog_csv.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=review_products.build_purchases.CATALOG_FIELDS) writer = csv.DictWriter(handle, fieldnames=review_products.build_purchases.CATALOG_FIELDS)
@@ -276,6 +348,14 @@ class ReviewWorkflowTests(unittest.TestCase):
result = CliRunner().invoke( result = CliRunner().invoke(
review_products.main, review_products.main,
[ [
"--giant-items-enriched-csv",
str(giant_items_csv),
"--costco-items-enriched-csv",
str(costco_items_csv),
"--giant-orders-csv",
str(giant_orders_csv),
"--costco-orders-csv",
str(costco_orders_csv),
"--purchases-csv", "--purchases-csv",
str(purchases_csv), str(purchases_csv),
"--queue-csv", "--queue-csv",
@@ -301,68 +381,47 @@ class ReviewWorkflowTests(unittest.TestCase):
resolutions_csv = Path(tmpdir) / "review_resolutions.csv" resolutions_csv = Path(tmpdir) / "review_resolutions.csv"
catalog_csv = Path(tmpdir) / "catalog.csv" catalog_csv = Path(tmpdir) / "catalog.csv"
links_csv = Path(tmpdir) / "product_links.csv" links_csv = Path(tmpdir) / "product_links.csv"
giant_items_csv, costco_items_csv, giant_orders_csv, costco_orders_csv = write_review_source_files(
with purchases_csv.open("w", newline="", encoding="utf-8") as handle: tmpdir,
writer = csv.DictWriter( [
handle, {
fieldnames=[ "purchase_date": "2026-03-14",
"purchase_date", "retailer": "costco",
"retailer", "order_id": "c2",
"order_id", "line_no": "2",
"line_no", "normalized_item_id": "cnorm_mix",
"normalized_item_id", "raw_item_name": "MIXED PEPPER 6-PACK",
"catalog_id", "normalized_item_name": "MIXED PEPPER",
"raw_item_name", "image_url": "",
"normalized_item_name", "upc": "",
"image_url", "line_total": "7.49",
"upc", },
"line_total", {
], "purchase_date": "2026-03-12",
) "retailer": "costco",
writer.writeheader() "order_id": "c1",
writer.writerows( "line_no": "1",
[ "normalized_item_id": "cnorm_mix",
{ "raw_item_name": "MIXED PEPPER 6-PACK",
"purchase_date": "2026-03-14", "normalized_item_name": "MIXED PEPPER",
"retailer": "costco", "image_url": "",
"order_id": "c2", "upc": "",
"line_no": "2", "line_total": "6.99",
"normalized_item_id": "cnorm_mix", },
"catalog_id": "", {
"raw_item_name": "MIXED PEPPER 6-PACK", "purchase_date": "2026-03-10",
"normalized_item_name": "MIXED PEPPER", "retailer": "giant",
"image_url": "", "order_id": "g1",
"upc": "", "line_no": "1",
"line_total": "7.49", "normalized_item_id": "gnorm_mix",
}, "raw_item_name": "MIXED PEPPER",
{ "normalized_item_name": "MIXED PEPPER",
"purchase_date": "2026-03-12", "image_url": "",
"retailer": "costco", "upc": "",
"order_id": "c1", "line_total": "5.99",
"line_no": "1", },
"normalized_item_id": "cnorm_mix", ],
"catalog_id": "", )
"raw_item_name": "MIXED PEPPER 6-PACK",
"normalized_item_name": "MIXED PEPPER",
"image_url": "",
"upc": "",
"line_total": "6.99",
},
{
"purchase_date": "2026-03-10",
"retailer": "giant",
"order_id": "g1",
"line_no": "1",
"normalized_item_id": "gnorm_mix",
"catalog_id": "cat_mix",
"raw_item_name": "MIXED PEPPER",
"normalized_item_name": "MIXED PEPPER",
"image_url": "",
"upc": "",
"line_total": "5.99",
},
]
)
with catalog_csv.open("w", newline="", encoding="utf-8") as handle: with catalog_csv.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=review_products.build_purchases.CATALOG_FIELDS) writer = csv.DictWriter(handle, fieldnames=review_products.build_purchases.CATALOG_FIELDS)
@@ -384,10 +443,33 @@ class ReviewWorkflowTests(unittest.TestCase):
"updated_at": "", "updated_at": "",
} }
) )
with links_csv.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=review_products.build_purchases.PRODUCT_LINK_FIELDS)
writer.writeheader()
writer.writerow(
{
"normalized_item_id": "gnorm_mix",
"catalog_id": "cat_mix",
"link_method": "manual_link",
"link_confidence": "high",
"review_status": "approved",
"reviewed_by": "",
"reviewed_at": "",
"link_notes": "",
}
)
result = CliRunner().invoke( result = CliRunner().invoke(
review_products.main, review_products.main,
[ [
"--giant-items-enriched-csv",
str(giant_items_csv),
"--costco-items-enriched-csv",
str(costco_items_csv),
"--giant-orders-csv",
str(giant_orders_csv),
"--costco-orders-csv",
str(costco_orders_csv),
"--purchases-csv", "--purchases-csv",
str(purchases_csv), str(purchases_csv),
"--queue-csv", "--queue-csv",
@@ -422,40 +504,23 @@ class ReviewWorkflowTests(unittest.TestCase):
resolutions_csv = Path(tmpdir) / "review_resolutions.csv" resolutions_csv = Path(tmpdir) / "review_resolutions.csv"
catalog_csv = Path(tmpdir) / "catalog.csv" catalog_csv = Path(tmpdir) / "catalog.csv"
links_csv = Path(tmpdir) / "product_links.csv" links_csv = Path(tmpdir) / "product_links.csv"
giant_items_csv, costco_items_csv, giant_orders_csv, costco_orders_csv = write_review_source_files(
with purchases_csv.open("w", newline="", encoding="utf-8") as handle: tmpdir,
writer = csv.DictWriter( [
handle,
fieldnames=[
"purchase_date",
"retailer",
"order_id",
"line_no",
"normalized_item_id",
"catalog_id",
"raw_item_name",
"normalized_item_name",
"image_url",
"upc",
"line_total",
],
)
writer.writeheader()
writer.writerow(
{ {
"purchase_date": "2026-03-14", "purchase_date": "2026-03-14",
"retailer": "giant", "retailer": "giant",
"order_id": "g1", "order_id": "g1",
"line_no": "1", "line_no": "1",
"normalized_item_id": "gnorm_ice", "normalized_item_id": "gnorm_ice",
"catalog_id": "",
"raw_item_name": "SB BAGGED ICE 20LB", "raw_item_name": "SB BAGGED ICE 20LB",
"normalized_item_name": "BAGGED ICE", "normalized_item_name": "BAGGED ICE",
"image_url": "", "image_url": "",
"upc": "", "upc": "",
"line_total": "3.50", "line_total": "3.50",
} }
) ],
)
with catalog_csv.open("w", newline="", encoding="utf-8") as handle: with catalog_csv.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=review_products.build_purchases.CATALOG_FIELDS) writer = csv.DictWriter(handle, fieldnames=review_products.build_purchases.CATALOG_FIELDS)
@@ -481,6 +546,14 @@ class ReviewWorkflowTests(unittest.TestCase):
result = CliRunner().invoke( result = CliRunner().invoke(
review_products.main, review_products.main,
[ [
"--giant-items-enriched-csv",
str(giant_items_csv),
"--costco-items-enriched-csv",
str(costco_items_csv),
"--giant-orders-csv",
str(giant_orders_csv),
"--costco-orders-csv",
str(costco_orders_csv),
"--purchases-csv", "--purchases-csv",
str(purchases_csv), str(purchases_csv),
"--queue-csv", "--queue-csv",
@@ -506,40 +579,23 @@ class ReviewWorkflowTests(unittest.TestCase):
resolutions_csv = Path(tmpdir) / "review_resolutions.csv" resolutions_csv = Path(tmpdir) / "review_resolutions.csv"
catalog_csv = Path(tmpdir) / "catalog.csv" catalog_csv = Path(tmpdir) / "catalog.csv"
links_csv = Path(tmpdir) / "product_links.csv" links_csv = Path(tmpdir) / "product_links.csv"
giant_items_csv, costco_items_csv, giant_orders_csv, costco_orders_csv = write_review_source_files(
with purchases_csv.open("w", newline="", encoding="utf-8") as handle: tmpdir,
writer = csv.DictWriter( [
handle,
fieldnames=[
"purchase_date",
"retailer",
"order_id",
"line_no",
"normalized_item_id",
"catalog_id",
"raw_item_name",
"normalized_item_name",
"image_url",
"upc",
"line_total",
],
)
writer.writeheader()
writer.writerow(
{ {
"purchase_date": "2026-03-14", "purchase_date": "2026-03-14",
"retailer": "giant", "retailer": "giant",
"order_id": "g1", "order_id": "g1",
"line_no": "1", "line_no": "1",
"normalized_item_id": "gnorm_skip", "normalized_item_id": "gnorm_skip",
"catalog_id": "",
"raw_item_name": "TEST ITEM", "raw_item_name": "TEST ITEM",
"normalized_item_name": "TEST ITEM", "normalized_item_name": "TEST ITEM",
"image_url": "", "image_url": "",
"upc": "", "upc": "",
"line_total": "1.00", "line_total": "1.00",
} }
) ],
)
with catalog_csv.open("w", newline="", encoding="utf-8") as handle: with catalog_csv.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=review_products.build_purchases.CATALOG_FIELDS) writer = csv.DictWriter(handle, fieldnames=review_products.build_purchases.CATALOG_FIELDS)
@@ -548,6 +604,14 @@ class ReviewWorkflowTests(unittest.TestCase):
result = CliRunner().invoke( result = CliRunner().invoke(
review_products.main, review_products.main,
[ [
"--giant-items-enriched-csv",
str(giant_items_csv),
"--costco-items-enriched-csv",
str(costco_items_csv),
"--giant-orders-csv",
str(giant_orders_csv),
"--costco-orders-csv",
str(costco_orders_csv),
"--purchases-csv", "--purchases-csv",
str(purchases_csv), str(purchases_csv),
"--queue-csv", "--queue-csv",
@@ -578,30 +642,12 @@ class ReviewWorkflowTests(unittest.TestCase):
resolutions_csv = Path(tmpdir) / "review_resolutions.csv" resolutions_csv = Path(tmpdir) / "review_resolutions.csv"
catalog_csv = Path(tmpdir) / "catalog.csv" catalog_csv = Path(tmpdir) / "catalog.csv"
links_csv = Path(tmpdir) / "product_links.csv" links_csv = Path(tmpdir) / "product_links.csv"
giant_items_csv, costco_items_csv, giant_orders_csv, costco_orders_csv = write_review_source_files(
with purchases_csv.open("w", newline="", encoding="utf-8") as handle: tmpdir,
writer = csv.DictWriter( [
handle,
fieldnames=[
"purchase_date",
"normalized_item_id",
"catalog_id",
"retailer",
"raw_item_name",
"normalized_item_name",
"image_url",
"upc",
"line_total",
"order_id",
"line_no",
],
)
writer.writeheader()
writer.writerow(
{ {
"purchase_date": "2026-03-15", "purchase_date": "2026-03-15",
"normalized_item_id": "gnorm_ice", "normalized_item_id": "gnorm_ice",
"catalog_id": "",
"retailer": "giant", "retailer": "giant",
"raw_item_name": "SB BAGGED ICE 20LB", "raw_item_name": "SB BAGGED ICE 20LB",
"normalized_item_name": "BAGGED ICE", "normalized_item_name": "BAGGED ICE",
@@ -611,7 +657,8 @@ class ReviewWorkflowTests(unittest.TestCase):
"order_id": "g1", "order_id": "g1",
"line_no": "1", "line_no": "1",
} }
) ],
)
with mock.patch.object( with mock.patch.object(
review_products.click, review_products.click,
@@ -619,6 +666,10 @@ class ReviewWorkflowTests(unittest.TestCase):
side_effect=["n", "ICE", "frozen", "ice", "manual merge", "q"], side_effect=["n", "ICE", "frozen", "ice", "manual merge", "q"],
): ):
review_products.main.callback( review_products.main.callback(
giant_items_enriched_csv=str(giant_items_csv),
costco_items_enriched_csv=str(costco_items_csv),
giant_orders_csv=str(giant_orders_csv),
costco_orders_csv=str(costco_orders_csv),
purchases_csv=str(purchases_csv), purchases_csv=str(purchases_csv),
queue_csv=str(queue_csv), queue_csv=str(queue_csv),
resolutions_csv=str(resolutions_csv), resolutions_csv=str(resolutions_csv),
@@ -647,6 +698,63 @@ class ReviewWorkflowTests(unittest.TestCase):
self.assertEqual("ICE", catalog_rows[0]["catalog_name"]) self.assertEqual("ICE", catalog_rows[0]["catalog_name"])
self.assertEqual(catalog_rows[0]["catalog_id"], link_rows[0]["catalog_id"]) self.assertEqual(catalog_rows[0]["catalog_id"], link_rows[0]["catalog_id"])
def test_build_review_queue_readds_orphaned_and_incomplete_links(self):
purchase_rows = [
{
"normalized_item_id": "gnorm_orphan",
"catalog_id": "cat_missing",
"retailer": "giant",
"raw_item_name": "ORPHAN ITEM",
"normalized_item_name": "ORPHAN ITEM",
"upc": "",
"line_total": "3.50",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
},
{
"normalized_item_id": "gnorm_incomplete",
"catalog_id": "cat_incomplete",
"retailer": "giant",
"raw_item_name": "INCOMPLETE ITEM",
"normalized_item_name": "INCOMPLETE ITEM",
"upc": "",
"line_total": "4.50",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
},
]
link_rows = [
{
"normalized_item_id": "gnorm_orphan",
"catalog_id": "cat_missing",
},
{
"normalized_item_id": "gnorm_incomplete",
"catalog_id": "cat_incomplete",
},
]
catalog_rows = [
{
"catalog_id": "cat_incomplete",
"catalog_name": "INCOMPLETE ITEM",
"product_type": "",
}
]
queue_rows = review_products.build_review_queue(
purchase_rows,
[],
link_rows,
catalog_rows,
[],
)
reasons = {row["normalized_item_id"]: row["reason_code"] for row in queue_rows}
self.assertEqual("orphaned_catalog_link", reasons["gnorm_orphan"])
self.assertEqual("incomplete_catalog_link", reasons["gnorm_incomplete"])
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()