Compare commits

...

2 Commits

Author SHA1 Message Date
ben
38c2c2ea2e Record t1.17 task evidence 2026-03-21 21:50:16 -04:00
ben
d25448b690 Fix normalized quantity basis 2026-03-21 21:50:10 -04:00
8 changed files with 124 additions and 10 deletions

View File

@@ -29,6 +29,8 @@ PURCHASE_FIELDS = [
"upc",
"qty",
"unit",
"normalized_quantity",
"normalized_quantity_unit",
"pack_qty",
"size_value",
"size_unit",
@@ -337,6 +339,8 @@ def build_purchase_rows(
"upc": row["upc"],
"qty": row["qty"],
"unit": row["unit"],
"normalized_quantity": row.get("normalized_quantity", ""),
"normalized_quantity_unit": row.get("normalized_quantity_unit", ""),
"pack_qty": row["pack_qty"],
"size_value": row["size_value"],
"size_unit": row["size_unit"],

View File

@@ -37,7 +37,9 @@ DUAL_WEIGHT_RE = re.compile(
LOGISTICS_SLASH_RE = re.compile(r"\b(?:T\d+/H\d+(?:/P\d+)?/?|H\d+/P\d+/?|T\d+/H\d+/?)\b")
PACK_DASH_RE = re.compile(r"(?<![A-Z0-9])(\d+)\s*-\s*PACK\b")
PACK_WORD_RE = re.compile(r"(?<![A-Z0-9])(\d+)\s*PACK\b")
SIZE_RE = re.compile(r"(?<![A-Z0-9])(\d+(?:\.\d+)?)\s*(OZ|LB|LBS|CT|KG|G)\b")
SIZE_RE = re.compile(
r"(?<![A-Z0-9])(\d+(?:\.\d+)?)\s*(OZ|LB|LBS|CT|KG|G|QT|QTS|PT|PTS|GAL|GALS|FL OZ|FLOZ)\b"
)
DISCOUNT_TARGET_RE = re.compile(r"^/\s*(\d+)\b")
@@ -192,6 +194,7 @@ def parse_costco_item(order_id, order_date, raw_path, line_no, item):
)
normalized_row_id = f"{RETAILER}:{order_id}:{line_no}"
normalized_quantity, normalized_quantity_unit = derive_normalized_quantity(
item.get("unit"),
size_value,
size_unit,
pack_qty,

View File

@@ -224,13 +224,17 @@ def normalize_unit(unit):
"OZ": "oz",
"FZ": "fl_oz",
"FL OZ": "fl_oz",
"FLOZ": "fl_oz",
"LB": "lb",
"LBS": "lb",
"ML": "ml",
"L": "l",
"QT": "qt",
"QTS": "qt",
"PT": "pt",
"PTS": "pt",
"GAL": "gal",
"GALS": "gal",
"GA": "gal",
}.get(collapsed, collapsed.lower())
@@ -340,16 +344,24 @@ def derive_prices(item, measure_type, size_value="", size_unit="", pack_qty=""):
return price_per_each, price_per_lb, price_per_oz
def derive_normalized_quantity(size_value, size_unit, pack_qty, measure_type):
def derive_normalized_quantity(qty, size_value, size_unit, pack_qty, measure_type):
parsed_qty = to_decimal(qty)
parsed_size = to_decimal(size_value)
parsed_pack = to_decimal(pack_qty) or Decimal("1")
parsed_pack = to_decimal(pack_qty)
total_multiplier = None
if parsed_qty not in (None, Decimal("0")):
total_multiplier = parsed_qty * (parsed_pack or Decimal("1"))
if parsed_size not in (None, Decimal("0")) and size_unit:
return format_decimal(parsed_size * parsed_pack), size_unit
if parsed_pack not in (None, Decimal("0")) and measure_type == "count":
return format_decimal(parsed_pack), "count"
if measure_type == "each":
return "1", "each"
if (
parsed_size not in (None, Decimal("0"))
and size_unit
and total_multiplier not in (None, Decimal("0"))
):
return format_decimal(parsed_size * total_multiplier), size_unit
if measure_type == "count" and total_multiplier not in (None, Decimal("0")):
return format_decimal(total_multiplier), "count"
if measure_type == "each" and parsed_qty not in (None, Decimal("0")):
return format_decimal(parsed_qty), "each"
return "", ""
@@ -424,6 +436,7 @@ def parse_item(order_id, order_date, raw_path, line_no, item):
normalized_row_id = f"{RETAILER}:{order_id}:{line_no}"
normalized_quantity, normalized_quantity_unit = derive_normalized_quantity(
item.get("shipQy"),
size_value,
size_unit,
pack_qty,

View File

@@ -344,3 +344,9 @@ Notes:
- review/link decisions should apply at the `normalized_item_id` level, then fan out to all purchase rows sharing that id.
* /
Normalized quantity is deterministic and conservative:
- if `qty * pack_qty * size_value` is available, use that total with `size_unit`
- else if count basis is explicit, use `qty * pack_qty` with unit `count`
- else if `measure_type` is `each`, use `qty each`
- else leave both fields blank
- no hidden unit conversion is applied inside normalization; values stay in their parsed units such as `oz`, `lb`, `qt`, or `count`

View File

@@ -763,8 +763,46 @@ enable fast lookup of catalog items during review via tokenized search and repla
- Search intentionally optimizes for manual speed rather than smart ranking: simple token overlap, max 10 rows, and immediate persistence on selection.
- Follow-up fix: search moved to `[f]ind` so `[s]kip` remains available at the main prompt.
* [ ] t1.16.2: catalog search refinement
* [x] t1.17: fix normalized quantity derivation and carry it through purchases (2-4 commits)
correct and document deterministic normalized quantity fields so unit-cost analysis works across package sizes
** Acceptance Criteria
1. populate and validate `normalized_quantity` and `normalized_quantity_unit` in `data/<retailer-method>/normalized_items.csv`
- these columns already exist and must be corrected rather than reintroduced
2. carry `normalized_quantity` and `normalized_quantity_unit` through to `data/review/purchases.csv`
3. derive normalized quantity deterministically from existing parsed fields only:
- `qty`
- `pack_qty`
- `size_value`
- `size_unit`
- `measure_type`
4. prefer the best deterministic basis rather than falling back to `each` too early:
- count items when count is explicit
- weight items when parsed weight is explicit
- volume items when parsed volume is explicit
- `each` only when no better basis is available
5. handle common cases explicitly, including totals derived from deterministic patterns such as:
- `18 count`
- `5 lb`
- `64 oz`
- `2 each`
6. preserve blanks when no reliable normalized quantity basis can be derived
7. existing `normalized_item_id` values remain stable; this task must not change retailer-level grouping identity
8. document the derivation rules and any intentional conversions or non-conversions in `pm/data-model.org` or task notes
- if unit conversions are allowed, they must be explicit and minimal
- pm note: keep this deterministic and conservative; do not introduce fuzzy inference
- pm note: if `lb <-> oz` or volume conversions are used, document them directly rather than hiding them in code
- pm note: this task enables cost analysis and charting, not catalog/review changes
** evidence
- commit: `d25448b`
- tests: `./venv/bin/python -m unittest discover -s tests`; `./venv/bin/python normalize_giant_web.py`; `./venv/bin/python normalize_costco_web.py`; `./venv/bin/python build_purchases.py`
- datetime: 2026-03-21 21:02:21 EDT
** notes
- The missing purchases fields were a carry-through bug: normalization had `normalized_quantity` and `normalized_quantity_unit`, but `build_purchases.py` never wrote them into `data/review/purchases.csv`.
- Normalized quantity now prefers explicit package basis over `each`, so rows like `PEPSI 6PK 7.5Z` resolve to `90 oz` and `KS ALMND BAR US 1.74QTS` purchased twice resolves to `3.48 qt`.
- The derivation stays conservative and does not convert units during normalization; parsed units such as `oz`, `lb`, `qt`, and `count` are preserved as-is.
* [ ] 1t.10: add optional llm-assisted suggestion workflow for unresolved normalized retailer items (2-4 commits)
** acceptance criteria

View File

@@ -264,6 +264,26 @@ class CostcoPipelineTests(unittest.TestCase):
self.assertEqual("6", row["normalized_quantity"])
self.assertEqual("count", row["normalized_quantity_unit"])
volume_row = enrich_costco.parse_costco_item(
order_id="abc",
order_date="2026-03-12",
raw_path=Path("costco_output/raw/abc.json"),
line_no=3,
item={
"itemNumber": "1185912",
"itemDescription01": "KS ALMND BAR US 1.74QTS CN",
"itemDescription02": None,
"itemDepartmentNumber": 18,
"transDepartmentNumber": 18,
"unit": 2,
"itemIdentifier": "E",
"amount": 21.98,
"itemUnitPriceAmount": 10.99,
},
)
self.assertEqual("3.48", volume_row["normalized_quantity"])
self.assertEqual("qt", volume_row["normalized_quantity_unit"])
discount = enrich_costco.parse_costco_item(
order_id="abc",
order_date="2026-03-12",

View File

@@ -111,9 +111,25 @@ class EnrichGiantTests(unittest.TestCase):
self.assertEqual("weight", row["measure_type"])
self.assertEqual("6", row["pack_qty"])
self.assertEqual("7.5", row["size_value"])
self.assertEqual("90", row["normalized_quantity"])
self.assertEqual("oz", row["normalized_quantity_unit"])
self.assertEqual("0.0667", row["price_per_oz"])
self.assertEqual("1.0667", row["price_per_lb"])
def test_derive_normalized_quantity_handles_count_volume_and_each(self):
self.assertEqual(
("18", "count"),
enrich_giant.derive_normalized_quantity("1", "", "", "18", "count"),
)
self.assertEqual(
("3.48", "qt"),
enrich_giant.derive_normalized_quantity("2", "1.74", "qt", "", "volume"),
)
self.assertEqual(
("2", "each"),
enrich_giant.derive_normalized_quantity("2", "", "", "", "each"),
)
def test_build_items_enriched_reads_raw_order_files_and_writes_csv(self):
with tempfile.TemporaryDirectory() as tmpdir:
raw_dir = Path(tmpdir) / "raw"

View File

@@ -47,6 +47,8 @@ class PurchaseLogTests(unittest.TestCase):
"upc": "4011",
"qty": "1",
"unit": "LB",
"normalized_quantity": "1",
"normalized_quantity_unit": "lb",
"line_total": "1.29",
"unit_price": "1.29",
"measure_type": "weight",
@@ -71,6 +73,8 @@ class PurchaseLogTests(unittest.TestCase):
"retailer_item_id": "30669",
"qty": "1",
"unit": "E",
"normalized_quantity": "3",
"normalized_quantity_unit": "lb",
"line_total": "2.98",
"unit_price": "2.98",
"size_value": "3",
@@ -155,6 +159,8 @@ class PurchaseLogTests(unittest.TestCase):
self.assertTrue(all(row["catalog_id"] == "cat_banana" for row in rows))
self.assertEqual({"giant", "costco"}, {row["retailer"] for row in rows})
self.assertEqual("https://example.test/banana.jpg", rows[0]["image_url"])
self.assertEqual("1", rows[0]["normalized_quantity"])
self.assertEqual("lb", rows[0]["normalized_quantity_unit"])
def test_main_writes_purchase_and_example_csvs(self):
with tempfile.TemporaryDirectory() as tmpdir:
@@ -184,6 +190,8 @@ class PurchaseLogTests(unittest.TestCase):
"upc": "4011",
"qty": "1",
"unit": "LB",
"normalized_quantity": "1",
"normalized_quantity_unit": "lb",
"line_total": "1.29",
"unit_price": "1.29",
"measure_type": "weight",
@@ -208,6 +216,8 @@ class PurchaseLogTests(unittest.TestCase):
"retailer_item_id": "30669",
"qty": "1",
"unit": "E",
"normalized_quantity": "3",
"normalized_quantity_unit": "lb",
"line_total": "2.98",
"unit_price": "2.98",
"size_value": "3",
@@ -346,6 +356,8 @@ class PurchaseLogTests(unittest.TestCase):
"upc": "",
"qty": "1",
"unit": "EA",
"normalized_quantity": "1",
"normalized_quantity_unit": "each",
"line_total": "3.50",
"unit_price": "3.50",
"measure_type": "each",
@@ -403,6 +415,8 @@ class PurchaseLogTests(unittest.TestCase):
self.assertEqual("approved", rows[0]["review_status"])
self.assertEqual("create", rows[0]["resolution_action"])
self.assertEqual("cat_ice", links[0]["catalog_id"])
self.assertEqual("1", rows[0]["normalized_quantity"])
self.assertEqual("each", rows[0]["normalized_quantity_unit"])
if __name__ == "__main__":