Record t1.17 task evidence

Fix normalized quantity basis
2026-03-21 21:50:16 -04:00 · 2026-03-21 21:50:10 -04:00
8 changed files with 124 additions and 10 deletions
--- a/build_purchases.py
+++ b/build_purchases.py
@@ -29,6 +29,8 @@ PURCHASE_FIELDS = [
    "upc",
    "qty",
    "unit",
+    "normalized_quantity",
+    "normalized_quantity_unit",
    "pack_qty",
    "size_value",
    "size_unit",
@@ -337,6 +339,8 @@ def build_purchase_rows(
                "upc": row["upc"],
                "qty": row["qty"],
                "unit": row["unit"],
+                "normalized_quantity": row.get("normalized_quantity", ""),
+                "normalized_quantity_unit": row.get("normalized_quantity_unit", ""),
                "pack_qty": row["pack_qty"],
                "size_value": row["size_value"],
                "size_unit": row["size_unit"],
--- a/enrich_costco.py
+++ b/enrich_costco.py
@@ -37,7 +37,9 @@ DUAL_WEIGHT_RE = re.compile(
 LOGISTICS_SLASH_RE = re.compile(r"\b(?:T\d+/H\d+(?:/P\d+)?/?|H\d+/P\d+/?|T\d+/H\d+/?)\b")
 PACK_DASH_RE = re.compile(r"(?<![A-Z0-9])(\d+)\s*-\s*PACK\b")
 PACK_WORD_RE = re.compile(r"(?<![A-Z0-9])(\d+)\s*PACK\b")
-SIZE_RE = re.compile(r"(?<![A-Z0-9])(\d+(?:\.\d+)?)\s*(OZ|LB|LBS|CT|KG|G)\b")
+SIZE_RE = re.compile(
+    r"(?<![A-Z0-9])(\d+(?:\.\d+)?)\s*(OZ|LB|LBS|CT|KG|G|QT|QTS|PT|PTS|GAL|GALS|FL OZ|FLOZ)\b"
+)
 DISCOUNT_TARGET_RE = re.compile(r"^/\s*(\d+)\b")


@@ -192,6 +194,7 @@ def parse_costco_item(order_id, order_date, raw_path, line_no, item):
    )
    normalized_row_id = f"{RETAILER}:{order_id}:{line_no}"
    normalized_quantity, normalized_quantity_unit = derive_normalized_quantity(
+        item.get("unit"),
        size_value,
        size_unit,
        pack_qty,
--- a/enrich_giant.py
+++ b/enrich_giant.py
@@ -224,13 +224,17 @@ def normalize_unit(unit):
        "OZ": "oz",
        "FZ": "fl_oz",
        "FL OZ": "fl_oz",
+        "FLOZ": "fl_oz",
        "LB": "lb",
        "LBS": "lb",
        "ML": "ml",
        "L": "l",
        "QT": "qt",
+        "QTS": "qt",
        "PT": "pt",
+        "PTS": "pt",
        "GAL": "gal",
+        "GALS": "gal",
        "GA": "gal",
    }.get(collapsed, collapsed.lower())

@@ -340,16 +344,24 @@ def derive_prices(item, measure_type, size_value="", size_unit="", pack_qty=""):
    return price_per_each, price_per_lb, price_per_oz


-def derive_normalized_quantity(size_value, size_unit, pack_qty, measure_type):
+def derive_normalized_quantity(qty, size_value, size_unit, pack_qty, measure_type):
+    parsed_qty = to_decimal(qty)
    parsed_size = to_decimal(size_value)
-    parsed_pack = to_decimal(pack_qty) or Decimal("1")
+    parsed_pack = to_decimal(pack_qty)
+    total_multiplier = None
+    if parsed_qty not in (None, Decimal("0")):
+        total_multiplier = parsed_qty * (parsed_pack or Decimal("1"))

-    if parsed_size not in (None, Decimal("0")) and size_unit:
-        return format_decimal(parsed_size * parsed_pack), size_unit
-    if parsed_pack not in (None, Decimal("0")) and measure_type == "count":
-        return format_decimal(parsed_pack), "count"
-    if measure_type == "each":
-        return "1", "each"
+    if (
+        parsed_size not in (None, Decimal("0"))
+        and size_unit
+        and total_multiplier not in (None, Decimal("0"))
+    ):
+        return format_decimal(parsed_size * total_multiplier), size_unit
+    if measure_type == "count" and total_multiplier not in (None, Decimal("0")):
+        return format_decimal(total_multiplier), "count"
+    if measure_type == "each" and parsed_qty not in (None, Decimal("0")):
+        return format_decimal(parsed_qty), "each"
    return "", ""


@@ -424,6 +436,7 @@ def parse_item(order_id, order_date, raw_path, line_no, item):

    normalized_row_id = f"{RETAILER}:{order_id}:{line_no}"
    normalized_quantity, normalized_quantity_unit = derive_normalized_quantity(
+        item.get("shipQy"),
        size_value,
        size_unit,
        pack_qty,
--- a/pm/data-model.org
+++ b/pm/data-model.org
@@ -344,3 +344,9 @@ Notes:
 - review/link decisions should apply at the `normalized_item_id` level, then fan out to all purchase rows sharing that id.

 * /
+Normalized quantity is deterministic and conservative:
+- if `qty * pack_qty * size_value` is available, use that total with `size_unit`
+- else if count basis is explicit, use `qty * pack_qty` with unit `count`
+- else if `measure_type` is `each`, use `qty each`
+- else leave both fields blank
+- no hidden unit conversion is applied inside normalization; values stay in their parsed units such as `oz`, `lb`, `qt`, or `count`
--- a/pm/tasks.org
+++ b/pm/tasks.org
@@ -763,8 +763,46 @@ enable fast lookup of catalog items during review via tokenized search and repla
 - Search intentionally optimizes for manual speed rather than smart ranking: simple token overlap, max 10 rows, and immediate persistence on selection.
 - Follow-up fix: search moved to `[f]ind` so `[s]kip` remains available at the main prompt.

-* [ ] t1.16.2: catalog search refinement
+* [x] t1.17: fix normalized quantity derivation and carry it through purchases (2-4 commits)
+correct and document deterministic normalized quantity fields so unit-cost analysis works across package sizes

+** Acceptance Criteria
+1. populate and validate `normalized_quantity` and `normalized_quantity_unit` in `data/<retailer-method>/normalized_items.csv`
+   - these columns already exist and must be corrected rather than reintroduced
+2. carry `normalized_quantity` and `normalized_quantity_unit` through to `data/review/purchases.csv`
+3. derive normalized quantity deterministically from existing parsed fields only:
+   - `qty`
+   - `pack_qty`
+   - `size_value`
+   - `size_unit`
+   - `measure_type`
+4. prefer the best deterministic basis rather than falling back to `each` too early:
+   - count items when count is explicit
+   - weight items when parsed weight is explicit
+   - volume items when parsed volume is explicit
+   - `each` only when no better basis is available
+5. handle common cases explicitly, including totals derived from deterministic patterns such as:
+   - `18 count`
+   - `5 lb`
+   - `64 oz`
+   - `2 each`
+6. preserve blanks when no reliable normalized quantity basis can be derived
+7. existing `normalized_item_id` values remain stable; this task must not change retailer-level grouping identity
+8. document the derivation rules and any intentional conversions or non-conversions in `pm/data-model.org` or task notes
+   - if unit conversions are allowed, they must be explicit and minimal
+- pm note: keep this deterministic and conservative; do not introduce fuzzy inference
+- pm note: if `lb <-> oz` or volume conversions are used, document them directly rather than hiding them in code
+- pm note: this task enables cost analysis and charting, not catalog/review changes
+
+** evidence
+- commit: `d25448b`
+- tests: `./venv/bin/python -m unittest discover -s tests`; `./venv/bin/python normalize_giant_web.py`; `./venv/bin/python normalize_costco_web.py`; `./venv/bin/python build_purchases.py`
+- datetime: 2026-03-21 21:02:21 EDT
+
+** notes
+- The missing purchases fields were a carry-through bug: normalization had `normalized_quantity` and `normalized_quantity_unit`, but `build_purchases.py` never wrote them into `data/review/purchases.csv`.
+- Normalized quantity now prefers explicit package basis over `each`, so rows like `PEPSI 6PK 7.5Z` resolve to `90 oz` and `KS ALMND BAR US 1.74QTS` purchased twice resolves to `3.48 qt`.
+- The derivation stays conservative and does not convert units during normalization; parsed units such as `oz`, `lb`, `qt`, and `count` are preserved as-is.
 * [ ] 1t.10: add optional llm-assisted suggestion workflow for unresolved normalized retailer items (2-4 commits)

 ** acceptance criteria
--- a/tests/test_costco_pipeline.py
+++ b/tests/test_costco_pipeline.py
@@ -264,6 +264,26 @@ class CostcoPipelineTests(unittest.TestCase):
        self.assertEqual("6", row["normalized_quantity"])
        self.assertEqual("count", row["normalized_quantity_unit"])

+        volume_row = enrich_costco.parse_costco_item(
+            order_id="abc",
+            order_date="2026-03-12",
+            raw_path=Path("costco_output/raw/abc.json"),
+            line_no=3,
+            item={
+                "itemNumber": "1185912",
+                "itemDescription01": "KS ALMND BAR US 1.74QTS CN",
+                "itemDescription02": None,
+                "itemDepartmentNumber": 18,
+                "transDepartmentNumber": 18,
+                "unit": 2,
+                "itemIdentifier": "E",
+                "amount": 21.98,
+                "itemUnitPriceAmount": 10.99,
+            },
+        )
+        self.assertEqual("3.48", volume_row["normalized_quantity"])
+        self.assertEqual("qt", volume_row["normalized_quantity_unit"])
+
        discount = enrich_costco.parse_costco_item(
            order_id="abc",
            order_date="2026-03-12",
--- a/tests/test_enrich_giant.py
+++ b/tests/test_enrich_giant.py
@@ -111,9 +111,25 @@ class EnrichGiantTests(unittest.TestCase):
        self.assertEqual("weight", row["measure_type"])
        self.assertEqual("6", row["pack_qty"])
        self.assertEqual("7.5", row["size_value"])
+        self.assertEqual("90", row["normalized_quantity"])
+        self.assertEqual("oz", row["normalized_quantity_unit"])
        self.assertEqual("0.0667", row["price_per_oz"])
        self.assertEqual("1.0667", row["price_per_lb"])

+    def test_derive_normalized_quantity_handles_count_volume_and_each(self):
+        self.assertEqual(
+            ("18", "count"),
+            enrich_giant.derive_normalized_quantity("1", "", "", "18", "count"),
+        )
+        self.assertEqual(
+            ("3.48", "qt"),
+            enrich_giant.derive_normalized_quantity("2", "1.74", "qt", "", "volume"),
+        )
+        self.assertEqual(
+            ("2", "each"),
+            enrich_giant.derive_normalized_quantity("2", "", "", "", "each"),
+        )
+
    def test_build_items_enriched_reads_raw_order_files_and_writes_csv(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            raw_dir = Path(tmpdir) / "raw"
--- a/tests/test_purchases.py
+++ b/tests/test_purchases.py
@@ -47,6 +47,8 @@ class PurchaseLogTests(unittest.TestCase):
                "upc": "4011",
                "qty": "1",
                "unit": "LB",
+                "normalized_quantity": "1",
+                "normalized_quantity_unit": "lb",
                "line_total": "1.29",
                "unit_price": "1.29",
                "measure_type": "weight",
@@ -71,6 +73,8 @@ class PurchaseLogTests(unittest.TestCase):
                "retailer_item_id": "30669",
                "qty": "1",
                "unit": "E",
+                "normalized_quantity": "3",
+                "normalized_quantity_unit": "lb",
                "line_total": "2.98",
                "unit_price": "2.98",
                "size_value": "3",
@@ -155,6 +159,8 @@ class PurchaseLogTests(unittest.TestCase):
        self.assertTrue(all(row["catalog_id"] == "cat_banana" for row in rows))
        self.assertEqual({"giant", "costco"}, {row["retailer"] for row in rows})
        self.assertEqual("https://example.test/banana.jpg", rows[0]["image_url"])
+        self.assertEqual("1", rows[0]["normalized_quantity"])
+        self.assertEqual("lb", rows[0]["normalized_quantity_unit"])

    def test_main_writes_purchase_and_example_csvs(self):
        with tempfile.TemporaryDirectory() as tmpdir:
@@ -184,6 +190,8 @@ class PurchaseLogTests(unittest.TestCase):
                    "upc": "4011",
                    "qty": "1",
                    "unit": "LB",
+                    "normalized_quantity": "1",
+                    "normalized_quantity_unit": "lb",
                    "line_total": "1.29",
                    "unit_price": "1.29",
                    "measure_type": "weight",
@@ -208,6 +216,8 @@ class PurchaseLogTests(unittest.TestCase):
                    "retailer_item_id": "30669",
                    "qty": "1",
                    "unit": "E",
+                    "normalized_quantity": "3",
+                    "normalized_quantity_unit": "lb",
                    "line_total": "2.98",
                    "unit_price": "2.98",
                    "size_value": "3",
@@ -346,6 +356,8 @@ class PurchaseLogTests(unittest.TestCase):
                "upc": "",
                "qty": "1",
                "unit": "EA",
+                "normalized_quantity": "1",
+                "normalized_quantity_unit": "each",
                "line_total": "3.50",
                "unit_price": "3.50",
                "measure_type": "each",
@@ -403,6 +415,8 @@ class PurchaseLogTests(unittest.TestCase):
        self.assertEqual("approved", rows[0]["review_status"])
        self.assertEqual("create", rows[0]["resolution_action"])
        self.assertEqual("cat_ice", links[0]["catalog_id"])
+        self.assertEqual("1", rows[0]["normalized_quantity"])
+        self.assertEqual("each", rows[0]["normalized_quantity_unit"])


 if __name__ == "__main__":
Author	SHA1	Message	Date
ben	38c2c2ea2e	Record t1.17 task evidence	2026-03-21 21:50:16 -04:00
ben	d25448b690	Fix normalized quantity basis	2026-03-21 21:50:10 -04:00