Clean Costco normalization artifacts

2026-03-20 11:09:44 -04:00
parent 848d229f2d
commit bcec6b37d3
3 changed files with 59 additions and 5 deletions
--- a/enrich_costco.py
+++ b/enrich_costco.py
@@ -30,6 +30,11 @@ CODE_TOKEN_RE = re.compile(
 )
 PACK_FRACTION_RE = re.compile(r"(?<![A-Z0-9])(\d+)\s*/\s*(\d+(?:\.\d+)?)\s*(OZ|LB|LBS|CT)\b")
 HASH_SIZE_RE = re.compile(r"(?<![A-Z0-9])(\d+(?:\.\d+)?)#\b")
+ITEM_CODE_RE = re.compile(r"#\w+\b")
+DUAL_WEIGHT_RE = re.compile(
+    r"\b\d+(?:\.\d+)?\s*(?:KG|G|LB|LBS|OZ)\s*/\s*\d+(?:\.\d+)?\s*(?:KG|G|LB|LBS|OZ)\b"
+)
+LOGISTICS_SLASH_RE = re.compile(r"\b(?:T\d+/H\d+(?:/P\d+)?/?|H\d+/P\d+/?|T\d+/H\d+/?)\b")
 PACK_DASH_RE = re.compile(r"(?<![A-Z0-9])(\d+)\s*-\s*PACK\b")
 PACK_WORD_RE = re.compile(r"(?<![A-Z0-9])(\d+)\s*PACK\b")
 SIZE_RE = re.compile(r"(?<![A-Z0-9])(\d+(?:\.\d+)?)\s*(OZ|LB|LBS|CT|KG|G)\b")
@@ -98,12 +103,17 @@ def normalize_costco_name(cleaned_name):
            base = PACK_FRACTION_RE.sub(" ", base)
        else:
            base = SIZE_RE.sub(" ", base)
+    base = DUAL_WEIGHT_RE.sub(" ", base)
    base = HASH_SIZE_RE.sub(" ", base)
+    base = ITEM_CODE_RE.sub(" ", base)
+    base = LOGISTICS_SLASH_RE.sub(" ", base)
    base = PACK_DASH_RE.sub(" ", base)
    base = PACK_WORD_RE.sub(" ", base)
    base = normalize_whitespace(base)
    tokens = []
    for token in base.split():
+        if token in {"/", "-"}:
+            continue
        if token in {"ORG"}:
            continue
        if token in {"PEANUT", "BUTTER"} and "JIF" in base:
--- a/pm/tasks.org
+++ b/pm/tasks.org
@@ -1,5 +1,5 @@
 #+title: Scrape-Giant Task Log
-
+#+STARTUP: overview
 * [X] t1.1: harden giant receipt fetch cli (2-4 commits)
 ** acceptance criteria
 - giant scraper runs from cli with prompts or env-backed defaults for `user_id` and `loyalty`
@@ -580,14 +580,14 @@ bring on-disk outputs fully into the target `data/` structure without changing r
 ** evidence
 - commit: `d2e6f2a`
 - tests: `./venv/bin/python -m unittest discover -s tests`; `./venv/bin/python build_purchases.py`; `./venv/bin/python review_products.py --refresh-only`; `./venv/bin/python report_pipeline_status.py`; `./venv/bin/python build_purchases.py --help`; `./venv/bin/python review_products.py --help`; `./venv/bin/python report_pipeline_status.py --help`; verified `data/giant-web/collected_orders.csv`, `data/giant-web/collected_items.csv`, `data/costco-web/collected_orders.csv`, `data/costco-web/collected_items.csv`, `data/catalog.csv`, and archived transitional review outputs under `data/review/archive/`
- datetime: 2026-03-20 10:04:15 EDT
+- datetime: [2026-03-20 10:04:15 EDT]

 ** notes
 - No recollection was needed; existing raw and collected exports were adapted in place and moved into the target names.
 - Updated the active script defaults to point at `data/...` so the code and on-disk layout now agree.
 - Kept obviously obsolete review artifacts, but moved them under `data/review/archive/` instead of deleting them outright.

-* [ ] t1.14.3: retailer-specific Costco normalization cleanup (2-4 commits)
+* [X] t1.14.3: retailer-specific Costco normalization cleanup (2-4 commits)
 tighten Costco-specific normalization so normalized item names are cleaner and deterministic retailer grouping is less noisy

 ** Acceptance Criteria
@@ -616,10 +616,13 @@ tighten Costco-specific normalization so normalized item names are cleaner and d

 ** evidence
 - commit:
- tests:
- datetime:
+- tests: `./venv/bin/python -m unittest discover -s tests`; `./venv/bin/python -m unittest tests.test_costco_pipeline`; `./venv/bin/python normalize_costco_web.py`; verified live cleaned examples in `data/costco-web/normalized_items.csv`, including `MANDARINS 2.27 KG / 5 LBS -> MANDARIN` and `LIFE 6'TABLE MDL #80873U - T12/H3/P36 -> LIFE 6'TABLE MDL`
+- datetime: 2026-03-20 11:09:32 EDT

 ** notes
+- Kept this explicitly Costco-specific and narrow: the cleanup removes known logistics/code artifacts and orphan slash tokens without introducing fuzzy naming logic.
+- The structured parsing still owns size/pack extraction, so name cleanup can safely strip dual-unit and logistics fragments after those fields are parsed.
+- Discount-line behavior remains unchanged; this task only cleaned normalized names and preserved the existing audit trail.

 * [ ] t1.15: refactor review/combine pipeline around normalized_item_id and catalog links (4-8 commits)
 replace the old observed/canonical workflow with a review-first pipeline that uses normalized_item_id as the retailer-level review unit and links it to catalog items
--- a/tests/test_costco_pipeline.py
+++ b/tests/test_costco_pipeline.py
@@ -285,6 +285,47 @@ class CostcoPipelineTests(unittest.TestCase):
        self.assertEqual("true", discount["is_coupon_line"])
        self.assertEqual("false", discount["is_item"])

+    def test_costco_name_cleanup_removes_dual_weight_and_logistics_artifacts(self):
+        mixed_units = enrich_costco.parse_costco_item(
+            order_id="abc",
+            order_date="2026-03-12",
+            raw_path=Path("costco_output/raw/abc.json"),
+            line_no=1,
+            item={
+                "itemNumber": "18600",
+                "itemDescription01": "MANDARINS 2.27 KG / 5 LBS",
+                "itemDescription02": None,
+                "itemDepartmentNumber": 65,
+                "transDepartmentNumber": 65,
+                "unit": 1,
+                "itemIdentifier": "E",
+                "amount": 7.49,
+                "itemUnitPriceAmount": 7.49,
+            },
+        )
+        self.assertEqual("MANDARIN", mixed_units["item_name_norm"])
+        self.assertEqual("5", mixed_units["size_value"])
+        self.assertEqual("lb", mixed_units["size_unit"])
+
+        logistics = enrich_costco.parse_costco_item(
+            order_id="abc",
+            order_date="2026-03-12",
+            raw_path=Path("costco_output/raw/abc.json"),
+            line_no=2,
+            item={
+                "itemNumber": "1375005",
+                "itemDescription01": "LIFE 6'TABLE MDL #80873U - T12/H3/P36",
+                "itemDescription02": None,
+                "itemDepartmentNumber": 18,
+                "transDepartmentNumber": 18,
+                "unit": 1,
+                "itemIdentifier": "E",
+                "amount": 119.98,
+                "itemUnitPriceAmount": 119.98,
+            },
+        )
+        self.assertEqual("LIFE 6'TABLE MDL", logistics["item_name_norm"])
+
    def test_build_items_enriched_matches_discount_to_item(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            raw_dir = Path(tmpdir) / "raw"