Clean Costco normalization artifacts

This commit is contained in:
ben
2026-03-20 11:09:44 -04:00
parent 848d229f2d
commit bcec6b37d3
3 changed files with 59 additions and 5 deletions

View File

@@ -30,6 +30,11 @@ CODE_TOKEN_RE = re.compile(
)
PACK_FRACTION_RE = re.compile(r"(?<![A-Z0-9])(\d+)\s*/\s*(\d+(?:\.\d+)?)\s*(OZ|LB|LBS|CT)\b")
HASH_SIZE_RE = re.compile(r"(?<![A-Z0-9])(\d+(?:\.\d+)?)#\b")
ITEM_CODE_RE = re.compile(r"#\w+\b")
DUAL_WEIGHT_RE = re.compile(
r"\b\d+(?:\.\d+)?\s*(?:KG|G|LB|LBS|OZ)\s*/\s*\d+(?:\.\d+)?\s*(?:KG|G|LB|LBS|OZ)\b"
)
LOGISTICS_SLASH_RE = re.compile(r"\b(?:T\d+/H\d+(?:/P\d+)?/?|H\d+/P\d+/?|T\d+/H\d+/?)\b")
PACK_DASH_RE = re.compile(r"(?<![A-Z0-9])(\d+)\s*-\s*PACK\b")
PACK_WORD_RE = re.compile(r"(?<![A-Z0-9])(\d+)\s*PACK\b")
SIZE_RE = re.compile(r"(?<![A-Z0-9])(\d+(?:\.\d+)?)\s*(OZ|LB|LBS|CT|KG|G)\b")
@@ -98,12 +103,17 @@ def normalize_costco_name(cleaned_name):
base = PACK_FRACTION_RE.sub(" ", base)
else:
base = SIZE_RE.sub(" ", base)
base = DUAL_WEIGHT_RE.sub(" ", base)
base = HASH_SIZE_RE.sub(" ", base)
base = ITEM_CODE_RE.sub(" ", base)
base = LOGISTICS_SLASH_RE.sub(" ", base)
base = PACK_DASH_RE.sub(" ", base)
base = PACK_WORD_RE.sub(" ", base)
base = normalize_whitespace(base)
tokens = []
for token in base.split():
if token in {"/", "-"}:
continue
if token in {"ORG"}:
continue
if token in {"PEANUT", "BUTTER"} and "JIF" in base:

View File

@@ -1,5 +1,5 @@
#+title: Scrape-Giant Task Log
#+STARTUP: overview
* [X] t1.1: harden giant receipt fetch cli (2-4 commits)
** acceptance criteria
- giant scraper runs from cli with prompts or env-backed defaults for `user_id` and `loyalty`
@@ -580,14 +580,14 @@ bring on-disk outputs fully into the target `data/` structure without changing r
** evidence
- commit: `d2e6f2a`
- tests: `./venv/bin/python -m unittest discover -s tests`; `./venv/bin/python build_purchases.py`; `./venv/bin/python review_products.py --refresh-only`; `./venv/bin/python report_pipeline_status.py`; `./venv/bin/python build_purchases.py --help`; `./venv/bin/python review_products.py --help`; `./venv/bin/python report_pipeline_status.py --help`; verified `data/giant-web/collected_orders.csv`, `data/giant-web/collected_items.csv`, `data/costco-web/collected_orders.csv`, `data/costco-web/collected_items.csv`, `data/catalog.csv`, and archived transitional review outputs under `data/review/archive/`
- datetime: 2026-03-20 10:04:15 EDT
- datetime: [2026-03-20 10:04:15 EDT]
** notes
- No recollection was needed; existing raw and collected exports were adapted in place and moved into the target names.
- Updated the active script defaults to point at `data/...` so the code and on-disk layout now agree.
- Kept obviously obsolete review artifacts, but moved them under `data/review/archive/` instead of deleting them outright.
* [ ] t1.14.3: retailer-specific Costco normalization cleanup (2-4 commits)
* [X] t1.14.3: retailer-specific Costco normalization cleanup (2-4 commits)
tighten Costco-specific normalization so normalized item names are cleaner and deterministic retailer grouping is less noisy
** Acceptance Criteria
@@ -616,10 +616,13 @@ tighten Costco-specific normalization so normalized item names are cleaner and d
** evidence
- commit:
- tests:
- datetime:
- tests: `./venv/bin/python -m unittest discover -s tests`; `./venv/bin/python -m unittest tests.test_costco_pipeline`; `./venv/bin/python normalize_costco_web.py`; verified live cleaned examples in `data/costco-web/normalized_items.csv`, including `MANDARINS 2.27 KG / 5 LBS -> MANDARIN` and `LIFE 6'TABLE MDL #80873U - T12/H3/P36 -> LIFE 6'TABLE MDL`
- datetime: 2026-03-20 11:09:32 EDT
** notes
- Kept this explicitly Costco-specific and narrow: the cleanup removes known logistics/code artifacts and orphan slash tokens without introducing fuzzy naming logic.
- The structured parsing still owns size/pack extraction, so name cleanup can safely strip dual-unit and logistics fragments after those fields are parsed.
- Discount-line behavior remains unchanged; this task only cleaned normalized names and preserved the existing audit trail.
* [ ] t1.15: refactor review/combine pipeline around normalized_item_id and catalog links (4-8 commits)
replace the old observed/canonical workflow with a review-first pipeline that uses normalized_item_id as the retailer-level review unit and links it to catalog items

View File

@@ -285,6 +285,47 @@ class CostcoPipelineTests(unittest.TestCase):
self.assertEqual("true", discount["is_coupon_line"])
self.assertEqual("false", discount["is_item"])
def test_costco_name_cleanup_removes_dual_weight_and_logistics_artifacts(self):
mixed_units = enrich_costco.parse_costco_item(
order_id="abc",
order_date="2026-03-12",
raw_path=Path("costco_output/raw/abc.json"),
line_no=1,
item={
"itemNumber": "18600",
"itemDescription01": "MANDARINS 2.27 KG / 5 LBS",
"itemDescription02": None,
"itemDepartmentNumber": 65,
"transDepartmentNumber": 65,
"unit": 1,
"itemIdentifier": "E",
"amount": 7.49,
"itemUnitPriceAmount": 7.49,
},
)
self.assertEqual("MANDARIN", mixed_units["item_name_norm"])
self.assertEqual("5", mixed_units["size_value"])
self.assertEqual("lb", mixed_units["size_unit"])
logistics = enrich_costco.parse_costco_item(
order_id="abc",
order_date="2026-03-12",
raw_path=Path("costco_output/raw/abc.json"),
line_no=2,
item={
"itemNumber": "1375005",
"itemDescription01": "LIFE 6'TABLE MDL #80873U - T12/H3/P36",
"itemDescription02": None,
"itemDepartmentNumber": 18,
"transDepartmentNumber": 18,
"unit": 1,
"itemIdentifier": "E",
"amount": 119.98,
"itemUnitPriceAmount": 119.98,
},
)
self.assertEqual("LIFE 6'TABLE MDL", logistics["item_name_norm"])
def test_build_items_enriched_matches_discount_to_item(self):
with tempfile.TemporaryDirectory() as tmpdir:
raw_dir = Path(tmpdir) / "raw"