Clean Costco normalization artifacts
This commit is contained in:
@@ -30,6 +30,11 @@ CODE_TOKEN_RE = re.compile(
|
||||
)
|
||||
PACK_FRACTION_RE = re.compile(r"(?<![A-Z0-9])(\d+)\s*/\s*(\d+(?:\.\d+)?)\s*(OZ|LB|LBS|CT)\b")
|
||||
HASH_SIZE_RE = re.compile(r"(?<![A-Z0-9])(\d+(?:\.\d+)?)#\b")
|
||||
ITEM_CODE_RE = re.compile(r"#\w+\b")
|
||||
DUAL_WEIGHT_RE = re.compile(
|
||||
r"\b\d+(?:\.\d+)?\s*(?:KG|G|LB|LBS|OZ)\s*/\s*\d+(?:\.\d+)?\s*(?:KG|G|LB|LBS|OZ)\b"
|
||||
)
|
||||
LOGISTICS_SLASH_RE = re.compile(r"\b(?:T\d+/H\d+(?:/P\d+)?/?|H\d+/P\d+/?|T\d+/H\d+/?)\b")
|
||||
PACK_DASH_RE = re.compile(r"(?<![A-Z0-9])(\d+)\s*-\s*PACK\b")
|
||||
PACK_WORD_RE = re.compile(r"(?<![A-Z0-9])(\d+)\s*PACK\b")
|
||||
SIZE_RE = re.compile(r"(?<![A-Z0-9])(\d+(?:\.\d+)?)\s*(OZ|LB|LBS|CT|KG|G)\b")
|
||||
@@ -98,12 +103,17 @@ def normalize_costco_name(cleaned_name):
|
||||
base = PACK_FRACTION_RE.sub(" ", base)
|
||||
else:
|
||||
base = SIZE_RE.sub(" ", base)
|
||||
base = DUAL_WEIGHT_RE.sub(" ", base)
|
||||
base = HASH_SIZE_RE.sub(" ", base)
|
||||
base = ITEM_CODE_RE.sub(" ", base)
|
||||
base = LOGISTICS_SLASH_RE.sub(" ", base)
|
||||
base = PACK_DASH_RE.sub(" ", base)
|
||||
base = PACK_WORD_RE.sub(" ", base)
|
||||
base = normalize_whitespace(base)
|
||||
tokens = []
|
||||
for token in base.split():
|
||||
if token in {"/", "-"}:
|
||||
continue
|
||||
if token in {"ORG"}:
|
||||
continue
|
||||
if token in {"PEANUT", "BUTTER"} and "JIF" in base:
|
||||
|
||||
13
pm/tasks.org
13
pm/tasks.org
@@ -1,5 +1,5 @@
|
||||
#+title: Scrape-Giant Task Log
|
||||
|
||||
#+STARTUP: overview
|
||||
* [X] t1.1: harden giant receipt fetch cli (2-4 commits)
|
||||
** acceptance criteria
|
||||
- giant scraper runs from cli with prompts or env-backed defaults for `user_id` and `loyalty`
|
||||
@@ -580,14 +580,14 @@ bring on-disk outputs fully into the target `data/` structure without changing r
|
||||
** evidence
|
||||
- commit: `d2e6f2a`
|
||||
- tests: `./venv/bin/python -m unittest discover -s tests`; `./venv/bin/python build_purchases.py`; `./venv/bin/python review_products.py --refresh-only`; `./venv/bin/python report_pipeline_status.py`; `./venv/bin/python build_purchases.py --help`; `./venv/bin/python review_products.py --help`; `./venv/bin/python report_pipeline_status.py --help`; verified `data/giant-web/collected_orders.csv`, `data/giant-web/collected_items.csv`, `data/costco-web/collected_orders.csv`, `data/costco-web/collected_items.csv`, `data/catalog.csv`, and archived transitional review outputs under `data/review/archive/`
|
||||
- datetime: 2026-03-20 10:04:15 EDT
|
||||
- datetime: [2026-03-20 10:04:15 EDT]
|
||||
|
||||
** notes
|
||||
- No recollection was needed; existing raw and collected exports were adapted in place and moved into the target names.
|
||||
- Updated the active script defaults to point at `data/...` so the code and on-disk layout now agree.
|
||||
- Kept obviously obsolete review artifacts, but moved them under `data/review/archive/` instead of deleting them outright.
|
||||
|
||||
* [ ] t1.14.3: retailer-specific Costco normalization cleanup (2-4 commits)
|
||||
* [X] t1.14.3: retailer-specific Costco normalization cleanup (2-4 commits)
|
||||
tighten Costco-specific normalization so normalized item names are cleaner and deterministic retailer grouping is less noisy
|
||||
|
||||
** Acceptance Criteria
|
||||
@@ -616,10 +616,13 @@ tighten Costco-specific normalization so normalized item names are cleaner and d
|
||||
|
||||
** evidence
|
||||
- commit:
|
||||
- tests:
|
||||
- datetime:
|
||||
- tests: `./venv/bin/python -m unittest discover -s tests`; `./venv/bin/python -m unittest tests.test_costco_pipeline`; `./venv/bin/python normalize_costco_web.py`; verified live cleaned examples in `data/costco-web/normalized_items.csv`, including `MANDARINS 2.27 KG / 5 LBS -> MANDARIN` and `LIFE 6'TABLE MDL #80873U - T12/H3/P36 -> LIFE 6'TABLE MDL`
|
||||
- datetime: 2026-03-20 11:09:32 EDT
|
||||
|
||||
** notes
|
||||
- Kept this explicitly Costco-specific and narrow: the cleanup removes known logistics/code artifacts and orphan slash tokens without introducing fuzzy naming logic.
|
||||
- The structured parsing still owns size/pack extraction, so name cleanup can safely strip dual-unit and logistics fragments after those fields are parsed.
|
||||
- Discount-line behavior remains unchanged; this task only cleaned normalized names and preserved the existing audit trail.
|
||||
|
||||
* [ ] t1.15: refactor review/combine pipeline around normalized_item_id and catalog links (4-8 commits)
|
||||
replace the old observed/canonical workflow with a review-first pipeline that uses normalized_item_id as the retailer-level review unit and links it to catalog items
|
||||
|
||||
@@ -285,6 +285,47 @@ class CostcoPipelineTests(unittest.TestCase):
|
||||
self.assertEqual("true", discount["is_coupon_line"])
|
||||
self.assertEqual("false", discount["is_item"])
|
||||
|
||||
def test_costco_name_cleanup_removes_dual_weight_and_logistics_artifacts(self):
|
||||
mixed_units = enrich_costco.parse_costco_item(
|
||||
order_id="abc",
|
||||
order_date="2026-03-12",
|
||||
raw_path=Path("costco_output/raw/abc.json"),
|
||||
line_no=1,
|
||||
item={
|
||||
"itemNumber": "18600",
|
||||
"itemDescription01": "MANDARINS 2.27 KG / 5 LBS",
|
||||
"itemDescription02": None,
|
||||
"itemDepartmentNumber": 65,
|
||||
"transDepartmentNumber": 65,
|
||||
"unit": 1,
|
||||
"itemIdentifier": "E",
|
||||
"amount": 7.49,
|
||||
"itemUnitPriceAmount": 7.49,
|
||||
},
|
||||
)
|
||||
self.assertEqual("MANDARIN", mixed_units["item_name_norm"])
|
||||
self.assertEqual("5", mixed_units["size_value"])
|
||||
self.assertEqual("lb", mixed_units["size_unit"])
|
||||
|
||||
logistics = enrich_costco.parse_costco_item(
|
||||
order_id="abc",
|
||||
order_date="2026-03-12",
|
||||
raw_path=Path("costco_output/raw/abc.json"),
|
||||
line_no=2,
|
||||
item={
|
||||
"itemNumber": "1375005",
|
||||
"itemDescription01": "LIFE 6'TABLE MDL #80873U - T12/H3/P36",
|
||||
"itemDescription02": None,
|
||||
"itemDepartmentNumber": 18,
|
||||
"transDepartmentNumber": 18,
|
||||
"unit": 1,
|
||||
"itemIdentifier": "E",
|
||||
"amount": 119.98,
|
||||
"itemUnitPriceAmount": 119.98,
|
||||
},
|
||||
)
|
||||
self.assertEqual("LIFE 6'TABLE MDL", logistics["item_name_norm"])
|
||||
|
||||
def test_build_items_enriched_matches_discount_to_item(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
raw_dir = Path(tmpdir) / "raw"
|
||||
|
||||
Reference in New Issue
Block a user