From da00288f1055043b11889d4aa5cad573ad9ecf15 Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 16 Mar 2026 09:17:46 -0400 Subject: [PATCH] Add Costco acquisition and enrich flow --- enrich_costco.py | 271 +++++++++++++++++++ scrape_costco.py | 464 ++++++++++++++++++++++++++++++++ tests/test_costco_pipeline.py | 201 ++++++++++++++ validate_cross_retailer_flow.py | 154 +++++++++++ 4 files changed, 1090 insertions(+) create mode 100644 enrich_costco.py create mode 100644 scrape_costco.py create mode 100644 tests/test_costco_pipeline.py create mode 100644 validate_cross_retailer_flow.py diff --git a/enrich_costco.py b/enrich_costco.py new file mode 100644 index 0000000..8129c64 --- /dev/null +++ b/enrich_costco.py @@ -0,0 +1,271 @@ +import csv +import json +import re +from pathlib import Path + +import click + +from enrich_giant import ( + OUTPUT_FIELDS, + format_decimal, + normalize_number, + normalize_unit, + normalize_whitespace, + singularize_tokens, + to_decimal, +) + + +PARSER_VERSION = "costco-enrich-v1" +RETAILER = "costco" +DEFAULT_INPUT_DIR = Path("costco_output/raw") +DEFAULT_OUTPUT_CSV = Path("costco_output/items_enriched.csv") + +CODE_TOKEN_RE = re.compile( + r"\b(?:SL\d+|T\d+H\d+|P\d+(?:/\d+)?|W\d+T\d+H\d+|FY\d+|CSPC#|C\d+T\d+H\d+|EC\d+T\d+H\d+|\d+X\d+)\b" +) +PACK_FRACTION_RE = re.compile(r"(?