Clean Costco normalization artifacts

This commit is contained in:
ben
2026-03-20 11:09:44 -04:00
parent 848d229f2d
commit bcec6b37d3
3 changed files with 59 additions and 5 deletions

View File

@@ -30,6 +30,11 @@ CODE_TOKEN_RE = re.compile(
)
PACK_FRACTION_RE = re.compile(r"(?<![A-Z0-9])(\d+)\s*/\s*(\d+(?:\.\d+)?)\s*(OZ|LB|LBS|CT)\b")
HASH_SIZE_RE = re.compile(r"(?<![A-Z0-9])(\d+(?:\.\d+)?)#\b")
ITEM_CODE_RE = re.compile(r"#\w+\b")
DUAL_WEIGHT_RE = re.compile(
r"\b\d+(?:\.\d+)?\s*(?:KG|G|LB|LBS|OZ)\s*/\s*\d+(?:\.\d+)?\s*(?:KG|G|LB|LBS|OZ)\b"
)
LOGISTICS_SLASH_RE = re.compile(r"\b(?:T\d+/H\d+(?:/P\d+)?/?|H\d+/P\d+/?|T\d+/H\d+/?)\b")
PACK_DASH_RE = re.compile(r"(?<![A-Z0-9])(\d+)\s*-\s*PACK\b")
PACK_WORD_RE = re.compile(r"(?<![A-Z0-9])(\d+)\s*PACK\b")
SIZE_RE = re.compile(r"(?<![A-Z0-9])(\d+(?:\.\d+)?)\s*(OZ|LB|LBS|CT|KG|G)\b")
@@ -98,12 +103,17 @@ def normalize_costco_name(cleaned_name):
base = PACK_FRACTION_RE.sub(" ", base)
else:
base = SIZE_RE.sub(" ", base)
base = DUAL_WEIGHT_RE.sub(" ", base)
base = HASH_SIZE_RE.sub(" ", base)
base = ITEM_CODE_RE.sub(" ", base)
base = LOGISTICS_SLASH_RE.sub(" ", base)
base = PACK_DASH_RE.sub(" ", base)
base = PACK_WORD_RE.sub(" ", base)
base = normalize_whitespace(base)
tokens = []
for token in base.split():
if token in {"/", "-"}:
continue
if token in {"ORG"}:
continue
if token in {"PEANUT", "BUTTER"} and "JIF" in base: