Extend shared schema for retailer-native ids

This commit is contained in:
ben
2026-03-16 09:17:36 -04:00
parent d20a131e04
commit 9497565978
8 changed files with 101 additions and 4 deletions

View File

@@ -18,6 +18,7 @@ OUTPUT_FIELDS = [
"line_no",
"observed_item_key",
"order_date",
"retailer_item_id",
"pod_id",
"item_name",
"upc",
@@ -43,6 +44,8 @@ OUTPUT_FIELDS = [
"measure_type",
"is_store_brand",
"is_fee",
"is_discount_line",
"is_coupon_line",
"price_per_each",
"price_per_lb",
"price_per_oz",
@@ -55,6 +58,8 @@ STORE_BRAND_PREFIXES = {
"NP": "NP",
}
DROP_TOKENS = {"FRESH"}
ABBREVIATIONS = {
"APPLE": "APPLE",
"APPLES": "APPLES",
@@ -234,9 +239,30 @@ def normalize_item_name(cleaned_name):
base = normalize_whitespace(base[len(prefix):])
base = strip_measure_tokens(base)
expanded_tokens = [expand_token(token) for token in base.split()]
expanded_tokens = []
for token in base.split():
expanded = expand_token(token)
if expanded in DROP_TOKENS:
continue
expanded_tokens.append(expanded)
expanded = " ".join(token for token in expanded_tokens if token)
return normalize_whitespace(expanded)
return singularize_tokens(normalize_whitespace(expanded))
def singularize_tokens(text):
singular_map = {
"APPLES": "APPLE",
"BANANAS": "BANANA",
"BERRIES": "BERRY",
"EGGS": "EGG",
"LEMONS": "LEMON",
"LIMES": "LIME",
"MANDARINS": "MANDARIN",
"PEPPERS": "PEPPER",
"STRAWBERRIES": "STRAWBERRY",
}
tokens = [singular_map.get(token, token) for token in text.split()]
return normalize_whitespace(" ".join(tokens))
def guess_measure_type(item, size_unit, pack_qty):
@@ -330,6 +356,7 @@ def parse_item(order_id, order_date, raw_path, line_no, item):
"line_no": str(line_no),
"observed_item_key": f"{RETAILER}:{order_id}:{line_no}",
"order_date": normalize_whitespace(order_date),
"retailer_item_id": stringify(item.get("podId")),
"pod_id": stringify(item.get("podId")),
"item_name": stringify(item.get("itemName")),
"upc": stringify(item.get("primUpcCd")),
@@ -355,6 +382,8 @@ def parse_item(order_id, order_date, raw_path, line_no, item):
"measure_type": measure_type,
"is_store_brand": "true" if bool(prefix) else "false",
"is_fee": "true" if is_fee else "false",
"is_discount_line": "false",
"is_coupon_line": "false",
"price_per_each": price_per_each,
"price_per_lb": price_per_lb,
"price_per_oz": price_per_oz,