Extend shared schema for retailer-native ids
This commit is contained in:
@@ -18,6 +18,7 @@ OUTPUT_FIELDS = [
|
||||
"line_no",
|
||||
"observed_item_key",
|
||||
"order_date",
|
||||
"retailer_item_id",
|
||||
"pod_id",
|
||||
"item_name",
|
||||
"upc",
|
||||
@@ -43,6 +44,8 @@ OUTPUT_FIELDS = [
|
||||
"measure_type",
|
||||
"is_store_brand",
|
||||
"is_fee",
|
||||
"is_discount_line",
|
||||
"is_coupon_line",
|
||||
"price_per_each",
|
||||
"price_per_lb",
|
||||
"price_per_oz",
|
||||
@@ -55,6 +58,8 @@ STORE_BRAND_PREFIXES = {
|
||||
"NP": "NP",
|
||||
}
|
||||
|
||||
DROP_TOKENS = {"FRESH"}
|
||||
|
||||
ABBREVIATIONS = {
|
||||
"APPLE": "APPLE",
|
||||
"APPLES": "APPLES",
|
||||
@@ -234,9 +239,30 @@ def normalize_item_name(cleaned_name):
|
||||
base = normalize_whitespace(base[len(prefix):])
|
||||
|
||||
base = strip_measure_tokens(base)
|
||||
expanded_tokens = [expand_token(token) for token in base.split()]
|
||||
expanded_tokens = []
|
||||
for token in base.split():
|
||||
expanded = expand_token(token)
|
||||
if expanded in DROP_TOKENS:
|
||||
continue
|
||||
expanded_tokens.append(expanded)
|
||||
expanded = " ".join(token for token in expanded_tokens if token)
|
||||
return normalize_whitespace(expanded)
|
||||
return singularize_tokens(normalize_whitespace(expanded))
|
||||
|
||||
|
||||
def singularize_tokens(text):
|
||||
singular_map = {
|
||||
"APPLES": "APPLE",
|
||||
"BANANAS": "BANANA",
|
||||
"BERRIES": "BERRY",
|
||||
"EGGS": "EGG",
|
||||
"LEMONS": "LEMON",
|
||||
"LIMES": "LIME",
|
||||
"MANDARINS": "MANDARIN",
|
||||
"PEPPERS": "PEPPER",
|
||||
"STRAWBERRIES": "STRAWBERRY",
|
||||
}
|
||||
tokens = [singular_map.get(token, token) for token in text.split()]
|
||||
return normalize_whitespace(" ".join(tokens))
|
||||
|
||||
|
||||
def guess_measure_type(item, size_unit, pack_qty):
|
||||
@@ -330,6 +356,7 @@ def parse_item(order_id, order_date, raw_path, line_no, item):
|
||||
"line_no": str(line_no),
|
||||
"observed_item_key": f"{RETAILER}:{order_id}:{line_no}",
|
||||
"order_date": normalize_whitespace(order_date),
|
||||
"retailer_item_id": stringify(item.get("podId")),
|
||||
"pod_id": stringify(item.get("podId")),
|
||||
"item_name": stringify(item.get("itemName")),
|
||||
"upc": stringify(item.get("primUpcCd")),
|
||||
@@ -355,6 +382,8 @@ def parse_item(order_id, order_date, raw_path, line_no, item):
|
||||
"measure_type": measure_type,
|
||||
"is_store_brand": "true" if bool(prefix) else "false",
|
||||
"is_fee": "true" if is_fee else "false",
|
||||
"is_discount_line": "false",
|
||||
"is_coupon_line": "false",
|
||||
"price_per_each": price_per_each,
|
||||
"price_per_lb": price_per_lb,
|
||||
"price_per_oz": price_per_oz,
|
||||
|
||||
Reference in New Issue
Block a user