From 9497565978d2186a7f9875384d87dd4a3b17ed7e Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 16 Mar 2026 09:17:36 -0400 Subject: [PATCH] Extend shared schema for retailer-native ids --- build_canonical_layer.py | 6 +++++- build_observed_products.py | 25 +++++++++++++++++++++++++ build_review_queue.py | 9 ++++++++- enrich_giant.py | 33 +++++++++++++++++++++++++++++++-- tests/test_canonical_layer.py | 15 +++++++++++++++ tests/test_enrich_giant.py | 1 + tests/test_observed_products.py | 7 +++++++ tests/test_review_queue.py | 9 +++++++++ 8 files changed, 101 insertions(+), 4 deletions(-) diff --git a/build_canonical_layer.py b/build_canonical_layer.py index c0410a0..3df2caa 100644 --- a/build_canonical_layer.py +++ b/build_canonical_layer.py @@ -58,7 +58,11 @@ def normalized_quantity(row): def auto_link_rule(observed_row): - if observed_row.get("is_fee") == "true": + if ( + observed_row.get("is_fee") == "true" + or observed_row.get("is_discount_line") == "true" + or observed_row.get("is_coupon_line") == "true" + ): return "", "", "" if observed_row.get("representative_upc"): diff --git a/build_observed_products.py b/build_observed_products.py index 3874d7b..807a6a3 100644 --- a/build_observed_products.py +++ b/build_observed_products.py @@ -17,6 +17,7 @@ OUTPUT_FIELDS = [ "observed_product_id", "retailer", "observed_key", + "representative_retailer_item_id", "representative_upc", "representative_item_name", "representative_name_norm", @@ -29,6 +30,8 @@ OUTPUT_FIELDS = [ "representative_image_url", "is_store_brand", "is_fee", + "is_discount_line", + "is_coupon_line", "first_seen_date", "last_seen_date", "times_seen", @@ -38,6 +41,7 @@ OUTPUT_FIELDS = [ "normalized_name_examples", "example_prices", "distinct_item_names_count", + "distinct_retailer_item_ids_count", "distinct_upcs_count", ] @@ -52,6 +56,17 @@ def build_observed_key(row): ] ) + if row.get("retailer_item_id"): + return "|".join( + [ + row["retailer"], + f"retailer_item_id={row['retailer_item_id']}", + f"name={row['item_name_norm']}", + f"discount={row.get('is_discount_line', 'false')}", + f"coupon={row.get('is_coupon_line', 'false')}", + ] + ) + return "|".join( [ row["retailer"], @@ -82,6 +97,9 @@ def build_observed_products(rows): "observed_product_id": stable_id("gobs", observed_key), "retailer": ordered[0]["retailer"], "observed_key": observed_key, + "representative_retailer_item_id": representative_value( + ordered, "retailer_item_id" + ), "representative_upc": representative_value(ordered, "upc"), "representative_item_name": representative_value(ordered, "item_name"), "representative_name_norm": representative_value( @@ -98,6 +116,10 @@ def build_observed_products(rows): "representative_image_url": first_nonblank(ordered, "image_url"), "is_store_brand": representative_value(ordered, "is_store_brand"), "is_fee": representative_value(ordered, "is_fee"), + "is_discount_line": representative_value( + ordered, "is_discount_line" + ), + "is_coupon_line": representative_value(ordered, "is_coupon_line"), "first_seen_date": ordered[0]["order_date"], "last_seen_date": ordered[-1]["order_date"], "times_seen": str(len(ordered)), @@ -115,6 +137,9 @@ def build_observed_products(rows): "distinct_item_names_count": str( len(distinct_values(ordered, "item_name")) ), + "distinct_retailer_item_ids_count": str( + len(distinct_values(ordered, "retailer_item_id")) + ), "distinct_upcs_count": str(len(distinct_values(ordered, "upc"))), } ) diff --git a/build_review_queue.py b/build_review_queue.py index 2ff6f35..b0432a2 100644 --- a/build_review_queue.py +++ b/build_review_queue.py @@ -37,7 +37,11 @@ def existing_review_state(path): def review_reasons(observed_row): reasons = [] - if observed_row["is_fee"] == "true": + if ( + observed_row["is_fee"] == "true" + or observed_row.get("is_discount_line") == "true" + or observed_row.get("is_coupon_line") == "true" + ): return reasons if observed_row["distinct_upcs_count"] not in {"", "0", "1"}: reasons.append(("multiple_upcs", "high")) @@ -119,6 +123,7 @@ def attach_observed_ids(item_rows, observed_rows): ) if row.get("upc") else "|".join( [ row["retailer"], + f"retailer_item_id={row.get('retailer_item_id', '')}", f"name={row['item_name_norm']}", f"size={row['size_value']}", f"unit={row['size_unit']}", @@ -126,6 +131,8 @@ def attach_observed_ids(item_rows, observed_rows): f"measure={row['measure_type']}", f"store_brand={row['is_store_brand']}", f"fee={row['is_fee']}", + f"discount={row.get('is_discount_line', 'false')}", + f"coupon={row.get('is_coupon_line', 'false')}", ] ) enriched = dict(row) diff --git a/enrich_giant.py b/enrich_giant.py index 7eb93e8..60039e8 100644 --- a/enrich_giant.py +++ b/enrich_giant.py @@ -18,6 +18,7 @@ OUTPUT_FIELDS = [ "line_no", "observed_item_key", "order_date", + "retailer_item_id", "pod_id", "item_name", "upc", @@ -43,6 +44,8 @@ OUTPUT_FIELDS = [ "measure_type", "is_store_brand", "is_fee", + "is_discount_line", + "is_coupon_line", "price_per_each", "price_per_lb", "price_per_oz", @@ -55,6 +58,8 @@ STORE_BRAND_PREFIXES = { "NP": "NP", } +DROP_TOKENS = {"FRESH"} + ABBREVIATIONS = { "APPLE": "APPLE", "APPLES": "APPLES", @@ -234,9 +239,30 @@ def normalize_item_name(cleaned_name): base = normalize_whitespace(base[len(prefix):]) base = strip_measure_tokens(base) - expanded_tokens = [expand_token(token) for token in base.split()] + expanded_tokens = [] + for token in base.split(): + expanded = expand_token(token) + if expanded in DROP_TOKENS: + continue + expanded_tokens.append(expanded) expanded = " ".join(token for token in expanded_tokens if token) - return normalize_whitespace(expanded) + return singularize_tokens(normalize_whitespace(expanded)) + + +def singularize_tokens(text): + singular_map = { + "APPLES": "APPLE", + "BANANAS": "BANANA", + "BERRIES": "BERRY", + "EGGS": "EGG", + "LEMONS": "LEMON", + "LIMES": "LIME", + "MANDARINS": "MANDARIN", + "PEPPERS": "PEPPER", + "STRAWBERRIES": "STRAWBERRY", + } + tokens = [singular_map.get(token, token) for token in text.split()] + return normalize_whitespace(" ".join(tokens)) def guess_measure_type(item, size_unit, pack_qty): @@ -330,6 +356,7 @@ def parse_item(order_id, order_date, raw_path, line_no, item): "line_no": str(line_no), "observed_item_key": f"{RETAILER}:{order_id}:{line_no}", "order_date": normalize_whitespace(order_date), + "retailer_item_id": stringify(item.get("podId")), "pod_id": stringify(item.get("podId")), "item_name": stringify(item.get("itemName")), "upc": stringify(item.get("primUpcCd")), @@ -355,6 +382,8 @@ def parse_item(order_id, order_date, raw_path, line_no, item): "measure_type": measure_type, "is_store_brand": "true" if bool(prefix) else "false", "is_fee": "true" if is_fee else "false", + "is_discount_line": "false", + "is_coupon_line": "false", "price_per_each": price_per_each, "price_per_lb": price_per_lb, "price_per_oz": price_per_oz, diff --git a/tests/test_canonical_layer.py b/tests/test_canonical_layer.py index 0bb67c6..5b45d44 100644 --- a/tests/test_canonical_layer.py +++ b/tests/test_canonical_layer.py @@ -9,6 +9,7 @@ class CanonicalLayerTests(unittest.TestCase): { "observed_product_id": "gobs_1", "representative_upc": "111", + "representative_retailer_item_id": "11", "representative_name_norm": "GALA APPLE", "representative_brand": "SB", "representative_variant": "", @@ -17,10 +18,13 @@ class CanonicalLayerTests(unittest.TestCase): "representative_pack_qty": "", "representative_measure_type": "weight", "is_fee": "false", + "is_discount_line": "false", + "is_coupon_line": "false", }, { "observed_product_id": "gobs_2", "representative_upc": "111", + "representative_retailer_item_id": "12", "representative_name_norm": "LARGE WHITE EGGS", "representative_brand": "SB", "representative_variant": "", @@ -29,10 +33,13 @@ class CanonicalLayerTests(unittest.TestCase): "representative_pack_qty": "18", "representative_measure_type": "count", "is_fee": "false", + "is_discount_line": "false", + "is_coupon_line": "false", }, { "observed_product_id": "gobs_3", "representative_upc": "", + "representative_retailer_item_id": "21", "representative_name_norm": "ROTINI", "representative_brand": "", "representative_variant": "", @@ -41,10 +48,13 @@ class CanonicalLayerTests(unittest.TestCase): "representative_pack_qty": "", "representative_measure_type": "weight", "is_fee": "false", + "is_discount_line": "false", + "is_coupon_line": "false", }, { "observed_product_id": "gobs_4", "representative_upc": "", + "representative_retailer_item_id": "22", "representative_name_norm": "ROTINI", "representative_brand": "SB", "representative_variant": "", @@ -53,10 +63,13 @@ class CanonicalLayerTests(unittest.TestCase): "representative_pack_qty": "", "representative_measure_type": "weight", "is_fee": "false", + "is_discount_line": "false", + "is_coupon_line": "false", }, { "observed_product_id": "gobs_5", "representative_upc": "", + "representative_retailer_item_id": "99", "representative_name_norm": "GL BAG CHARGE", "representative_brand": "", "representative_variant": "", @@ -65,6 +78,8 @@ class CanonicalLayerTests(unittest.TestCase): "representative_pack_qty": "", "representative_measure_type": "each", "is_fee": "true", + "is_discount_line": "false", + "is_coupon_line": "false", }, ] diff --git a/tests/test_enrich_giant.py b/tests/test_enrich_giant.py index 7e7e282..39a34ff 100644 --- a/tests/test_enrich_giant.py +++ b/tests/test_enrich_giant.py @@ -177,6 +177,7 @@ class EnrichGiantTests(unittest.TestCase): self.assertEqual("PEPSI", rows[0]["item_name_norm"]) self.assertEqual("6", rows[0]["pack_qty"]) self.assertEqual("7.5", rows[0]["size_value"]) + self.assertEqual("10", rows[0]["retailer_item_id"]) self.assertEqual("true", rows[1]["is_store_brand"]) with output_csv.open(newline="", encoding="utf-8") as handle: diff --git a/tests/test_observed_products.py b/tests/test_observed_products.py index 753babd..90a7a5e 100644 --- a/tests/test_observed_products.py +++ b/tests/test_observed_products.py @@ -13,6 +13,7 @@ class ObservedProductTests(unittest.TestCase): "order_date": "2026-01-01", "item_name": "SB GALA APPLE 5LB", "item_name_norm": "GALA APPLE", + "retailer_item_id": "11", "upc": "111", "brand_guess": "SB", "variant": "", @@ -23,6 +24,8 @@ class ObservedProductTests(unittest.TestCase): "image_url": "https://example.test/a.jpg", "is_store_brand": "true", "is_fee": "false", + "is_discount_line": "false", + "is_coupon_line": "false", "line_total": "7.99", }, { @@ -32,6 +35,7 @@ class ObservedProductTests(unittest.TestCase): "order_date": "2026-01-10", "item_name": "SB GALA APPLE 5 LB", "item_name_norm": "GALA APPLE", + "retailer_item_id": "11", "upc": "111", "brand_guess": "SB", "variant": "", @@ -42,6 +46,8 @@ class ObservedProductTests(unittest.TestCase): "image_url": "", "is_store_brand": "true", "is_fee": "false", + "is_discount_line": "false", + "is_coupon_line": "false", "line_total": "8.49", }, ] @@ -52,6 +58,7 @@ class ObservedProductTests(unittest.TestCase): self.assertEqual("2", observed[0]["times_seen"]) self.assertEqual("2026-01-01", observed[0]["first_seen_date"]) self.assertEqual("2026-01-10", observed[0]["last_seen_date"]) + self.assertEqual("11", observed[0]["representative_retailer_item_id"]) self.assertEqual("111", observed[0]["representative_upc"]) self.assertIn("SB GALA APPLE 5LB", observed[0]["raw_name_examples"]) diff --git a/tests/test_review_queue.py b/tests/test_review_queue.py index c644227..3843700 100644 --- a/tests/test_review_queue.py +++ b/tests/test_review_queue.py @@ -20,6 +20,8 @@ class ReviewQueueTests(unittest.TestCase): "distinct_item_names_count": "2", "distinct_upcs_count": "1", "is_fee": "false", + "is_discount_line": "false", + "is_coupon_line": "false", } ] item_rows = [ @@ -64,6 +66,7 @@ class ReviewQueueTests(unittest.TestCase): "observed_product_id": "gobs_1", "retailer": "giant", "observed_key": "giant|upc=111|name=GALA APPLE", + "representative_retailer_item_id": "11", "representative_upc": "111", "representative_item_name": "SB GALA APPLE 5LB", "representative_name_norm": "GALA APPLE", @@ -76,6 +79,8 @@ class ReviewQueueTests(unittest.TestCase): "representative_image_url": "", "is_store_brand": "true", "is_fee": "false", + "is_discount_line": "false", + "is_coupon_line": "false", "first_seen_date": "2026-01-01", "last_seen_date": "2026-01-10", "times_seen": "2", @@ -85,6 +90,7 @@ class ReviewQueueTests(unittest.TestCase): "normalized_name_examples": "GALA APPLE", "example_prices": "7.99 | 8.49", "distinct_item_names_count": "2", + "distinct_retailer_item_ids_count": "1", "distinct_upcs_count": "1", } ] @@ -95,6 +101,7 @@ class ReviewQueueTests(unittest.TestCase): "line_no": "1", "item_name": "SB GALA APPLE 5LB", "item_name_norm": "GALA APPLE", + "retailer_item_id": "11", "upc": "111", "size_value": "5", "size_unit": "lb", @@ -102,6 +109,8 @@ class ReviewQueueTests(unittest.TestCase): "measure_type": "weight", "is_store_brand": "true", "is_fee": "false", + "is_discount_line": "false", + "is_coupon_line": "false", "line_total": "7.99", } ]