diff --git a/build_canonical_layer.py b/build_canonical_layer.py index 3df2caa..d16addf 100644 --- a/build_canonical_layer.py +++ b/build_canonical_layer.py @@ -1,4 +1,5 @@ import click +import re from layer_helpers import read_csv_rows, representative_value, stable_id, write_csv_rows @@ -20,6 +21,8 @@ CANONICAL_FIELDS = [ "updated_at", ] +CANONICAL_DROP_TOKENS = {"CT", "COUNT", "COUNTS", "DOZ", "DOZEN", "DOZ.", "PACK"} + LINK_FIELDS = [ "observed_product_id", "canonical_product_id", @@ -91,26 +94,24 @@ def auto_link_rule(observed_row): "high", ) - if ( - observed_row.get("representative_name_norm") - and not observed_row.get("representative_size_value") - and not observed_row.get("representative_size_unit") - and not observed_row.get("representative_pack_qty") - ): - return ( - "exact_name", - "|".join( - [ - f"name={observed_row['representative_name_norm']}", - f"measure={observed_row['representative_measure_type']}", - ] - ), - "medium", - ) - return "", "", "" +def clean_canonical_name(name): + tokens = [] + for token in re.sub(r"[^A-Z0-9\s]", " ", (name or "").upper()).split(): + if token.isdigit(): + continue + if token in CANONICAL_DROP_TOKENS: + continue + if re.fullmatch(r"\d+(?:PK|PACK)", token): + continue + if re.fullmatch(r"\d+DZ", token): + continue + tokens.append(token) + return " ".join(tokens).strip() + + def canonical_row_for_group(canonical_product_id, group_rows, link_method): quantity_value, quantity_unit = normalized_quantity( { @@ -130,7 +131,10 @@ def canonical_row_for_group(canonical_product_id, group_rows, link_method): ) return { "canonical_product_id": canonical_product_id, - "canonical_name": representative_value(group_rows, "representative_name_norm"), + "canonical_name": clean_canonical_name( + representative_value(group_rows, "representative_name_norm") + ) + or representative_value(group_rows, "representative_name_norm"), "product_type": "", "brand": representative_value(group_rows, "representative_brand"), "variant": representative_value(group_rows, "representative_variant"), diff --git a/tests/test_canonical_layer.py b/tests/test_canonical_layer.py index 5b45d44..451c731 100644 --- a/tests/test_canonical_layer.py +++ b/tests/test_canonical_layer.py @@ -4,7 +4,7 @@ import build_canonical_layer class CanonicalLayerTests(unittest.TestCase): - def test_build_canonical_layer_auto_links_exact_upc_and_name_size(self): + def test_build_canonical_layer_auto_links_exact_upc_and_name_size_only(self): observed_rows = [ { "observed_product_id": "gobs_1", @@ -81,6 +81,21 @@ class CanonicalLayerTests(unittest.TestCase): "is_discount_line": "false", "is_coupon_line": "false", }, + { + "observed_product_id": "gobs_6", + "representative_upc": "", + "representative_retailer_item_id": "", + "representative_name_norm": "LIME", + "representative_brand": "", + "representative_variant": "", + "representative_size_value": "", + "representative_size_unit": "", + "representative_pack_qty": "", + "representative_measure_type": "each", + "is_fee": "false", + "is_discount_line": "false", + "is_coupon_line": "false", + }, ] canonicals, links = build_canonical_layer.build_canonical_layer(observed_rows) @@ -93,6 +108,11 @@ class CanonicalLayerTests(unittest.TestCase): self.assertEqual("exact_name_size", methods["gobs_3"]) self.assertEqual("exact_name_size", methods["gobs_4"]) self.assertNotIn("gobs_5", methods) + self.assertNotIn("gobs_6", methods) + + def test_clean_canonical_name_removes_packaging_noise(self): + self.assertEqual("LIME", build_canonical_layer.clean_canonical_name("LIME . / .")) + self.assertEqual("EGG", build_canonical_layer.clean_canonical_name("5DZ EGG / /")) if __name__ == "__main__":