Make canonical auto-linking more conservative

This commit is contained in:
ben
2026-03-17 15:07:48 -04:00
parent 56a03bcb1d
commit 08e2a86cbd
2 changed files with 43 additions and 19 deletions

View File

@@ -1,4 +1,5 @@
import click
import re
from layer_helpers import read_csv_rows, representative_value, stable_id, write_csv_rows
@@ -20,6 +21,8 @@ CANONICAL_FIELDS = [
"updated_at",
]
CANONICAL_DROP_TOKENS = {"CT", "COUNT", "COUNTS", "DOZ", "DOZEN", "DOZ.", "PACK"}
LINK_FIELDS = [
"observed_product_id",
"canonical_product_id",
@@ -91,26 +94,24 @@ def auto_link_rule(observed_row):
"high",
)
if (
observed_row.get("representative_name_norm")
and not observed_row.get("representative_size_value")
and not observed_row.get("representative_size_unit")
and not observed_row.get("representative_pack_qty")
):
return (
"exact_name",
"|".join(
[
f"name={observed_row['representative_name_norm']}",
f"measure={observed_row['representative_measure_type']}",
]
),
"medium",
)
return "", "", ""
def clean_canonical_name(name):
tokens = []
for token in re.sub(r"[^A-Z0-9\s]", " ", (name or "").upper()).split():
if token.isdigit():
continue
if token in CANONICAL_DROP_TOKENS:
continue
if re.fullmatch(r"\d+(?:PK|PACK)", token):
continue
if re.fullmatch(r"\d+DZ", token):
continue
tokens.append(token)
return " ".join(tokens).strip()
def canonical_row_for_group(canonical_product_id, group_rows, link_method):
quantity_value, quantity_unit = normalized_quantity(
{
@@ -130,7 +131,10 @@ def canonical_row_for_group(canonical_product_id, group_rows, link_method):
)
return {
"canonical_product_id": canonical_product_id,
"canonical_name": representative_value(group_rows, "representative_name_norm"),
"canonical_name": clean_canonical_name(
representative_value(group_rows, "representative_name_norm")
)
or representative_value(group_rows, "representative_name_norm"),
"product_type": "",
"brand": representative_value(group_rows, "representative_brand"),
"variant": representative_value(group_rows, "representative_variant"),

View File

@@ -4,7 +4,7 @@ import build_canonical_layer
class CanonicalLayerTests(unittest.TestCase):
def test_build_canonical_layer_auto_links_exact_upc_and_name_size(self):
def test_build_canonical_layer_auto_links_exact_upc_and_name_size_only(self):
observed_rows = [
{
"observed_product_id": "gobs_1",
@@ -81,6 +81,21 @@ class CanonicalLayerTests(unittest.TestCase):
"is_discount_line": "false",
"is_coupon_line": "false",
},
{
"observed_product_id": "gobs_6",
"representative_upc": "",
"representative_retailer_item_id": "",
"representative_name_norm": "LIME",
"representative_brand": "",
"representative_variant": "",
"representative_size_value": "",
"representative_size_unit": "",
"representative_pack_qty": "",
"representative_measure_type": "each",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
},
]
canonicals, links = build_canonical_layer.build_canonical_layer(observed_rows)
@@ -93,6 +108,11 @@ class CanonicalLayerTests(unittest.TestCase):
self.assertEqual("exact_name_size", methods["gobs_3"])
self.assertEqual("exact_name_size", methods["gobs_4"])
self.assertNotIn("gobs_5", methods)
self.assertNotIn("gobs_6", methods)
def test_clean_canonical_name_removes_packaging_noise(self):
self.assertEqual("LIME", build_canonical_layer.clean_canonical_name("LIME . / ."))
self.assertEqual("EGG", build_canonical_layer.clean_canonical_name("5DZ EGG / /"))
if __name__ == "__main__":