Make canonical auto-linking more conservative
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
import click
|
||||
import re
|
||||
|
||||
from layer_helpers import read_csv_rows, representative_value, stable_id, write_csv_rows
|
||||
|
||||
@@ -20,6 +21,8 @@ CANONICAL_FIELDS = [
|
||||
"updated_at",
|
||||
]
|
||||
|
||||
CANONICAL_DROP_TOKENS = {"CT", "COUNT", "COUNTS", "DOZ", "DOZEN", "DOZ.", "PACK"}
|
||||
|
||||
LINK_FIELDS = [
|
||||
"observed_product_id",
|
||||
"canonical_product_id",
|
||||
@@ -91,26 +94,24 @@ def auto_link_rule(observed_row):
|
||||
"high",
|
||||
)
|
||||
|
||||
if (
|
||||
observed_row.get("representative_name_norm")
|
||||
and not observed_row.get("representative_size_value")
|
||||
and not observed_row.get("representative_size_unit")
|
||||
and not observed_row.get("representative_pack_qty")
|
||||
):
|
||||
return (
|
||||
"exact_name",
|
||||
"|".join(
|
||||
[
|
||||
f"name={observed_row['representative_name_norm']}",
|
||||
f"measure={observed_row['representative_measure_type']}",
|
||||
]
|
||||
),
|
||||
"medium",
|
||||
)
|
||||
|
||||
return "", "", ""
|
||||
|
||||
|
||||
def clean_canonical_name(name):
|
||||
tokens = []
|
||||
for token in re.sub(r"[^A-Z0-9\s]", " ", (name or "").upper()).split():
|
||||
if token.isdigit():
|
||||
continue
|
||||
if token in CANONICAL_DROP_TOKENS:
|
||||
continue
|
||||
if re.fullmatch(r"\d+(?:PK|PACK)", token):
|
||||
continue
|
||||
if re.fullmatch(r"\d+DZ", token):
|
||||
continue
|
||||
tokens.append(token)
|
||||
return " ".join(tokens).strip()
|
||||
|
||||
|
||||
def canonical_row_for_group(canonical_product_id, group_rows, link_method):
|
||||
quantity_value, quantity_unit = normalized_quantity(
|
||||
{
|
||||
@@ -130,7 +131,10 @@ def canonical_row_for_group(canonical_product_id, group_rows, link_method):
|
||||
)
|
||||
return {
|
||||
"canonical_product_id": canonical_product_id,
|
||||
"canonical_name": representative_value(group_rows, "representative_name_norm"),
|
||||
"canonical_name": clean_canonical_name(
|
||||
representative_value(group_rows, "representative_name_norm")
|
||||
)
|
||||
or representative_value(group_rows, "representative_name_norm"),
|
||||
"product_type": "",
|
||||
"brand": representative_value(group_rows, "representative_brand"),
|
||||
"variant": representative_value(group_rows, "representative_variant"),
|
||||
|
||||
@@ -4,7 +4,7 @@ import build_canonical_layer
|
||||
|
||||
|
||||
class CanonicalLayerTests(unittest.TestCase):
|
||||
def test_build_canonical_layer_auto_links_exact_upc_and_name_size(self):
|
||||
def test_build_canonical_layer_auto_links_exact_upc_and_name_size_only(self):
|
||||
observed_rows = [
|
||||
{
|
||||
"observed_product_id": "gobs_1",
|
||||
@@ -81,6 +81,21 @@ class CanonicalLayerTests(unittest.TestCase):
|
||||
"is_discount_line": "false",
|
||||
"is_coupon_line": "false",
|
||||
},
|
||||
{
|
||||
"observed_product_id": "gobs_6",
|
||||
"representative_upc": "",
|
||||
"representative_retailer_item_id": "",
|
||||
"representative_name_norm": "LIME",
|
||||
"representative_brand": "",
|
||||
"representative_variant": "",
|
||||
"representative_size_value": "",
|
||||
"representative_size_unit": "",
|
||||
"representative_pack_qty": "",
|
||||
"representative_measure_type": "each",
|
||||
"is_fee": "false",
|
||||
"is_discount_line": "false",
|
||||
"is_coupon_line": "false",
|
||||
},
|
||||
]
|
||||
|
||||
canonicals, links = build_canonical_layer.build_canonical_layer(observed_rows)
|
||||
@@ -93,6 +108,11 @@ class CanonicalLayerTests(unittest.TestCase):
|
||||
self.assertEqual("exact_name_size", methods["gobs_3"])
|
||||
self.assertEqual("exact_name_size", methods["gobs_4"])
|
||||
self.assertNotIn("gobs_5", methods)
|
||||
self.assertNotIn("gobs_6", methods)
|
||||
|
||||
def test_clean_canonical_name_removes_packaging_noise(self):
|
||||
self.assertEqual("LIME", build_canonical_layer.clean_canonical_name("LIME . / ."))
|
||||
self.assertEqual("EGG", build_canonical_layer.clean_canonical_name("5DZ EGG / /"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user