Make canonical auto-linking more conservative

This commit is contained in:
ben
2026-03-17 15:07:48 -04:00
parent 56a03bcb1d
commit 08e2a86cbd
2 changed files with 43 additions and 19 deletions

View File

@@ -1,4 +1,5 @@
import click
import re
from layer_helpers import read_csv_rows, representative_value, stable_id, write_csv_rows
@@ -20,6 +21,8 @@ CANONICAL_FIELDS = [
"updated_at",
]
CANONICAL_DROP_TOKENS = {"CT", "COUNT", "COUNTS", "DOZ", "DOZEN", "DOZ.", "PACK"}
LINK_FIELDS = [
"observed_product_id",
"canonical_product_id",
@@ -91,26 +94,24 @@ def auto_link_rule(observed_row):
"high",
)
if (
observed_row.get("representative_name_norm")
and not observed_row.get("representative_size_value")
and not observed_row.get("representative_size_unit")
and not observed_row.get("representative_pack_qty")
):
return (
"exact_name",
"|".join(
[
f"name={observed_row['representative_name_norm']}",
f"measure={observed_row['representative_measure_type']}",
]
),
"medium",
)
return "", "", ""
def clean_canonical_name(name):
tokens = []
for token in re.sub(r"[^A-Z0-9\s]", " ", (name or "").upper()).split():
if token.isdigit():
continue
if token in CANONICAL_DROP_TOKENS:
continue
if re.fullmatch(r"\d+(?:PK|PACK)", token):
continue
if re.fullmatch(r"\d+DZ", token):
continue
tokens.append(token)
return " ".join(tokens).strip()
def canonical_row_for_group(canonical_product_id, group_rows, link_method):
quantity_value, quantity_unit = normalized_quantity(
{
@@ -130,7 +131,10 @@ def canonical_row_for_group(canonical_product_id, group_rows, link_method):
)
return {
"canonical_product_id": canonical_product_id,
"canonical_name": representative_value(group_rows, "representative_name_norm"),
"canonical_name": clean_canonical_name(
representative_value(group_rows, "representative_name_norm")
)
or representative_value(group_rows, "representative_name_norm"),
"product_type": "",
"brand": representative_value(group_rows, "representative_brand"),
"variant": representative_value(group_rows, "representative_variant"),