Make canonical auto-linking more conservative
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
import click
|
||||
import re
|
||||
|
||||
from layer_helpers import read_csv_rows, representative_value, stable_id, write_csv_rows
|
||||
|
||||
@@ -20,6 +21,8 @@ CANONICAL_FIELDS = [
|
||||
"updated_at",
|
||||
]
|
||||
|
||||
CANONICAL_DROP_TOKENS = {"CT", "COUNT", "COUNTS", "DOZ", "DOZEN", "DOZ.", "PACK"}
|
||||
|
||||
LINK_FIELDS = [
|
||||
"observed_product_id",
|
||||
"canonical_product_id",
|
||||
@@ -91,26 +94,24 @@ def auto_link_rule(observed_row):
|
||||
"high",
|
||||
)
|
||||
|
||||
if (
|
||||
observed_row.get("representative_name_norm")
|
||||
and not observed_row.get("representative_size_value")
|
||||
and not observed_row.get("representative_size_unit")
|
||||
and not observed_row.get("representative_pack_qty")
|
||||
):
|
||||
return (
|
||||
"exact_name",
|
||||
"|".join(
|
||||
[
|
||||
f"name={observed_row['representative_name_norm']}",
|
||||
f"measure={observed_row['representative_measure_type']}",
|
||||
]
|
||||
),
|
||||
"medium",
|
||||
)
|
||||
|
||||
return "", "", ""
|
||||
|
||||
|
||||
def clean_canonical_name(name):
|
||||
tokens = []
|
||||
for token in re.sub(r"[^A-Z0-9\s]", " ", (name or "").upper()).split():
|
||||
if token.isdigit():
|
||||
continue
|
||||
if token in CANONICAL_DROP_TOKENS:
|
||||
continue
|
||||
if re.fullmatch(r"\d+(?:PK|PACK)", token):
|
||||
continue
|
||||
if re.fullmatch(r"\d+DZ", token):
|
||||
continue
|
||||
tokens.append(token)
|
||||
return " ".join(tokens).strip()
|
||||
|
||||
|
||||
def canonical_row_for_group(canonical_product_id, group_rows, link_method):
|
||||
quantity_value, quantity_unit = normalized_quantity(
|
||||
{
|
||||
@@ -130,7 +131,10 @@ def canonical_row_for_group(canonical_product_id, group_rows, link_method):
|
||||
)
|
||||
return {
|
||||
"canonical_product_id": canonical_product_id,
|
||||
"canonical_name": representative_value(group_rows, "representative_name_norm"),
|
||||
"canonical_name": clean_canonical_name(
|
||||
representative_value(group_rows, "representative_name_norm")
|
||||
)
|
||||
or representative_value(group_rows, "representative_name_norm"),
|
||||
"product_type": "",
|
||||
"brand": representative_value(group_rows, "representative_brand"),
|
||||
"variant": representative_value(group_rows, "representative_variant"),
|
||||
|
||||
Reference in New Issue
Block a user