diff --git a/build_canonical_layer.py b/build_canonical_layer.py index f818624..c0410a0 100644 --- a/build_canonical_layer.py +++ b/build_canonical_layer.py @@ -1,8 +1,6 @@ -import csv - import click -from layer_helpers import read_csv_rows, stable_id, write_csv_rows +from layer_helpers import read_csv_rows, representative_value, stable_id, write_csv_rows CANONICAL_FIELDS = [ @@ -59,40 +57,110 @@ def normalized_quantity(row): return "", "" +def auto_link_rule(observed_row): + if observed_row.get("is_fee") == "true": + return "", "", "" + + if observed_row.get("representative_upc"): + return ( + "exact_upc", + f"upc={observed_row['representative_upc']}", + "high", + ) + + if ( + observed_row.get("representative_name_norm") + and observed_row.get("representative_size_value") + and observed_row.get("representative_size_unit") + ): + return ( + "exact_name_size", + "|".join( + [ + f"name={observed_row['representative_name_norm']}", + f"size={observed_row['representative_size_value']}", + f"unit={observed_row['representative_size_unit']}", + f"pack={observed_row['representative_pack_qty']}", + f"measure={observed_row['representative_measure_type']}", + ] + ), + "high", + ) + + if ( + observed_row.get("representative_name_norm") + and not observed_row.get("representative_size_value") + and not observed_row.get("representative_size_unit") + and not observed_row.get("representative_pack_qty") + ): + return ( + "exact_name", + "|".join( + [ + f"name={observed_row['representative_name_norm']}", + f"measure={observed_row['representative_measure_type']}", + ] + ), + "medium", + ) + + return "", "", "" + + +def canonical_row_for_group(canonical_product_id, group_rows, link_method): + quantity_value, quantity_unit = normalized_quantity( + { + "representative_size_value": representative_value( + group_rows, "representative_size_value" + ), + "representative_size_unit": representative_value( + group_rows, "representative_size_unit" + ), + "representative_pack_qty": representative_value( + group_rows, "representative_pack_qty" + ), + "representative_measure_type": representative_value( + group_rows, "representative_measure_type" + ), + } + ) + return { + "canonical_product_id": canonical_product_id, + "canonical_name": representative_value(group_rows, "representative_name_norm"), + "product_type": "", + "brand": representative_value(group_rows, "representative_brand"), + "variant": representative_value(group_rows, "representative_variant"), + "size_value": representative_value(group_rows, "representative_size_value"), + "size_unit": representative_value(group_rows, "representative_size_unit"), + "pack_qty": representative_value(group_rows, "representative_pack_qty"), + "measure_type": representative_value(group_rows, "representative_measure_type"), + "normalized_quantity": quantity_value, + "normalized_quantity_unit": quantity_unit, + "notes": f"auto-linked via {link_method}", + "created_at": "", + "updated_at": "", + } + + def build_canonical_layer(observed_rows): canonical_rows = [] link_rows = [] + groups = {} for observed_row in sorted(observed_rows, key=lambda row: row["observed_product_id"]): - canonical_product_id = stable_id( - "gcan", f"seed|{observed_row['observed_product_id']}" - ) - quantity_value, quantity_unit = normalized_quantity(observed_row) + link_method, group_key, confidence = auto_link_rule(observed_row) + if not group_key: + continue - canonical_rows.append( - { - "canonical_product_id": canonical_product_id, - "canonical_name": observed_row["representative_name_norm"], - "product_type": "", - "brand": observed_row["representative_brand"], - "variant": observed_row["representative_variant"], - "size_value": observed_row["representative_size_value"], - "size_unit": observed_row["representative_size_unit"], - "pack_qty": observed_row["representative_pack_qty"], - "measure_type": observed_row["representative_measure_type"], - "normalized_quantity": quantity_value, - "normalized_quantity_unit": quantity_unit, - "notes": f"seeded from {observed_row['observed_product_id']}", - "created_at": "", - "updated_at": "", - } - ) + canonical_product_id = stable_id("gcan", f"{link_method}|{group_key}") + groups.setdefault(canonical_product_id, {"method": link_method, "rows": []}) + groups[canonical_product_id]["rows"].append(observed_row) link_rows.append( { "observed_product_id": observed_row["observed_product_id"], "canonical_product_id": canonical_product_id, - "link_method": "seed_observed_product", - "link_confidence": "", + "link_method": link_method, + "link_confidence": confidence, "review_status": "", "reviewed_by": "", "reviewed_at": "", @@ -100,6 +168,13 @@ def build_canonical_layer(observed_rows): } ) + for canonical_product_id, group in sorted(groups.items()): + canonical_rows.append( + canonical_row_for_group( + canonical_product_id, group["rows"], group["method"] + ) + ) + return canonical_rows, link_rows diff --git a/tests/test_canonical_layer.py b/tests/test_canonical_layer.py index 8245c16..0bb67c6 100644 --- a/tests/test_canonical_layer.py +++ b/tests/test_canonical_layer.py @@ -4,10 +4,11 @@ import build_canonical_layer class CanonicalLayerTests(unittest.TestCase): - def test_build_canonical_layer_seeds_one_canonical_per_observed_product(self): + def test_build_canonical_layer_auto_links_exact_upc_and_name_size(self): observed_rows = [ { "observed_product_id": "gobs_1", + "representative_upc": "111", "representative_name_norm": "GALA APPLE", "representative_brand": "SB", "representative_variant": "", @@ -15,9 +16,23 @@ class CanonicalLayerTests(unittest.TestCase): "representative_size_unit": "lb", "representative_pack_qty": "", "representative_measure_type": "weight", + "is_fee": "false", }, { "observed_product_id": "gobs_2", + "representative_upc": "111", + "representative_name_norm": "LARGE WHITE EGGS", + "representative_brand": "SB", + "representative_variant": "", + "representative_size_value": "", + "representative_size_unit": "", + "representative_pack_qty": "18", + "representative_measure_type": "count", + "is_fee": "false", + }, + { + "observed_product_id": "gobs_3", + "representative_upc": "", "representative_name_norm": "ROTINI", "representative_brand": "", "representative_variant": "", @@ -25,17 +40,44 @@ class CanonicalLayerTests(unittest.TestCase): "representative_size_unit": "oz", "representative_pack_qty": "", "representative_measure_type": "weight", + "is_fee": "false", + }, + { + "observed_product_id": "gobs_4", + "representative_upc": "", + "representative_name_norm": "ROTINI", + "representative_brand": "SB", + "representative_variant": "", + "representative_size_value": "16", + "representative_size_unit": "oz", + "representative_pack_qty": "", + "representative_measure_type": "weight", + "is_fee": "false", + }, + { + "observed_product_id": "gobs_5", + "representative_upc": "", + "representative_name_norm": "GL BAG CHARGE", + "representative_brand": "", + "representative_variant": "", + "representative_size_value": "", + "representative_size_unit": "", + "representative_pack_qty": "", + "representative_measure_type": "each", + "is_fee": "true", }, ] canonicals, links = build_canonical_layer.build_canonical_layer(observed_rows) self.assertEqual(2, len(canonicals)) - self.assertEqual(2, len(links)) - self.assertEqual("GALA APPLE", canonicals[0]["canonical_name"]) - self.assertEqual("5", canonicals[0]["normalized_quantity"]) - self.assertEqual("lb", canonicals[0]["normalized_quantity_unit"]) - self.assertEqual("seed_observed_product", links[0]["link_method"]) + self.assertEqual(4, len(links)) + methods = {row["observed_product_id"]: row["link_method"] for row in links} + self.assertEqual("exact_upc", methods["gobs_1"]) + self.assertEqual("exact_upc", methods["gobs_2"]) + self.assertEqual("exact_name_size", methods["gobs_3"]) + self.assertEqual("exact_name_size", methods["gobs_4"]) + self.assertNotIn("gobs_5", methods) if __name__ == "__main__":