From dc392149b54ff6e8688bb32b5f1d8907aa68f2b8 Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 16 Mar 2026 00:43:11 -0400 Subject: [PATCH] Generate Giant observed products --- build_observed_products.py | 147 ++++++++++++++++++++++++++++++++ layer_helpers.py | 54 ++++++++++++ tests/test_observed_products.py | 60 +++++++++++++ 3 files changed, 261 insertions(+) create mode 100644 build_observed_products.py create mode 100644 layer_helpers.py create mode 100644 tests/test_observed_products.py diff --git a/build_observed_products.py b/build_observed_products.py new file mode 100644 index 0000000..3874d7b --- /dev/null +++ b/build_observed_products.py @@ -0,0 +1,147 @@ +from collections import defaultdict + +import click + +from layer_helpers import ( + compact_join, + distinct_values, + first_nonblank, + read_csv_rows, + representative_value, + stable_id, + write_csv_rows, +) + + +OUTPUT_FIELDS = [ + "observed_product_id", + "retailer", + "observed_key", + "representative_upc", + "representative_item_name", + "representative_name_norm", + "representative_brand", + "representative_variant", + "representative_size_value", + "representative_size_unit", + "representative_pack_qty", + "representative_measure_type", + "representative_image_url", + "is_store_brand", + "is_fee", + "first_seen_date", + "last_seen_date", + "times_seen", + "example_order_id", + "example_item_name", + "raw_name_examples", + "normalized_name_examples", + "example_prices", + "distinct_item_names_count", + "distinct_upcs_count", +] + + +def build_observed_key(row): + if row.get("upc"): + return "|".join( + [ + row["retailer"], + f"upc={row['upc']}", + f"name={row['item_name_norm']}", + ] + ) + + return "|".join( + [ + row["retailer"], + f"name={row['item_name_norm']}", + f"size={row['size_value']}", + f"unit={row['size_unit']}", + f"pack={row['pack_qty']}", + f"measure={row['measure_type']}", + f"store_brand={row['is_store_brand']}", + f"fee={row['is_fee']}", + ] + ) + + +def build_observed_products(rows): + grouped = defaultdict(list) + for row in rows: + grouped[build_observed_key(row)].append(row) + + observed_rows = [] + for observed_key, group_rows in sorted(grouped.items()): + ordered = sorted( + group_rows, + key=lambda row: (row["order_date"], row["order_id"], int(row["line_no"])), + ) + observed_rows.append( + { + "observed_product_id": stable_id("gobs", observed_key), + "retailer": ordered[0]["retailer"], + "observed_key": observed_key, + "representative_upc": representative_value(ordered, "upc"), + "representative_item_name": representative_value(ordered, "item_name"), + "representative_name_norm": representative_value( + ordered, "item_name_norm" + ), + "representative_brand": representative_value(ordered, "brand_guess"), + "representative_variant": representative_value(ordered, "variant"), + "representative_size_value": representative_value(ordered, "size_value"), + "representative_size_unit": representative_value(ordered, "size_unit"), + "representative_pack_qty": representative_value(ordered, "pack_qty"), + "representative_measure_type": representative_value( + ordered, "measure_type" + ), + "representative_image_url": first_nonblank(ordered, "image_url"), + "is_store_brand": representative_value(ordered, "is_store_brand"), + "is_fee": representative_value(ordered, "is_fee"), + "first_seen_date": ordered[0]["order_date"], + "last_seen_date": ordered[-1]["order_date"], + "times_seen": str(len(ordered)), + "example_order_id": ordered[0]["order_id"], + "example_item_name": ordered[0]["item_name"], + "raw_name_examples": compact_join( + distinct_values(ordered, "item_name"), limit=4 + ), + "normalized_name_examples": compact_join( + distinct_values(ordered, "item_name_norm"), limit=4 + ), + "example_prices": compact_join( + distinct_values(ordered, "line_total"), limit=4 + ), + "distinct_item_names_count": str( + len(distinct_values(ordered, "item_name")) + ), + "distinct_upcs_count": str(len(distinct_values(ordered, "upc"))), + } + ) + + observed_rows.sort(key=lambda row: row["observed_product_id"]) + return observed_rows + + +@click.command() +@click.option( + "--items-enriched-csv", + default="giant_output/items_enriched.csv", + show_default=True, + help="Path to enriched Giant item rows.", +) +@click.option( + "--output-csv", + default="giant_output/products_observed.csv", + show_default=True, + help="Path to observed product output.", +) +def main(items_enriched_csv, output_csv): + rows = read_csv_rows(items_enriched_csv) + observed_rows = build_observed_products(rows) + write_csv_rows(output_csv, observed_rows, OUTPUT_FIELDS) + click.echo(f"wrote {len(observed_rows)} rows to {output_csv}") + + +if __name__ == "__main__": + main() diff --git a/layer_helpers.py b/layer_helpers.py new file mode 100644 index 0000000..fa3df63 --- /dev/null +++ b/layer_helpers.py @@ -0,0 +1,54 @@ +import csv +import hashlib +from collections import Counter +from pathlib import Path + + +def read_csv_rows(path): + path = Path(path) + with path.open(newline="", encoding="utf-8") as handle: + return list(csv.DictReader(handle)) + + +def write_csv_rows(path, rows, fieldnames): + path = Path(path) + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", newline="", encoding="utf-8") as handle: + writer = csv.DictWriter(handle, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) + + +def stable_id(prefix, raw_key): + digest = hashlib.sha1(str(raw_key).encode("utf-8")).hexdigest()[:12] + return f"{prefix}_{digest}" + + +def first_nonblank(rows, field): + for row in rows: + value = row.get(field, "") + if value: + return value + return "" + + +def representative_value(rows, field): + values = [row.get(field, "") for row in rows if row.get(field, "")] + if not values: + return "" + counts = Counter(values) + return sorted(counts.items(), key=lambda item: (-item[1], item[0]))[0][0] + + +def distinct_values(rows, field): + return sorted({row.get(field, "") for row in rows if row.get(field, "")}) + + +def compact_join(values, limit=3): + unique = [] + seen = set() + for value in values: + if value and value not in seen: + seen.add(value) + unique.append(value) + return " | ".join(unique[:limit]) diff --git a/tests/test_observed_products.py b/tests/test_observed_products.py new file mode 100644 index 0000000..753babd --- /dev/null +++ b/tests/test_observed_products.py @@ -0,0 +1,60 @@ +import unittest + +import build_observed_products + + +class ObservedProductTests(unittest.TestCase): + def test_build_observed_products_aggregates_rows_with_same_key(self): + rows = [ + { + "retailer": "giant", + "order_id": "1", + "line_no": "1", + "order_date": "2026-01-01", + "item_name": "SB GALA APPLE 5LB", + "item_name_norm": "GALA APPLE", + "upc": "111", + "brand_guess": "SB", + "variant": "", + "size_value": "5", + "size_unit": "lb", + "pack_qty": "", + "measure_type": "weight", + "image_url": "https://example.test/a.jpg", + "is_store_brand": "true", + "is_fee": "false", + "line_total": "7.99", + }, + { + "retailer": "giant", + "order_id": "2", + "line_no": "1", + "order_date": "2026-01-10", + "item_name": "SB GALA APPLE 5 LB", + "item_name_norm": "GALA APPLE", + "upc": "111", + "brand_guess": "SB", + "variant": "", + "size_value": "5", + "size_unit": "lb", + "pack_qty": "", + "measure_type": "weight", + "image_url": "", + "is_store_brand": "true", + "is_fee": "false", + "line_total": "8.49", + }, + ] + + observed = build_observed_products.build_observed_products(rows) + + self.assertEqual(1, len(observed)) + self.assertEqual("2", observed[0]["times_seen"]) + self.assertEqual("2026-01-01", observed[0]["first_seen_date"]) + self.assertEqual("2026-01-10", observed[0]["last_seen_date"]) + self.assertEqual("111", observed[0]["representative_upc"]) + self.assertIn("SB GALA APPLE 5LB", observed[0]["raw_name_examples"]) + + +if __name__ == "__main__": + unittest.main()