Generate Giant observed products
This commit is contained in:
147
build_observed_products.py
Normal file
147
build_observed_products.py
Normal file
@@ -0,0 +1,147 @@
|
||||
from collections import defaultdict
|
||||
|
||||
import click
|
||||
|
||||
from layer_helpers import (
|
||||
compact_join,
|
||||
distinct_values,
|
||||
first_nonblank,
|
||||
read_csv_rows,
|
||||
representative_value,
|
||||
stable_id,
|
||||
write_csv_rows,
|
||||
)
|
||||
|
||||
|
||||
OUTPUT_FIELDS = [
|
||||
"observed_product_id",
|
||||
"retailer",
|
||||
"observed_key",
|
||||
"representative_upc",
|
||||
"representative_item_name",
|
||||
"representative_name_norm",
|
||||
"representative_brand",
|
||||
"representative_variant",
|
||||
"representative_size_value",
|
||||
"representative_size_unit",
|
||||
"representative_pack_qty",
|
||||
"representative_measure_type",
|
||||
"representative_image_url",
|
||||
"is_store_brand",
|
||||
"is_fee",
|
||||
"first_seen_date",
|
||||
"last_seen_date",
|
||||
"times_seen",
|
||||
"example_order_id",
|
||||
"example_item_name",
|
||||
"raw_name_examples",
|
||||
"normalized_name_examples",
|
||||
"example_prices",
|
||||
"distinct_item_names_count",
|
||||
"distinct_upcs_count",
|
||||
]
|
||||
|
||||
|
||||
def build_observed_key(row):
|
||||
if row.get("upc"):
|
||||
return "|".join(
|
||||
[
|
||||
row["retailer"],
|
||||
f"upc={row['upc']}",
|
||||
f"name={row['item_name_norm']}",
|
||||
]
|
||||
)
|
||||
|
||||
return "|".join(
|
||||
[
|
||||
row["retailer"],
|
||||
f"name={row['item_name_norm']}",
|
||||
f"size={row['size_value']}",
|
||||
f"unit={row['size_unit']}",
|
||||
f"pack={row['pack_qty']}",
|
||||
f"measure={row['measure_type']}",
|
||||
f"store_brand={row['is_store_brand']}",
|
||||
f"fee={row['is_fee']}",
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def build_observed_products(rows):
|
||||
grouped = defaultdict(list)
|
||||
for row in rows:
|
||||
grouped[build_observed_key(row)].append(row)
|
||||
|
||||
observed_rows = []
|
||||
for observed_key, group_rows in sorted(grouped.items()):
|
||||
ordered = sorted(
|
||||
group_rows,
|
||||
key=lambda row: (row["order_date"], row["order_id"], int(row["line_no"])),
|
||||
)
|
||||
observed_rows.append(
|
||||
{
|
||||
"observed_product_id": stable_id("gobs", observed_key),
|
||||
"retailer": ordered[0]["retailer"],
|
||||
"observed_key": observed_key,
|
||||
"representative_upc": representative_value(ordered, "upc"),
|
||||
"representative_item_name": representative_value(ordered, "item_name"),
|
||||
"representative_name_norm": representative_value(
|
||||
ordered, "item_name_norm"
|
||||
),
|
||||
"representative_brand": representative_value(ordered, "brand_guess"),
|
||||
"representative_variant": representative_value(ordered, "variant"),
|
||||
"representative_size_value": representative_value(ordered, "size_value"),
|
||||
"representative_size_unit": representative_value(ordered, "size_unit"),
|
||||
"representative_pack_qty": representative_value(ordered, "pack_qty"),
|
||||
"representative_measure_type": representative_value(
|
||||
ordered, "measure_type"
|
||||
),
|
||||
"representative_image_url": first_nonblank(ordered, "image_url"),
|
||||
"is_store_brand": representative_value(ordered, "is_store_brand"),
|
||||
"is_fee": representative_value(ordered, "is_fee"),
|
||||
"first_seen_date": ordered[0]["order_date"],
|
||||
"last_seen_date": ordered[-1]["order_date"],
|
||||
"times_seen": str(len(ordered)),
|
||||
"example_order_id": ordered[0]["order_id"],
|
||||
"example_item_name": ordered[0]["item_name"],
|
||||
"raw_name_examples": compact_join(
|
||||
distinct_values(ordered, "item_name"), limit=4
|
||||
),
|
||||
"normalized_name_examples": compact_join(
|
||||
distinct_values(ordered, "item_name_norm"), limit=4
|
||||
),
|
||||
"example_prices": compact_join(
|
||||
distinct_values(ordered, "line_total"), limit=4
|
||||
),
|
||||
"distinct_item_names_count": str(
|
||||
len(distinct_values(ordered, "item_name"))
|
||||
),
|
||||
"distinct_upcs_count": str(len(distinct_values(ordered, "upc"))),
|
||||
}
|
||||
)
|
||||
|
||||
observed_rows.sort(key=lambda row: row["observed_product_id"])
|
||||
return observed_rows
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option(
|
||||
"--items-enriched-csv",
|
||||
default="giant_output/items_enriched.csv",
|
||||
show_default=True,
|
||||
help="Path to enriched Giant item rows.",
|
||||
)
|
||||
@click.option(
|
||||
"--output-csv",
|
||||
default="giant_output/products_observed.csv",
|
||||
show_default=True,
|
||||
help="Path to observed product output.",
|
||||
)
|
||||
def main(items_enriched_csv, output_csv):
|
||||
rows = read_csv_rows(items_enriched_csv)
|
||||
observed_rows = build_observed_products(rows)
|
||||
write_csv_rows(output_csv, observed_rows, OUTPUT_FIELDS)
|
||||
click.echo(f"wrote {len(observed_rows)} rows to {output_csv}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
54
layer_helpers.py
Normal file
54
layer_helpers.py
Normal file
@@ -0,0 +1,54 @@
|
||||
import csv
|
||||
import hashlib
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def read_csv_rows(path):
|
||||
path = Path(path)
|
||||
with path.open(newline="", encoding="utf-8") as handle:
|
||||
return list(csv.DictReader(handle))
|
||||
|
||||
|
||||
def write_csv_rows(path, rows, fieldnames):
|
||||
path = Path(path)
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with path.open("w", newline="", encoding="utf-8") as handle:
|
||||
writer = csv.DictWriter(handle, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(rows)
|
||||
|
||||
|
||||
def stable_id(prefix, raw_key):
|
||||
digest = hashlib.sha1(str(raw_key).encode("utf-8")).hexdigest()[:12]
|
||||
return f"{prefix}_{digest}"
|
||||
|
||||
|
||||
def first_nonblank(rows, field):
|
||||
for row in rows:
|
||||
value = row.get(field, "")
|
||||
if value:
|
||||
return value
|
||||
return ""
|
||||
|
||||
|
||||
def representative_value(rows, field):
|
||||
values = [row.get(field, "") for row in rows if row.get(field, "")]
|
||||
if not values:
|
||||
return ""
|
||||
counts = Counter(values)
|
||||
return sorted(counts.items(), key=lambda item: (-item[1], item[0]))[0][0]
|
||||
|
||||
|
||||
def distinct_values(rows, field):
|
||||
return sorted({row.get(field, "") for row in rows if row.get(field, "")})
|
||||
|
||||
|
||||
def compact_join(values, limit=3):
|
||||
unique = []
|
||||
seen = set()
|
||||
for value in values:
|
||||
if value and value not in seen:
|
||||
seen.add(value)
|
||||
unique.append(value)
|
||||
return " | ".join(unique[:limit])
|
||||
60
tests/test_observed_products.py
Normal file
60
tests/test_observed_products.py
Normal file
@@ -0,0 +1,60 @@
|
||||
import unittest
|
||||
|
||||
import build_observed_products
|
||||
|
||||
|
||||
class ObservedProductTests(unittest.TestCase):
|
||||
def test_build_observed_products_aggregates_rows_with_same_key(self):
|
||||
rows = [
|
||||
{
|
||||
"retailer": "giant",
|
||||
"order_id": "1",
|
||||
"line_no": "1",
|
||||
"order_date": "2026-01-01",
|
||||
"item_name": "SB GALA APPLE 5LB",
|
||||
"item_name_norm": "GALA APPLE",
|
||||
"upc": "111",
|
||||
"brand_guess": "SB",
|
||||
"variant": "",
|
||||
"size_value": "5",
|
||||
"size_unit": "lb",
|
||||
"pack_qty": "",
|
||||
"measure_type": "weight",
|
||||
"image_url": "https://example.test/a.jpg",
|
||||
"is_store_brand": "true",
|
||||
"is_fee": "false",
|
||||
"line_total": "7.99",
|
||||
},
|
||||
{
|
||||
"retailer": "giant",
|
||||
"order_id": "2",
|
||||
"line_no": "1",
|
||||
"order_date": "2026-01-10",
|
||||
"item_name": "SB GALA APPLE 5 LB",
|
||||
"item_name_norm": "GALA APPLE",
|
||||
"upc": "111",
|
||||
"brand_guess": "SB",
|
||||
"variant": "",
|
||||
"size_value": "5",
|
||||
"size_unit": "lb",
|
||||
"pack_qty": "",
|
||||
"measure_type": "weight",
|
||||
"image_url": "",
|
||||
"is_store_brand": "true",
|
||||
"is_fee": "false",
|
||||
"line_total": "8.49",
|
||||
},
|
||||
]
|
||||
|
||||
observed = build_observed_products.build_observed_products(rows)
|
||||
|
||||
self.assertEqual(1, len(observed))
|
||||
self.assertEqual("2", observed[0]["times_seen"])
|
||||
self.assertEqual("2026-01-01", observed[0]["first_seen_date"])
|
||||
self.assertEqual("2026-01-10", observed[0]["last_seen_date"])
|
||||
self.assertEqual("111", observed[0]["representative_upc"])
|
||||
self.assertIn("SB GALA APPLE 5LB", observed[0]["raw_name_examples"])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user