Generate Giant observed products
This commit is contained in:
147
build_observed_products.py
Normal file
147
build_observed_products.py
Normal file
@@ -0,0 +1,147 @@
|
|||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
import click
|
||||||
|
|
||||||
|
from layer_helpers import (
|
||||||
|
compact_join,
|
||||||
|
distinct_values,
|
||||||
|
first_nonblank,
|
||||||
|
read_csv_rows,
|
||||||
|
representative_value,
|
||||||
|
stable_id,
|
||||||
|
write_csv_rows,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
OUTPUT_FIELDS = [
|
||||||
|
"observed_product_id",
|
||||||
|
"retailer",
|
||||||
|
"observed_key",
|
||||||
|
"representative_upc",
|
||||||
|
"representative_item_name",
|
||||||
|
"representative_name_norm",
|
||||||
|
"representative_brand",
|
||||||
|
"representative_variant",
|
||||||
|
"representative_size_value",
|
||||||
|
"representative_size_unit",
|
||||||
|
"representative_pack_qty",
|
||||||
|
"representative_measure_type",
|
||||||
|
"representative_image_url",
|
||||||
|
"is_store_brand",
|
||||||
|
"is_fee",
|
||||||
|
"first_seen_date",
|
||||||
|
"last_seen_date",
|
||||||
|
"times_seen",
|
||||||
|
"example_order_id",
|
||||||
|
"example_item_name",
|
||||||
|
"raw_name_examples",
|
||||||
|
"normalized_name_examples",
|
||||||
|
"example_prices",
|
||||||
|
"distinct_item_names_count",
|
||||||
|
"distinct_upcs_count",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def build_observed_key(row):
|
||||||
|
if row.get("upc"):
|
||||||
|
return "|".join(
|
||||||
|
[
|
||||||
|
row["retailer"],
|
||||||
|
f"upc={row['upc']}",
|
||||||
|
f"name={row['item_name_norm']}",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
return "|".join(
|
||||||
|
[
|
||||||
|
row["retailer"],
|
||||||
|
f"name={row['item_name_norm']}",
|
||||||
|
f"size={row['size_value']}",
|
||||||
|
f"unit={row['size_unit']}",
|
||||||
|
f"pack={row['pack_qty']}",
|
||||||
|
f"measure={row['measure_type']}",
|
||||||
|
f"store_brand={row['is_store_brand']}",
|
||||||
|
f"fee={row['is_fee']}",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def build_observed_products(rows):
|
||||||
|
grouped = defaultdict(list)
|
||||||
|
for row in rows:
|
||||||
|
grouped[build_observed_key(row)].append(row)
|
||||||
|
|
||||||
|
observed_rows = []
|
||||||
|
for observed_key, group_rows in sorted(grouped.items()):
|
||||||
|
ordered = sorted(
|
||||||
|
group_rows,
|
||||||
|
key=lambda row: (row["order_date"], row["order_id"], int(row["line_no"])),
|
||||||
|
)
|
||||||
|
observed_rows.append(
|
||||||
|
{
|
||||||
|
"observed_product_id": stable_id("gobs", observed_key),
|
||||||
|
"retailer": ordered[0]["retailer"],
|
||||||
|
"observed_key": observed_key,
|
||||||
|
"representative_upc": representative_value(ordered, "upc"),
|
||||||
|
"representative_item_name": representative_value(ordered, "item_name"),
|
||||||
|
"representative_name_norm": representative_value(
|
||||||
|
ordered, "item_name_norm"
|
||||||
|
),
|
||||||
|
"representative_brand": representative_value(ordered, "brand_guess"),
|
||||||
|
"representative_variant": representative_value(ordered, "variant"),
|
||||||
|
"representative_size_value": representative_value(ordered, "size_value"),
|
||||||
|
"representative_size_unit": representative_value(ordered, "size_unit"),
|
||||||
|
"representative_pack_qty": representative_value(ordered, "pack_qty"),
|
||||||
|
"representative_measure_type": representative_value(
|
||||||
|
ordered, "measure_type"
|
||||||
|
),
|
||||||
|
"representative_image_url": first_nonblank(ordered, "image_url"),
|
||||||
|
"is_store_brand": representative_value(ordered, "is_store_brand"),
|
||||||
|
"is_fee": representative_value(ordered, "is_fee"),
|
||||||
|
"first_seen_date": ordered[0]["order_date"],
|
||||||
|
"last_seen_date": ordered[-1]["order_date"],
|
||||||
|
"times_seen": str(len(ordered)),
|
||||||
|
"example_order_id": ordered[0]["order_id"],
|
||||||
|
"example_item_name": ordered[0]["item_name"],
|
||||||
|
"raw_name_examples": compact_join(
|
||||||
|
distinct_values(ordered, "item_name"), limit=4
|
||||||
|
),
|
||||||
|
"normalized_name_examples": compact_join(
|
||||||
|
distinct_values(ordered, "item_name_norm"), limit=4
|
||||||
|
),
|
||||||
|
"example_prices": compact_join(
|
||||||
|
distinct_values(ordered, "line_total"), limit=4
|
||||||
|
),
|
||||||
|
"distinct_item_names_count": str(
|
||||||
|
len(distinct_values(ordered, "item_name"))
|
||||||
|
),
|
||||||
|
"distinct_upcs_count": str(len(distinct_values(ordered, "upc"))),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
observed_rows.sort(key=lambda row: row["observed_product_id"])
|
||||||
|
return observed_rows
|
||||||
|
|
||||||
|
|
||||||
|
@click.command()
|
||||||
|
@click.option(
|
||||||
|
"--items-enriched-csv",
|
||||||
|
default="giant_output/items_enriched.csv",
|
||||||
|
show_default=True,
|
||||||
|
help="Path to enriched Giant item rows.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--output-csv",
|
||||||
|
default="giant_output/products_observed.csv",
|
||||||
|
show_default=True,
|
||||||
|
help="Path to observed product output.",
|
||||||
|
)
|
||||||
|
def main(items_enriched_csv, output_csv):
|
||||||
|
rows = read_csv_rows(items_enriched_csv)
|
||||||
|
observed_rows = build_observed_products(rows)
|
||||||
|
write_csv_rows(output_csv, observed_rows, OUTPUT_FIELDS)
|
||||||
|
click.echo(f"wrote {len(observed_rows)} rows to {output_csv}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
54
layer_helpers.py
Normal file
54
layer_helpers.py
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
import csv
|
||||||
|
import hashlib
|
||||||
|
from collections import Counter
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def read_csv_rows(path):
|
||||||
|
path = Path(path)
|
||||||
|
with path.open(newline="", encoding="utf-8") as handle:
|
||||||
|
return list(csv.DictReader(handle))
|
||||||
|
|
||||||
|
|
||||||
|
def write_csv_rows(path, rows, fieldnames):
|
||||||
|
path = Path(path)
|
||||||
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with path.open("w", newline="", encoding="utf-8") as handle:
|
||||||
|
writer = csv.DictWriter(handle, fieldnames=fieldnames)
|
||||||
|
writer.writeheader()
|
||||||
|
writer.writerows(rows)
|
||||||
|
|
||||||
|
|
||||||
|
def stable_id(prefix, raw_key):
|
||||||
|
digest = hashlib.sha1(str(raw_key).encode("utf-8")).hexdigest()[:12]
|
||||||
|
return f"{prefix}_{digest}"
|
||||||
|
|
||||||
|
|
||||||
|
def first_nonblank(rows, field):
|
||||||
|
for row in rows:
|
||||||
|
value = row.get(field, "")
|
||||||
|
if value:
|
||||||
|
return value
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def representative_value(rows, field):
|
||||||
|
values = [row.get(field, "") for row in rows if row.get(field, "")]
|
||||||
|
if not values:
|
||||||
|
return ""
|
||||||
|
counts = Counter(values)
|
||||||
|
return sorted(counts.items(), key=lambda item: (-item[1], item[0]))[0][0]
|
||||||
|
|
||||||
|
|
||||||
|
def distinct_values(rows, field):
|
||||||
|
return sorted({row.get(field, "") for row in rows if row.get(field, "")})
|
||||||
|
|
||||||
|
|
||||||
|
def compact_join(values, limit=3):
|
||||||
|
unique = []
|
||||||
|
seen = set()
|
||||||
|
for value in values:
|
||||||
|
if value and value not in seen:
|
||||||
|
seen.add(value)
|
||||||
|
unique.append(value)
|
||||||
|
return " | ".join(unique[:limit])
|
||||||
60
tests/test_observed_products.py
Normal file
60
tests/test_observed_products.py
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
import unittest
|
||||||
|
|
||||||
|
import build_observed_products
|
||||||
|
|
||||||
|
|
||||||
|
class ObservedProductTests(unittest.TestCase):
|
||||||
|
def test_build_observed_products_aggregates_rows_with_same_key(self):
|
||||||
|
rows = [
|
||||||
|
{
|
||||||
|
"retailer": "giant",
|
||||||
|
"order_id": "1",
|
||||||
|
"line_no": "1",
|
||||||
|
"order_date": "2026-01-01",
|
||||||
|
"item_name": "SB GALA APPLE 5LB",
|
||||||
|
"item_name_norm": "GALA APPLE",
|
||||||
|
"upc": "111",
|
||||||
|
"brand_guess": "SB",
|
||||||
|
"variant": "",
|
||||||
|
"size_value": "5",
|
||||||
|
"size_unit": "lb",
|
||||||
|
"pack_qty": "",
|
||||||
|
"measure_type": "weight",
|
||||||
|
"image_url": "https://example.test/a.jpg",
|
||||||
|
"is_store_brand": "true",
|
||||||
|
"is_fee": "false",
|
||||||
|
"line_total": "7.99",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"retailer": "giant",
|
||||||
|
"order_id": "2",
|
||||||
|
"line_no": "1",
|
||||||
|
"order_date": "2026-01-10",
|
||||||
|
"item_name": "SB GALA APPLE 5 LB",
|
||||||
|
"item_name_norm": "GALA APPLE",
|
||||||
|
"upc": "111",
|
||||||
|
"brand_guess": "SB",
|
||||||
|
"variant": "",
|
||||||
|
"size_value": "5",
|
||||||
|
"size_unit": "lb",
|
||||||
|
"pack_qty": "",
|
||||||
|
"measure_type": "weight",
|
||||||
|
"image_url": "",
|
||||||
|
"is_store_brand": "true",
|
||||||
|
"is_fee": "false",
|
||||||
|
"line_total": "8.49",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
observed = build_observed_products.build_observed_products(rows)
|
||||||
|
|
||||||
|
self.assertEqual(1, len(observed))
|
||||||
|
self.assertEqual("2", observed[0]["times_seen"])
|
||||||
|
self.assertEqual("2026-01-01", observed[0]["first_seen_date"])
|
||||||
|
self.assertEqual("2026-01-10", observed[0]["last_seen_date"])
|
||||||
|
self.assertEqual("111", observed[0]["representative_upc"])
|
||||||
|
self.assertIn("SB GALA APPLE 5LB", observed[0]["raw_name_examples"])
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
Reference in New Issue
Block a user