Generate Giant observed products
This commit is contained in:
147
build_observed_products.py
Normal file
147
build_observed_products.py
Normal file
@@ -0,0 +1,147 @@
|
||||
from collections import defaultdict
|
||||
|
||||
import click
|
||||
|
||||
from layer_helpers import (
|
||||
compact_join,
|
||||
distinct_values,
|
||||
first_nonblank,
|
||||
read_csv_rows,
|
||||
representative_value,
|
||||
stable_id,
|
||||
write_csv_rows,
|
||||
)
|
||||
|
||||
|
||||
OUTPUT_FIELDS = [
|
||||
"observed_product_id",
|
||||
"retailer",
|
||||
"observed_key",
|
||||
"representative_upc",
|
||||
"representative_item_name",
|
||||
"representative_name_norm",
|
||||
"representative_brand",
|
||||
"representative_variant",
|
||||
"representative_size_value",
|
||||
"representative_size_unit",
|
||||
"representative_pack_qty",
|
||||
"representative_measure_type",
|
||||
"representative_image_url",
|
||||
"is_store_brand",
|
||||
"is_fee",
|
||||
"first_seen_date",
|
||||
"last_seen_date",
|
||||
"times_seen",
|
||||
"example_order_id",
|
||||
"example_item_name",
|
||||
"raw_name_examples",
|
||||
"normalized_name_examples",
|
||||
"example_prices",
|
||||
"distinct_item_names_count",
|
||||
"distinct_upcs_count",
|
||||
]
|
||||
|
||||
|
||||
def build_observed_key(row):
|
||||
if row.get("upc"):
|
||||
return "|".join(
|
||||
[
|
||||
row["retailer"],
|
||||
f"upc={row['upc']}",
|
||||
f"name={row['item_name_norm']}",
|
||||
]
|
||||
)
|
||||
|
||||
return "|".join(
|
||||
[
|
||||
row["retailer"],
|
||||
f"name={row['item_name_norm']}",
|
||||
f"size={row['size_value']}",
|
||||
f"unit={row['size_unit']}",
|
||||
f"pack={row['pack_qty']}",
|
||||
f"measure={row['measure_type']}",
|
||||
f"store_brand={row['is_store_brand']}",
|
||||
f"fee={row['is_fee']}",
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def build_observed_products(rows):
|
||||
grouped = defaultdict(list)
|
||||
for row in rows:
|
||||
grouped[build_observed_key(row)].append(row)
|
||||
|
||||
observed_rows = []
|
||||
for observed_key, group_rows in sorted(grouped.items()):
|
||||
ordered = sorted(
|
||||
group_rows,
|
||||
key=lambda row: (row["order_date"], row["order_id"], int(row["line_no"])),
|
||||
)
|
||||
observed_rows.append(
|
||||
{
|
||||
"observed_product_id": stable_id("gobs", observed_key),
|
||||
"retailer": ordered[0]["retailer"],
|
||||
"observed_key": observed_key,
|
||||
"representative_upc": representative_value(ordered, "upc"),
|
||||
"representative_item_name": representative_value(ordered, "item_name"),
|
||||
"representative_name_norm": representative_value(
|
||||
ordered, "item_name_norm"
|
||||
),
|
||||
"representative_brand": representative_value(ordered, "brand_guess"),
|
||||
"representative_variant": representative_value(ordered, "variant"),
|
||||
"representative_size_value": representative_value(ordered, "size_value"),
|
||||
"representative_size_unit": representative_value(ordered, "size_unit"),
|
||||
"representative_pack_qty": representative_value(ordered, "pack_qty"),
|
||||
"representative_measure_type": representative_value(
|
||||
ordered, "measure_type"
|
||||
),
|
||||
"representative_image_url": first_nonblank(ordered, "image_url"),
|
||||
"is_store_brand": representative_value(ordered, "is_store_brand"),
|
||||
"is_fee": representative_value(ordered, "is_fee"),
|
||||
"first_seen_date": ordered[0]["order_date"],
|
||||
"last_seen_date": ordered[-1]["order_date"],
|
||||
"times_seen": str(len(ordered)),
|
||||
"example_order_id": ordered[0]["order_id"],
|
||||
"example_item_name": ordered[0]["item_name"],
|
||||
"raw_name_examples": compact_join(
|
||||
distinct_values(ordered, "item_name"), limit=4
|
||||
),
|
||||
"normalized_name_examples": compact_join(
|
||||
distinct_values(ordered, "item_name_norm"), limit=4
|
||||
),
|
||||
"example_prices": compact_join(
|
||||
distinct_values(ordered, "line_total"), limit=4
|
||||
),
|
||||
"distinct_item_names_count": str(
|
||||
len(distinct_values(ordered, "item_name"))
|
||||
),
|
||||
"distinct_upcs_count": str(len(distinct_values(ordered, "upc"))),
|
||||
}
|
||||
)
|
||||
|
||||
observed_rows.sort(key=lambda row: row["observed_product_id"])
|
||||
return observed_rows
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option(
|
||||
"--items-enriched-csv",
|
||||
default="giant_output/items_enriched.csv",
|
||||
show_default=True,
|
||||
help="Path to enriched Giant item rows.",
|
||||
)
|
||||
@click.option(
|
||||
"--output-csv",
|
||||
default="giant_output/products_observed.csv",
|
||||
show_default=True,
|
||||
help="Path to observed product output.",
|
||||
)
|
||||
def main(items_enriched_csv, output_csv):
|
||||
rows = read_csv_rows(items_enriched_csv)
|
||||
observed_rows = build_observed_products(rows)
|
||||
write_csv_rows(output_csv, observed_rows, OUTPUT_FIELDS)
|
||||
click.echo(f"wrote {len(observed_rows)} rows to {output_csv}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user