148 lines
4.8 KiB
Python
148 lines
4.8 KiB
Python
from collections import defaultdict
|
|
|
|
import click
|
|
|
|
from layer_helpers import (
|
|
compact_join,
|
|
distinct_values,
|
|
first_nonblank,
|
|
read_csv_rows,
|
|
representative_value,
|
|
stable_id,
|
|
write_csv_rows,
|
|
)
|
|
|
|
|
|
OUTPUT_FIELDS = [
|
|
"observed_product_id",
|
|
"retailer",
|
|
"observed_key",
|
|
"representative_upc",
|
|
"representative_item_name",
|
|
"representative_name_norm",
|
|
"representative_brand",
|
|
"representative_variant",
|
|
"representative_size_value",
|
|
"representative_size_unit",
|
|
"representative_pack_qty",
|
|
"representative_measure_type",
|
|
"representative_image_url",
|
|
"is_store_brand",
|
|
"is_fee",
|
|
"first_seen_date",
|
|
"last_seen_date",
|
|
"times_seen",
|
|
"example_order_id",
|
|
"example_item_name",
|
|
"raw_name_examples",
|
|
"normalized_name_examples",
|
|
"example_prices",
|
|
"distinct_item_names_count",
|
|
"distinct_upcs_count",
|
|
]
|
|
|
|
|
|
def build_observed_key(row):
|
|
if row.get("upc"):
|
|
return "|".join(
|
|
[
|
|
row["retailer"],
|
|
f"upc={row['upc']}",
|
|
f"name={row['item_name_norm']}",
|
|
]
|
|
)
|
|
|
|
return "|".join(
|
|
[
|
|
row["retailer"],
|
|
f"name={row['item_name_norm']}",
|
|
f"size={row['size_value']}",
|
|
f"unit={row['size_unit']}",
|
|
f"pack={row['pack_qty']}",
|
|
f"measure={row['measure_type']}",
|
|
f"store_brand={row['is_store_brand']}",
|
|
f"fee={row['is_fee']}",
|
|
]
|
|
)
|
|
|
|
|
|
def build_observed_products(rows):
|
|
grouped = defaultdict(list)
|
|
for row in rows:
|
|
grouped[build_observed_key(row)].append(row)
|
|
|
|
observed_rows = []
|
|
for observed_key, group_rows in sorted(grouped.items()):
|
|
ordered = sorted(
|
|
group_rows,
|
|
key=lambda row: (row["order_date"], row["order_id"], int(row["line_no"])),
|
|
)
|
|
observed_rows.append(
|
|
{
|
|
"observed_product_id": stable_id("gobs", observed_key),
|
|
"retailer": ordered[0]["retailer"],
|
|
"observed_key": observed_key,
|
|
"representative_upc": representative_value(ordered, "upc"),
|
|
"representative_item_name": representative_value(ordered, "item_name"),
|
|
"representative_name_norm": representative_value(
|
|
ordered, "item_name_norm"
|
|
),
|
|
"representative_brand": representative_value(ordered, "brand_guess"),
|
|
"representative_variant": representative_value(ordered, "variant"),
|
|
"representative_size_value": representative_value(ordered, "size_value"),
|
|
"representative_size_unit": representative_value(ordered, "size_unit"),
|
|
"representative_pack_qty": representative_value(ordered, "pack_qty"),
|
|
"representative_measure_type": representative_value(
|
|
ordered, "measure_type"
|
|
),
|
|
"representative_image_url": first_nonblank(ordered, "image_url"),
|
|
"is_store_brand": representative_value(ordered, "is_store_brand"),
|
|
"is_fee": representative_value(ordered, "is_fee"),
|
|
"first_seen_date": ordered[0]["order_date"],
|
|
"last_seen_date": ordered[-1]["order_date"],
|
|
"times_seen": str(len(ordered)),
|
|
"example_order_id": ordered[0]["order_id"],
|
|
"example_item_name": ordered[0]["item_name"],
|
|
"raw_name_examples": compact_join(
|
|
distinct_values(ordered, "item_name"), limit=4
|
|
),
|
|
"normalized_name_examples": compact_join(
|
|
distinct_values(ordered, "item_name_norm"), limit=4
|
|
),
|
|
"example_prices": compact_join(
|
|
distinct_values(ordered, "line_total"), limit=4
|
|
),
|
|
"distinct_item_names_count": str(
|
|
len(distinct_values(ordered, "item_name"))
|
|
),
|
|
"distinct_upcs_count": str(len(distinct_values(ordered, "upc"))),
|
|
}
|
|
)
|
|
|
|
observed_rows.sort(key=lambda row: row["observed_product_id"])
|
|
return observed_rows
|
|
|
|
|
|
@click.command()
|
|
@click.option(
|
|
"--items-enriched-csv",
|
|
default="giant_output/items_enriched.csv",
|
|
show_default=True,
|
|
help="Path to enriched Giant item rows.",
|
|
)
|
|
@click.option(
|
|
"--output-csv",
|
|
default="giant_output/products_observed.csv",
|
|
show_default=True,
|
|
help="Path to observed product output.",
|
|
)
|
|
def main(items_enriched_csv, output_csv):
|
|
rows = read_csv_rows(items_enriched_csv)
|
|
observed_rows = build_observed_products(rows)
|
|
write_csv_rows(output_csv, observed_rows, OUTPUT_FIELDS)
|
|
click.echo(f"wrote {len(observed_rows)} rows to {output_csv}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|