from collections import defaultdict import click from layer_helpers import ( compact_join, distinct_values, first_nonblank, read_csv_rows, representative_value, stable_id, write_csv_rows, ) OUTPUT_FIELDS = [ "observed_product_id", "retailer", "observed_key", "representative_upc", "representative_item_name", "representative_name_norm", "representative_brand", "representative_variant", "representative_size_value", "representative_size_unit", "representative_pack_qty", "representative_measure_type", "representative_image_url", "is_store_brand", "is_fee", "first_seen_date", "last_seen_date", "times_seen", "example_order_id", "example_item_name", "raw_name_examples", "normalized_name_examples", "example_prices", "distinct_item_names_count", "distinct_upcs_count", ] def build_observed_key(row): if row.get("upc"): return "|".join( [ row["retailer"], f"upc={row['upc']}", f"name={row['item_name_norm']}", ] ) return "|".join( [ row["retailer"], f"name={row['item_name_norm']}", f"size={row['size_value']}", f"unit={row['size_unit']}", f"pack={row['pack_qty']}", f"measure={row['measure_type']}", f"store_brand={row['is_store_brand']}", f"fee={row['is_fee']}", ] ) def build_observed_products(rows): grouped = defaultdict(list) for row in rows: grouped[build_observed_key(row)].append(row) observed_rows = [] for observed_key, group_rows in sorted(grouped.items()): ordered = sorted( group_rows, key=lambda row: (row["order_date"], row["order_id"], int(row["line_no"])), ) observed_rows.append( { "observed_product_id": stable_id("gobs", observed_key), "retailer": ordered[0]["retailer"], "observed_key": observed_key, "representative_upc": representative_value(ordered, "upc"), "representative_item_name": representative_value(ordered, "item_name"), "representative_name_norm": representative_value( ordered, "item_name_norm" ), "representative_brand": representative_value(ordered, "brand_guess"), "representative_variant": representative_value(ordered, "variant"), "representative_size_value": representative_value(ordered, "size_value"), "representative_size_unit": representative_value(ordered, "size_unit"), "representative_pack_qty": representative_value(ordered, "pack_qty"), "representative_measure_type": representative_value( ordered, "measure_type" ), "representative_image_url": first_nonblank(ordered, "image_url"), "is_store_brand": representative_value(ordered, "is_store_brand"), "is_fee": representative_value(ordered, "is_fee"), "first_seen_date": ordered[0]["order_date"], "last_seen_date": ordered[-1]["order_date"], "times_seen": str(len(ordered)), "example_order_id": ordered[0]["order_id"], "example_item_name": ordered[0]["item_name"], "raw_name_examples": compact_join( distinct_values(ordered, "item_name"), limit=4 ), "normalized_name_examples": compact_join( distinct_values(ordered, "item_name_norm"), limit=4 ), "example_prices": compact_join( distinct_values(ordered, "line_total"), limit=4 ), "distinct_item_names_count": str( len(distinct_values(ordered, "item_name")) ), "distinct_upcs_count": str(len(distinct_values(ordered, "upc"))), } ) observed_rows.sort(key=lambda row: row["observed_product_id"]) return observed_rows @click.command() @click.option( "--items-enriched-csv", default="giant_output/items_enriched.csv", show_default=True, help="Path to enriched Giant item rows.", ) @click.option( "--output-csv", default="giant_output/products_observed.csv", show_default=True, help="Path to observed product output.", ) def main(items_enriched_csv, output_csv): rows = read_csv_rows(items_enriched_csv) observed_rows = build_observed_products(rows) write_csv_rows(output_csv, observed_rows, OUTPUT_FIELDS) click.echo(f"wrote {len(observed_rows)} rows to {output_csv}") if __name__ == "__main__": main()