import json from pathlib import Path import click import build_canonical_layer import build_observed_products from layer_helpers import stable_id, write_csv_rows PROOF_FIELDS = [ "proof_name", "canonical_product_id", "giant_observed_product_id", "costco_observed_product_id", "giant_example_item", "costco_example_item", "notes", ] def read_rows(path): import csv with Path(path).open(newline="", encoding="utf-8") as handle: return list(csv.DictReader(handle)) def find_proof_pair(observed_rows): giant = None costco = None for row in observed_rows: if row["retailer"] == "giant" and row["representative_name_norm"] == "BANANA": giant = row if row["retailer"] == "costco" and row["representative_name_norm"] == "BANANA": costco = row return giant, costco def merge_proof_pair(canonical_rows, link_rows, giant_row, costco_row): if not giant_row or not costco_row: return canonical_rows, link_rows, [] proof_canonical_id = stable_id("gcan", "proof|banana") link_rows = [ row for row in link_rows if row["observed_product_id"] not in {giant_row["observed_product_id"], costco_row["observed_product_id"]} ] canonical_rows = [ row for row in canonical_rows if row["canonical_product_id"] != proof_canonical_id ] canonical_rows.append( { "canonical_product_id": proof_canonical_id, "canonical_name": "BANANA", "product_type": "banana", "brand": "", "variant": "", "size_value": "", "size_unit": "", "pack_qty": "", "measure_type": "weight", "normalized_quantity": "", "normalized_quantity_unit": "", "notes": "manual proof merge for cross-retailer validation", "created_at": "", "updated_at": "", } ) for observed_row in [giant_row, costco_row]: link_rows.append( { "observed_product_id": observed_row["observed_product_id"], "canonical_product_id": proof_canonical_id, "link_method": "manual_proof_merge", "link_confidence": "medium", "review_status": "", "reviewed_by": "", "reviewed_at": "", "link_notes": "cross-retailer validation proof", } ) proof_rows = [ { "proof_name": "banana", "canonical_product_id": proof_canonical_id, "giant_observed_product_id": giant_row["observed_product_id"], "costco_observed_product_id": costco_row["observed_product_id"], "giant_example_item": giant_row["example_item_name"], "costco_example_item": costco_row["example_item_name"], "notes": "BANANA proof pair built from Giant and Costco enriched rows", } ] return canonical_rows, link_rows, proof_rows @click.command() @click.option( "--giant-items-enriched-csv", default="giant_output/items_enriched.csv", show_default=True, ) @click.option( "--costco-items-enriched-csv", default="costco_output/items_enriched.csv", show_default=True, ) @click.option( "--outdir", default="combined_output", show_default=True, ) def main(giant_items_enriched_csv, costco_items_enriched_csv, outdir): outdir = Path(outdir) rows = read_rows(giant_items_enriched_csv) + read_rows(costco_items_enriched_csv) observed_rows = build_observed_products.build_observed_products(rows) canonical_rows, link_rows = build_canonical_layer.build_canonical_layer(observed_rows) giant_row, costco_row = find_proof_pair(observed_rows) if not giant_row or not costco_row: raise click.ClickException( "could not find BANANA proof pair across Giant and Costco observed products" ) canonical_rows, link_rows, proof_rows = merge_proof_pair( canonical_rows, link_rows, giant_row, costco_row ) write_csv_rows( outdir / "products_observed.csv", observed_rows, build_observed_products.OUTPUT_FIELDS, ) write_csv_rows( outdir / "products_canonical.csv", canonical_rows, build_canonical_layer.CANONICAL_FIELDS, ) write_csv_rows( outdir / "product_links.csv", link_rows, build_canonical_layer.LINK_FIELDS, ) write_csv_rows(outdir / "proof_examples.csv", proof_rows, PROOF_FIELDS) click.echo( f"wrote combined outputs to {outdir} using {len(observed_rows)} observed rows" ) if __name__ == "__main__": main()