Add Costco acquisition and enrich flow
This commit is contained in:
154
validate_cross_retailer_flow.py
Normal file
154
validate_cross_retailer_flow.py
Normal file
@@ -0,0 +1,154 @@
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import click
|
||||
|
||||
import build_canonical_layer
|
||||
import build_observed_products
|
||||
from layer_helpers import stable_id, write_csv_rows
|
||||
|
||||
|
||||
PROOF_FIELDS = [
|
||||
"proof_name",
|
||||
"canonical_product_id",
|
||||
"giant_observed_product_id",
|
||||
"costco_observed_product_id",
|
||||
"giant_example_item",
|
||||
"costco_example_item",
|
||||
"notes",
|
||||
]
|
||||
|
||||
|
||||
def read_rows(path):
|
||||
import csv
|
||||
|
||||
with Path(path).open(newline="", encoding="utf-8") as handle:
|
||||
return list(csv.DictReader(handle))
|
||||
|
||||
|
||||
def find_proof_pair(observed_rows):
|
||||
giant = None
|
||||
costco = None
|
||||
for row in observed_rows:
|
||||
if row["retailer"] == "giant" and row["representative_name_norm"] == "BANANA":
|
||||
giant = row
|
||||
if row["retailer"] == "costco" and row["representative_name_norm"] == "BANANA":
|
||||
costco = row
|
||||
return giant, costco
|
||||
|
||||
|
||||
def merge_proof_pair(canonical_rows, link_rows, giant_row, costco_row):
|
||||
if not giant_row or not costco_row:
|
||||
return canonical_rows, link_rows, []
|
||||
|
||||
proof_canonical_id = stable_id("gcan", "proof|banana")
|
||||
link_rows = [
|
||||
row
|
||||
for row in link_rows
|
||||
if row["observed_product_id"]
|
||||
not in {giant_row["observed_product_id"], costco_row["observed_product_id"]}
|
||||
]
|
||||
canonical_rows = [
|
||||
row
|
||||
for row in canonical_rows
|
||||
if row["canonical_product_id"] != proof_canonical_id
|
||||
]
|
||||
canonical_rows.append(
|
||||
{
|
||||
"canonical_product_id": proof_canonical_id,
|
||||
"canonical_name": "BANANA",
|
||||
"product_type": "banana",
|
||||
"brand": "",
|
||||
"variant": "",
|
||||
"size_value": "",
|
||||
"size_unit": "",
|
||||
"pack_qty": "",
|
||||
"measure_type": "weight",
|
||||
"normalized_quantity": "",
|
||||
"normalized_quantity_unit": "",
|
||||
"notes": "manual proof merge for cross-retailer validation",
|
||||
"created_at": "",
|
||||
"updated_at": "",
|
||||
}
|
||||
)
|
||||
for observed_row in [giant_row, costco_row]:
|
||||
link_rows.append(
|
||||
{
|
||||
"observed_product_id": observed_row["observed_product_id"],
|
||||
"canonical_product_id": proof_canonical_id,
|
||||
"link_method": "manual_proof_merge",
|
||||
"link_confidence": "medium",
|
||||
"review_status": "",
|
||||
"reviewed_by": "",
|
||||
"reviewed_at": "",
|
||||
"link_notes": "cross-retailer validation proof",
|
||||
}
|
||||
)
|
||||
|
||||
proof_rows = [
|
||||
{
|
||||
"proof_name": "banana",
|
||||
"canonical_product_id": proof_canonical_id,
|
||||
"giant_observed_product_id": giant_row["observed_product_id"],
|
||||
"costco_observed_product_id": costco_row["observed_product_id"],
|
||||
"giant_example_item": giant_row["example_item_name"],
|
||||
"costco_example_item": costco_row["example_item_name"],
|
||||
"notes": "BANANA proof pair built from Giant and Costco enriched rows",
|
||||
}
|
||||
]
|
||||
return canonical_rows, link_rows, proof_rows
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option(
|
||||
"--giant-items-enriched-csv",
|
||||
default="giant_output/items_enriched.csv",
|
||||
show_default=True,
|
||||
)
|
||||
@click.option(
|
||||
"--costco-items-enriched-csv",
|
||||
default="costco_output/items_enriched.csv",
|
||||
show_default=True,
|
||||
)
|
||||
@click.option(
|
||||
"--outdir",
|
||||
default="combined_output",
|
||||
show_default=True,
|
||||
)
|
||||
def main(giant_items_enriched_csv, costco_items_enriched_csv, outdir):
|
||||
outdir = Path(outdir)
|
||||
rows = read_rows(giant_items_enriched_csv) + read_rows(costco_items_enriched_csv)
|
||||
observed_rows = build_observed_products.build_observed_products(rows)
|
||||
canonical_rows, link_rows = build_canonical_layer.build_canonical_layer(observed_rows)
|
||||
giant_row, costco_row = find_proof_pair(observed_rows)
|
||||
if not giant_row or not costco_row:
|
||||
raise click.ClickException(
|
||||
"could not find BANANA proof pair across Giant and Costco observed products"
|
||||
)
|
||||
canonical_rows, link_rows, proof_rows = merge_proof_pair(
|
||||
canonical_rows, link_rows, giant_row, costco_row
|
||||
)
|
||||
|
||||
write_csv_rows(
|
||||
outdir / "products_observed.csv",
|
||||
observed_rows,
|
||||
build_observed_products.OUTPUT_FIELDS,
|
||||
)
|
||||
write_csv_rows(
|
||||
outdir / "products_canonical.csv",
|
||||
canonical_rows,
|
||||
build_canonical_layer.CANONICAL_FIELDS,
|
||||
)
|
||||
write_csv_rows(
|
||||
outdir / "product_links.csv",
|
||||
link_rows,
|
||||
build_canonical_layer.LINK_FIELDS,
|
||||
)
|
||||
write_csv_rows(outdir / "proof_examples.csv", proof_rows, PROOF_FIELDS)
|
||||
click.echo(
|
||||
f"wrote combined outputs to {outdir} using {len(observed_rows)} observed rows"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user