from decimal import Decimal from pathlib import Path import click from enrich_giant import format_decimal, to_decimal from layer_helpers import read_csv_rows, write_csv_rows PURCHASE_FIELDS = [ "purchase_date", "retailer", "order_id", "line_no", "normalized_row_id", "normalized_item_id", "catalog_id", "review_status", "resolution_action", "raw_item_name", "normalized_item_name", "catalog_name", "category", "product_type", "brand", "variant", "image_url", "retailer_item_id", "upc", "qty", "unit", "normalized_quantity", "normalized_quantity_unit", "pack_qty", "size_value", "size_unit", "measure_type", "line_total", "unit_price", "matched_discount_amount", "net_line_total", "store_name", "store_number", "store_city", "store_state", "price_per_each", "price_per_each_basis", "price_per_count", "price_per_count_basis", "price_per_lb", "price_per_lb_basis", "price_per_oz", "price_per_oz_basis", "is_discount_line", "is_coupon_line", "is_fee", "raw_order_path", ] EXAMPLE_FIELDS = [ "example_name", "catalog_id", "giant_purchase_date", "giant_raw_item_name", "giant_price_per_lb", "costco_purchase_date", "costco_raw_item_name", "costco_price_per_lb", "notes", ] CATALOG_FIELDS = [ "catalog_id", "catalog_name", "category", "product_type", "brand", "variant", "size_value", "size_unit", "pack_qty", "measure_type", "notes", "created_at", "updated_at", ] PRODUCT_LINK_FIELDS = [ "normalized_item_id", "catalog_id", "link_method", "link_confidence", "review_status", "reviewed_by", "reviewed_at", "link_notes", ] RESOLUTION_FIELDS = [ "normalized_item_id", "catalog_id", "resolution_action", "status", "resolution_notes", "reviewed_at", ] def derive_metrics(row): line_total = to_decimal(row.get("net_line_total") or row.get("line_total")) qty = to_decimal(row.get("qty")) pack_qty = to_decimal(row.get("pack_qty")) size_value = to_decimal(row.get("size_value")) picked_weight = to_decimal(row.get("picked_weight")) size_unit = row.get("size_unit", "") price_per_each = row.get("price_per_each", "") price_per_lb = row.get("price_per_lb", "") price_per_oz = row.get("price_per_oz", "") price_per_count = "" basis_each = "" basis_count = "" basis_lb = "" basis_oz = "" if price_per_each: basis_each = "line_total_over_qty" elif line_total is not None and qty not in (None, 0): price_per_each = format_decimal(line_total / qty) basis_each = "line_total_over_qty" if line_total is not None and pack_qty not in (None, 0): total_count = pack_qty * (qty or Decimal("1")) if total_count not in (None, 0): price_per_count = format_decimal(line_total / total_count) basis_count = "line_total_over_pack_qty" if picked_weight not in (None, 0): price_per_lb = format_decimal(line_total / picked_weight) if line_total is not None else "" price_per_oz = ( format_decimal((line_total / picked_weight) / Decimal("16")) if line_total is not None else "" ) basis_lb = "picked_weight_lb" basis_oz = "picked_weight_lb_to_oz" elif line_total is not None and size_value not in (None, 0): total_units = size_value * (pack_qty or Decimal("1")) * (qty or Decimal("1")) if size_unit == "lb" and total_units not in (None, 0): per_lb = line_total / total_units price_per_lb = format_decimal(per_lb) price_per_oz = format_decimal(per_lb / Decimal("16")) basis_lb = "parsed_size_lb" basis_oz = "parsed_size_lb_to_oz" elif size_unit == "oz" and total_units not in (None, 0): per_oz = line_total / total_units price_per_oz = format_decimal(per_oz) price_per_lb = format_decimal(per_oz * Decimal("16")) basis_lb = "parsed_size_oz_to_lb" basis_oz = "parsed_size_oz" return { "price_per_each": price_per_each, "price_per_each_basis": basis_each, "price_per_count": price_per_count, "price_per_count_basis": basis_count, "price_per_lb": price_per_lb, "price_per_lb_basis": basis_lb, "price_per_oz": price_per_oz, "price_per_oz_basis": basis_oz, } def order_lookup(rows, retailer): return {(retailer, row["order_id"]): row for row in rows} def read_optional_csv_rows(path): path = Path(path) if not path.exists(): return [] return read_csv_rows(path) def normalize_catalog_row(row): return { "catalog_id": row.get("catalog_id") or row.get("canonical_product_id", ""), "catalog_name": row.get("catalog_name") or row.get("canonical_name", ""), "category": row.get("category", ""), "product_type": row.get("product_type", ""), "brand": row.get("brand", ""), "variant": row.get("variant", ""), "size_value": row.get("size_value", ""), "size_unit": row.get("size_unit", ""), "pack_qty": row.get("pack_qty", ""), "measure_type": row.get("measure_type", ""), "notes": row.get("notes", ""), "created_at": row.get("created_at", ""), "updated_at": row.get("updated_at", ""), } def is_review_first_catalog_row(row): notes = row.get("notes", "").strip().lower() if notes.startswith("auto-linked via"): return False return True def normalize_link_row(row): return { "normalized_item_id": row.get("normalized_item_id", ""), "catalog_id": row.get("catalog_id") or row.get("canonical_product_id", ""), "link_method": row.get("link_method", ""), "link_confidence": row.get("link_confidence", ""), "review_status": row.get("review_status", ""), "reviewed_by": row.get("reviewed_by", ""), "reviewed_at": row.get("reviewed_at", ""), "link_notes": row.get("link_notes", ""), } def normalize_resolution_row(row): return { "normalized_item_id": row.get("normalized_item_id", ""), "catalog_id": row.get("catalog_id") or row.get("canonical_product_id", ""), "resolution_action": row.get("resolution_action", ""), "status": row.get("status", ""), "resolution_notes": row.get("resolution_notes", ""), "reviewed_at": row.get("reviewed_at", ""), } def load_resolution_lookup(resolution_rows): lookup = {} for row in resolution_rows: normalized_row = normalize_resolution_row(row) normalized_item_id = normalized_row.get("normalized_item_id", "") if not normalized_item_id: continue lookup[normalized_item_id] = normalized_row return lookup def merge_catalog_rows(existing_rows, new_rows): merged = {} for row in existing_rows + new_rows: normalized_row = normalize_catalog_row(row) catalog_id = normalized_row.get("catalog_id", "") if catalog_id: merged[catalog_id] = normalized_row return sorted(merged.values(), key=lambda row: row["catalog_id"]) def load_link_lookup(link_rows): lookup = {} for row in link_rows: normalized_row = normalize_link_row(row) normalized_item_id = normalized_row.get("normalized_item_id", "") if not normalized_item_id: continue lookup[normalized_item_id] = normalized_row return lookup def build_purchase_rows( giant_enriched_rows, costco_enriched_rows, giant_orders, costco_orders, resolution_rows, link_rows=None, catalog_rows=None, ): all_enriched_rows = giant_enriched_rows + costco_enriched_rows resolution_lookup = load_resolution_lookup(resolution_rows) link_lookup = load_link_lookup(link_rows or []) catalog_lookup = { row["catalog_id"]: normalize_catalog_row(row) for row in (catalog_rows or []) if normalize_catalog_row(row).get("catalog_id") } for normalized_item_id, resolution in resolution_lookup.items(): action = resolution.get("resolution_action", "") status = resolution.get("status", "") if status != "approved": continue if action in {"link", "create"} and resolution.get("catalog_id"): link_lookup[normalized_item_id] = { "normalized_item_id": normalized_item_id, "catalog_id": resolution["catalog_id"], "link_method": f"manual_{action}", "link_confidence": "high", "review_status": status, "reviewed_by": "", "reviewed_at": resolution.get("reviewed_at", ""), "link_notes": resolution.get("resolution_notes", ""), } elif action == "exclude": link_lookup.pop(normalized_item_id, None) orders_by_id = {} orders_by_id.update(order_lookup(giant_orders, "giant")) orders_by_id.update(order_lookup(costco_orders, "costco")) purchase_rows = [] for row in sorted( all_enriched_rows, key=lambda item: (item["order_date"], item["retailer"], item["order_id"], int(item["line_no"])), ): normalized_item_id = row.get("normalized_item_id", "") resolution = resolution_lookup.get(normalized_item_id, {}) link_row = link_lookup.get(normalized_item_id, {}) catalog_row = catalog_lookup.get(link_row.get("catalog_id", ""), {}) order_row = orders_by_id.get((row["retailer"], row["order_id"]), {}) metrics = derive_metrics(row) purchase_rows.append( { "purchase_date": row["order_date"], "retailer": row["retailer"], "order_id": row["order_id"], "line_no": row["line_no"], "normalized_row_id": row.get("normalized_row_id", ""), "normalized_item_id": normalized_item_id, "catalog_id": link_row.get("catalog_id", ""), "review_status": resolution.get("status", ""), "resolution_action": resolution.get("resolution_action", ""), "raw_item_name": row["item_name"], "normalized_item_name": row["item_name_norm"], "catalog_name": catalog_row.get("catalog_name", ""), "category": catalog_row.get("category", ""), "product_type": catalog_row.get("product_type", ""), "brand": catalog_row.get("brand", ""), "variant": catalog_row.get("variant", ""), "image_url": row.get("image_url", ""), "retailer_item_id": row["retailer_item_id"], "upc": row["upc"], "qty": row["qty"], "unit": row["unit"], "normalized_quantity": row.get("normalized_quantity", ""), "normalized_quantity_unit": row.get("normalized_quantity_unit", ""), "pack_qty": row["pack_qty"], "size_value": row["size_value"], "size_unit": row["size_unit"], "measure_type": row["measure_type"], "line_total": row["line_total"], "unit_price": row["unit_price"], "matched_discount_amount": row.get("matched_discount_amount", ""), "net_line_total": row.get("net_line_total", ""), "store_name": order_row.get("store_name", ""), "store_number": order_row.get("store_number", ""), "store_city": order_row.get("store_city", ""), "store_state": order_row.get("store_state", ""), "is_discount_line": row["is_discount_line"], "is_coupon_line": row["is_coupon_line"], "is_fee": row["is_fee"], "raw_order_path": row["raw_order_path"], **metrics, } ) return purchase_rows, sorted(link_lookup.values(), key=lambda row: row["normalized_item_id"]) def build_comparison_examples(purchase_rows): giant_banana = None costco_banana = None for row in purchase_rows: if row.get("normalized_item_name") != "BANANA": continue if not row.get("catalog_id"): continue if row["retailer"] == "giant" and row.get("price_per_lb"): giant_banana = row if row["retailer"] == "costco" and row.get("price_per_lb"): costco_banana = row if not giant_banana or not costco_banana: return [] return [ { "example_name": "banana_price_per_lb", "catalog_id": giant_banana["catalog_id"], "giant_purchase_date": giant_banana["purchase_date"], "giant_raw_item_name": giant_banana["raw_item_name"], "giant_price_per_lb": giant_banana["price_per_lb"], "costco_purchase_date": costco_banana["purchase_date"], "costco_raw_item_name": costco_banana["raw_item_name"], "costco_price_per_lb": costco_banana["price_per_lb"], "notes": "Example comparison using normalized price_per_lb across Giant and Costco", } ] @click.command() @click.option("--giant-items-enriched-csv", default="data/giant-web/normalized_items.csv", show_default=True) @click.option("--costco-items-enriched-csv", default="data/costco-web/normalized_items.csv", show_default=True) @click.option("--giant-orders-csv", default="data/giant-web/collected_orders.csv", show_default=True) @click.option("--costco-orders-csv", default="data/costco-web/collected_orders.csv", show_default=True) @click.option("--resolutions-csv", default="data/review/review_resolutions.csv", show_default=True) @click.option("--catalog-csv", default="data/catalog.csv", show_default=True) @click.option("--links-csv", default="data/review/product_links.csv", show_default=True) @click.option("--output-csv", default="data/review/purchases.csv", show_default=True) @click.option("--examples-csv", default="data/review/comparison_examples.csv", show_default=True) def main( giant_items_enriched_csv, costco_items_enriched_csv, giant_orders_csv, costco_orders_csv, resolutions_csv, catalog_csv, links_csv, output_csv, examples_csv, ): resolution_rows = read_optional_csv_rows(resolutions_csv) catalog_rows = merge_catalog_rows( [row for row in read_optional_csv_rows(catalog_csv) if is_review_first_catalog_row(row)], [], ) existing_links = [normalize_link_row(row) for row in read_optional_csv_rows(links_csv)] purchase_rows, link_rows = build_purchase_rows( read_csv_rows(giant_items_enriched_csv), read_csv_rows(costco_items_enriched_csv), read_csv_rows(giant_orders_csv), read_csv_rows(costco_orders_csv), resolution_rows, existing_links, catalog_rows, ) example_rows = build_comparison_examples(purchase_rows) write_csv_rows(catalog_csv, catalog_rows, CATALOG_FIELDS) write_csv_rows(links_csv, link_rows, PRODUCT_LINK_FIELDS) write_csv_rows(output_csv, purchase_rows, PURCHASE_FIELDS) write_csv_rows(examples_csv, example_rows, EXAMPLE_FIELDS) click.echo( f"wrote {len(purchase_rows)} purchase rows to {output_csv}, " f"{len(catalog_rows)} catalog rows to {catalog_csv}, " f"{len(link_rows)} product links to {links_csv}, " f"and {len(example_rows)} comparison examples to {examples_csv}" ) if __name__ == "__main__": main()