Refactor review pipeline around normalized items

2026-03-20 11:27:46 -04:00
parent 607c51038a
commit 9104781b93
6 changed files with 512 additions and 361 deletions
--- a/review_products.py
+++ b/review_products.py
@@ -10,8 +10,8 @@ from layer_helpers import compact_join, stable_id, write_csv_rows
 QUEUE_FIELDS = [
    "review_id",
    "retailer",
-    "observed_product_id",
-    "canonical_product_id",
+    "normalized_item_id",
+    "catalog_id",
    "reason_code",
    "priority",
    "raw_item_names",
@@ -26,36 +26,49 @@ QUEUE_FIELDS = [
    "updated_at",
 ]

+INFO_COLOR = "cyan"
+PROMPT_COLOR = "bright_yellow"
+WARNING_COLOR = "magenta"
+

 def build_review_queue(purchase_rows, resolution_rows):
-    by_observed = defaultdict(list)
+    by_normalized = defaultdict(list)
    resolution_lookup = build_purchases.load_resolution_lookup(resolution_rows)

    for row in purchase_rows:
-        observed_product_id = row.get("observed_product_id", "")
-        if not observed_product_id:
+        normalized_item_id = row.get("normalized_item_id", "")
+        if not normalized_item_id:
            continue
-        by_observed[observed_product_id].append(row)
+        by_normalized[normalized_item_id].append(row)

    today_text = str(date.today())
    queue_rows = []
-    for observed_product_id, rows in sorted(by_observed.items()):
-        current_resolution = resolution_lookup.get(observed_product_id, {})
+    for normalized_item_id, rows in sorted(by_normalized.items()):
+        current_resolution = resolution_lookup.get(normalized_item_id, {})
        if current_resolution.get("status") == "approved":
            continue
-        unresolved_rows = [row for row in rows if not row.get("canonical_product_id")]
+
+        unresolved_rows = [
+            row
+            for row in rows
+            if not row.get("catalog_id")
+            and row.get("is_item", "true") != "false"
+            and row.get("is_fee") != "true"
+            and row.get("is_discount_line") != "true"
+            and row.get("is_coupon_line") != "true"
+        ]
        if not unresolved_rows:
            continue

        retailers = sorted({row["retailer"] for row in rows})
-        review_id = stable_id("rvw", observed_product_id)
+        review_id = stable_id("rvw", normalized_item_id)
        queue_rows.append(
            {
                "review_id": review_id,
                "retailer": " | ".join(retailers),
-                "observed_product_id": observed_product_id,
-                "canonical_product_id": current_resolution.get("canonical_product_id", ""),
-                "reason_code": "missing_canonical_link",
+                "normalized_item_id": normalized_item_id,
+                "catalog_id": current_resolution.get("catalog_id", ""),
+                "reason_code": "missing_catalog_link",
                "priority": "high",
                "raw_item_names": compact_join(
                    sorted({row["raw_item_name"] for row in rows if row["raw_item_name"]}),
@@ -98,11 +111,6 @@ def save_catalog_rows(path, rows):
    write_csv_rows(path, rows, build_purchases.CATALOG_FIELDS)


-INFO_COLOR = "cyan"
-PROMPT_COLOR = "bright_yellow"
-WARNING_COLOR = "magenta"
-
-
 def sort_related_items(rows):
    return sorted(
        rows,
@@ -115,7 +123,7 @@ def sort_related_items(rows):
    )


-def build_canonical_suggestions(related_rows, catalog_rows, limit=3):
+def build_catalog_suggestions(related_rows, purchase_rows, catalog_rows, limit=3):
    normalized_names = {
        row.get("normalized_item_name", "").strip().upper()
        for row in related_rows
@@ -126,56 +134,52 @@ def build_canonical_suggestions(related_rows, catalog_rows, limit=3):
        for row in related_rows
        if row.get("upc", "").strip()
    }
+    catalog_by_id = {
+        row.get("catalog_id", ""): row for row in catalog_rows if row.get("catalog_id", "")
+    }
    suggestions = []
    seen_ids = set()

-    def add_matches(rows, reason):
-        for row in rows:
-            canonical_product_id = row.get("canonical_product_id", "")
-            if not canonical_product_id or canonical_product_id in seen_ids:
-                continue
-            seen_ids.add(canonical_product_id)
-            suggestions.append(
-                {
-                    "canonical_product_id": canonical_product_id,
-                    "canonical_name": row.get("canonical_name", ""),
-                    "reason": reason,
-                }
-            )
-            if len(suggestions) >= limit:
-                return True
-        return False
+    def add_catalog_id(catalog_id, reason):
+        if not catalog_id or catalog_id in seen_ids or catalog_id not in catalog_by_id:
+            return False
+        seen_ids.add(catalog_id)
+        catalog_row = catalog_by_id[catalog_id]
+        suggestions.append(
+            {
+                "catalog_id": catalog_id,
+                "catalog_name": catalog_row.get("catalog_name", ""),
+                "reason": reason,
+            }
+        )
+        return len(suggestions) >= limit

-    exact_upc_rows = [
-        row
-        for row in catalog_rows
-        if row.get("upc", "").strip() and row.get("upc", "").strip() in upcs
+    reviewed_purchase_rows = [
+        row for row in purchase_rows if row.get("catalog_id") and row.get("normalized_item_id")
    ]
-    if add_matches(exact_upc_rows, "exact upc"):
-        return suggestions
+    for row in reviewed_purchase_rows:
+        if row.get("upc", "").strip() and row.get("upc", "").strip() in upcs:
+            if add_catalog_id(row.get("catalog_id", ""), "exact upc"):
+                return suggestions

-    exact_name_rows = [
-        row
-        for row in catalog_rows
-        if row.get("canonical_name", "").strip().upper() in normalized_names
-    ]
-    if add_matches(exact_name_rows, "exact normalized name"):
-        return suggestions
+    for row in reviewed_purchase_rows:
+        if row.get("normalized_item_name", "").strip().upper() in normalized_names:
+            if add_catalog_id(row.get("catalog_id", ""), "exact normalized name"):
+                return suggestions

-    contains_rows = []
-    for row in catalog_rows:
-        canonical_name = row.get("canonical_name", "").strip().upper()
-        if not canonical_name:
+    for catalog_row in catalog_rows:
+        catalog_name = catalog_row.get("catalog_name", "").strip().upper()
+        if not catalog_name:
            continue
        for normalized_name in normalized_names:
-            if normalized_name in canonical_name or canonical_name in normalized_name:
-                contains_rows.append(row)
+            if normalized_name in catalog_name or catalog_name in normalized_name:
+                if add_catalog_id(catalog_row.get("catalog_id", ""), "catalog name contains match"):
+                    return suggestions
                break
-    add_matches(contains_rows, "canonical name contains match")
    return suggestions


-def build_display_lines(queue_row, related_rows):
+def build_display_lines(related_rows):
    lines = []
    for index, row in enumerate(sort_related_items(related_rows), start=1):
        lines.append(
@@ -197,41 +201,38 @@ def build_display_lines(queue_row, related_rows):
    return lines


-def observed_name(queue_row, related_rows):
+def normalized_label(queue_row, related_rows):
    if queue_row.get("normalized_names"):
        return queue_row["normalized_names"].split(" | ")[0]
    for row in related_rows:
        if row.get("normalized_item_name"):
            return row["normalized_item_name"]
-    return queue_row.get("observed_product_id", "")
+    return queue_row.get("normalized_item_id", "")


-def choose_existing_canonical(display_rows, observed_label, matched_count):
+def choose_existing_catalog(display_rows, normalized_name, matched_count):
    click.secho(
-        f"Select the canonical_name to associate {matched_count} items with:",
+        f"Select the catalog_name to associate {matched_count} items with:",
        fg=INFO_COLOR,
    )
    for index, row in enumerate(display_rows, start=1):
-        click.echo(f"  [{index}] {row['canonical_name']} | {row['canonical_product_id']}")
+        click.echo(f"  [{index}] {row['catalog_name']} | {row['catalog_id']}")
    choice = click.prompt(
        click.style("selection", fg=PROMPT_COLOR),
        type=click.IntRange(1, len(display_rows)),
    )
    chosen_row = display_rows[choice - 1]
    click.echo(
-        f'{matched_count} "{observed_label}" items and future matches will be associated '
-        f'with "{chosen_row["canonical_name"]}".'
-    )
-    click.secho(
-        "actions: [y]es  [n]o  [b]ack  [s]kip  [q]uit",
-        fg=PROMPT_COLOR,
+        f'{matched_count} "{normalized_name}" items and future matches will be associated '
+        f'with "{chosen_row["catalog_name"]}".'
    )
+    click.secho("actions: [y]es  [n]o  [b]ack  [s]kip  [q]uit", fg=PROMPT_COLOR)
    confirm = click.prompt(
        click.style("confirm", fg=PROMPT_COLOR),
        type=click.Choice(["y", "n", "b", "s", "q"]),
    )
    if confirm == "y":
-        return chosen_row["canonical_product_id"], ""
+        return chosen_row["catalog_id"], ""
    if confirm == "s":
        return "", "skip"
    if confirm == "q":
@@ -239,54 +240,43 @@ def choose_existing_canonical(display_rows, observed_label, matched_count):
    return "", "back"


-def prompt_resolution(queue_row, related_rows, catalog_rows, queue_index, queue_total):
-    suggestions = build_canonical_suggestions(related_rows, catalog_rows)
-    observed_label = observed_name(queue_row, related_rows)
+def prompt_resolution(queue_row, related_rows, purchase_rows, catalog_rows, queue_index, queue_total):
+    suggestions = build_catalog_suggestions(related_rows, purchase_rows, catalog_rows)
+    normalized_name = normalized_label(queue_row, related_rows)
    matched_count = len(related_rows)
    click.echo("")
    click.secho(
-        f"Review {queue_index}/{queue_total}: Resolve observed_product {observed_label} "
-        "to canonical_name [__]?",
+        f"Review {queue_index}/{queue_total}: Resolve normalized_item {normalized_name} "
+        "to catalog_name [__]?",
        fg=INFO_COLOR,
    )
    click.echo(f"{matched_count} matched items:")
-    for line in build_display_lines(queue_row, related_rows):
+    for line in build_display_lines(related_rows):
        click.echo(line)
    if suggestions:
-        click.echo(f"{len(suggestions)} canonical suggestions found:")
+        click.echo(f"{len(suggestions)} catalog_name suggestions found:")
        for index, suggestion in enumerate(suggestions, start=1):
-            click.echo(f" [{index}] {suggestion['canonical_name']}")
+            click.echo(f" [{index}] {suggestion['catalog_name']}")
    else:
-        click.echo("no canonical_name suggestions found")
-    click.secho(
-        "[l]ink existing  [n]ew canonical  e[x]clude  [s]kip  [q]uit:",
-        fg=PROMPT_COLOR,
-    )
-    action = click.prompt(
-        "",
-        type=click.Choice(["l", "n", "x", "s", "q"]),
-        prompt_suffix=" ",
-    )
+        click.echo("no catalog_name suggestions found")
+    click.secho("[l]ink existing  [n]ew catalog  e[x]clude  [s]kip  [q]uit:", fg=PROMPT_COLOR)
+    action = click.prompt("", type=click.Choice(["l", "n", "x", "s", "q"]), prompt_suffix=" ")
    if action == "q":
        return None, None
    if action == "s":
        return {
-            "observed_product_id": queue_row["observed_product_id"],
-            "canonical_product_id": "",
+            "normalized_item_id": queue_row["normalized_item_id"],
+            "catalog_id": "",
            "resolution_action": "skip",
            "status": "pending",
            "resolution_notes": queue_row.get("resolution_notes", ""),
            "reviewed_at": str(date.today()),
        }, None
    if action == "x":
-        notes = click.prompt(
-            click.style("exclude notes", fg=PROMPT_COLOR),
-            default="",
-            show_default=False,
-        )
+        notes = click.prompt(click.style("exclude notes", fg=PROMPT_COLOR), default="", show_default=False)
        return {
-            "observed_product_id": queue_row["observed_product_id"],
-            "canonical_product_id": "",
+            "normalized_item_id": queue_row["normalized_item_id"],
+            "catalog_id": "",
            "resolution_action": "exclude",
            "status": "approved",
            "resolution_notes": notes,
@@ -295,22 +285,19 @@ def prompt_resolution(queue_row, related_rows, catalog_rows, queue_index, queue_
    if action == "l":
        display_rows = suggestions or [
            {
-                "canonical_product_id": row["canonical_product_id"],
-                "canonical_name": row["canonical_name"],
+                "catalog_id": row["catalog_id"],
+                "catalog_name": row["catalog_name"],
                "reason": "catalog sample",
            }
            for row in catalog_rows[:10]
+            if row.get("catalog_id")
        ]
        while True:
-            canonical_product_id, outcome = choose_existing_canonical(
-                display_rows,
-                observed_label,
-                matched_count,
-            )
+            catalog_id, outcome = choose_existing_catalog(display_rows, normalized_name, matched_count)
            if outcome == "skip":
                return {
-                    "observed_product_id": queue_row["observed_product_id"],
-                    "canonical_product_id": "",
+                    "normalized_item_id": queue_row["normalized_item_id"],
+                    "catalog_id": "",
                    "resolution_action": "skip",
                    "status": "pending",
                    "resolution_notes": queue_row.get("resolution_notes", ""),
@@ -323,34 +310,22 @@ def prompt_resolution(queue_row, related_rows, catalog_rows, queue_index, queue_
            break
        notes = click.prompt(click.style("link notes", fg=PROMPT_COLOR), default="", show_default=False)
        return {
-            "observed_product_id": queue_row["observed_product_id"],
-            "canonical_product_id": canonical_product_id,
+            "normalized_item_id": queue_row["normalized_item_id"],
+            "catalog_id": catalog_id,
            "resolution_action": "link",
            "status": "approved",
            "resolution_notes": notes,
            "reviewed_at": str(date.today()),
        }, None

-    canonical_name = click.prompt(click.style("canonical name", fg=PROMPT_COLOR), type=str)
-    category = click.prompt(
-        click.style("category", fg=PROMPT_COLOR),
-        default="",
-        show_default=False,
-    )
-    product_type = click.prompt(
-        click.style("product type", fg=PROMPT_COLOR),
-        default="",
-        show_default=False,
-    )
-    notes = click.prompt(
-        click.style("notes", fg=PROMPT_COLOR),
-        default="",
-        show_default=False,
-    )
-    canonical_product_id = stable_id("gcan", f"manual|{canonical_name}|{category}|{product_type}")
-    canonical_row = {
-        "canonical_product_id": canonical_product_id,
-        "canonical_name": canonical_name,
+    catalog_name = click.prompt(click.style("catalog name", fg=PROMPT_COLOR), type=str)
+    category = click.prompt(click.style("category", fg=PROMPT_COLOR), default="", show_default=False)
+    product_type = click.prompt(click.style("product type", fg=PROMPT_COLOR), default="", show_default=False)
+    notes = click.prompt(click.style("notes", fg=PROMPT_COLOR), default="", show_default=False)
+    catalog_id = stable_id("cat", f"manual|{catalog_name}|{category}|{product_type}")
+    catalog_row = {
+        "catalog_id": catalog_id,
+        "catalog_name": catalog_name,
        "category": category,
        "product_type": product_type,
        "brand": "",
@@ -364,14 +339,14 @@ def prompt_resolution(queue_row, related_rows, catalog_rows, queue_index, queue_
        "updated_at": str(date.today()),
    }
    resolution_row = {
-        "observed_product_id": queue_row["observed_product_id"],
-        "canonical_product_id": canonical_product_id,
+        "normalized_item_id": queue_row["normalized_item_id"],
+        "catalog_id": catalog_id,
        "resolution_action": "create",
        "status": "approved",
        "resolution_notes": notes,
        "reviewed_at": str(date.today()),
    }
-    return resolution_row, canonical_row
+    return resolution_row, catalog_row


@click.command()
@@ -384,7 +359,7 @@ def prompt_resolution(queue_row, related_rows, catalog_rows, queue_index, queue_
 def main(purchases_csv, queue_csv, resolutions_csv, catalog_csv, limit, refresh_only):
    purchase_rows = build_purchases.read_optional_csv_rows(purchases_csv)
    resolution_rows = build_purchases.read_optional_csv_rows(resolutions_csv)
-    catalog_rows = build_purchases.read_optional_csv_rows(catalog_csv)
+    catalog_rows = build_purchases.merge_catalog_rows(build_purchases.read_optional_csv_rows(catalog_csv), [])
    queue_rows = build_review_queue(purchase_rows, resolution_rows)
    write_csv_rows(queue_csv, queue_rows, QUEUE_FIELDS)
    click.echo(f"wrote {len(queue_rows)} rows to {queue_csv}")
@@ -393,29 +368,33 @@ def main(purchases_csv, queue_csv, resolutions_csv, catalog_csv, limit, refresh_
        return

    resolution_lookup = build_purchases.load_resolution_lookup(resolution_rows)
-    catalog_by_id = {row["canonical_product_id"]: row for row in catalog_rows if row.get("canonical_product_id")}
-    rows_by_observed = defaultdict(list)
+    catalog_by_id = {row["catalog_id"]: row for row in catalog_rows if row.get("catalog_id")}
+    rows_by_normalized = defaultdict(list)
    for row in purchase_rows:
-        observed_product_id = row.get("observed_product_id", "")
-        if observed_product_id:
-            rows_by_observed[observed_product_id].append(row)
+        normalized_item_id = row.get("normalized_item_id", "")
+        if normalized_item_id:
+            rows_by_normalized[normalized_item_id].append(row)
+
    reviewed = 0
    for index, queue_row in enumerate(queue_rows, start=1):
        if limit and reviewed >= limit:
            break
-        related_rows = rows_by_observed.get(queue_row["observed_product_id"], [])
-        result = prompt_resolution(queue_row, related_rows, catalog_rows, index, len(queue_rows))
+        related_rows = rows_by_normalized.get(queue_row["normalized_item_id"], [])
+        result = prompt_resolution(queue_row, related_rows, purchase_rows, catalog_rows, index, len(queue_rows))
        if result == (None, None):
            break
-        resolution_row, canonical_row = result
-        resolution_lookup[resolution_row["observed_product_id"]] = resolution_row
-        if canonical_row and canonical_row["canonical_product_id"] not in catalog_by_id:
-            catalog_by_id[canonical_row["canonical_product_id"]] = canonical_row
-            catalog_rows.append(canonical_row)
+        resolution_row, catalog_row = result
+        resolution_lookup[resolution_row["normalized_item_id"]] = resolution_row
+        if catalog_row and catalog_row["catalog_id"] not in catalog_by_id:
+            catalog_by_id[catalog_row["catalog_id"]] = catalog_row
+            catalog_rows.append(catalog_row)
        reviewed += 1

-    save_resolution_rows(resolutions_csv, sorted(resolution_lookup.values(), key=lambda row: row["observed_product_id"]))
-    save_catalog_rows(catalog_csv, sorted(catalog_by_id.values(), key=lambda row: row["canonical_product_id"]))
+    save_resolution_rows(
+        resolutions_csv,
+        sorted(resolution_lookup.values(), key=lambda row: row["normalized_item_id"]),
+    )
+    save_catalog_rows(catalog_csv, sorted(catalog_by_id.values(), key=lambda row: row["catalog_id"]))
    click.echo(
        f"saved {len(resolution_lookup)} resolution rows to {resolutions_csv} "
        f"and {len(catalog_by_id)} catalog rows to {catalog_csv}"