Improve product review display workflow

2026-03-17 12:25:47 -04:00
parent e494386e64
commit 7b8141cd42
5 changed files with 372 additions and 46 deletions
--- a/build_purchases.py
+++ b/build_purchases.py
@@ -22,6 +22,7 @@ PURCHASE_FIELDS = [
    "resolution_action",
    "raw_item_name",
    "normalized_item_name",
+    "image_url",
    "retailer_item_id",
    "upc",
    "qty",
@@ -280,6 +281,7 @@ def build_purchase_rows(
                "resolution_action": resolution.get("resolution_action", ""),
                "raw_item_name": row["item_name"],
                "normalized_item_name": row["item_name_norm"],
+                "image_url": row.get("image_url", ""),
                "retailer_item_id": row["retailer_item_id"],
                "upc": row["upc"],
                "qty": row["qty"],
--- a/pm/tasks.org
+++ b/pm/tasks.org
@@ -367,7 +367,7 @@
 - commit: `c7dad54` on branch `cx`
 - tests: `./venv/bin/python -m unittest discover -s tests`; `./venv/bin/python build_purchases.py`; `./venv/bin/python review_products.py --refresh-only`; verified `combined_output/review_queue.csv`, `combined_output/review_resolutions.csv` workflow, and `combined_output/canonical_catalog.csv`
 - date: 2026-03-16
-* [ ] t1.12: simplify review process display
+* [X] t1.12: simplify review process display
 Clearly show current state separate from proposed future state.
 ** acceptance criteria
 1. Display position in review queue, e.g., (1/22)
@@ -386,10 +386,13 @@ Clearly show current state separate from proposed future state.
  
 ** evidence
 - commit:
- tests:
- date:
+- tests: `./venv/bin/python -m unittest discover -s tests`; `./venv/bin/python review_products.py --help`; verified review prompt now shows queue position, matched item history, image URLs when present, and deterministic canonical suggestions
+- date: 2026-03-17

 ** notes
+- The big win here was clarifying terminology in the prompt itself: the reviewer is resolving one observed product group to a canonical_name, not linking raw rows to each other.
+- Showing the full matched-item list by descending date made the review context much more legible than the old compact summary fields.
+- Deterministic suggestions help, but they are intentionally conservative and shallow; this improves reviewer speed without pretending to solve product matching automatically.

 * [ ] t1.10: add optional llm-assisted suggestion workflow for unresolved products (2-4 commits)

--- a/review_products.py
+++ b/review_products.py
@@ -1,6 +1,5 @@
 from collections import defaultdict
 from datetime import date
-from pathlib import Path

 import click

@@ -99,17 +98,140 @@ def save_catalog_rows(path, rows):
    write_csv_rows(path, rows, build_purchases.CATALOG_FIELDS)


-def prompt_resolution(queue_row, catalog_rows):
+INFO_COLOR = "cyan"
+PROMPT_COLOR = "bright_yellow"
+WARNING_COLOR = "magenta"
+
+
+def sort_related_items(rows):
+    return sorted(
+        rows,
+        key=lambda row: (
+            row.get("purchase_date", ""),
+            row.get("order_id", ""),
+            int(row.get("line_no", "0") or "0"),
+        ),
+        reverse=True,
+    )
+
+
+def build_canonical_suggestions(related_rows, catalog_rows, limit=3):
+    normalized_names = {
+        row.get("normalized_item_name", "").strip().upper()
+        for row in related_rows
+        if row.get("normalized_item_name", "").strip()
+    }
+    upcs = {
+        row.get("upc", "").strip()
+        for row in related_rows
+        if row.get("upc", "").strip()
+    }
+    suggestions = []
+    seen_ids = set()
+
+    def add_matches(rows, reason):
+        for row in rows:
+            canonical_product_id = row.get("canonical_product_id", "")
+            if not canonical_product_id or canonical_product_id in seen_ids:
+                continue
+            seen_ids.add(canonical_product_id)
+            suggestions.append(
+                {
+                    "canonical_product_id": canonical_product_id,
+                    "canonical_name": row.get("canonical_name", ""),
+                    "reason": reason,
+                }
+            )
+            if len(suggestions) >= limit:
+                return True
+        return False
+
+    exact_upc_rows = [
+        row
+        for row in catalog_rows
+        if row.get("upc", "").strip() and row.get("upc", "").strip() in upcs
+    ]
+    if add_matches(exact_upc_rows, "exact upc"):
+        return suggestions
+
+    exact_name_rows = [
+        row
+        for row in catalog_rows
+        if row.get("canonical_name", "").strip().upper() in normalized_names
+    ]
+    if add_matches(exact_name_rows, "exact normalized name"):
+        return suggestions
+
+    contains_rows = []
+    for row in catalog_rows:
+        canonical_name = row.get("canonical_name", "").strip().upper()
+        if not canonical_name:
+            continue
+        for normalized_name in normalized_names:
+            if normalized_name in canonical_name or canonical_name in normalized_name:
+                contains_rows.append(row)
+                break
+    add_matches(contains_rows, "canonical name contains match")
+    return suggestions
+
+
+def build_display_lines(queue_row, related_rows):
+    lines = []
+    for row in sort_related_items(related_rows):
+        lines.append(
+            "  - {purchase_date} | {line_total} | {raw_item_name} | {normalized_item_name} | "
+            "{upc} | {retailer}".format(
+                purchase_date=row.get("purchase_date", ""),
+                line_total=row.get("line_total", ""),
+                raw_item_name=row.get("raw_item_name", ""),
+                normalized_item_name=row.get("normalized_item_name", ""),
+                upc=row.get("upc", ""),
+                retailer=row.get("retailer", ""),
+            )
+        )
+        if row.get("image_url"):
+            lines.append(f"    image: {row['image_url']}")
+    if not lines:
+        lines.append("  - no matched item rows found")
+    return lines
+
+
+def prompt_resolution(queue_row, related_rows, catalog_rows, queue_index, queue_total):
+    suggestions = build_canonical_suggestions(related_rows, catalog_rows)
    click.echo("")
+    click.secho(
+        f"Review observed product ({queue_index}/{queue_total})",
+        fg=INFO_COLOR,
+    )
+    click.echo(
+        "Resolve this observed product group to an existing canonical_name, "
+        "a new canonical_name, exclude it, or skip it."
+    )
    click.echo(f"observed_product_id: {queue_row['observed_product_id']}")
    click.echo(f"retailer: {queue_row['retailer']}")
-    click.echo(f"raw names: {queue_row['raw_item_names']}")
-    click.echo(f"normalized names: {queue_row['normalized_names']}")
-    click.echo(f"upcs: {queue_row['upc_values']}")
-    click.echo(f"example prices: {queue_row['example_prices']}")
-    click.echo(f"seen count: {queue_row['seen_count']}")
-    click.echo("actions: [l]ink existing  [n]ew canonical  [x]exclude  [s]kip  [q]uit")
-    action = click.prompt("action", type=click.Choice(["l", "n", "x", "s", "q"]))
+    click.echo(f"observed_name(s): {queue_row['normalized_names']}")
+    click.echo(f"upc(s): {queue_row['upc_values']}")
+    click.echo(f"seen_count: {queue_row['seen_count']}")
+    click.secho("matched items:", fg=INFO_COLOR)
+    for line in build_display_lines(queue_row, related_rows):
+        click.echo(line)
+    if suggestions:
+        click.secho("suggested canonical_name values:", fg=INFO_COLOR)
+        for suggestion in suggestions:
+            click.echo(
+                f"  - {suggestion['canonical_product_id']} | {suggestion['canonical_name']} "
+                f"({suggestion['reason']})"
+            )
+    else:
+        click.secho("no deterministic canonical suggestions found", fg=WARNING_COLOR)
+    click.secho(
+        "actions: [l]ink existing  [n]ew canonical  e[x]clude  [s]kip  [q]uit",
+        fg=PROMPT_COLOR,
+    )
+    action = click.prompt(
+        click.style("action", fg=PROMPT_COLOR),
+        type=click.Choice(["l", "n", "x", "s", "q"]),
+    )
    if action == "q":
        return None, None
    if action == "s":
@@ -122,7 +244,11 @@ def prompt_resolution(queue_row, catalog_rows):
            "reviewed_at": str(date.today()),
        }, None
    if action == "x":
-        notes = click.prompt("exclude notes", default="", show_default=False)
+        notes = click.prompt(
+            click.style("exclude notes", fg=PROMPT_COLOR),
+            default="",
+            show_default=False,
+        )
        return {
            "observed_product_id": queue_row["observed_product_id"],
            "canonical_product_id": "",
@@ -132,11 +258,28 @@ def prompt_resolution(queue_row, catalog_rows):
            "reviewed_at": str(date.today()),
        }, None
    if action == "l":
-        click.echo("existing canonicals:")
-        for row in catalog_rows[:10]:
-            click.echo(f"  {row['canonical_product_id']}  {row['canonical_name']}")
-        canonical_product_id = click.prompt("canonical product id", type=str)
-        notes = click.prompt("link notes", default="", show_default=False)
+        display_rows = suggestions or [
+            {
+                "canonical_product_id": row["canonical_product_id"],
+                "canonical_name": row["canonical_name"],
+                "reason": "catalog sample",
+            }
+            for row in catalog_rows[:10]
+        ]
+        click.secho("existing canonical_name values:", fg=INFO_COLOR)
+        for row in display_rows:
+            click.echo(
+                f"  - {row['canonical_product_id']} | {row['canonical_name']} ({row['reason']})"
+            )
+        canonical_product_id = click.prompt(
+            click.style("canonical product id", fg=PROMPT_COLOR),
+            type=str,
+        )
+        notes = click.prompt(
+            click.style("link notes", fg=PROMPT_COLOR),
+            default="",
+            show_default=False,
+        )
        return {
            "observed_product_id": queue_row["observed_product_id"],
            "canonical_product_id": canonical_product_id,
@@ -146,10 +289,22 @@ def prompt_resolution(queue_row, catalog_rows):
            "reviewed_at": str(date.today()),
        }, None

-    canonical_name = click.prompt("canonical name", type=str)
-    category = click.prompt("category", default="", show_default=False)
-    product_type = click.prompt("product type", default="", show_default=False)
-    notes = click.prompt("notes", default="", show_default=False)
+    canonical_name = click.prompt(click.style("canonical name", fg=PROMPT_COLOR), type=str)
+    category = click.prompt(
+        click.style("category", fg=PROMPT_COLOR),
+        default="",
+        show_default=False,
+    )
+    product_type = click.prompt(
+        click.style("product type", fg=PROMPT_COLOR),
+        default="",
+        show_default=False,
+    )
+    notes = click.prompt(
+        click.style("notes", fg=PROMPT_COLOR),
+        default="",
+        show_default=False,
+    )
    canonical_product_id = stable_id("gcan", f"manual|{canonical_name}|{category}|{product_type}")
    canonical_row = {
        "canonical_product_id": canonical_product_id,
@@ -197,11 +352,17 @@ def main(purchases_csv, queue_csv, resolutions_csv, catalog_csv, limit, refresh_

    resolution_lookup = build_purchases.load_resolution_lookup(resolution_rows)
    catalog_by_id = {row["canonical_product_id"]: row for row in catalog_rows if row.get("canonical_product_id")}
+    rows_by_observed = defaultdict(list)
+    for row in purchase_rows:
+        observed_product_id = row.get("observed_product_id", "")
+        if observed_product_id:
+            rows_by_observed[observed_product_id].append(row)
    reviewed = 0
-    for queue_row in queue_rows:
+    for index, queue_row in enumerate(queue_rows, start=1):
        if limit and reviewed >= limit:
            break
-        result = prompt_resolution(queue_row, catalog_rows)
+        related_rows = rows_by_observed.get(queue_row["observed_product_id"], [])
+        result = prompt_resolution(queue_row, related_rows, catalog_rows, index, len(queue_rows))
        if result == (None, None):
            break
        resolution_row, canonical_row = result
--- a/tests/test_purchases.py
+++ b/tests/test_purchases.py
@@ -41,6 +41,7 @@ class PurchaseLogTests(unittest.TestCase):
                "order_date": "2026-03-01",
                "item_name": "FRESH BANANA",
                "item_name_norm": "BANANA",
+                "image_url": "https://example.test/banana.jpg",
                "retailer_item_id": "100",
                "upc": "4011",
                "qty": "1",
@@ -99,24 +100,18 @@ class PurchaseLogTests(unittest.TestCase):
            }
        ]

-<<<<<<< HEAD
        rows, _observed, _canon, _links = build_purchases.build_purchase_rows(
-=======
-        rows = build_purchases.build_purchase_rows(
->>>>>>> be1bf63 (Build pivot-ready purchase log)
            [giant_row],
            [costco_row],
            giant_orders,
            costco_orders,
-<<<<<<< HEAD
            [],
-=======
->>>>>>> be1bf63 (Build pivot-ready purchase log)
        )

        self.assertEqual(2, len(rows))
        self.assertTrue(all(row["canonical_product_id"] for row in rows))
        self.assertEqual({"giant", "costco"}, {row["retailer"] for row in rows})
+        self.assertEqual("https://example.test/banana.jpg", rows[0]["image_url"])

    def test_main_writes_purchase_and_example_csvs(self):
        with tempfile.TemporaryDirectory() as tmpdir:
@@ -124,11 +119,13 @@ class PurchaseLogTests(unittest.TestCase):
            costco_items = Path(tmpdir) / "costco_items.csv"
            giant_orders = Path(tmpdir) / "giant_orders.csv"
            costco_orders = Path(tmpdir) / "costco_orders.csv"
+            resolutions_csv = Path(tmpdir) / "review_resolutions.csv"
+            catalog_csv = Path(tmpdir) / "canonical_catalog.csv"
+            links_csv = Path(tmpdir) / "product_links.csv"
            purchases_csv = Path(tmpdir) / "combined" / "purchases.csv"
            examples_csv = Path(tmpdir) / "combined" / "comparison_examples.csv"

            fieldnames = enrich_costco.OUTPUT_FIELDS
-            rows = []
            giant_row = {field: "" for field in fieldnames}
            giant_row.update(
                {
@@ -178,7 +175,6 @@ class PurchaseLogTests(unittest.TestCase):
                    "is_fee": "false",
                }
            )
-            rows.extend([giant_row, costco_row])

            for path, source_rows in [
                (giant_items, [giant_row]),
@@ -189,12 +185,35 @@ class PurchaseLogTests(unittest.TestCase):
                    writer.writeheader()
                    writer.writerows(source_rows)

+            order_fields = ["order_id", "store_name", "store_number", "store_city", "store_state"]
            for path, source_rows in [
-                (giant_orders, [{"order_id": "g1", "store_name": "Giant", "store_number": "42", "store_city": "Springfield", "store_state": "VA"}]),
-                (costco_orders, [{"order_id": "c1", "store_name": "MT VERNON", "store_number": "1115", "store_city": "ALEXANDRIA", "store_state": "VA"}]),
+                (
+                    giant_orders,
+                    [
+                        {
+                            "order_id": "g1",
+                            "store_name": "Giant",
+                            "store_number": "42",
+                            "store_city": "Springfield",
+                            "store_state": "VA",
+                        }
+                    ],
+                ),
+                (
+                    costco_orders,
+                    [
+                        {
+                            "order_id": "c1",
+                            "store_name": "MT VERNON",
+                            "store_number": "1115",
+                            "store_city": "ALEXANDRIA",
+                            "store_state": "VA",
+                        }
+                    ],
+                ),
            ]:
                with path.open("w", newline="", encoding="utf-8") as handle:
-                    writer = csv.DictWriter(handle, fieldnames=["order_id", "store_name", "store_number", "store_city", "store_state"])
+                    writer = csv.DictWriter(handle, fieldnames=order_fields)
                    writer.writeheader()
                    writer.writerows(source_rows)

@@ -203,12 +222,9 @@ class PurchaseLogTests(unittest.TestCase):
                costco_items_enriched_csv=str(costco_items),
                giant_orders_csv=str(giant_orders),
                costco_orders_csv=str(costco_orders),
-<<<<<<< HEAD
-                resolutions_csv=str(Path(tmpdir) / "review_resolutions.csv"),
-                catalog_csv=str(Path(tmpdir) / "canonical_catalog.csv"),
-                links_csv=str(Path(tmpdir) / "product_links.csv"),
-=======
->>>>>>> be1bf63 (Build pivot-ready purchase log)
+                resolutions_csv=str(resolutions_csv),
+                catalog_csv=str(catalog_csv),
+                links_csv=str(links_csv),
                output_csv=str(purchases_csv),
                examples_csv=str(examples_csv),
            )
@@ -222,7 +238,6 @@ class PurchaseLogTests(unittest.TestCase):
            self.assertEqual(2, len(purchase_rows))
            self.assertEqual(1, len(example_rows))

-<<<<<<< HEAD
    def test_build_purchase_rows_applies_manual_resolution(self):
        fieldnames = enrich_costco.OUTPUT_FIELDS
        giant_row = {field: "" for field in fieldnames}
@@ -255,7 +270,15 @@ class PurchaseLogTests(unittest.TestCase):
        rows, _observed, _canon, _links = build_purchases.build_purchase_rows(
            [giant_row],
            [],
-            [{"order_id": "g1", "store_name": "Giant", "store_number": "42", "store_city": "Springfield", "store_state": "VA"}],
+            [
+                {
+                    "order_id": "g1",
+                    "store_name": "Giant",
+                    "store_number": "42",
+                    "store_city": "Springfield",
+                    "store_state": "VA",
+                }
+            ],
            [],
            [
                {
@@ -273,8 +296,6 @@ class PurchaseLogTests(unittest.TestCase):
        self.assertEqual("approved", rows[0]["review_status"])
        self.assertEqual("create", rows[0]["resolution_action"])

-=======
->>>>>>> be1bf63 (Build pivot-ready purchase log)

 if __name__ == "__main__":
    unittest.main()
--- a/tests/test_review_workflow.py
+++ b/tests/test_review_workflow.py
@@ -4,6 +4,8 @@ import unittest
 from pathlib import Path
 from unittest import mock

+from click.testing import CliRunner
+
 import review_products


@@ -37,6 +39,135 @@ class ReviewWorkflowTests(unittest.TestCase):
        self.assertEqual("gobs_1", queue_rows[0]["observed_product_id"])
        self.assertIn("SB BAGGED ICE 20LB", queue_rows[0]["raw_item_names"])

+    def test_build_canonical_suggestions_prefers_upc_then_name(self):
+        suggestions = review_products.build_canonical_suggestions(
+            [
+                {
+                    "normalized_item_name": "MIXED PEPPER",
+                    "upc": "12345",
+                }
+            ],
+            [
+                {
+                    "canonical_product_id": "gcan_1",
+                    "canonical_name": "MIXED PEPPER",
+                    "upc": "",
+                },
+                {
+                    "canonical_product_id": "gcan_2",
+                    "canonical_name": "MIXED PEPPER 6 PACK",
+                    "upc": "12345",
+                },
+            ],
+        )
+
+        self.assertEqual("gcan_2", suggestions[0]["canonical_product_id"])
+        self.assertEqual("exact upc", suggestions[0]["reason"])
+        self.assertEqual("gcan_1", suggestions[1]["canonical_product_id"])
+
+    def test_review_products_displays_position_items_and_suggestions(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            purchases_csv = Path(tmpdir) / "purchases.csv"
+            queue_csv = Path(tmpdir) / "review_queue.csv"
+            resolutions_csv = Path(tmpdir) / "review_resolutions.csv"
+            catalog_csv = Path(tmpdir) / "canonical_catalog.csv"
+
+            purchase_fields = [
+                "purchase_date",
+                "retailer",
+                "order_id",
+                "line_no",
+                "observed_product_id",
+                "canonical_product_id",
+                "raw_item_name",
+                "normalized_item_name",
+                "image_url",
+                "upc",
+                "line_total",
+            ]
+            with purchases_csv.open("w", newline="", encoding="utf-8") as handle:
+                writer = csv.DictWriter(handle, fieldnames=purchase_fields)
+                writer.writeheader()
+                writer.writerows(
+                    [
+                        {
+                            "purchase_date": "2026-03-14",
+                            "retailer": "costco",
+                            "order_id": "c2",
+                            "line_no": "2",
+                            "observed_product_id": "gobs_mix",
+                            "canonical_product_id": "",
+                            "raw_item_name": "MIXED PEPPER 6-PACK",
+                            "normalized_item_name": "MIXED PEPPER",
+                            "image_url": "",
+                            "upc": "",
+                            "line_total": "7.49",
+                        },
+                        {
+                            "purchase_date": "2026-03-12",
+                            "retailer": "costco",
+                            "order_id": "c1",
+                            "line_no": "1",
+                            "observed_product_id": "gobs_mix",
+                            "canonical_product_id": "",
+                            "raw_item_name": "MIXED PEPPER 6-PACK",
+                            "normalized_item_name": "MIXED PEPPER",
+                            "image_url": "https://example.test/mixed-pepper.jpg",
+                            "upc": "",
+                            "line_total": "6.99",
+                        },
+                    ]
+                )
+
+            with catalog_csv.open("w", newline="", encoding="utf-8") as handle:
+                writer = csv.DictWriter(handle, fieldnames=review_products.build_purchases.CATALOG_FIELDS)
+                writer.writeheader()
+                writer.writerow(
+                    {
+                        "canonical_product_id": "gcan_mix",
+                        "canonical_name": "MIXED PEPPER",
+                        "category": "produce",
+                        "product_type": "pepper",
+                        "brand": "",
+                        "variant": "",
+                        "size_value": "",
+                        "size_unit": "",
+                        "pack_qty": "",
+                        "measure_type": "",
+                        "notes": "",
+                        "created_at": "",
+                        "updated_at": "",
+                    }
+                )
+
+            runner = CliRunner()
+            result = runner.invoke(
+                review_products.main,
+                [
+                    "--purchases-csv",
+                    str(purchases_csv),
+                    "--queue-csv",
+                    str(queue_csv),
+                    "--resolutions-csv",
+                    str(resolutions_csv),
+                    "--catalog-csv",
+                    str(catalog_csv),
+                ],
+                input="q\n",
+                color=True,
+            )
+
+            self.assertEqual(0, result.exit_code)
+            self.assertIn("Review observed product (1/1)", result.output)
+            self.assertIn("Resolve this observed product group", result.output)
+            self.assertIn("actions: [l]ink existing  [n]ew canonical  e[x]clude  [s]kip  [q]uit", result.output)
+            first_item = result.output.index("2026-03-14 | 7.49")
+            second_item = result.output.index("2026-03-12 | 6.99")
+            self.assertLess(first_item, second_item)
+            self.assertIn("image: https://example.test/mixed-pepper.jpg", result.output)
+            self.assertIn("gcan_mix | MIXED PEPPER (exact normalized name)", result.output)
+            self.assertIn("\x1b[", result.output)
+
    def test_review_products_creates_canonical_and_resolution(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            purchases_csv = Path(tmpdir) / "purchases.csv"
@@ -48,25 +179,33 @@ class ReviewWorkflowTests(unittest.TestCase):
                writer = csv.DictWriter(
                    handle,
                    fieldnames=[
+                        "purchase_date",
                        "observed_product_id",
                        "canonical_product_id",
                        "retailer",
                        "raw_item_name",
                        "normalized_item_name",
+                        "image_url",
                        "upc",
                        "line_total",
+                        "order_id",
+                        "line_no",
                    ],
                )
                writer.writeheader()
                writer.writerow(
                    {
+                        "purchase_date": "2026-03-15",
                        "observed_product_id": "gobs_ice",
                        "canonical_product_id": "",
                        "retailer": "giant",
                        "raw_item_name": "SB BAGGED ICE 20LB",
                        "normalized_item_name": "BAGGED ICE",
+                        "image_url": "",
                        "upc": "",
                        "line_total": "3.50",
+                        "order_id": "g1",
+                        "line_no": "1",
                    }
                )