scrape-giant/review_products.py

from collections import defaultdict
from datetime import date
import re

import click

import build_purchases
from layer_helpers import compact_join, stable_id, write_csv_rows


QUEUE_FIELDS = [
    "review_id",
    "retailer",
    "normalized_item_id",
    "catalog_id",
    "reason_code",
    "priority",
    "raw_item_names",
    "normalized_names",
    "upc_values",
    "example_prices",
    "seen_count",
    "status",
    "resolution_action",
    "resolution_notes",
    "created_at",
    "updated_at",
]

INFO_COLOR = "cyan"
PROMPT_COLOR = "bright_yellow"
WARNING_COLOR = "magenta"
TOKEN_RE = re.compile(r"[A-Z0-9]+")


def print_intro_text():
    click.secho("Review guide:", fg=INFO_COLOR)
    click.echo("  catalog name: unique product identity including variant, but not packaging")
    click.echo("  product type: general product you want to compare across purchases")
    click.echo("  category: broad analysis bucket such as dairy, produce, or frozen")


def build_review_queue(purchase_rows, resolution_rows):
    by_normalized = defaultdict(list)
    resolution_lookup = build_purchases.load_resolution_lookup(resolution_rows)

    for row in purchase_rows:
        normalized_item_id = row.get("normalized_item_id", "")
        if not normalized_item_id:
            continue
        by_normalized[normalized_item_id].append(row)

    today_text = str(date.today())
    queue_rows = []
    for normalized_item_id, rows in sorted(by_normalized.items()):
        current_resolution = resolution_lookup.get(normalized_item_id, {})
        if current_resolution.get("status") == "approved":
            continue

        unresolved_rows = [
            row
            for row in rows
            if not row.get("catalog_id")
            and row.get("is_item", "true") != "false"
            and row.get("is_fee") != "true"
            and row.get("is_discount_line") != "true"
            and row.get("is_coupon_line") != "true"
        ]
        if not unresolved_rows:
            continue

        retailers = sorted({row["retailer"] for row in rows})
        review_id = stable_id("rvw", normalized_item_id)
        queue_rows.append(
            {
                "review_id": review_id,
                "retailer": " | ".join(retailers),
                "normalized_item_id": normalized_item_id,
                "catalog_id": current_resolution.get("catalog_id", ""),
                "reason_code": "missing_catalog_link",
                "priority": "high",
                "raw_item_names": compact_join(
                    sorted({row["raw_item_name"] for row in rows if row["raw_item_name"]}),
                    limit=8,
                ),
                "normalized_names": compact_join(
                    sorted(
                        {
                            row["normalized_item_name"]
                            for row in rows
                            if row["normalized_item_name"]
                        }
                    ),
                    limit=8,
                ),
                "upc_values": compact_join(
                    sorted({row["upc"] for row in rows if row["upc"]}),
                    limit=8,
                ),
                "example_prices": compact_join(
                    sorted({row["line_total"] for row in rows if row["line_total"]}),
                    limit=8,
                ),
                "seen_count": str(len(rows)),
                "status": current_resolution.get("status", "pending"),
                "resolution_action": current_resolution.get("resolution_action", ""),
                "resolution_notes": current_resolution.get("resolution_notes", ""),
                "created_at": current_resolution.get("reviewed_at", today_text),
                "updated_at": today_text,
            }
        )
    return queue_rows


def save_resolution_rows(path, rows):
    write_csv_rows(path, rows, build_purchases.RESOLUTION_FIELDS)


def save_catalog_rows(path, rows):
    write_csv_rows(path, rows, build_purchases.CATALOG_FIELDS)


def save_link_rows(path, rows):
    write_csv_rows(path, rows, build_purchases.PRODUCT_LINK_FIELDS)


def sort_related_items(rows):
    return sorted(
        rows,
        key=lambda row: (
            row.get("purchase_date", ""),
            row.get("order_id", ""),
            int(row.get("line_no", "0") or "0"),
        ),
        reverse=True,
    )


def tokenize_match_text(*values):
    tokens = set()
    for value in values:
        tokens.update(TOKEN_RE.findall((value or "").upper()))
    return tokens


def build_catalog_suggestions(related_rows, purchase_rows, catalog_rows, limit=3):
    normalized_names = {
        row.get("normalized_item_name", "").strip().upper()
        for row in related_rows
        if row.get("normalized_item_name", "").strip()
    }
    upcs = {
        row.get("upc", "").strip()
        for row in related_rows
        if row.get("upc", "").strip()
    }
    catalog_by_id = {
        row.get("catalog_id", ""): row for row in catalog_rows if row.get("catalog_id", "")
    }
    suggestions = []
    seen_ids = set()

    def add_catalog_id(catalog_id, reason):
        if not catalog_id or catalog_id in seen_ids or catalog_id not in catalog_by_id:
            return False
        seen_ids.add(catalog_id)
        catalog_row = catalog_by_id[catalog_id]
        suggestions.append(
            {
                "catalog_id": catalog_id,
                "catalog_name": catalog_row.get("catalog_name", ""),
                "reason": reason,
            }
        )
        return len(suggestions) >= limit

    reviewed_purchase_rows = [
        row for row in purchase_rows if row.get("catalog_id") and row.get("normalized_item_id")
    ]
    for row in reviewed_purchase_rows:
        if row.get("upc", "").strip() and row.get("upc", "").strip() in upcs:
            if add_catalog_id(row.get("catalog_id", ""), "exact upc"):
                return suggestions

    for row in reviewed_purchase_rows:
        if row.get("normalized_item_name", "").strip().upper() in normalized_names:
            if add_catalog_id(row.get("catalog_id", ""), "exact normalized name"):
                return suggestions

    for catalog_row in catalog_rows:
        catalog_name = catalog_row.get("catalog_name", "").strip().upper()
        if not catalog_name:
            continue
        for normalized_name in normalized_names:
            if normalized_name in catalog_name or catalog_name in normalized_name:
                if add_catalog_id(catalog_row.get("catalog_id", ""), "catalog name contains match"):
                    return suggestions
                break
    return suggestions


def search_catalog_rows(query, catalog_rows, purchase_rows, current_normalized_item_id, limit=10):
    query_tokens = tokenize_match_text(query)
    if not query_tokens:
        return []

    linked_purchase_counts = defaultdict(int)
    linked_normalized_ids = defaultdict(set)
    current_catalog_id = ""
    for row in purchase_rows:
        catalog_id = row.get("catalog_id", "")
        normalized_item_id = row.get("normalized_item_id", "")
        if catalog_id and normalized_item_id:
            linked_purchase_counts[catalog_id] += 1
            linked_normalized_ids[catalog_id].add(normalized_item_id)
        if normalized_item_id == current_normalized_item_id and catalog_id:
            current_catalog_id = catalog_id

    ranked_rows = []
    for row in catalog_rows:
        catalog_id = row.get("catalog_id", "")
        if not catalog_id or catalog_id == current_catalog_id:
            continue
        catalog_tokens = tokenize_match_text(
            row.get("catalog_name", ""),
            row.get("product_type", ""),
            row.get("variant", ""),
        )
        overlap = query_tokens & catalog_tokens
        if not overlap:
            continue
        ranked_rows.append(
            {
                "catalog_id": catalog_id,
                "catalog_name": row.get("catalog_name", ""),
                "product_type": row.get("product_type", ""),
                "category": row.get("category", ""),
                "variant": row.get("variant", ""),
                "linked_normalized_items": len(linked_normalized_ids.get(catalog_id, set())),
                "linked_purchase_rows": linked_purchase_counts.get(catalog_id, 0),
                "score": len(overlap),
            }
        )

    ranked_rows.sort(
        key=lambda row: (-row["score"], row["catalog_name"], row["catalog_id"])
    )
    return ranked_rows[:limit]


def suggestion_display_rows(suggestions, purchase_rows, catalog_rows):
    linked_purchase_counts = defaultdict(int)
    linked_normalized_ids = defaultdict(set)
    for row in purchase_rows:
        catalog_id = row.get("catalog_id", "")
        normalized_item_id = row.get("normalized_item_id", "")
        if not catalog_id or not normalized_item_id:
            continue
        linked_purchase_counts[catalog_id] += 1
        linked_normalized_ids[catalog_id].add(normalized_item_id)

    display_rows = []
    catalog_details = {
        row["catalog_id"]: {
            "product_type": row.get("product_type", ""),
            "category": row.get("category", ""),
        }
        for row in catalog_rows
        if row.get("catalog_id")
    }
    for row in purchase_rows:
        if row.get("catalog_id"):
            catalog_details.setdefault(
                row["catalog_id"],
                {
                    "product_type": row.get("product_type", ""),
                    "category": row.get("category", ""),
                },
            )

    for row in suggestions:
        catalog_id = row["catalog_id"]
        details = catalog_details.get(catalog_id, {})
        display_rows.append(
            {
                **row,
                "product_type": details.get("product_type", ""),
                "category": details.get("category", ""),
                "linked_purchase_rows": linked_purchase_counts.get(catalog_id, 0),
                "linked_normalized_items": len(linked_normalized_ids.get(catalog_id, set())),
            }
        )
    return display_rows


def print_catalog_rows(rows):
    for index, row in enumerate(rows, start=1):
        click.echo(
            f" [{index}] {row['catalog_name']}, {row.get('product_type', '')}, "
            f"{row.get('category', '')} ({row['linked_normalized_items']} items, "
            f"{row['linked_purchase_rows']} rows)"
        )


def build_display_lines(related_rows):
    lines = []
    for index, row in enumerate(sort_related_items(related_rows), start=1):
        lines.append(
            " [{index}] {raw_item_name} | {retailer} | {purchase_date} | {line_total} | {image_url}".format(
                index=index,
                raw_item_name=row.get("raw_item_name", ""),
                retailer=row.get("retailer", ""),
                purchase_date=row.get("purchase_date", ""),
                line_total=row.get("line_total", ""),
                image_url=row.get("image_url", ""),
            )
        )
    if not lines:
        lines.append(" [1] no matched item rows found")
    return lines


def normalized_label(queue_row, related_rows):
    if queue_row.get("normalized_names"):
        return queue_row["normalized_names"].split(" | ")[0]
    for row in related_rows:
        if row.get("normalized_item_name"):
            return row["normalized_item_name"]
    return queue_row.get("normalized_item_id", "")


def choose_existing_catalog(display_rows, normalized_name, matched_count):
    click.secho(
        f"Select the catalog_name to associate {matched_count} items with:",
        fg=INFO_COLOR,
    )
    print_catalog_rows(display_rows)
    choice = click.prompt(
        click.style("selection", fg=PROMPT_COLOR),
        type=click.IntRange(1, len(display_rows)),
    )
    chosen_row = display_rows[choice - 1]
    click.echo(
        f'{matched_count} "{normalized_name}" items and future matches will be associated '
        f'with "{chosen_row["catalog_name"]}".'
    )
    click.secho("actions: [y]es  [n]o  [b]ack  [s]kip  [q]uit", fg=PROMPT_COLOR)
    confirm = click.prompt(
        click.style("confirm", fg=PROMPT_COLOR),
        type=click.Choice(["y", "n", "b", "s", "q"]),
    )
    if confirm == "y":
        return chosen_row["catalog_id"], ""
    if confirm == "s":
        return "", "skip"
    if confirm == "q":
        return "", "quit"
    return "", "back"


def prompt_resolution(queue_row, related_rows, purchase_rows, catalog_rows, queue_index, queue_total):
    suggestions = suggestion_display_rows(
        build_catalog_suggestions(related_rows, purchase_rows, catalog_rows),
        purchase_rows,
        catalog_rows,
    )
    normalized_name = normalized_label(queue_row, related_rows)
    matched_count = len(related_rows)
    click.echo("")
    click.secho(
        f"Review {queue_index}/{queue_total}: {normalized_name}",
        fg=INFO_COLOR,
    )
    click.echo(f"{matched_count} matched items:")
    for line in build_display_lines(related_rows):
        click.echo(line)
    if suggestions:
        click.echo(f"{len(suggestions)} catalog_name suggestions found:")
        print_catalog_rows(suggestions)
    else:
        click.echo("no catalog_name suggestions found")
    prompt_bits = []
    if suggestions:
        prompt_bits.append("[#] link to suggestion")
    prompt_bits.extend(["[f]ind", "[n]ew", "[s]kip", "e[x]clude", "[q]uit"])
    click.secho("  ".join(prompt_bits) + " >", fg=PROMPT_COLOR)
    action = click.prompt("", type=str, prompt_suffix=" ").strip().lower()
    if action.isdigit() and suggestions:
        choice = int(action)
        if 1 <= choice <= len(suggestions):
            chosen_row = suggestions[choice - 1]
            notes = click.prompt(click.style("link notes", fg=PROMPT_COLOR), default="", show_default=False)
            return {
                "normalized_item_id": queue_row["normalized_item_id"],
                "catalog_id": chosen_row["catalog_id"],
                "resolution_action": "link",
                "status": "approved",
                "resolution_notes": notes,
                "reviewed_at": str(date.today()),
            }, None
        click.secho("invalid suggestion number", fg=WARNING_COLOR)
        return prompt_resolution(queue_row, related_rows, purchase_rows, catalog_rows, queue_index, queue_total)
    if action == "q":
        return None, None
    if action == "s":
        return {
            "normalized_item_id": queue_row["normalized_item_id"],
            "catalog_id": "",
            "resolution_action": "skip",
            "status": "pending",
            "resolution_notes": queue_row.get("resolution_notes", ""),
            "reviewed_at": str(date.today()),
        }, None
    if action == "f":
        while True:
            query = click.prompt(click.style("search", fg=PROMPT_COLOR), default="", show_default=False).strip()
            if not query:
                return prompt_resolution(queue_row, related_rows, purchase_rows, catalog_rows, queue_index, queue_total)
            search_rows = search_catalog_rows(
                query,
                catalog_rows,
                purchase_rows,
                queue_row["normalized_item_id"],
            )
            if not search_rows:
                click.echo("no matches found")
                retry = click.prompt(
                    click.style("search again? [enter=yes, q=no]", fg=PROMPT_COLOR),
                    default="",
                    show_default=False,
                ).strip().lower()
                if retry == "q":
                    return prompt_resolution(queue_row, related_rows, purchase_rows, catalog_rows, queue_index, queue_total)
                continue
            click.echo(f"{len(search_rows)} search results found:")
            print_catalog_rows(search_rows)
            choice = click.prompt(
                click.style("selection", fg=PROMPT_COLOR),
                type=click.IntRange(1, len(search_rows)),
            )
            chosen_row = search_rows[choice - 1]
            notes = click.prompt(click.style("link notes", fg=PROMPT_COLOR), default="", show_default=False)
            return {
                "normalized_item_id": queue_row["normalized_item_id"],
                "catalog_id": chosen_row["catalog_id"],
                "resolution_action": "link",
                "status": "approved",
                "resolution_notes": notes,
                "reviewed_at": str(date.today()),
            }, None
    if action == "x":
        notes = click.prompt(click.style("exclude notes", fg=PROMPT_COLOR), default="", show_default=False)
        return {
            "normalized_item_id": queue_row["normalized_item_id"],
            "catalog_id": "",
            "resolution_action": "exclude",
            "status": "approved",
            "resolution_notes": notes,
            "reviewed_at": str(date.today()),
        }, None
    if action != "n":
        click.secho("invalid action", fg=WARNING_COLOR)
        return prompt_resolution(queue_row, related_rows, purchase_rows, catalog_rows, queue_index, queue_total)

    catalog_name = click.prompt(click.style("catalog name", fg=PROMPT_COLOR), type=str)
    product_type = click.prompt(click.style("product type", fg=PROMPT_COLOR), default="", show_default=False)
    category = click.prompt(click.style("category", fg=PROMPT_COLOR), default="", show_default=False)
    notes = click.prompt(click.style("notes", fg=PROMPT_COLOR), default="", show_default=False)
    catalog_id = stable_id("cat", f"manual|{catalog_name}|{category}|{product_type}")
    catalog_row = {
        "catalog_id": catalog_id,
        "catalog_name": catalog_name,
        "category": category,
        "product_type": product_type,
        "brand": "",
        "variant": "",
        "size_value": "",
        "size_unit": "",
        "pack_qty": "",
        "measure_type": "",
        "notes": notes,
        "created_at": str(date.today()),
        "updated_at": str(date.today()),
    }
    resolution_row = {
        "normalized_item_id": queue_row["normalized_item_id"],
        "catalog_id": catalog_id,
        "resolution_action": "create",
        "status": "approved",
        "resolution_notes": notes,
        "reviewed_at": str(date.today()),
    }
    return resolution_row, catalog_row


def apply_resolution_to_queue(queue_rows, resolution_lookup):
    today_text = str(date.today())
    updated_rows = []
    for row in queue_rows:
        resolution = resolution_lookup.get(row["normalized_item_id"], {})
        row_copy = dict(row)
        if resolution:
            row_copy["catalog_id"] = resolution.get("catalog_id", "")
            row_copy["status"] = resolution.get("status", row_copy.get("status", "pending"))
            row_copy["resolution_action"] = resolution.get("resolution_action", "")
            row_copy["resolution_notes"] = resolution.get("resolution_notes", "")
            row_copy["updated_at"] = resolution.get("reviewed_at", today_text)
            if resolution.get("status") == "approved":
                row_copy["created_at"] = row_copy.get("created_at") or resolution.get("reviewed_at", today_text)
        updated_rows.append(row_copy)
    return updated_rows


def link_rows_from_state(link_lookup):
    return sorted(link_lookup.values(), key=lambda row: row["normalized_item_id"])


@click.command()
@click.option("--purchases-csv", default="data/review/purchases.csv", show_default=True)
@click.option("--queue-csv", default="data/review/review_queue.csv", show_default=True)
@click.option("--resolutions-csv", default="data/review/review_resolutions.csv", show_default=True)
@click.option("--catalog-csv", default="data/catalog.csv", show_default=True)
@click.option("--links-csv", default="data/review/product_links.csv", show_default=True)
@click.option("--limit", default=0, show_default=True, type=int)
@click.option("--refresh-only", is_flag=True, help="Only rebuild review_queue.csv without prompting.")
def main(purchases_csv, queue_csv, resolutions_csv, catalog_csv, links_csv, limit, refresh_only):
    purchase_rows = build_purchases.read_optional_csv_rows(purchases_csv)
    resolution_rows = build_purchases.read_optional_csv_rows(resolutions_csv)
    catalog_rows = build_purchases.merge_catalog_rows(build_purchases.read_optional_csv_rows(catalog_csv), [])
    link_lookup = build_purchases.load_link_lookup(build_purchases.read_optional_csv_rows(links_csv))
    queue_rows = build_review_queue(purchase_rows, resolution_rows)
    write_csv_rows(queue_csv, queue_rows, QUEUE_FIELDS)
    click.echo(f"wrote {len(queue_rows)} rows to {queue_csv}")

    if refresh_only:
        return

    print_intro_text()
    resolution_lookup = build_purchases.load_resolution_lookup(resolution_rows)
    catalog_by_id = {row["catalog_id"]: row for row in catalog_rows if row.get("catalog_id")}
    rows_by_normalized = defaultdict(list)
    for row in purchase_rows:
        normalized_item_id = row.get("normalized_item_id", "")
        if normalized_item_id:
            rows_by_normalized[normalized_item_id].append(row)

    reviewed = 0
    for index, queue_row in enumerate(queue_rows, start=1):
        if limit and reviewed >= limit:
            break
        related_rows = rows_by_normalized.get(queue_row["normalized_item_id"], [])
        result = prompt_resolution(queue_row, related_rows, purchase_rows, catalog_rows, index, len(queue_rows))
        if result == (None, None):
            break
        resolution_row, catalog_row = result
        resolution_lookup[resolution_row["normalized_item_id"]] = resolution_row
        if catalog_row and catalog_row["catalog_id"] not in catalog_by_id:
            catalog_by_id[catalog_row["catalog_id"]] = catalog_row
            catalog_rows.append(catalog_row)
        normalized_item_id = resolution_row["normalized_item_id"]
        if resolution_row["status"] == "approved":
            if resolution_row["resolution_action"] in {"link", "create"} and resolution_row.get("catalog_id"):
                link_lookup[normalized_item_id] = {
                    "normalized_item_id": normalized_item_id,
                    "catalog_id": resolution_row["catalog_id"],
                    "link_method": f"manual_{resolution_row['resolution_action']}",
                    "link_confidence": "high",
                    "review_status": "approved",
                    "reviewed_by": "",
                    "reviewed_at": resolution_row.get("reviewed_at", ""),
                    "link_notes": resolution_row.get("resolution_notes", ""),
                }
            elif resolution_row["resolution_action"] == "exclude":
                link_lookup.pop(normalized_item_id, None)
        queue_rows = apply_resolution_to_queue(queue_rows, resolution_lookup)
        write_csv_rows(queue_csv, queue_rows, QUEUE_FIELDS)
        save_resolution_rows(
            resolutions_csv,
            sorted(resolution_lookup.values(), key=lambda row: row["normalized_item_id"]),
        )
        save_catalog_rows(catalog_csv, sorted(catalog_by_id.values(), key=lambda row: row["catalog_id"]))
        save_link_rows(links_csv, link_rows_from_state(link_lookup))
        reviewed += 1

    save_resolution_rows(resolutions_csv, sorted(resolution_lookup.values(), key=lambda row: row["normalized_item_id"]))
    save_catalog_rows(catalog_csv, sorted(catalog_by_id.values(), key=lambda row: row["catalog_id"]))
    save_link_rows(links_csv, link_rows_from_state(link_lookup))
    click.echo(
        f"saved {len(resolution_lookup)} resolution rows to {resolutions_csv}, "
        f"{len(catalog_by_id)} catalog rows to {catalog_csv}, "
        f"and {len(link_lookup)} product links to {links_csv}"
    )


if __name__ == "__main__":
    main()