Refactor review pipeline around normalized items
This commit is contained in:
@@ -10,8 +10,8 @@ from layer_helpers import compact_join, stable_id, write_csv_rows
|
||||
QUEUE_FIELDS = [
|
||||
"review_id",
|
||||
"retailer",
|
||||
"observed_product_id",
|
||||
"canonical_product_id",
|
||||
"normalized_item_id",
|
||||
"catalog_id",
|
||||
"reason_code",
|
||||
"priority",
|
||||
"raw_item_names",
|
||||
@@ -26,36 +26,49 @@ QUEUE_FIELDS = [
|
||||
"updated_at",
|
||||
]
|
||||
|
||||
INFO_COLOR = "cyan"
|
||||
PROMPT_COLOR = "bright_yellow"
|
||||
WARNING_COLOR = "magenta"
|
||||
|
||||
|
||||
def build_review_queue(purchase_rows, resolution_rows):
|
||||
by_observed = defaultdict(list)
|
||||
by_normalized = defaultdict(list)
|
||||
resolution_lookup = build_purchases.load_resolution_lookup(resolution_rows)
|
||||
|
||||
for row in purchase_rows:
|
||||
observed_product_id = row.get("observed_product_id", "")
|
||||
if not observed_product_id:
|
||||
normalized_item_id = row.get("normalized_item_id", "")
|
||||
if not normalized_item_id:
|
||||
continue
|
||||
by_observed[observed_product_id].append(row)
|
||||
by_normalized[normalized_item_id].append(row)
|
||||
|
||||
today_text = str(date.today())
|
||||
queue_rows = []
|
||||
for observed_product_id, rows in sorted(by_observed.items()):
|
||||
current_resolution = resolution_lookup.get(observed_product_id, {})
|
||||
for normalized_item_id, rows in sorted(by_normalized.items()):
|
||||
current_resolution = resolution_lookup.get(normalized_item_id, {})
|
||||
if current_resolution.get("status") == "approved":
|
||||
continue
|
||||
unresolved_rows = [row for row in rows if not row.get("canonical_product_id")]
|
||||
|
||||
unresolved_rows = [
|
||||
row
|
||||
for row in rows
|
||||
if not row.get("catalog_id")
|
||||
and row.get("is_item", "true") != "false"
|
||||
and row.get("is_fee") != "true"
|
||||
and row.get("is_discount_line") != "true"
|
||||
and row.get("is_coupon_line") != "true"
|
||||
]
|
||||
if not unresolved_rows:
|
||||
continue
|
||||
|
||||
retailers = sorted({row["retailer"] for row in rows})
|
||||
review_id = stable_id("rvw", observed_product_id)
|
||||
review_id = stable_id("rvw", normalized_item_id)
|
||||
queue_rows.append(
|
||||
{
|
||||
"review_id": review_id,
|
||||
"retailer": " | ".join(retailers),
|
||||
"observed_product_id": observed_product_id,
|
||||
"canonical_product_id": current_resolution.get("canonical_product_id", ""),
|
||||
"reason_code": "missing_canonical_link",
|
||||
"normalized_item_id": normalized_item_id,
|
||||
"catalog_id": current_resolution.get("catalog_id", ""),
|
||||
"reason_code": "missing_catalog_link",
|
||||
"priority": "high",
|
||||
"raw_item_names": compact_join(
|
||||
sorted({row["raw_item_name"] for row in rows if row["raw_item_name"]}),
|
||||
@@ -98,11 +111,6 @@ def save_catalog_rows(path, rows):
|
||||
write_csv_rows(path, rows, build_purchases.CATALOG_FIELDS)
|
||||
|
||||
|
||||
INFO_COLOR = "cyan"
|
||||
PROMPT_COLOR = "bright_yellow"
|
||||
WARNING_COLOR = "magenta"
|
||||
|
||||
|
||||
def sort_related_items(rows):
|
||||
return sorted(
|
||||
rows,
|
||||
@@ -115,7 +123,7 @@ def sort_related_items(rows):
|
||||
)
|
||||
|
||||
|
||||
def build_canonical_suggestions(related_rows, catalog_rows, limit=3):
|
||||
def build_catalog_suggestions(related_rows, purchase_rows, catalog_rows, limit=3):
|
||||
normalized_names = {
|
||||
row.get("normalized_item_name", "").strip().upper()
|
||||
for row in related_rows
|
||||
@@ -126,56 +134,52 @@ def build_canonical_suggestions(related_rows, catalog_rows, limit=3):
|
||||
for row in related_rows
|
||||
if row.get("upc", "").strip()
|
||||
}
|
||||
catalog_by_id = {
|
||||
row.get("catalog_id", ""): row for row in catalog_rows if row.get("catalog_id", "")
|
||||
}
|
||||
suggestions = []
|
||||
seen_ids = set()
|
||||
|
||||
def add_matches(rows, reason):
|
||||
for row in rows:
|
||||
canonical_product_id = row.get("canonical_product_id", "")
|
||||
if not canonical_product_id or canonical_product_id in seen_ids:
|
||||
continue
|
||||
seen_ids.add(canonical_product_id)
|
||||
suggestions.append(
|
||||
{
|
||||
"canonical_product_id": canonical_product_id,
|
||||
"canonical_name": row.get("canonical_name", ""),
|
||||
"reason": reason,
|
||||
}
|
||||
)
|
||||
if len(suggestions) >= limit:
|
||||
return True
|
||||
return False
|
||||
def add_catalog_id(catalog_id, reason):
|
||||
if not catalog_id or catalog_id in seen_ids or catalog_id not in catalog_by_id:
|
||||
return False
|
||||
seen_ids.add(catalog_id)
|
||||
catalog_row = catalog_by_id[catalog_id]
|
||||
suggestions.append(
|
||||
{
|
||||
"catalog_id": catalog_id,
|
||||
"catalog_name": catalog_row.get("catalog_name", ""),
|
||||
"reason": reason,
|
||||
}
|
||||
)
|
||||
return len(suggestions) >= limit
|
||||
|
||||
exact_upc_rows = [
|
||||
row
|
||||
for row in catalog_rows
|
||||
if row.get("upc", "").strip() and row.get("upc", "").strip() in upcs
|
||||
reviewed_purchase_rows = [
|
||||
row for row in purchase_rows if row.get("catalog_id") and row.get("normalized_item_id")
|
||||
]
|
||||
if add_matches(exact_upc_rows, "exact upc"):
|
||||
return suggestions
|
||||
for row in reviewed_purchase_rows:
|
||||
if row.get("upc", "").strip() and row.get("upc", "").strip() in upcs:
|
||||
if add_catalog_id(row.get("catalog_id", ""), "exact upc"):
|
||||
return suggestions
|
||||
|
||||
exact_name_rows = [
|
||||
row
|
||||
for row in catalog_rows
|
||||
if row.get("canonical_name", "").strip().upper() in normalized_names
|
||||
]
|
||||
if add_matches(exact_name_rows, "exact normalized name"):
|
||||
return suggestions
|
||||
for row in reviewed_purchase_rows:
|
||||
if row.get("normalized_item_name", "").strip().upper() in normalized_names:
|
||||
if add_catalog_id(row.get("catalog_id", ""), "exact normalized name"):
|
||||
return suggestions
|
||||
|
||||
contains_rows = []
|
||||
for row in catalog_rows:
|
||||
canonical_name = row.get("canonical_name", "").strip().upper()
|
||||
if not canonical_name:
|
||||
for catalog_row in catalog_rows:
|
||||
catalog_name = catalog_row.get("catalog_name", "").strip().upper()
|
||||
if not catalog_name:
|
||||
continue
|
||||
for normalized_name in normalized_names:
|
||||
if normalized_name in canonical_name or canonical_name in normalized_name:
|
||||
contains_rows.append(row)
|
||||
if normalized_name in catalog_name or catalog_name in normalized_name:
|
||||
if add_catalog_id(catalog_row.get("catalog_id", ""), "catalog name contains match"):
|
||||
return suggestions
|
||||
break
|
||||
add_matches(contains_rows, "canonical name contains match")
|
||||
return suggestions
|
||||
|
||||
|
||||
def build_display_lines(queue_row, related_rows):
|
||||
def build_display_lines(related_rows):
|
||||
lines = []
|
||||
for index, row in enumerate(sort_related_items(related_rows), start=1):
|
||||
lines.append(
|
||||
@@ -197,41 +201,38 @@ def build_display_lines(queue_row, related_rows):
|
||||
return lines
|
||||
|
||||
|
||||
def observed_name(queue_row, related_rows):
|
||||
def normalized_label(queue_row, related_rows):
|
||||
if queue_row.get("normalized_names"):
|
||||
return queue_row["normalized_names"].split(" | ")[0]
|
||||
for row in related_rows:
|
||||
if row.get("normalized_item_name"):
|
||||
return row["normalized_item_name"]
|
||||
return queue_row.get("observed_product_id", "")
|
||||
return queue_row.get("normalized_item_id", "")
|
||||
|
||||
|
||||
def choose_existing_canonical(display_rows, observed_label, matched_count):
|
||||
def choose_existing_catalog(display_rows, normalized_name, matched_count):
|
||||
click.secho(
|
||||
f"Select the canonical_name to associate {matched_count} items with:",
|
||||
f"Select the catalog_name to associate {matched_count} items with:",
|
||||
fg=INFO_COLOR,
|
||||
)
|
||||
for index, row in enumerate(display_rows, start=1):
|
||||
click.echo(f" [{index}] {row['canonical_name']} | {row['canonical_product_id']}")
|
||||
click.echo(f" [{index}] {row['catalog_name']} | {row['catalog_id']}")
|
||||
choice = click.prompt(
|
||||
click.style("selection", fg=PROMPT_COLOR),
|
||||
type=click.IntRange(1, len(display_rows)),
|
||||
)
|
||||
chosen_row = display_rows[choice - 1]
|
||||
click.echo(
|
||||
f'{matched_count} "{observed_label}" items and future matches will be associated '
|
||||
f'with "{chosen_row["canonical_name"]}".'
|
||||
)
|
||||
click.secho(
|
||||
"actions: [y]es [n]o [b]ack [s]kip [q]uit",
|
||||
fg=PROMPT_COLOR,
|
||||
f'{matched_count} "{normalized_name}" items and future matches will be associated '
|
||||
f'with "{chosen_row["catalog_name"]}".'
|
||||
)
|
||||
click.secho("actions: [y]es [n]o [b]ack [s]kip [q]uit", fg=PROMPT_COLOR)
|
||||
confirm = click.prompt(
|
||||
click.style("confirm", fg=PROMPT_COLOR),
|
||||
type=click.Choice(["y", "n", "b", "s", "q"]),
|
||||
)
|
||||
if confirm == "y":
|
||||
return chosen_row["canonical_product_id"], ""
|
||||
return chosen_row["catalog_id"], ""
|
||||
if confirm == "s":
|
||||
return "", "skip"
|
||||
if confirm == "q":
|
||||
@@ -239,54 +240,43 @@ def choose_existing_canonical(display_rows, observed_label, matched_count):
|
||||
return "", "back"
|
||||
|
||||
|
||||
def prompt_resolution(queue_row, related_rows, catalog_rows, queue_index, queue_total):
|
||||
suggestions = build_canonical_suggestions(related_rows, catalog_rows)
|
||||
observed_label = observed_name(queue_row, related_rows)
|
||||
def prompt_resolution(queue_row, related_rows, purchase_rows, catalog_rows, queue_index, queue_total):
|
||||
suggestions = build_catalog_suggestions(related_rows, purchase_rows, catalog_rows)
|
||||
normalized_name = normalized_label(queue_row, related_rows)
|
||||
matched_count = len(related_rows)
|
||||
click.echo("")
|
||||
click.secho(
|
||||
f"Review {queue_index}/{queue_total}: Resolve observed_product {observed_label} "
|
||||
"to canonical_name [__]?",
|
||||
f"Review {queue_index}/{queue_total}: Resolve normalized_item {normalized_name} "
|
||||
"to catalog_name [__]?",
|
||||
fg=INFO_COLOR,
|
||||
)
|
||||
click.echo(f"{matched_count} matched items:")
|
||||
for line in build_display_lines(queue_row, related_rows):
|
||||
for line in build_display_lines(related_rows):
|
||||
click.echo(line)
|
||||
if suggestions:
|
||||
click.echo(f"{len(suggestions)} canonical suggestions found:")
|
||||
click.echo(f"{len(suggestions)} catalog_name suggestions found:")
|
||||
for index, suggestion in enumerate(suggestions, start=1):
|
||||
click.echo(f" [{index}] {suggestion['canonical_name']}")
|
||||
click.echo(f" [{index}] {suggestion['catalog_name']}")
|
||||
else:
|
||||
click.echo("no canonical_name suggestions found")
|
||||
click.secho(
|
||||
"[l]ink existing [n]ew canonical e[x]clude [s]kip [q]uit:",
|
||||
fg=PROMPT_COLOR,
|
||||
)
|
||||
action = click.prompt(
|
||||
"",
|
||||
type=click.Choice(["l", "n", "x", "s", "q"]),
|
||||
prompt_suffix=" ",
|
||||
)
|
||||
click.echo("no catalog_name suggestions found")
|
||||
click.secho("[l]ink existing [n]ew catalog e[x]clude [s]kip [q]uit:", fg=PROMPT_COLOR)
|
||||
action = click.prompt("", type=click.Choice(["l", "n", "x", "s", "q"]), prompt_suffix=" ")
|
||||
if action == "q":
|
||||
return None, None
|
||||
if action == "s":
|
||||
return {
|
||||
"observed_product_id": queue_row["observed_product_id"],
|
||||
"canonical_product_id": "",
|
||||
"normalized_item_id": queue_row["normalized_item_id"],
|
||||
"catalog_id": "",
|
||||
"resolution_action": "skip",
|
||||
"status": "pending",
|
||||
"resolution_notes": queue_row.get("resolution_notes", ""),
|
||||
"reviewed_at": str(date.today()),
|
||||
}, None
|
||||
if action == "x":
|
||||
notes = click.prompt(
|
||||
click.style("exclude notes", fg=PROMPT_COLOR),
|
||||
default="",
|
||||
show_default=False,
|
||||
)
|
||||
notes = click.prompt(click.style("exclude notes", fg=PROMPT_COLOR), default="", show_default=False)
|
||||
return {
|
||||
"observed_product_id": queue_row["observed_product_id"],
|
||||
"canonical_product_id": "",
|
||||
"normalized_item_id": queue_row["normalized_item_id"],
|
||||
"catalog_id": "",
|
||||
"resolution_action": "exclude",
|
||||
"status": "approved",
|
||||
"resolution_notes": notes,
|
||||
@@ -295,22 +285,19 @@ def prompt_resolution(queue_row, related_rows, catalog_rows, queue_index, queue_
|
||||
if action == "l":
|
||||
display_rows = suggestions or [
|
||||
{
|
||||
"canonical_product_id": row["canonical_product_id"],
|
||||
"canonical_name": row["canonical_name"],
|
||||
"catalog_id": row["catalog_id"],
|
||||
"catalog_name": row["catalog_name"],
|
||||
"reason": "catalog sample",
|
||||
}
|
||||
for row in catalog_rows[:10]
|
||||
if row.get("catalog_id")
|
||||
]
|
||||
while True:
|
||||
canonical_product_id, outcome = choose_existing_canonical(
|
||||
display_rows,
|
||||
observed_label,
|
||||
matched_count,
|
||||
)
|
||||
catalog_id, outcome = choose_existing_catalog(display_rows, normalized_name, matched_count)
|
||||
if outcome == "skip":
|
||||
return {
|
||||
"observed_product_id": queue_row["observed_product_id"],
|
||||
"canonical_product_id": "",
|
||||
"normalized_item_id": queue_row["normalized_item_id"],
|
||||
"catalog_id": "",
|
||||
"resolution_action": "skip",
|
||||
"status": "pending",
|
||||
"resolution_notes": queue_row.get("resolution_notes", ""),
|
||||
@@ -323,34 +310,22 @@ def prompt_resolution(queue_row, related_rows, catalog_rows, queue_index, queue_
|
||||
break
|
||||
notes = click.prompt(click.style("link notes", fg=PROMPT_COLOR), default="", show_default=False)
|
||||
return {
|
||||
"observed_product_id": queue_row["observed_product_id"],
|
||||
"canonical_product_id": canonical_product_id,
|
||||
"normalized_item_id": queue_row["normalized_item_id"],
|
||||
"catalog_id": catalog_id,
|
||||
"resolution_action": "link",
|
||||
"status": "approved",
|
||||
"resolution_notes": notes,
|
||||
"reviewed_at": str(date.today()),
|
||||
}, None
|
||||
|
||||
canonical_name = click.prompt(click.style("canonical name", fg=PROMPT_COLOR), type=str)
|
||||
category = click.prompt(
|
||||
click.style("category", fg=PROMPT_COLOR),
|
||||
default="",
|
||||
show_default=False,
|
||||
)
|
||||
product_type = click.prompt(
|
||||
click.style("product type", fg=PROMPT_COLOR),
|
||||
default="",
|
||||
show_default=False,
|
||||
)
|
||||
notes = click.prompt(
|
||||
click.style("notes", fg=PROMPT_COLOR),
|
||||
default="",
|
||||
show_default=False,
|
||||
)
|
||||
canonical_product_id = stable_id("gcan", f"manual|{canonical_name}|{category}|{product_type}")
|
||||
canonical_row = {
|
||||
"canonical_product_id": canonical_product_id,
|
||||
"canonical_name": canonical_name,
|
||||
catalog_name = click.prompt(click.style("catalog name", fg=PROMPT_COLOR), type=str)
|
||||
category = click.prompt(click.style("category", fg=PROMPT_COLOR), default="", show_default=False)
|
||||
product_type = click.prompt(click.style("product type", fg=PROMPT_COLOR), default="", show_default=False)
|
||||
notes = click.prompt(click.style("notes", fg=PROMPT_COLOR), default="", show_default=False)
|
||||
catalog_id = stable_id("cat", f"manual|{catalog_name}|{category}|{product_type}")
|
||||
catalog_row = {
|
||||
"catalog_id": catalog_id,
|
||||
"catalog_name": catalog_name,
|
||||
"category": category,
|
||||
"product_type": product_type,
|
||||
"brand": "",
|
||||
@@ -364,14 +339,14 @@ def prompt_resolution(queue_row, related_rows, catalog_rows, queue_index, queue_
|
||||
"updated_at": str(date.today()),
|
||||
}
|
||||
resolution_row = {
|
||||
"observed_product_id": queue_row["observed_product_id"],
|
||||
"canonical_product_id": canonical_product_id,
|
||||
"normalized_item_id": queue_row["normalized_item_id"],
|
||||
"catalog_id": catalog_id,
|
||||
"resolution_action": "create",
|
||||
"status": "approved",
|
||||
"resolution_notes": notes,
|
||||
"reviewed_at": str(date.today()),
|
||||
}
|
||||
return resolution_row, canonical_row
|
||||
return resolution_row, catalog_row
|
||||
|
||||
|
||||
@click.command()
|
||||
@@ -384,7 +359,7 @@ def prompt_resolution(queue_row, related_rows, catalog_rows, queue_index, queue_
|
||||
def main(purchases_csv, queue_csv, resolutions_csv, catalog_csv, limit, refresh_only):
|
||||
purchase_rows = build_purchases.read_optional_csv_rows(purchases_csv)
|
||||
resolution_rows = build_purchases.read_optional_csv_rows(resolutions_csv)
|
||||
catalog_rows = build_purchases.read_optional_csv_rows(catalog_csv)
|
||||
catalog_rows = build_purchases.merge_catalog_rows(build_purchases.read_optional_csv_rows(catalog_csv), [])
|
||||
queue_rows = build_review_queue(purchase_rows, resolution_rows)
|
||||
write_csv_rows(queue_csv, queue_rows, QUEUE_FIELDS)
|
||||
click.echo(f"wrote {len(queue_rows)} rows to {queue_csv}")
|
||||
@@ -393,29 +368,33 @@ def main(purchases_csv, queue_csv, resolutions_csv, catalog_csv, limit, refresh_
|
||||
return
|
||||
|
||||
resolution_lookup = build_purchases.load_resolution_lookup(resolution_rows)
|
||||
catalog_by_id = {row["canonical_product_id"]: row for row in catalog_rows if row.get("canonical_product_id")}
|
||||
rows_by_observed = defaultdict(list)
|
||||
catalog_by_id = {row["catalog_id"]: row for row in catalog_rows if row.get("catalog_id")}
|
||||
rows_by_normalized = defaultdict(list)
|
||||
for row in purchase_rows:
|
||||
observed_product_id = row.get("observed_product_id", "")
|
||||
if observed_product_id:
|
||||
rows_by_observed[observed_product_id].append(row)
|
||||
normalized_item_id = row.get("normalized_item_id", "")
|
||||
if normalized_item_id:
|
||||
rows_by_normalized[normalized_item_id].append(row)
|
||||
|
||||
reviewed = 0
|
||||
for index, queue_row in enumerate(queue_rows, start=1):
|
||||
if limit and reviewed >= limit:
|
||||
break
|
||||
related_rows = rows_by_observed.get(queue_row["observed_product_id"], [])
|
||||
result = prompt_resolution(queue_row, related_rows, catalog_rows, index, len(queue_rows))
|
||||
related_rows = rows_by_normalized.get(queue_row["normalized_item_id"], [])
|
||||
result = prompt_resolution(queue_row, related_rows, purchase_rows, catalog_rows, index, len(queue_rows))
|
||||
if result == (None, None):
|
||||
break
|
||||
resolution_row, canonical_row = result
|
||||
resolution_lookup[resolution_row["observed_product_id"]] = resolution_row
|
||||
if canonical_row and canonical_row["canonical_product_id"] not in catalog_by_id:
|
||||
catalog_by_id[canonical_row["canonical_product_id"]] = canonical_row
|
||||
catalog_rows.append(canonical_row)
|
||||
resolution_row, catalog_row = result
|
||||
resolution_lookup[resolution_row["normalized_item_id"]] = resolution_row
|
||||
if catalog_row and catalog_row["catalog_id"] not in catalog_by_id:
|
||||
catalog_by_id[catalog_row["catalog_id"]] = catalog_row
|
||||
catalog_rows.append(catalog_row)
|
||||
reviewed += 1
|
||||
|
||||
save_resolution_rows(resolutions_csv, sorted(resolution_lookup.values(), key=lambda row: row["observed_product_id"]))
|
||||
save_catalog_rows(catalog_csv, sorted(catalog_by_id.values(), key=lambda row: row["canonical_product_id"]))
|
||||
save_resolution_rows(
|
||||
resolutions_csv,
|
||||
sorted(resolution_lookup.values(), key=lambda row: row["normalized_item_id"]),
|
||||
)
|
||||
save_catalog_rows(catalog_csv, sorted(catalog_by_id.values(), key=lambda row: row["catalog_id"]))
|
||||
click.echo(
|
||||
f"saved {len(resolution_lookup)} resolution rows to {resolutions_csv} "
|
||||
f"and {len(catalog_by_id)} catalog rows to {catalog_csv}"
|
||||
|
||||
Reference in New Issue
Block a user