Refactor retailer normalization outputs
This commit is contained in:
111
enrich_giant.py
111
enrich_giant.py
@@ -16,6 +16,9 @@ OUTPUT_FIELDS = [
|
||||
"retailer",
|
||||
"order_id",
|
||||
"line_no",
|
||||
"normalized_row_id",
|
||||
"normalized_item_id",
|
||||
"normalization_basis",
|
||||
"observed_item_key",
|
||||
"order_date",
|
||||
"retailer_item_id",
|
||||
@@ -44,13 +47,21 @@ OUTPUT_FIELDS = [
|
||||
"size_unit",
|
||||
"pack_qty",
|
||||
"measure_type",
|
||||
"normalized_quantity",
|
||||
"normalized_quantity_unit",
|
||||
"is_store_brand",
|
||||
"is_item",
|
||||
"is_fee",
|
||||
"is_discount_line",
|
||||
"is_coupon_line",
|
||||
"price_per_each",
|
||||
"price_per_each_basis",
|
||||
"price_per_count",
|
||||
"price_per_count_basis",
|
||||
"price_per_lb",
|
||||
"price_per_lb_basis",
|
||||
"price_per_oz",
|
||||
"price_per_oz_basis",
|
||||
"parse_version",
|
||||
"parse_notes",
|
||||
]
|
||||
@@ -329,6 +340,65 @@ def derive_prices(item, measure_type, size_value="", size_unit="", pack_qty=""):
|
||||
return price_per_each, price_per_lb, price_per_oz
|
||||
|
||||
|
||||
def derive_normalized_quantity(size_value, size_unit, pack_qty, measure_type):
|
||||
parsed_size = to_decimal(size_value)
|
||||
parsed_pack = to_decimal(pack_qty) or Decimal("1")
|
||||
|
||||
if parsed_size not in (None, Decimal("0")) and size_unit:
|
||||
return format_decimal(parsed_size * parsed_pack), size_unit
|
||||
if parsed_pack not in (None, Decimal("0")) and measure_type == "count":
|
||||
return format_decimal(parsed_pack), "count"
|
||||
if measure_type == "each":
|
||||
return "1", "each"
|
||||
return "", ""
|
||||
|
||||
|
||||
def derive_price_fields(price_per_each, price_per_lb, price_per_oz, line_total, qty, pack_qty):
|
||||
line_total_decimal = to_decimal(line_total)
|
||||
qty_decimal = to_decimal(qty)
|
||||
pack_decimal = to_decimal(pack_qty)
|
||||
price_per_count = ""
|
||||
price_per_count_basis = ""
|
||||
if line_total_decimal is not None and qty_decimal not in (None, Decimal("0")) and pack_decimal not in (
|
||||
None,
|
||||
Decimal("0"),
|
||||
):
|
||||
price_per_count = format_decimal(line_total_decimal / (qty_decimal * pack_decimal))
|
||||
price_per_count_basis = "line_total_over_pack_qty"
|
||||
|
||||
return {
|
||||
"price_per_each": price_per_each,
|
||||
"price_per_each_basis": "line_total_over_qty" if price_per_each else "",
|
||||
"price_per_count": price_per_count,
|
||||
"price_per_count_basis": price_per_count_basis,
|
||||
"price_per_lb": price_per_lb,
|
||||
"price_per_lb_basis": "parsed_or_picked_weight" if price_per_lb else "",
|
||||
"price_per_oz": price_per_oz,
|
||||
"price_per_oz_basis": "parsed_or_picked_weight" if price_per_oz else "",
|
||||
}
|
||||
|
||||
|
||||
def normalization_identity(row):
|
||||
if row.get("upc"):
|
||||
return f"{row['retailer']}|upc={row['upc']}", "exact_upc"
|
||||
if row.get("retailer_item_id"):
|
||||
return f"{row['retailer']}|retailer_item_id={row['retailer_item_id']}", "exact_retailer_item_id"
|
||||
if row.get("item_name_norm"):
|
||||
return (
|
||||
"|".join(
|
||||
[
|
||||
row["retailer"],
|
||||
f"name={row['item_name_norm']}",
|
||||
f"size={row.get('size_value', '')}",
|
||||
f"unit={row.get('size_unit', '')}",
|
||||
f"pack={row.get('pack_qty', '')}",
|
||||
]
|
||||
),
|
||||
"exact_name_size_pack",
|
||||
)
|
||||
return row["normalized_row_id"], "row_identity"
|
||||
|
||||
|
||||
def parse_item(order_id, order_date, raw_path, line_no, item):
|
||||
cleaned_name = clean_item_name(item.get("itemName", ""))
|
||||
size_value, size_unit, pack_qty = parse_size_and_pack(cleaned_name)
|
||||
@@ -352,11 +422,42 @@ def parse_item(order_id, order_date, raw_path, line_no, item):
|
||||
if size_value and not size_unit:
|
||||
parse_notes.append("size_without_unit")
|
||||
|
||||
normalized_row_id = f"{RETAILER}:{order_id}:{line_no}"
|
||||
normalized_quantity, normalized_quantity_unit = derive_normalized_quantity(
|
||||
size_value,
|
||||
size_unit,
|
||||
pack_qty,
|
||||
measure_type,
|
||||
)
|
||||
identity_key, normalization_basis = normalization_identity(
|
||||
{
|
||||
"retailer": RETAILER,
|
||||
"normalized_row_id": normalized_row_id,
|
||||
"upc": stringify(item.get("primUpcCd")),
|
||||
"retailer_item_id": stringify(item.get("podId")),
|
||||
"item_name_norm": normalized_name,
|
||||
"size_value": size_value,
|
||||
"size_unit": size_unit,
|
||||
"pack_qty": pack_qty,
|
||||
}
|
||||
)
|
||||
price_fields = derive_price_fields(
|
||||
price_per_each,
|
||||
price_per_lb,
|
||||
price_per_oz,
|
||||
stringify(item.get("groceryAmount")),
|
||||
stringify(item.get("shipQy")),
|
||||
pack_qty,
|
||||
)
|
||||
|
||||
return {
|
||||
"retailer": RETAILER,
|
||||
"order_id": str(order_id),
|
||||
"line_no": str(line_no),
|
||||
"observed_item_key": f"{RETAILER}:{order_id}:{line_no}",
|
||||
"normalized_row_id": normalized_row_id,
|
||||
"normalized_item_id": f"gnorm:{identity_key}",
|
||||
"normalization_basis": normalization_basis,
|
||||
"observed_item_key": normalized_row_id,
|
||||
"order_date": normalize_whitespace(order_date),
|
||||
"retailer_item_id": stringify(item.get("podId")),
|
||||
"pod_id": stringify(item.get("podId")),
|
||||
@@ -384,13 +485,14 @@ def parse_item(order_id, order_date, raw_path, line_no, item):
|
||||
"size_unit": size_unit,
|
||||
"pack_qty": pack_qty,
|
||||
"measure_type": measure_type,
|
||||
"normalized_quantity": normalized_quantity,
|
||||
"normalized_quantity_unit": normalized_quantity_unit,
|
||||
"is_store_brand": "true" if bool(prefix) else "false",
|
||||
"is_item": "false" if is_fee else "true",
|
||||
"is_fee": "true" if is_fee else "false",
|
||||
"is_discount_line": "false",
|
||||
"is_coupon_line": "false",
|
||||
"price_per_each": price_per_each,
|
||||
"price_per_lb": price_per_lb,
|
||||
"price_per_oz": price_per_oz,
|
||||
**price_fields,
|
||||
"parse_version": PARSER_VERSION,
|
||||
"parse_notes": ";".join(parse_notes),
|
||||
}
|
||||
@@ -443,6 +545,7 @@ def write_csv(path, rows):
|
||||
help="CSV path for enriched Giant item rows.",
|
||||
)
|
||||
def main(input_dir, output_csv):
|
||||
click.echo("legacy entrypoint: prefer normalize_giant_web.py for data-model outputs")
|
||||
raw_dir = Path(input_dir)
|
||||
output_path = Path(output_csv)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user