import csv import json import re from decimal import Decimal, InvalidOperation, ROUND_HALF_UP from pathlib import Path import click PARSER_VERSION = "giant-enrich-v1" RETAILER = "giant" DEFAULT_INPUT_DIR = Path("giant_output/raw") DEFAULT_OUTPUT_CSV = Path("giant_output/items_enriched.csv") OUTPUT_FIELDS = [ "retailer", "order_id", "line_no", "observed_item_key", "order_date", "pod_id", "item_name", "upc", "category_id", "category", "qty", "unit", "unit_price", "line_total", "picked_weight", "mvp_savings", "reward_savings", "coupon_savings", "coupon_price", "image_url", "raw_order_path", "item_name_norm", "brand_guess", "variant", "size_value", "size_unit", "pack_qty", "measure_type", "is_store_brand", "is_fee", "price_per_each", "price_per_lb", "price_per_oz", "parse_version", "parse_notes", ] STORE_BRAND_PREFIXES = { "SB": "SB", "NP": "NP", } ABBREVIATIONS = { "APPLE": "APPLE", "APPLES": "APPLES", "APLE": "APPLE", "BASIL": "BASIL", "BLK": "BLACK", "BNLS": "BONELESS", "BRWN": "BROWN", "CARROTS": "CARROTS", "CHDR": "CHEDDAR", "CHICKEN": "CHICKEN", "CHOC": "CHOCOLATE", "CHS": "CHEESE", "CHSE": "CHEESE", "CHZ": "CHEESE", "CILANTRO": "CILANTRO", "CKI": "COOKIE", "CRSHD": "CRUSHED", "FLR": "FLOUR", "FRSH": "FRESH", "GALA": "GALA", "GRAHM": "GRAHAM", "HOT": "HOT", "HRSRDSH": "HORSERADISH", "IMP": "IMPORTED", "IQF": "IQF", "LENTILS": "LENTILS", "LG": "LARGE", "MLK": "MILK", "MSTRD": "MUSTARD", "ONION": "ONION", "ORG": "ORGANIC", "PEPPER": "PEPPER", "PEPPERS": "PEPPERS", "POT": "POTATO", "POTATO": "POTATO", "PPR": "PEPPER", "RICOTTA": "RICOTTA", "ROASTER": "ROASTER", "ROTINI": "ROTINI", "SCE": "SAUCE", "SLC": "SLICED", "SPINCH": "SPINACH", "SPNC": "SPINACH", "SPINACH": "SPINACH", "SQZ": "SQUEEZE", "SWT": "SWEET", "THYME": "THYME", "TOM": "TOMATO", "TOMS": "TOMATOES", "TRTL": "TORTILLA", "VEG": "VEGETABLE", "VINEGAR": "VINEGAR", "WHT": "WHITE", "WHOLE": "WHOLE", "YLW": "YELLOW", "YLWGLD": "YELLOW_GOLD", } FEE_PATTERNS = [ re.compile(r"\bBAG CHARGE\b"), re.compile(r"\bDISC AT TOTAL\b"), ] SIZE_RE = re.compile(r"(? 0 and unit != "EA"): return "weight" if size_unit in {"lb", "oz"}: return "weight" if size_unit in {"ml", "l", "qt", "pt", "gal", "fl_oz"}: return "volume" if pack_qty: return "count" if unit == "EA" or (qty is not None and qty > 0): return "each" return "" def is_fee_item(cleaned_name): return any(pattern.search(cleaned_name) for pattern in FEE_PATTERNS) def derive_prices(item, measure_type, size_value="", size_unit="", pack_qty=""): qty = to_decimal(item.get("shipQy")) line_total = to_decimal(item.get("groceryAmount")) picked_weight = to_decimal(item.get("totalPickedWeight")) parsed_size = to_decimal(size_value) parsed_pack = to_decimal(pack_qty) or Decimal("1") price_per_each = "" price_per_lb = "" price_per_oz = "" if line_total is None: return price_per_each, price_per_lb, price_per_oz if measure_type == "each" and qty not in (None, Decimal("0")): price_per_each = format_decimal(line_total / qty) if measure_type == "count" and qty not in (None, Decimal("0")): price_per_each = format_decimal(line_total / qty) if measure_type == "weight" and picked_weight not in (None, Decimal("0")): per_lb = line_total / picked_weight price_per_lb = format_decimal(per_lb) price_per_oz = format_decimal(per_lb / Decimal("16")) return price_per_each, price_per_lb, price_per_oz if measure_type == "weight" and parsed_size not in (None, Decimal("0")) and qty not in (None, Decimal("0")): total_units = qty * parsed_pack * parsed_size if size_unit == "lb": per_lb = line_total / total_units price_per_lb = format_decimal(per_lb) price_per_oz = format_decimal(per_lb / Decimal("16")) elif size_unit == "oz": per_oz = line_total / total_units price_per_oz = format_decimal(per_oz) price_per_lb = format_decimal(per_oz * Decimal("16")) return price_per_each, price_per_lb, price_per_oz def parse_item(order_id, order_date, raw_path, line_no, item): cleaned_name = clean_item_name(item.get("itemName", "")) size_value, size_unit, pack_qty = parse_size_and_pack(cleaned_name) prefix, brand_guess = extract_store_brand_prefix(cleaned_name) normalized_name = normalize_item_name(cleaned_name) measure_type = guess_measure_type(item, size_unit, pack_qty) price_per_each, price_per_lb, price_per_oz = derive_prices( item, measure_type, size_value=size_value, size_unit=size_unit, pack_qty=pack_qty, ) is_fee = is_fee_item(cleaned_name) parse_notes = [] if prefix: parse_notes.append(f"store_brand_prefix={prefix}") if is_fee: parse_notes.append("fee_item") if size_value and not size_unit: parse_notes.append("size_without_unit") return { "retailer": RETAILER, "order_id": str(order_id), "line_no": str(line_no), "observed_item_key": f"{RETAILER}:{order_id}:{line_no}", "order_date": normalize_whitespace(order_date), "pod_id": stringify(item.get("podId")), "item_name": stringify(item.get("itemName")), "upc": stringify(item.get("primUpcCd")), "category_id": stringify(item.get("categoryId")), "category": stringify(item.get("categoryDesc")), "qty": stringify(item.get("shipQy")), "unit": stringify(item.get("lbEachCd")), "unit_price": stringify(item.get("unitPrice")), "line_total": stringify(item.get("groceryAmount")), "picked_weight": stringify(item.get("totalPickedWeight")), "mvp_savings": stringify(item.get("mvpSavings")), "reward_savings": stringify(item.get("rewardSavings")), "coupon_savings": stringify(item.get("couponSavings")), "coupon_price": stringify(item.get("couponPrice")), "image_url": extract_image_url(item), "raw_order_path": raw_path.as_posix(), "item_name_norm": normalized_name, "brand_guess": brand_guess, "variant": "", "size_value": size_value, "size_unit": size_unit, "pack_qty": pack_qty, "measure_type": measure_type, "is_store_brand": "true" if bool(prefix) else "false", "is_fee": "true" if is_fee else "false", "price_per_each": price_per_each, "price_per_lb": price_per_lb, "price_per_oz": price_per_oz, "parse_version": PARSER_VERSION, "parse_notes": ";".join(parse_notes), } def stringify(value): if value is None: return "" return str(value) def iter_order_rows(raw_dir): for path in sorted(raw_dir.glob("*.json")): if path.name == "history.json": continue payload = json.loads(path.read_text(encoding="utf-8")) order_id = payload.get("orderId", path.stem) order_date = payload.get("orderDate", "") for line_no, item in enumerate(payload.get("items", []), start=1): yield parse_item(order_id, order_date, path, line_no, item) def build_items_enriched(raw_dir): rows = list(iter_order_rows(raw_dir)) rows.sort(key=lambda row: (row["order_date"], row["order_id"], int(row["line_no"]))) return rows def write_csv(path, rows): path.parent.mkdir(parents=True, exist_ok=True) with path.open("w", newline="", encoding="utf-8") as handle: writer = csv.DictWriter(handle, fieldnames=OUTPUT_FIELDS) writer.writeheader() writer.writerows(rows) @click.command() @click.option( "--input-dir", default=str(DEFAULT_INPUT_DIR), show_default=True, help="Directory containing Giant raw order json files.", ) @click.option( "--output-csv", default=str(DEFAULT_OUTPUT_CSV), show_default=True, help="CSV path for enriched Giant item rows.", ) def main(input_dir, output_csv): raw_dir = Path(input_dir) output_path = Path(output_csv) if not raw_dir.exists(): raise click.ClickException(f"input dir does not exist: {raw_dir}") rows = build_items_enriched(raw_dir) write_csv(output_path, rows) click.echo(f"wrote {len(rows)} rows to {output_path}") if __name__ == "__main__": main()