Fix normalized quantity basis
This commit is contained in:
@@ -29,6 +29,8 @@ PURCHASE_FIELDS = [
|
||||
"upc",
|
||||
"qty",
|
||||
"unit",
|
||||
"normalized_quantity",
|
||||
"normalized_quantity_unit",
|
||||
"pack_qty",
|
||||
"size_value",
|
||||
"size_unit",
|
||||
@@ -337,6 +339,8 @@ def build_purchase_rows(
|
||||
"upc": row["upc"],
|
||||
"qty": row["qty"],
|
||||
"unit": row["unit"],
|
||||
"normalized_quantity": row.get("normalized_quantity", ""),
|
||||
"normalized_quantity_unit": row.get("normalized_quantity_unit", ""),
|
||||
"pack_qty": row["pack_qty"],
|
||||
"size_value": row["size_value"],
|
||||
"size_unit": row["size_unit"],
|
||||
|
||||
@@ -37,7 +37,9 @@ DUAL_WEIGHT_RE = re.compile(
|
||||
LOGISTICS_SLASH_RE = re.compile(r"\b(?:T\d+/H\d+(?:/P\d+)?/?|H\d+/P\d+/?|T\d+/H\d+/?)\b")
|
||||
PACK_DASH_RE = re.compile(r"(?<![A-Z0-9])(\d+)\s*-\s*PACK\b")
|
||||
PACK_WORD_RE = re.compile(r"(?<![A-Z0-9])(\d+)\s*PACK\b")
|
||||
SIZE_RE = re.compile(r"(?<![A-Z0-9])(\d+(?:\.\d+)?)\s*(OZ|LB|LBS|CT|KG|G)\b")
|
||||
SIZE_RE = re.compile(
|
||||
r"(?<![A-Z0-9])(\d+(?:\.\d+)?)\s*(OZ|LB|LBS|CT|KG|G|QT|QTS|PT|PTS|GAL|GALS|FL OZ|FLOZ)\b"
|
||||
)
|
||||
DISCOUNT_TARGET_RE = re.compile(r"^/\s*(\d+)\b")
|
||||
|
||||
|
||||
@@ -192,6 +194,7 @@ def parse_costco_item(order_id, order_date, raw_path, line_no, item):
|
||||
)
|
||||
normalized_row_id = f"{RETAILER}:{order_id}:{line_no}"
|
||||
normalized_quantity, normalized_quantity_unit = derive_normalized_quantity(
|
||||
item.get("unit"),
|
||||
size_value,
|
||||
size_unit,
|
||||
pack_qty,
|
||||
|
||||
@@ -224,13 +224,17 @@ def normalize_unit(unit):
|
||||
"OZ": "oz",
|
||||
"FZ": "fl_oz",
|
||||
"FL OZ": "fl_oz",
|
||||
"FLOZ": "fl_oz",
|
||||
"LB": "lb",
|
||||
"LBS": "lb",
|
||||
"ML": "ml",
|
||||
"L": "l",
|
||||
"QT": "qt",
|
||||
"QTS": "qt",
|
||||
"PT": "pt",
|
||||
"PTS": "pt",
|
||||
"GAL": "gal",
|
||||
"GALS": "gal",
|
||||
"GA": "gal",
|
||||
}.get(collapsed, collapsed.lower())
|
||||
|
||||
@@ -340,16 +344,24 @@ def derive_prices(item, measure_type, size_value="", size_unit="", pack_qty=""):
|
||||
return price_per_each, price_per_lb, price_per_oz
|
||||
|
||||
|
||||
def derive_normalized_quantity(size_value, size_unit, pack_qty, measure_type):
|
||||
def derive_normalized_quantity(qty, size_value, size_unit, pack_qty, measure_type):
|
||||
parsed_qty = to_decimal(qty)
|
||||
parsed_size = to_decimal(size_value)
|
||||
parsed_pack = to_decimal(pack_qty) or Decimal("1")
|
||||
parsed_pack = to_decimal(pack_qty)
|
||||
total_multiplier = None
|
||||
if parsed_qty not in (None, Decimal("0")):
|
||||
total_multiplier = parsed_qty * (parsed_pack or Decimal("1"))
|
||||
|
||||
if parsed_size not in (None, Decimal("0")) and size_unit:
|
||||
return format_decimal(parsed_size * parsed_pack), size_unit
|
||||
if parsed_pack not in (None, Decimal("0")) and measure_type == "count":
|
||||
return format_decimal(parsed_pack), "count"
|
||||
if measure_type == "each":
|
||||
return "1", "each"
|
||||
if (
|
||||
parsed_size not in (None, Decimal("0"))
|
||||
and size_unit
|
||||
and total_multiplier not in (None, Decimal("0"))
|
||||
):
|
||||
return format_decimal(parsed_size * total_multiplier), size_unit
|
||||
if measure_type == "count" and total_multiplier not in (None, Decimal("0")):
|
||||
return format_decimal(total_multiplier), "count"
|
||||
if measure_type == "each" and parsed_qty not in (None, Decimal("0")):
|
||||
return format_decimal(parsed_qty), "each"
|
||||
return "", ""
|
||||
|
||||
|
||||
@@ -424,6 +436,7 @@ def parse_item(order_id, order_date, raw_path, line_no, item):
|
||||
|
||||
normalized_row_id = f"{RETAILER}:{order_id}:{line_no}"
|
||||
normalized_quantity, normalized_quantity_unit = derive_normalized_quantity(
|
||||
item.get("shipQy"),
|
||||
size_value,
|
||||
size_unit,
|
||||
pack_qty,
|
||||
|
||||
@@ -344,3 +344,9 @@ Notes:
|
||||
- review/link decisions should apply at the `normalized_item_id` level, then fan out to all purchase rows sharing that id.
|
||||
|
||||
* /
|
||||
Normalized quantity is deterministic and conservative:
|
||||
- if `qty * pack_qty * size_value` is available, use that total with `size_unit`
|
||||
- else if count basis is explicit, use `qty * pack_qty` with unit `count`
|
||||
- else if `measure_type` is `each`, use `qty each`
|
||||
- else leave both fields blank
|
||||
- no hidden unit conversion is applied inside normalization; values stay in their parsed units such as `oz`, `lb`, `qt`, or `count`
|
||||
|
||||
@@ -264,6 +264,26 @@ class CostcoPipelineTests(unittest.TestCase):
|
||||
self.assertEqual("6", row["normalized_quantity"])
|
||||
self.assertEqual("count", row["normalized_quantity_unit"])
|
||||
|
||||
volume_row = enrich_costco.parse_costco_item(
|
||||
order_id="abc",
|
||||
order_date="2026-03-12",
|
||||
raw_path=Path("costco_output/raw/abc.json"),
|
||||
line_no=3,
|
||||
item={
|
||||
"itemNumber": "1185912",
|
||||
"itemDescription01": "KS ALMND BAR US 1.74QTS CN",
|
||||
"itemDescription02": None,
|
||||
"itemDepartmentNumber": 18,
|
||||
"transDepartmentNumber": 18,
|
||||
"unit": 2,
|
||||
"itemIdentifier": "E",
|
||||
"amount": 21.98,
|
||||
"itemUnitPriceAmount": 10.99,
|
||||
},
|
||||
)
|
||||
self.assertEqual("3.48", volume_row["normalized_quantity"])
|
||||
self.assertEqual("qt", volume_row["normalized_quantity_unit"])
|
||||
|
||||
discount = enrich_costco.parse_costco_item(
|
||||
order_id="abc",
|
||||
order_date="2026-03-12",
|
||||
|
||||
@@ -111,9 +111,25 @@ class EnrichGiantTests(unittest.TestCase):
|
||||
self.assertEqual("weight", row["measure_type"])
|
||||
self.assertEqual("6", row["pack_qty"])
|
||||
self.assertEqual("7.5", row["size_value"])
|
||||
self.assertEqual("90", row["normalized_quantity"])
|
||||
self.assertEqual("oz", row["normalized_quantity_unit"])
|
||||
self.assertEqual("0.0667", row["price_per_oz"])
|
||||
self.assertEqual("1.0667", row["price_per_lb"])
|
||||
|
||||
def test_derive_normalized_quantity_handles_count_volume_and_each(self):
|
||||
self.assertEqual(
|
||||
("18", "count"),
|
||||
enrich_giant.derive_normalized_quantity("1", "", "", "18", "count"),
|
||||
)
|
||||
self.assertEqual(
|
||||
("3.48", "qt"),
|
||||
enrich_giant.derive_normalized_quantity("2", "1.74", "qt", "", "volume"),
|
||||
)
|
||||
self.assertEqual(
|
||||
("2", "each"),
|
||||
enrich_giant.derive_normalized_quantity("2", "", "", "", "each"),
|
||||
)
|
||||
|
||||
def test_build_items_enriched_reads_raw_order_files_and_writes_csv(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
raw_dir = Path(tmpdir) / "raw"
|
||||
|
||||
@@ -47,6 +47,8 @@ class PurchaseLogTests(unittest.TestCase):
|
||||
"upc": "4011",
|
||||
"qty": "1",
|
||||
"unit": "LB",
|
||||
"normalized_quantity": "1",
|
||||
"normalized_quantity_unit": "lb",
|
||||
"line_total": "1.29",
|
||||
"unit_price": "1.29",
|
||||
"measure_type": "weight",
|
||||
@@ -71,6 +73,8 @@ class PurchaseLogTests(unittest.TestCase):
|
||||
"retailer_item_id": "30669",
|
||||
"qty": "1",
|
||||
"unit": "E",
|
||||
"normalized_quantity": "3",
|
||||
"normalized_quantity_unit": "lb",
|
||||
"line_total": "2.98",
|
||||
"unit_price": "2.98",
|
||||
"size_value": "3",
|
||||
@@ -155,6 +159,8 @@ class PurchaseLogTests(unittest.TestCase):
|
||||
self.assertTrue(all(row["catalog_id"] == "cat_banana" for row in rows))
|
||||
self.assertEqual({"giant", "costco"}, {row["retailer"] for row in rows})
|
||||
self.assertEqual("https://example.test/banana.jpg", rows[0]["image_url"])
|
||||
self.assertEqual("1", rows[0]["normalized_quantity"])
|
||||
self.assertEqual("lb", rows[0]["normalized_quantity_unit"])
|
||||
|
||||
def test_main_writes_purchase_and_example_csvs(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
@@ -184,6 +190,8 @@ class PurchaseLogTests(unittest.TestCase):
|
||||
"upc": "4011",
|
||||
"qty": "1",
|
||||
"unit": "LB",
|
||||
"normalized_quantity": "1",
|
||||
"normalized_quantity_unit": "lb",
|
||||
"line_total": "1.29",
|
||||
"unit_price": "1.29",
|
||||
"measure_type": "weight",
|
||||
@@ -208,6 +216,8 @@ class PurchaseLogTests(unittest.TestCase):
|
||||
"retailer_item_id": "30669",
|
||||
"qty": "1",
|
||||
"unit": "E",
|
||||
"normalized_quantity": "3",
|
||||
"normalized_quantity_unit": "lb",
|
||||
"line_total": "2.98",
|
||||
"unit_price": "2.98",
|
||||
"size_value": "3",
|
||||
@@ -346,6 +356,8 @@ class PurchaseLogTests(unittest.TestCase):
|
||||
"upc": "",
|
||||
"qty": "1",
|
||||
"unit": "EA",
|
||||
"normalized_quantity": "1",
|
||||
"normalized_quantity_unit": "each",
|
||||
"line_total": "3.50",
|
||||
"unit_price": "3.50",
|
||||
"measure_type": "each",
|
||||
@@ -403,6 +415,8 @@ class PurchaseLogTests(unittest.TestCase):
|
||||
self.assertEqual("approved", rows[0]["review_status"])
|
||||
self.assertEqual("create", rows[0]["resolution_action"])
|
||||
self.assertEqual("cat_ice", links[0]["catalog_id"])
|
||||
self.assertEqual("1", rows[0]["normalized_quantity"])
|
||||
self.assertEqual("each", rows[0]["normalized_quantity_unit"])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user