Document visit-level purchase analysis

This commit is contained in:
ben
2026-03-24 08:29:26 -04:00
parent de8ff535b8
commit 6940f165fb
2 changed files with 106 additions and 0 deletions

View File

@@ -129,6 +129,19 @@ Combined:
- `data/review/pipeline_status.json`
- `data/catalog.csv`
`data/review/purchases.csv` is the main analysis artifact. It is designed to support both:
- item-level price analysis
- visit-level analysis such as spend by visit, items per visit, category spend by visit, and retailer/store breakdown
The visit fields are carried directly in `purchases.csv`, so you can pivot on them without extra joins:
- `order_id`
- `purchase_date`
- `retailer`
- `store_name`
- `store_number`
- `store_city`
- `store_state`
## Review Workflow
Run `review_products.py` to cleanup unresolved or weakly unified items:

View File

@@ -167,6 +167,11 @@ class PurchaseLogTests(unittest.TestCase):
self.assertEqual("1", rows[0]["normalized_quantity"])
self.assertEqual("lb", rows[0]["normalized_quantity_unit"])
self.assertEqual("lb", rows[0]["effective_price_unit"])
self.assertEqual("g1", rows[0]["order_id"])
self.assertEqual("Giant", rows[0]["store_name"])
self.assertEqual("42", rows[0]["store_number"])
self.assertEqual("Springfield", rows[0]["store_city"])
self.assertEqual("VA", rows[0]["store_state"])
def test_main_writes_purchase_and_example_csvs(self):
with tempfile.TemporaryDirectory() as tmpdir:
@@ -624,6 +629,94 @@ class PurchaseLogTests(unittest.TestCase):
self.assertEqual("", rows[0]["effective_price"])
self.assertEqual("", rows[0]["effective_price_unit"])
def test_purchase_rows_support_visit_level_grouping_without_extra_joins(self):
fieldnames = enrich_costco.OUTPUT_FIELDS
def base_row():
return {field: "" for field in fieldnames}
row_one = base_row()
row_one.update(
{
"retailer": "giant",
"order_id": "g1",
"line_no": "1",
"normalized_row_id": "giant:g1:1",
"normalized_item_id": "gnorm:first",
"order_date": "2026-03-01",
"item_name": "FIRST ITEM",
"item_name_norm": "FIRST ITEM",
"qty": "1",
"unit": "EA",
"normalized_quantity": "1",
"normalized_quantity_unit": "each",
"line_total": "3.50",
"measure_type": "each",
"raw_order_path": "data/giant-web/raw/g1.json",
"is_discount_line": "false",
"is_coupon_line": "false",
"is_fee": "false",
}
)
row_two = base_row()
row_two.update(
{
"retailer": "giant",
"order_id": "g1",
"line_no": "2",
"normalized_row_id": "giant:g1:2",
"normalized_item_id": "gnorm:second",
"order_date": "2026-03-01",
"item_name": "SECOND ITEM",
"item_name_norm": "SECOND ITEM",
"qty": "1",
"unit": "EA",
"normalized_quantity": "1",
"normalized_quantity_unit": "each",
"line_total": "2.00",
"measure_type": "each",
"raw_order_path": "data/giant-web/raw/g1.json",
"is_discount_line": "false",
"is_coupon_line": "false",
"is_fee": "false",
}
)
rows, _links = build_purchases.build_purchase_rows(
[row_one, row_two],
[],
[
{
"order_id": "g1",
"store_name": "Giant",
"store_number": "42",
"store_city": "Springfield",
"store_state": "VA",
}
],
[],
[],
[],
[],
)
visit_key = {
(
row["retailer"],
row["order_id"],
row["purchase_date"],
row["store_name"],
row["store_number"],
row["store_city"],
row["store_state"],
)
for row in rows
}
visit_total = sum(float(row["net_line_total"]) for row in rows)
self.assertEqual(1, len(visit_key))
self.assertEqual(5.5, visit_total)
if __name__ == "__main__":
unittest.main()