From 6940f165fbc37062da64aeab9cba808575b167a9 Mon Sep 17 00:00:00 2001 From: ben Date: Tue, 24 Mar 2026 08:29:26 -0400 Subject: [PATCH] Document visit-level purchase analysis --- README.md | 13 ++++++ tests/test_purchases.py | 93 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+) diff --git a/README.md b/README.md index 3c615a3..7b01541 100644 --- a/README.md +++ b/README.md @@ -129,6 +129,19 @@ Combined: - `data/review/pipeline_status.json` - `data/catalog.csv` +`data/review/purchases.csv` is the main analysis artifact. It is designed to support both: +- item-level price analysis +- visit-level analysis such as spend by visit, items per visit, category spend by visit, and retailer/store breakdown + +The visit fields are carried directly in `purchases.csv`, so you can pivot on them without extra joins: +- `order_id` +- `purchase_date` +- `retailer` +- `store_name` +- `store_number` +- `store_city` +- `store_state` + ## Review Workflow Run `review_products.py` to cleanup unresolved or weakly unified items: diff --git a/tests/test_purchases.py b/tests/test_purchases.py index 951334d..41576fb 100644 --- a/tests/test_purchases.py +++ b/tests/test_purchases.py @@ -167,6 +167,11 @@ class PurchaseLogTests(unittest.TestCase): self.assertEqual("1", rows[0]["normalized_quantity"]) self.assertEqual("lb", rows[0]["normalized_quantity_unit"]) self.assertEqual("lb", rows[0]["effective_price_unit"]) + self.assertEqual("g1", rows[0]["order_id"]) + self.assertEqual("Giant", rows[0]["store_name"]) + self.assertEqual("42", rows[0]["store_number"]) + self.assertEqual("Springfield", rows[0]["store_city"]) + self.assertEqual("VA", rows[0]["store_state"]) def test_main_writes_purchase_and_example_csvs(self): with tempfile.TemporaryDirectory() as tmpdir: @@ -624,6 +629,94 @@ class PurchaseLogTests(unittest.TestCase): self.assertEqual("", rows[0]["effective_price"]) self.assertEqual("", rows[0]["effective_price_unit"]) + def test_purchase_rows_support_visit_level_grouping_without_extra_joins(self): + fieldnames = enrich_costco.OUTPUT_FIELDS + + def base_row(): + return {field: "" for field in fieldnames} + + row_one = base_row() + row_one.update( + { + "retailer": "giant", + "order_id": "g1", + "line_no": "1", + "normalized_row_id": "giant:g1:1", + "normalized_item_id": "gnorm:first", + "order_date": "2026-03-01", + "item_name": "FIRST ITEM", + "item_name_norm": "FIRST ITEM", + "qty": "1", + "unit": "EA", + "normalized_quantity": "1", + "normalized_quantity_unit": "each", + "line_total": "3.50", + "measure_type": "each", + "raw_order_path": "data/giant-web/raw/g1.json", + "is_discount_line": "false", + "is_coupon_line": "false", + "is_fee": "false", + } + ) + row_two = base_row() + row_two.update( + { + "retailer": "giant", + "order_id": "g1", + "line_no": "2", + "normalized_row_id": "giant:g1:2", + "normalized_item_id": "gnorm:second", + "order_date": "2026-03-01", + "item_name": "SECOND ITEM", + "item_name_norm": "SECOND ITEM", + "qty": "1", + "unit": "EA", + "normalized_quantity": "1", + "normalized_quantity_unit": "each", + "line_total": "2.00", + "measure_type": "each", + "raw_order_path": "data/giant-web/raw/g1.json", + "is_discount_line": "false", + "is_coupon_line": "false", + "is_fee": "false", + } + ) + + rows, _links = build_purchases.build_purchase_rows( + [row_one, row_two], + [], + [ + { + "order_id": "g1", + "store_name": "Giant", + "store_number": "42", + "store_city": "Springfield", + "store_state": "VA", + } + ], + [], + [], + [], + [], + ) + + visit_key = { + ( + row["retailer"], + row["order_id"], + row["purchase_date"], + row["store_name"], + row["store_number"], + row["store_city"], + row["store_state"], + ) + for row in rows + } + visit_total = sum(float(row["net_line_total"]) for row in rows) + + self.assertEqual(1, len(visit_key)) + self.assertEqual(5.5, visit_total) + if __name__ == "__main__": unittest.main()