Document visit-level purchase analysis

2026-03-24 08:29:26 -04:00
parent de8ff535b8
commit 6940f165fb
2 changed files with 106 additions and 0 deletions
--- a/README.md
+++ b/README.md
@@ -129,6 +129,19 @@ Combined:
 - `data/review/pipeline_status.json`
 - `data/catalog.csv`

+`data/review/purchases.csv` is the main analysis artifact. It is designed to support both:
+- item-level price analysis
+- visit-level analysis such as spend by visit, items per visit, category spend by visit, and retailer/store breakdown
+
+The visit fields are carried directly in `purchases.csv`, so you can pivot on them without extra joins:
+- `order_id`
+- `purchase_date`
+- `retailer`
+- `store_name`
+- `store_number`
+- `store_city`
+- `store_state`
+
 ## Review Workflow

 Run `review_products.py` to cleanup unresolved or weakly unified items:
--- a/tests/test_purchases.py
+++ b/tests/test_purchases.py
@@ -167,6 +167,11 @@ class PurchaseLogTests(unittest.TestCase):
        self.assertEqual("1", rows[0]["normalized_quantity"])
        self.assertEqual("lb", rows[0]["normalized_quantity_unit"])
        self.assertEqual("lb", rows[0]["effective_price_unit"])
+        self.assertEqual("g1", rows[0]["order_id"])
+        self.assertEqual("Giant", rows[0]["store_name"])
+        self.assertEqual("42", rows[0]["store_number"])
+        self.assertEqual("Springfield", rows[0]["store_city"])
+        self.assertEqual("VA", rows[0]["store_state"])

    def test_main_writes_purchase_and_example_csvs(self):
        with tempfile.TemporaryDirectory() as tmpdir:
@@ -624,6 +629,94 @@ class PurchaseLogTests(unittest.TestCase):
        self.assertEqual("", rows[0]["effective_price"])
        self.assertEqual("", rows[0]["effective_price_unit"])

+    def test_purchase_rows_support_visit_level_grouping_without_extra_joins(self):
+        fieldnames = enrich_costco.OUTPUT_FIELDS
+
+        def base_row():
+            return {field: "" for field in fieldnames}
+
+        row_one = base_row()
+        row_one.update(
+            {
+                "retailer": "giant",
+                "order_id": "g1",
+                "line_no": "1",
+                "normalized_row_id": "giant:g1:1",
+                "normalized_item_id": "gnorm:first",
+                "order_date": "2026-03-01",
+                "item_name": "FIRST ITEM",
+                "item_name_norm": "FIRST ITEM",
+                "qty": "1",
+                "unit": "EA",
+                "normalized_quantity": "1",
+                "normalized_quantity_unit": "each",
+                "line_total": "3.50",
+                "measure_type": "each",
+                "raw_order_path": "data/giant-web/raw/g1.json",
+                "is_discount_line": "false",
+                "is_coupon_line": "false",
+                "is_fee": "false",
+            }
+        )
+        row_two = base_row()
+        row_two.update(
+            {
+                "retailer": "giant",
+                "order_id": "g1",
+                "line_no": "2",
+                "normalized_row_id": "giant:g1:2",
+                "normalized_item_id": "gnorm:second",
+                "order_date": "2026-03-01",
+                "item_name": "SECOND ITEM",
+                "item_name_norm": "SECOND ITEM",
+                "qty": "1",
+                "unit": "EA",
+                "normalized_quantity": "1",
+                "normalized_quantity_unit": "each",
+                "line_total": "2.00",
+                "measure_type": "each",
+                "raw_order_path": "data/giant-web/raw/g1.json",
+                "is_discount_line": "false",
+                "is_coupon_line": "false",
+                "is_fee": "false",
+            }
+        )
+
+        rows, _links = build_purchases.build_purchase_rows(
+            [row_one, row_two],
+            [],
+            [
+                {
+                    "order_id": "g1",
+                    "store_name": "Giant",
+                    "store_number": "42",
+                    "store_city": "Springfield",
+                    "store_state": "VA",
+                }
+            ],
+            [],
+            [],
+            [],
+            [],
+        )
+
+        visit_key = {
+            (
+                row["retailer"],
+                row["order_id"],
+                row["purchase_date"],
+                row["store_name"],
+                row["store_number"],
+                row["store_city"],
+                row["store_state"],
+            )
+            for row in rows
+        }
+        visit_total = sum(float(row["net_line_total"]) for row in rows)
+
+        self.assertEqual(1, len(visit_key))
+        self.assertEqual(5.5, visit_total)
+

 if __name__ == "__main__":
    unittest.main()