From ac82fa64fbe69ca6d18170c4e8882aa58d906fb4 Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 16 Mar 2026 11:39:45 -0400 Subject: [PATCH] Fix Costco receipt enumeration windows --- scrape_costco.py | 140 ++++++++++++++++++++++++++-- tests/test_costco_pipeline.py | 170 ++++++++++++++++++++++++++++++++++ 2 files changed, 300 insertions(+), 10 deletions(-) diff --git a/scrape_costco.py b/scrape_costco.py index fc16a1f..0c36d9b 100644 --- a/scrape_costco.py +++ b/scrape_costco.py @@ -1,6 +1,7 @@ import csv import json import os +from datetime import datetime, timedelta from pathlib import Path import click @@ -260,6 +261,120 @@ def detail_receipts(payload): return payload.get("data", {}).get("receiptsWithCounts", {}).get("receipts", []) +def summary_counts(payload): + counts = payload.get("data", {}).get("receiptsWithCounts", {}) + return { + "inWarehouse": counts.get("inWarehouse", 0) or 0, + "gasStation": counts.get("gasStation", 0) or 0, + "carWash": counts.get("carWash", 0) or 0, + "gasAndCarWash": counts.get("gasAndCarWash", 0) or 0, + } + + +def parse_cli_date(value): + return datetime.strptime(value, "%m/%d/%Y").date() + + +def format_cli_date(value): + return f"{value.month}/{value.day:02d}/{value.year}" + + +def build_date_windows(start_date, end_date, window_days): + start = parse_cli_date(start_date) + end = parse_cli_date(end_date) + if end < start: + raise click.ClickException("end-date must be on or after start-date") + if window_days < 1: + raise click.ClickException("window-days must be at least 1") + + windows = [] + current = start + while current <= end: + window_end = min(current + timedelta(days=window_days - 1), end) + windows.append( + { + "startDate": format_cli_date(current), + "endDate": format_cli_date(window_end), + } + ) + current = window_end + timedelta(days=1) + return windows + + +def unique_receipts(receipts): + by_barcode = {} + for receipt in receipts: + barcode = receipt.get("transactionBarcode") + if barcode: + by_barcode[barcode] = receipt + return list(by_barcode.values()) + + +def fetch_summary_windows( + session, + start_date, + end_date, + document_type, + document_sub_type, + window_days, +): + requests_metadata = [] + combined_receipts = [] + + for window in build_date_windows(start_date, end_date, window_days): + variables = { + "startDate": window["startDate"], + "endDate": window["endDate"], + "text": "custom", + "documentType": document_type, + "documentSubType": document_sub_type, + } + payload = graphql_post(session, SUMMARY_QUERY, variables) + receipts = summary_receipts(payload) + counts = summary_counts(payload) + warehouse_count = sum( + 1 for receipt in receipts if receipt.get("receiptType") == "In-Warehouse" + ) + mismatch = counts["inWarehouse"] != warehouse_count + requests_metadata.append( + { + **variables, + "returnedReceipts": len(receipts), + "returnedInWarehouseReceipts": warehouse_count, + "inWarehouse": counts["inWarehouse"], + "gasStation": counts["gasStation"], + "carWash": counts["carWash"], + "gasAndCarWash": counts["gasAndCarWash"], + "countMismatch": mismatch, + } + ) + if mismatch: + click.echo( + ( + "warning: summary count mismatch for " + f"{window['startDate']} to {window['endDate']}: " + f"inWarehouse={counts['inWarehouse']} " + f"returnedInWarehouseReceipts={warehouse_count}" + ), + err=True, + ) + combined_receipts.extend(receipts) + + unique = unique_receipts(combined_receipts) + aggregate_payload = { + "data": { + "receiptsWithCounts": { + "inWarehouse": sum(row["inWarehouse"] for row in requests_metadata), + "gasStation": sum(row["gasStation"] for row in requests_metadata), + "carWash": sum(row["carWash"] for row in requests_metadata), + "gasAndCarWash": sum(row["gasAndCarWash"] for row in requests_metadata), + "receipts": unique, + } + } + } + return aggregate_payload, requests_metadata + + def flatten_costco_data(summary_payload, detail_payloads, raw_dir): summary_lookup = { receipt["transactionBarcode"]: receipt @@ -415,7 +530,14 @@ def write_csv(path, rows, fieldnames): show_default=True, help="Summary document sub type.", ) -def main(start_date, end_date, outdir, document_type, document_sub_type): +@click.option( + "--window-days", + default=92, + show_default=True, + type=int, + help="Maximum number of days to request per summary window.", +) +def main(start_date, end_date, outdir, document_type, document_sub_type, window_days): config = load_config() required = ["authorization", "client_id", "client_identifier"] missing = [key for key in required if not config[key]] @@ -428,18 +550,16 @@ def main(start_date, end_date, outdir, document_type, document_sub_type): raw_dir = outdir / "raw" session = build_session(config) - summary_payload = graphql_post( + summary_payload, request_metadata = fetch_summary_windows( session, - SUMMARY_QUERY, - { - "startDate": start_date, - "endDate": end_date, - "text": "custom", - "documentType": document_type, - "documentSubType": document_sub_type, - }, + start_date, + end_date, + document_type, + document_sub_type, + window_days, ) write_json(raw_dir / "summary.json", summary_payload) + write_json(raw_dir / "summary_requests.json", request_metadata) receipts = summary_receipts(summary_payload) detail_payloads = [] diff --git a/tests/test_costco_pipeline.py b/tests/test_costco_pipeline.py index ade5321..60b0738 100644 --- a/tests/test_costco_pipeline.py +++ b/tests/test_costco_pipeline.py @@ -3,6 +3,7 @@ import json import tempfile import unittest from pathlib import Path +from unittest import mock import enrich_costco import scrape_costco @@ -10,6 +11,82 @@ import validate_cross_retailer_flow class CostcoPipelineTests(unittest.TestCase): + def test_build_date_windows_splits_long_ranges(self): + windows = scrape_costco.build_date_windows("1/01/2026", "6/30/2026", 92) + + self.assertEqual( + [ + {"startDate": "1/01/2026", "endDate": "4/02/2026"}, + {"startDate": "4/03/2026", "endDate": "6/30/2026"}, + ], + windows, + ) + + def test_fetch_summary_windows_records_metadata_and_warns_on_mismatch(self): + payloads = [ + { + "data": { + "receiptsWithCounts": { + "inWarehouse": 2, + "gasStation": 0, + "carWash": 0, + "gasAndCarWash": 0, + "receipts": [ + { + "transactionBarcode": "abc", + "receiptType": "In-Warehouse", + } + ], + } + } + }, + { + "data": { + "receiptsWithCounts": { + "inWarehouse": 1, + "gasStation": 0, + "carWash": 0, + "gasAndCarWash": 0, + "receipts": [ + { + "transactionBarcode": "def", + "receiptType": "In-Warehouse", + } + ], + } + } + }, + ] + + with mock.patch.object( + scrape_costco, "graphql_post", side_effect=payloads + ) as mocked_post, mock.patch.object(scrape_costco.click, "echo") as mocked_echo: + summary_payload, metadata = scrape_costco.fetch_summary_windows( + session=object(), + start_date="1/01/2026", + end_date="6/30/2026", + document_type="all", + document_sub_type="all", + window_days=92, + ) + + self.assertEqual(2, mocked_post.call_count) + self.assertEqual(2, len(metadata)) + self.assertTrue(metadata[0]["countMismatch"]) + self.assertFalse(metadata[1]["countMismatch"]) + self.assertEqual("1/01/2026", metadata[0]["startDate"]) + self.assertEqual("4/03/2026", metadata[1]["startDate"]) + self.assertEqual( + ["abc", "def"], + [ + row["transactionBarcode"] + for row in scrape_costco.summary_receipts(summary_payload) + ], + ) + mocked_echo.assert_called_once() + warning_text = mocked_echo.call_args.args[0] + self.assertIn("warning: summary count mismatch", warning_text) + def test_flatten_costco_data_preserves_discount_rows(self): summary_payload = { "data": { @@ -196,6 +273,99 @@ class CostcoPipelineTests(unittest.TestCase): self.assertEqual(1, len(rows)) self.assertEqual("banana", rows[0]["proof_name"]) + def test_main_writes_summary_request_metadata(self): + with tempfile.TemporaryDirectory() as tmpdir: + outdir = Path(tmpdir) / "costco_output" + summary_payload = { + "data": { + "receiptsWithCounts": { + "inWarehouse": 1, + "gasStation": 0, + "carWash": 0, + "gasAndCarWash": 0, + "receipts": [ + { + "transactionBarcode": "abc", + "receiptType": "In-Warehouse", + "tenderArray": [], + "couponArray": [], + } + ], + } + } + } + detail_payload = { + "data": { + "receiptsWithCounts": { + "receipts": [ + { + "transactionBarcode": "abc", + "transactionDate": "2026-03-12", + "receiptType": "In-Warehouse", + "total": 10.0, + "totalItemCount": 1, + "instantSavings": 0, + "warehouseName": "MT VERNON", + "warehouseNumber": 1115, + "warehouseAddress1": "7940 RICHMOND HWY", + "warehouseCity": "ALEXANDRIA", + "warehouseState": "VA", + "warehousePostalCode": "22306", + "itemArray": [], + } + ] + } + } + } + metadata = [ + { + "startDate": "1/01/2026", + "endDate": "3/31/2026", + "text": "custom", + "documentType": "all", + "documentSubType": "all", + "returnedReceipts": 1, + "returnedInWarehouseReceipts": 1, + "inWarehouse": 1, + "gasStation": 0, + "carWash": 0, + "gasAndCarWash": 0, + "countMismatch": False, + } + ] + + with mock.patch.object( + scrape_costco, "load_config", + return_value={ + "authorization": "token", + "client_id": "client", + "client_identifier": "identifier", + }, + ), mock.patch.object( + scrape_costco, "build_session", return_value=object() + ), mock.patch.object( + scrape_costco, + "fetch_summary_windows", + return_value=(summary_payload, metadata), + ), mock.patch.object( + scrape_costco, + "graphql_post", + return_value=detail_payload, + ): + scrape_costco.main.callback( + start_date="1/01/2026", + end_date="3/31/2026", + outdir=str(outdir), + document_type="all", + document_sub_type="all", + window_days=92, + ) + + metadata_path = outdir / "raw" / "summary_requests.json" + self.assertTrue(metadata_path.exists()) + saved_metadata = json.loads(metadata_path.read_text(encoding="utf-8")) + self.assertEqual(metadata, saved_metadata) + if __name__ == "__main__": unittest.main()