Fix Costco receipt enumeration windows
This commit is contained in:
140
scrape_costco.py
140
scrape_costco.py
@@ -1,6 +1,7 @@
|
||||
import csv
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
import click
|
||||
@@ -260,6 +261,120 @@ def detail_receipts(payload):
|
||||
return payload.get("data", {}).get("receiptsWithCounts", {}).get("receipts", [])
|
||||
|
||||
|
||||
def summary_counts(payload):
|
||||
counts = payload.get("data", {}).get("receiptsWithCounts", {})
|
||||
return {
|
||||
"inWarehouse": counts.get("inWarehouse", 0) or 0,
|
||||
"gasStation": counts.get("gasStation", 0) or 0,
|
||||
"carWash": counts.get("carWash", 0) or 0,
|
||||
"gasAndCarWash": counts.get("gasAndCarWash", 0) or 0,
|
||||
}
|
||||
|
||||
|
||||
def parse_cli_date(value):
|
||||
return datetime.strptime(value, "%m/%d/%Y").date()
|
||||
|
||||
|
||||
def format_cli_date(value):
|
||||
return f"{value.month}/{value.day:02d}/{value.year}"
|
||||
|
||||
|
||||
def build_date_windows(start_date, end_date, window_days):
|
||||
start = parse_cli_date(start_date)
|
||||
end = parse_cli_date(end_date)
|
||||
if end < start:
|
||||
raise click.ClickException("end-date must be on or after start-date")
|
||||
if window_days < 1:
|
||||
raise click.ClickException("window-days must be at least 1")
|
||||
|
||||
windows = []
|
||||
current = start
|
||||
while current <= end:
|
||||
window_end = min(current + timedelta(days=window_days - 1), end)
|
||||
windows.append(
|
||||
{
|
||||
"startDate": format_cli_date(current),
|
||||
"endDate": format_cli_date(window_end),
|
||||
}
|
||||
)
|
||||
current = window_end + timedelta(days=1)
|
||||
return windows
|
||||
|
||||
|
||||
def unique_receipts(receipts):
|
||||
by_barcode = {}
|
||||
for receipt in receipts:
|
||||
barcode = receipt.get("transactionBarcode")
|
||||
if barcode:
|
||||
by_barcode[barcode] = receipt
|
||||
return list(by_barcode.values())
|
||||
|
||||
|
||||
def fetch_summary_windows(
|
||||
session,
|
||||
start_date,
|
||||
end_date,
|
||||
document_type,
|
||||
document_sub_type,
|
||||
window_days,
|
||||
):
|
||||
requests_metadata = []
|
||||
combined_receipts = []
|
||||
|
||||
for window in build_date_windows(start_date, end_date, window_days):
|
||||
variables = {
|
||||
"startDate": window["startDate"],
|
||||
"endDate": window["endDate"],
|
||||
"text": "custom",
|
||||
"documentType": document_type,
|
||||
"documentSubType": document_sub_type,
|
||||
}
|
||||
payload = graphql_post(session, SUMMARY_QUERY, variables)
|
||||
receipts = summary_receipts(payload)
|
||||
counts = summary_counts(payload)
|
||||
warehouse_count = sum(
|
||||
1 for receipt in receipts if receipt.get("receiptType") == "In-Warehouse"
|
||||
)
|
||||
mismatch = counts["inWarehouse"] != warehouse_count
|
||||
requests_metadata.append(
|
||||
{
|
||||
**variables,
|
||||
"returnedReceipts": len(receipts),
|
||||
"returnedInWarehouseReceipts": warehouse_count,
|
||||
"inWarehouse": counts["inWarehouse"],
|
||||
"gasStation": counts["gasStation"],
|
||||
"carWash": counts["carWash"],
|
||||
"gasAndCarWash": counts["gasAndCarWash"],
|
||||
"countMismatch": mismatch,
|
||||
}
|
||||
)
|
||||
if mismatch:
|
||||
click.echo(
|
||||
(
|
||||
"warning: summary count mismatch for "
|
||||
f"{window['startDate']} to {window['endDate']}: "
|
||||
f"inWarehouse={counts['inWarehouse']} "
|
||||
f"returnedInWarehouseReceipts={warehouse_count}"
|
||||
),
|
||||
err=True,
|
||||
)
|
||||
combined_receipts.extend(receipts)
|
||||
|
||||
unique = unique_receipts(combined_receipts)
|
||||
aggregate_payload = {
|
||||
"data": {
|
||||
"receiptsWithCounts": {
|
||||
"inWarehouse": sum(row["inWarehouse"] for row in requests_metadata),
|
||||
"gasStation": sum(row["gasStation"] for row in requests_metadata),
|
||||
"carWash": sum(row["carWash"] for row in requests_metadata),
|
||||
"gasAndCarWash": sum(row["gasAndCarWash"] for row in requests_metadata),
|
||||
"receipts": unique,
|
||||
}
|
||||
}
|
||||
}
|
||||
return aggregate_payload, requests_metadata
|
||||
|
||||
|
||||
def flatten_costco_data(summary_payload, detail_payloads, raw_dir):
|
||||
summary_lookup = {
|
||||
receipt["transactionBarcode"]: receipt
|
||||
@@ -415,7 +530,14 @@ def write_csv(path, rows, fieldnames):
|
||||
show_default=True,
|
||||
help="Summary document sub type.",
|
||||
)
|
||||
def main(start_date, end_date, outdir, document_type, document_sub_type):
|
||||
@click.option(
|
||||
"--window-days",
|
||||
default=92,
|
||||
show_default=True,
|
||||
type=int,
|
||||
help="Maximum number of days to request per summary window.",
|
||||
)
|
||||
def main(start_date, end_date, outdir, document_type, document_sub_type, window_days):
|
||||
config = load_config()
|
||||
required = ["authorization", "client_id", "client_identifier"]
|
||||
missing = [key for key in required if not config[key]]
|
||||
@@ -428,18 +550,16 @@ def main(start_date, end_date, outdir, document_type, document_sub_type):
|
||||
raw_dir = outdir / "raw"
|
||||
session = build_session(config)
|
||||
|
||||
summary_payload = graphql_post(
|
||||
summary_payload, request_metadata = fetch_summary_windows(
|
||||
session,
|
||||
SUMMARY_QUERY,
|
||||
{
|
||||
"startDate": start_date,
|
||||
"endDate": end_date,
|
||||
"text": "custom",
|
||||
"documentType": document_type,
|
||||
"documentSubType": document_sub_type,
|
||||
},
|
||||
start_date,
|
||||
end_date,
|
||||
document_type,
|
||||
document_sub_type,
|
||||
window_days,
|
||||
)
|
||||
write_json(raw_dir / "summary.json", summary_payload)
|
||||
write_json(raw_dir / "summary_requests.json", request_metadata)
|
||||
receipts = summary_receipts(summary_payload)
|
||||
|
||||
detail_payloads = []
|
||||
|
||||
@@ -3,6 +3,7 @@ import json
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
import enrich_costco
|
||||
import scrape_costco
|
||||
@@ -10,6 +11,82 @@ import validate_cross_retailer_flow
|
||||
|
||||
|
||||
class CostcoPipelineTests(unittest.TestCase):
|
||||
def test_build_date_windows_splits_long_ranges(self):
|
||||
windows = scrape_costco.build_date_windows("1/01/2026", "6/30/2026", 92)
|
||||
|
||||
self.assertEqual(
|
||||
[
|
||||
{"startDate": "1/01/2026", "endDate": "4/02/2026"},
|
||||
{"startDate": "4/03/2026", "endDate": "6/30/2026"},
|
||||
],
|
||||
windows,
|
||||
)
|
||||
|
||||
def test_fetch_summary_windows_records_metadata_and_warns_on_mismatch(self):
|
||||
payloads = [
|
||||
{
|
||||
"data": {
|
||||
"receiptsWithCounts": {
|
||||
"inWarehouse": 2,
|
||||
"gasStation": 0,
|
||||
"carWash": 0,
|
||||
"gasAndCarWash": 0,
|
||||
"receipts": [
|
||||
{
|
||||
"transactionBarcode": "abc",
|
||||
"receiptType": "In-Warehouse",
|
||||
}
|
||||
],
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"receiptsWithCounts": {
|
||||
"inWarehouse": 1,
|
||||
"gasStation": 0,
|
||||
"carWash": 0,
|
||||
"gasAndCarWash": 0,
|
||||
"receipts": [
|
||||
{
|
||||
"transactionBarcode": "def",
|
||||
"receiptType": "In-Warehouse",
|
||||
}
|
||||
],
|
||||
}
|
||||
}
|
||||
},
|
||||
]
|
||||
|
||||
with mock.patch.object(
|
||||
scrape_costco, "graphql_post", side_effect=payloads
|
||||
) as mocked_post, mock.patch.object(scrape_costco.click, "echo") as mocked_echo:
|
||||
summary_payload, metadata = scrape_costco.fetch_summary_windows(
|
||||
session=object(),
|
||||
start_date="1/01/2026",
|
||||
end_date="6/30/2026",
|
||||
document_type="all",
|
||||
document_sub_type="all",
|
||||
window_days=92,
|
||||
)
|
||||
|
||||
self.assertEqual(2, mocked_post.call_count)
|
||||
self.assertEqual(2, len(metadata))
|
||||
self.assertTrue(metadata[0]["countMismatch"])
|
||||
self.assertFalse(metadata[1]["countMismatch"])
|
||||
self.assertEqual("1/01/2026", metadata[0]["startDate"])
|
||||
self.assertEqual("4/03/2026", metadata[1]["startDate"])
|
||||
self.assertEqual(
|
||||
["abc", "def"],
|
||||
[
|
||||
row["transactionBarcode"]
|
||||
for row in scrape_costco.summary_receipts(summary_payload)
|
||||
],
|
||||
)
|
||||
mocked_echo.assert_called_once()
|
||||
warning_text = mocked_echo.call_args.args[0]
|
||||
self.assertIn("warning: summary count mismatch", warning_text)
|
||||
|
||||
def test_flatten_costco_data_preserves_discount_rows(self):
|
||||
summary_payload = {
|
||||
"data": {
|
||||
@@ -196,6 +273,99 @@ class CostcoPipelineTests(unittest.TestCase):
|
||||
self.assertEqual(1, len(rows))
|
||||
self.assertEqual("banana", rows[0]["proof_name"])
|
||||
|
||||
def test_main_writes_summary_request_metadata(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
outdir = Path(tmpdir) / "costco_output"
|
||||
summary_payload = {
|
||||
"data": {
|
||||
"receiptsWithCounts": {
|
||||
"inWarehouse": 1,
|
||||
"gasStation": 0,
|
||||
"carWash": 0,
|
||||
"gasAndCarWash": 0,
|
||||
"receipts": [
|
||||
{
|
||||
"transactionBarcode": "abc",
|
||||
"receiptType": "In-Warehouse",
|
||||
"tenderArray": [],
|
||||
"couponArray": [],
|
||||
}
|
||||
],
|
||||
}
|
||||
}
|
||||
}
|
||||
detail_payload = {
|
||||
"data": {
|
||||
"receiptsWithCounts": {
|
||||
"receipts": [
|
||||
{
|
||||
"transactionBarcode": "abc",
|
||||
"transactionDate": "2026-03-12",
|
||||
"receiptType": "In-Warehouse",
|
||||
"total": 10.0,
|
||||
"totalItemCount": 1,
|
||||
"instantSavings": 0,
|
||||
"warehouseName": "MT VERNON",
|
||||
"warehouseNumber": 1115,
|
||||
"warehouseAddress1": "7940 RICHMOND HWY",
|
||||
"warehouseCity": "ALEXANDRIA",
|
||||
"warehouseState": "VA",
|
||||
"warehousePostalCode": "22306",
|
||||
"itemArray": [],
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
metadata = [
|
||||
{
|
||||
"startDate": "1/01/2026",
|
||||
"endDate": "3/31/2026",
|
||||
"text": "custom",
|
||||
"documentType": "all",
|
||||
"documentSubType": "all",
|
||||
"returnedReceipts": 1,
|
||||
"returnedInWarehouseReceipts": 1,
|
||||
"inWarehouse": 1,
|
||||
"gasStation": 0,
|
||||
"carWash": 0,
|
||||
"gasAndCarWash": 0,
|
||||
"countMismatch": False,
|
||||
}
|
||||
]
|
||||
|
||||
with mock.patch.object(
|
||||
scrape_costco, "load_config",
|
||||
return_value={
|
||||
"authorization": "token",
|
||||
"client_id": "client",
|
||||
"client_identifier": "identifier",
|
||||
},
|
||||
), mock.patch.object(
|
||||
scrape_costco, "build_session", return_value=object()
|
||||
), mock.patch.object(
|
||||
scrape_costco,
|
||||
"fetch_summary_windows",
|
||||
return_value=(summary_payload, metadata),
|
||||
), mock.patch.object(
|
||||
scrape_costco,
|
||||
"graphql_post",
|
||||
return_value=detail_payload,
|
||||
):
|
||||
scrape_costco.main.callback(
|
||||
start_date="1/01/2026",
|
||||
end_date="3/31/2026",
|
||||
outdir=str(outdir),
|
||||
document_type="all",
|
||||
document_sub_type="all",
|
||||
window_days=92,
|
||||
)
|
||||
|
||||
metadata_path = outdir / "raw" / "summary_requests.json"
|
||||
self.assertTrue(metadata_path.exists())
|
||||
saved_metadata = json.loads(metadata_path.read_text(encoding="utf-8"))
|
||||
self.assertEqual(metadata, saved_metadata)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
Reference in New Issue
Block a user