Fix Costco receipt enumeration windows
This commit is contained in:
140
scrape_costco.py
140
scrape_costco.py
@@ -1,6 +1,7 @@
|
|||||||
import csv
|
import csv
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
from datetime import datetime, timedelta
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import click
|
import click
|
||||||
@@ -260,6 +261,120 @@ def detail_receipts(payload):
|
|||||||
return payload.get("data", {}).get("receiptsWithCounts", {}).get("receipts", [])
|
return payload.get("data", {}).get("receiptsWithCounts", {}).get("receipts", [])
|
||||||
|
|
||||||
|
|
||||||
|
def summary_counts(payload):
|
||||||
|
counts = payload.get("data", {}).get("receiptsWithCounts", {})
|
||||||
|
return {
|
||||||
|
"inWarehouse": counts.get("inWarehouse", 0) or 0,
|
||||||
|
"gasStation": counts.get("gasStation", 0) or 0,
|
||||||
|
"carWash": counts.get("carWash", 0) or 0,
|
||||||
|
"gasAndCarWash": counts.get("gasAndCarWash", 0) or 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def parse_cli_date(value):
|
||||||
|
return datetime.strptime(value, "%m/%d/%Y").date()
|
||||||
|
|
||||||
|
|
||||||
|
def format_cli_date(value):
|
||||||
|
return f"{value.month}/{value.day:02d}/{value.year}"
|
||||||
|
|
||||||
|
|
||||||
|
def build_date_windows(start_date, end_date, window_days):
|
||||||
|
start = parse_cli_date(start_date)
|
||||||
|
end = parse_cli_date(end_date)
|
||||||
|
if end < start:
|
||||||
|
raise click.ClickException("end-date must be on or after start-date")
|
||||||
|
if window_days < 1:
|
||||||
|
raise click.ClickException("window-days must be at least 1")
|
||||||
|
|
||||||
|
windows = []
|
||||||
|
current = start
|
||||||
|
while current <= end:
|
||||||
|
window_end = min(current + timedelta(days=window_days - 1), end)
|
||||||
|
windows.append(
|
||||||
|
{
|
||||||
|
"startDate": format_cli_date(current),
|
||||||
|
"endDate": format_cli_date(window_end),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
current = window_end + timedelta(days=1)
|
||||||
|
return windows
|
||||||
|
|
||||||
|
|
||||||
|
def unique_receipts(receipts):
|
||||||
|
by_barcode = {}
|
||||||
|
for receipt in receipts:
|
||||||
|
barcode = receipt.get("transactionBarcode")
|
||||||
|
if barcode:
|
||||||
|
by_barcode[barcode] = receipt
|
||||||
|
return list(by_barcode.values())
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_summary_windows(
|
||||||
|
session,
|
||||||
|
start_date,
|
||||||
|
end_date,
|
||||||
|
document_type,
|
||||||
|
document_sub_type,
|
||||||
|
window_days,
|
||||||
|
):
|
||||||
|
requests_metadata = []
|
||||||
|
combined_receipts = []
|
||||||
|
|
||||||
|
for window in build_date_windows(start_date, end_date, window_days):
|
||||||
|
variables = {
|
||||||
|
"startDate": window["startDate"],
|
||||||
|
"endDate": window["endDate"],
|
||||||
|
"text": "custom",
|
||||||
|
"documentType": document_type,
|
||||||
|
"documentSubType": document_sub_type,
|
||||||
|
}
|
||||||
|
payload = graphql_post(session, SUMMARY_QUERY, variables)
|
||||||
|
receipts = summary_receipts(payload)
|
||||||
|
counts = summary_counts(payload)
|
||||||
|
warehouse_count = sum(
|
||||||
|
1 for receipt in receipts if receipt.get("receiptType") == "In-Warehouse"
|
||||||
|
)
|
||||||
|
mismatch = counts["inWarehouse"] != warehouse_count
|
||||||
|
requests_metadata.append(
|
||||||
|
{
|
||||||
|
**variables,
|
||||||
|
"returnedReceipts": len(receipts),
|
||||||
|
"returnedInWarehouseReceipts": warehouse_count,
|
||||||
|
"inWarehouse": counts["inWarehouse"],
|
||||||
|
"gasStation": counts["gasStation"],
|
||||||
|
"carWash": counts["carWash"],
|
||||||
|
"gasAndCarWash": counts["gasAndCarWash"],
|
||||||
|
"countMismatch": mismatch,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if mismatch:
|
||||||
|
click.echo(
|
||||||
|
(
|
||||||
|
"warning: summary count mismatch for "
|
||||||
|
f"{window['startDate']} to {window['endDate']}: "
|
||||||
|
f"inWarehouse={counts['inWarehouse']} "
|
||||||
|
f"returnedInWarehouseReceipts={warehouse_count}"
|
||||||
|
),
|
||||||
|
err=True,
|
||||||
|
)
|
||||||
|
combined_receipts.extend(receipts)
|
||||||
|
|
||||||
|
unique = unique_receipts(combined_receipts)
|
||||||
|
aggregate_payload = {
|
||||||
|
"data": {
|
||||||
|
"receiptsWithCounts": {
|
||||||
|
"inWarehouse": sum(row["inWarehouse"] for row in requests_metadata),
|
||||||
|
"gasStation": sum(row["gasStation"] for row in requests_metadata),
|
||||||
|
"carWash": sum(row["carWash"] for row in requests_metadata),
|
||||||
|
"gasAndCarWash": sum(row["gasAndCarWash"] for row in requests_metadata),
|
||||||
|
"receipts": unique,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return aggregate_payload, requests_metadata
|
||||||
|
|
||||||
|
|
||||||
def flatten_costco_data(summary_payload, detail_payloads, raw_dir):
|
def flatten_costco_data(summary_payload, detail_payloads, raw_dir):
|
||||||
summary_lookup = {
|
summary_lookup = {
|
||||||
receipt["transactionBarcode"]: receipt
|
receipt["transactionBarcode"]: receipt
|
||||||
@@ -415,7 +530,14 @@ def write_csv(path, rows, fieldnames):
|
|||||||
show_default=True,
|
show_default=True,
|
||||||
help="Summary document sub type.",
|
help="Summary document sub type.",
|
||||||
)
|
)
|
||||||
def main(start_date, end_date, outdir, document_type, document_sub_type):
|
@click.option(
|
||||||
|
"--window-days",
|
||||||
|
default=92,
|
||||||
|
show_default=True,
|
||||||
|
type=int,
|
||||||
|
help="Maximum number of days to request per summary window.",
|
||||||
|
)
|
||||||
|
def main(start_date, end_date, outdir, document_type, document_sub_type, window_days):
|
||||||
config = load_config()
|
config = load_config()
|
||||||
required = ["authorization", "client_id", "client_identifier"]
|
required = ["authorization", "client_id", "client_identifier"]
|
||||||
missing = [key for key in required if not config[key]]
|
missing = [key for key in required if not config[key]]
|
||||||
@@ -428,18 +550,16 @@ def main(start_date, end_date, outdir, document_type, document_sub_type):
|
|||||||
raw_dir = outdir / "raw"
|
raw_dir = outdir / "raw"
|
||||||
session = build_session(config)
|
session = build_session(config)
|
||||||
|
|
||||||
summary_payload = graphql_post(
|
summary_payload, request_metadata = fetch_summary_windows(
|
||||||
session,
|
session,
|
||||||
SUMMARY_QUERY,
|
start_date,
|
||||||
{
|
end_date,
|
||||||
"startDate": start_date,
|
document_type,
|
||||||
"endDate": end_date,
|
document_sub_type,
|
||||||
"text": "custom",
|
window_days,
|
||||||
"documentType": document_type,
|
|
||||||
"documentSubType": document_sub_type,
|
|
||||||
},
|
|
||||||
)
|
)
|
||||||
write_json(raw_dir / "summary.json", summary_payload)
|
write_json(raw_dir / "summary.json", summary_payload)
|
||||||
|
write_json(raw_dir / "summary_requests.json", request_metadata)
|
||||||
receipts = summary_receipts(summary_payload)
|
receipts = summary_receipts(summary_payload)
|
||||||
|
|
||||||
detail_payloads = []
|
detail_payloads = []
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ import json
|
|||||||
import tempfile
|
import tempfile
|
||||||
import unittest
|
import unittest
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from unittest import mock
|
||||||
|
|
||||||
import enrich_costco
|
import enrich_costco
|
||||||
import scrape_costco
|
import scrape_costco
|
||||||
@@ -10,6 +11,82 @@ import validate_cross_retailer_flow
|
|||||||
|
|
||||||
|
|
||||||
class CostcoPipelineTests(unittest.TestCase):
|
class CostcoPipelineTests(unittest.TestCase):
|
||||||
|
def test_build_date_windows_splits_long_ranges(self):
|
||||||
|
windows = scrape_costco.build_date_windows("1/01/2026", "6/30/2026", 92)
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
[
|
||||||
|
{"startDate": "1/01/2026", "endDate": "4/02/2026"},
|
||||||
|
{"startDate": "4/03/2026", "endDate": "6/30/2026"},
|
||||||
|
],
|
||||||
|
windows,
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_fetch_summary_windows_records_metadata_and_warns_on_mismatch(self):
|
||||||
|
payloads = [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"receiptsWithCounts": {
|
||||||
|
"inWarehouse": 2,
|
||||||
|
"gasStation": 0,
|
||||||
|
"carWash": 0,
|
||||||
|
"gasAndCarWash": 0,
|
||||||
|
"receipts": [
|
||||||
|
{
|
||||||
|
"transactionBarcode": "abc",
|
||||||
|
"receiptType": "In-Warehouse",
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"receiptsWithCounts": {
|
||||||
|
"inWarehouse": 1,
|
||||||
|
"gasStation": 0,
|
||||||
|
"carWash": 0,
|
||||||
|
"gasAndCarWash": 0,
|
||||||
|
"receipts": [
|
||||||
|
{
|
||||||
|
"transactionBarcode": "def",
|
||||||
|
"receiptType": "In-Warehouse",
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
with mock.patch.object(
|
||||||
|
scrape_costco, "graphql_post", side_effect=payloads
|
||||||
|
) as mocked_post, mock.patch.object(scrape_costco.click, "echo") as mocked_echo:
|
||||||
|
summary_payload, metadata = scrape_costco.fetch_summary_windows(
|
||||||
|
session=object(),
|
||||||
|
start_date="1/01/2026",
|
||||||
|
end_date="6/30/2026",
|
||||||
|
document_type="all",
|
||||||
|
document_sub_type="all",
|
||||||
|
window_days=92,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(2, mocked_post.call_count)
|
||||||
|
self.assertEqual(2, len(metadata))
|
||||||
|
self.assertTrue(metadata[0]["countMismatch"])
|
||||||
|
self.assertFalse(metadata[1]["countMismatch"])
|
||||||
|
self.assertEqual("1/01/2026", metadata[0]["startDate"])
|
||||||
|
self.assertEqual("4/03/2026", metadata[1]["startDate"])
|
||||||
|
self.assertEqual(
|
||||||
|
["abc", "def"],
|
||||||
|
[
|
||||||
|
row["transactionBarcode"]
|
||||||
|
for row in scrape_costco.summary_receipts(summary_payload)
|
||||||
|
],
|
||||||
|
)
|
||||||
|
mocked_echo.assert_called_once()
|
||||||
|
warning_text = mocked_echo.call_args.args[0]
|
||||||
|
self.assertIn("warning: summary count mismatch", warning_text)
|
||||||
|
|
||||||
def test_flatten_costco_data_preserves_discount_rows(self):
|
def test_flatten_costco_data_preserves_discount_rows(self):
|
||||||
summary_payload = {
|
summary_payload = {
|
||||||
"data": {
|
"data": {
|
||||||
@@ -196,6 +273,99 @@ class CostcoPipelineTests(unittest.TestCase):
|
|||||||
self.assertEqual(1, len(rows))
|
self.assertEqual(1, len(rows))
|
||||||
self.assertEqual("banana", rows[0]["proof_name"])
|
self.assertEqual("banana", rows[0]["proof_name"])
|
||||||
|
|
||||||
|
def test_main_writes_summary_request_metadata(self):
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
outdir = Path(tmpdir) / "costco_output"
|
||||||
|
summary_payload = {
|
||||||
|
"data": {
|
||||||
|
"receiptsWithCounts": {
|
||||||
|
"inWarehouse": 1,
|
||||||
|
"gasStation": 0,
|
||||||
|
"carWash": 0,
|
||||||
|
"gasAndCarWash": 0,
|
||||||
|
"receipts": [
|
||||||
|
{
|
||||||
|
"transactionBarcode": "abc",
|
||||||
|
"receiptType": "In-Warehouse",
|
||||||
|
"tenderArray": [],
|
||||||
|
"couponArray": [],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
detail_payload = {
|
||||||
|
"data": {
|
||||||
|
"receiptsWithCounts": {
|
||||||
|
"receipts": [
|
||||||
|
{
|
||||||
|
"transactionBarcode": "abc",
|
||||||
|
"transactionDate": "2026-03-12",
|
||||||
|
"receiptType": "In-Warehouse",
|
||||||
|
"total": 10.0,
|
||||||
|
"totalItemCount": 1,
|
||||||
|
"instantSavings": 0,
|
||||||
|
"warehouseName": "MT VERNON",
|
||||||
|
"warehouseNumber": 1115,
|
||||||
|
"warehouseAddress1": "7940 RICHMOND HWY",
|
||||||
|
"warehouseCity": "ALEXANDRIA",
|
||||||
|
"warehouseState": "VA",
|
||||||
|
"warehousePostalCode": "22306",
|
||||||
|
"itemArray": [],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
metadata = [
|
||||||
|
{
|
||||||
|
"startDate": "1/01/2026",
|
||||||
|
"endDate": "3/31/2026",
|
||||||
|
"text": "custom",
|
||||||
|
"documentType": "all",
|
||||||
|
"documentSubType": "all",
|
||||||
|
"returnedReceipts": 1,
|
||||||
|
"returnedInWarehouseReceipts": 1,
|
||||||
|
"inWarehouse": 1,
|
||||||
|
"gasStation": 0,
|
||||||
|
"carWash": 0,
|
||||||
|
"gasAndCarWash": 0,
|
||||||
|
"countMismatch": False,
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
with mock.patch.object(
|
||||||
|
scrape_costco, "load_config",
|
||||||
|
return_value={
|
||||||
|
"authorization": "token",
|
||||||
|
"client_id": "client",
|
||||||
|
"client_identifier": "identifier",
|
||||||
|
},
|
||||||
|
), mock.patch.object(
|
||||||
|
scrape_costco, "build_session", return_value=object()
|
||||||
|
), mock.patch.object(
|
||||||
|
scrape_costco,
|
||||||
|
"fetch_summary_windows",
|
||||||
|
return_value=(summary_payload, metadata),
|
||||||
|
), mock.patch.object(
|
||||||
|
scrape_costco,
|
||||||
|
"graphql_post",
|
||||||
|
return_value=detail_payload,
|
||||||
|
):
|
||||||
|
scrape_costco.main.callback(
|
||||||
|
start_date="1/01/2026",
|
||||||
|
end_date="3/31/2026",
|
||||||
|
outdir=str(outdir),
|
||||||
|
document_type="all",
|
||||||
|
document_sub_type="all",
|
||||||
|
window_days=92,
|
||||||
|
)
|
||||||
|
|
||||||
|
metadata_path = outdir / "raw" / "summary_requests.json"
|
||||||
|
self.assertTrue(metadata_path.exists())
|
||||||
|
saved_metadata = json.loads(metadata_path.read_text(encoding="utf-8"))
|
||||||
|
self.assertEqual(metadata, saved_metadata)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
Reference in New Issue
Block a user