Fix Costco receipt enumeration windows
This commit is contained in:
140
scrape_costco.py
140
scrape_costco.py
@@ -1,6 +1,7 @@
|
||||
import csv
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
import click
|
||||
@@ -260,6 +261,120 @@ def detail_receipts(payload):
|
||||
return payload.get("data", {}).get("receiptsWithCounts", {}).get("receipts", [])
|
||||
|
||||
|
||||
def summary_counts(payload):
|
||||
counts = payload.get("data", {}).get("receiptsWithCounts", {})
|
||||
return {
|
||||
"inWarehouse": counts.get("inWarehouse", 0) or 0,
|
||||
"gasStation": counts.get("gasStation", 0) or 0,
|
||||
"carWash": counts.get("carWash", 0) or 0,
|
||||
"gasAndCarWash": counts.get("gasAndCarWash", 0) or 0,
|
||||
}
|
||||
|
||||
|
||||
def parse_cli_date(value):
|
||||
return datetime.strptime(value, "%m/%d/%Y").date()
|
||||
|
||||
|
||||
def format_cli_date(value):
|
||||
return f"{value.month}/{value.day:02d}/{value.year}"
|
||||
|
||||
|
||||
def build_date_windows(start_date, end_date, window_days):
|
||||
start = parse_cli_date(start_date)
|
||||
end = parse_cli_date(end_date)
|
||||
if end < start:
|
||||
raise click.ClickException("end-date must be on or after start-date")
|
||||
if window_days < 1:
|
||||
raise click.ClickException("window-days must be at least 1")
|
||||
|
||||
windows = []
|
||||
current = start
|
||||
while current <= end:
|
||||
window_end = min(current + timedelta(days=window_days - 1), end)
|
||||
windows.append(
|
||||
{
|
||||
"startDate": format_cli_date(current),
|
||||
"endDate": format_cli_date(window_end),
|
||||
}
|
||||
)
|
||||
current = window_end + timedelta(days=1)
|
||||
return windows
|
||||
|
||||
|
||||
def unique_receipts(receipts):
|
||||
by_barcode = {}
|
||||
for receipt in receipts:
|
||||
barcode = receipt.get("transactionBarcode")
|
||||
if barcode:
|
||||
by_barcode[barcode] = receipt
|
||||
return list(by_barcode.values())
|
||||
|
||||
|
||||
def fetch_summary_windows(
|
||||
session,
|
||||
start_date,
|
||||
end_date,
|
||||
document_type,
|
||||
document_sub_type,
|
||||
window_days,
|
||||
):
|
||||
requests_metadata = []
|
||||
combined_receipts = []
|
||||
|
||||
for window in build_date_windows(start_date, end_date, window_days):
|
||||
variables = {
|
||||
"startDate": window["startDate"],
|
||||
"endDate": window["endDate"],
|
||||
"text": "custom",
|
||||
"documentType": document_type,
|
||||
"documentSubType": document_sub_type,
|
||||
}
|
||||
payload = graphql_post(session, SUMMARY_QUERY, variables)
|
||||
receipts = summary_receipts(payload)
|
||||
counts = summary_counts(payload)
|
||||
warehouse_count = sum(
|
||||
1 for receipt in receipts if receipt.get("receiptType") == "In-Warehouse"
|
||||
)
|
||||
mismatch = counts["inWarehouse"] != warehouse_count
|
||||
requests_metadata.append(
|
||||
{
|
||||
**variables,
|
||||
"returnedReceipts": len(receipts),
|
||||
"returnedInWarehouseReceipts": warehouse_count,
|
||||
"inWarehouse": counts["inWarehouse"],
|
||||
"gasStation": counts["gasStation"],
|
||||
"carWash": counts["carWash"],
|
||||
"gasAndCarWash": counts["gasAndCarWash"],
|
||||
"countMismatch": mismatch,
|
||||
}
|
||||
)
|
||||
if mismatch:
|
||||
click.echo(
|
||||
(
|
||||
"warning: summary count mismatch for "
|
||||
f"{window['startDate']} to {window['endDate']}: "
|
||||
f"inWarehouse={counts['inWarehouse']} "
|
||||
f"returnedInWarehouseReceipts={warehouse_count}"
|
||||
),
|
||||
err=True,
|
||||
)
|
||||
combined_receipts.extend(receipts)
|
||||
|
||||
unique = unique_receipts(combined_receipts)
|
||||
aggregate_payload = {
|
||||
"data": {
|
||||
"receiptsWithCounts": {
|
||||
"inWarehouse": sum(row["inWarehouse"] for row in requests_metadata),
|
||||
"gasStation": sum(row["gasStation"] for row in requests_metadata),
|
||||
"carWash": sum(row["carWash"] for row in requests_metadata),
|
||||
"gasAndCarWash": sum(row["gasAndCarWash"] for row in requests_metadata),
|
||||
"receipts": unique,
|
||||
}
|
||||
}
|
||||
}
|
||||
return aggregate_payload, requests_metadata
|
||||
|
||||
|
||||
def flatten_costco_data(summary_payload, detail_payloads, raw_dir):
|
||||
summary_lookup = {
|
||||
receipt["transactionBarcode"]: receipt
|
||||
@@ -415,7 +530,14 @@ def write_csv(path, rows, fieldnames):
|
||||
show_default=True,
|
||||
help="Summary document sub type.",
|
||||
)
|
||||
def main(start_date, end_date, outdir, document_type, document_sub_type):
|
||||
@click.option(
|
||||
"--window-days",
|
||||
default=92,
|
||||
show_default=True,
|
||||
type=int,
|
||||
help="Maximum number of days to request per summary window.",
|
||||
)
|
||||
def main(start_date, end_date, outdir, document_type, document_sub_type, window_days):
|
||||
config = load_config()
|
||||
required = ["authorization", "client_id", "client_identifier"]
|
||||
missing = [key for key in required if not config[key]]
|
||||
@@ -428,18 +550,16 @@ def main(start_date, end_date, outdir, document_type, document_sub_type):
|
||||
raw_dir = outdir / "raw"
|
||||
session = build_session(config)
|
||||
|
||||
summary_payload = graphql_post(
|
||||
summary_payload, request_metadata = fetch_summary_windows(
|
||||
session,
|
||||
SUMMARY_QUERY,
|
||||
{
|
||||
"startDate": start_date,
|
||||
"endDate": end_date,
|
||||
"text": "custom",
|
||||
"documentType": document_type,
|
||||
"documentSubType": document_sub_type,
|
||||
},
|
||||
start_date,
|
||||
end_date,
|
||||
document_type,
|
||||
document_sub_type,
|
||||
window_days,
|
||||
)
|
||||
write_json(raw_dir / "summary.json", summary_payload)
|
||||
write_json(raw_dir / "summary_requests.json", request_metadata)
|
||||
receipts = summary_receipts(summary_payload)
|
||||
|
||||
detail_payloads = []
|
||||
|
||||
Reference in New Issue
Block a user