Align Costco scraper with browser session flow

This commit is contained in:
ben
2026-03-16 12:28:19 -04:00
parent 58d6efb7bb
commit c0054dc51e
2 changed files with 159 additions and 56 deletions

View File

@@ -1,12 +1,13 @@
import csv
import json
import os
import time
from calendar import monthrange
from datetime import datetime, timedelta
from pathlib import Path
import click
from dotenv import load_dotenv
import browser_cookie3
from curl_cffi import requests
BASE_URL = "https://ecom-api.costco.com/ebusiness/order/v1/orders/graphql"
RETAILER = "costco"
@@ -207,24 +208,12 @@ ITEM_FIELDS = [
]
def load_config():
load_dotenv()
return {
"authorization": os.getenv("COSTCO_X_AUTHORIZATION", "").strip(),
"client_id": os.getenv("COSTCO_WCS_CLIENT_ID", "").strip(),
"client_identifier": os.getenv("COSTCO_CLIENT_IDENTIFIER", "").strip(),
}
def build_headers(config):
def build_headers():
return {
"accept": "*/*",
"content-type": "application/json-patch+json",
"costco.service": "restOrders",
"costco.env": "ecom",
"costco-x-authorization": config["authorization"],
"costco-x-wcs-clientId": config["client_id"],
"client-identifier": config["client_identifier"],
"origin": "https://www.costco.com",
"referer": "https://www.costco.com/",
"user-agent": (
@@ -234,23 +223,36 @@ def build_headers(config):
}
def build_session(config):
from curl_cffi import requests
def build_session():
session = requests.Session()
session.headers.update(build_headers(config))
session.cookies.update(browser_cookie3.firefox(domain_name="costco.com"))
session.headers.update(build_headers())
return session
def graphql_post(session, query, variables):
last_response = None
for attempt in range(3):
try:
response = session.post(
BASE_URL,
json={"query": query, "variables": variables},
impersonate="firefox",
timeout=30,
)
response.raise_for_status()
last_response = response
if response.status_code == 200:
return response.json()
click.echo(f"retry {attempt + 1}/3 status={response.status_code}")
except Exception as exc: # pragma: no cover - network error path
click.echo(f"retry {attempt + 1}/3 error={exc}")
time.sleep(3)
if last_response is not None:
last_response.raise_for_status()
raise RuntimeError("failed to fetch Costco GraphQL payload")
def summary_receipts(payload):
@@ -279,6 +281,25 @@ def format_cli_date(value):
return f"{value.month}/{value.day:02d}/{value.year}"
def subtract_months(value, months):
year = value.year
month = value.month - months
while month <= 0:
month += 12
year -= 1
day = min(value.day, monthrange(year, month)[1])
return value.replace(year=year, month=month, day=day)
def resolve_date_range(months_back, today=None):
if months_back < 1:
raise click.ClickException("months-back must be at least 1")
end = today or datetime.now().date()
start = subtract_months(end, months_back)
return format_cli_date(start), format_cli_date(end)
def build_date_windows(start_date, end_date, window_days):
start = parse_cli_date(start_date)
end = parse_cli_date(end_date)
@@ -304,12 +325,20 @@ def build_date_windows(start_date, end_date, window_days):
def unique_receipts(receipts):
by_barcode = {}
for receipt in receipts:
barcode = receipt.get("transactionBarcode")
if barcode:
by_barcode[barcode] = receipt
key = receipt_key(receipt)
if key:
by_barcode[key] = receipt
return list(by_barcode.values())
def receipt_key(receipt):
barcode = receipt.get("transactionBarcode", "")
transaction_date_time = receipt.get("transactionDateTime", "")
if not barcode:
return ""
return f"{barcode}::{transaction_date_time}"
def fetch_summary_windows(
session,
start_date,
@@ -377,8 +406,9 @@ def fetch_summary_windows(
def flatten_costco_data(summary_payload, detail_payloads, raw_dir):
summary_lookup = {
receipt["transactionBarcode"]: receipt
receipt_key(receipt): receipt
for receipt in summary_receipts(summary_payload)
if receipt_key(receipt)
}
orders = []
items = []
@@ -386,13 +416,14 @@ def flatten_costco_data(summary_payload, detail_payloads, raw_dir):
for detail_payload in detail_payloads:
for receipt in detail_receipts(detail_payload):
order_id = receipt["transactionBarcode"]
summary_row = summary_lookup.get(order_id, {})
receipt_id = receipt_key(receipt)
summary_row = summary_lookup.get(receipt_id, {})
coupon_numbers = {
row.get("upcnumberCoupon", "")
for row in summary_row.get("couponArray", []) or []
if row.get("upcnumberCoupon")
}
raw_order_path = raw_dir / f"{order_id}.json"
raw_order_path = raw_dir / f"{receipt_id or order_id}.json"
orders.append(
{
@@ -510,8 +541,6 @@ def write_csv(path, rows, fieldnames):
@click.command()
@click.option("--start-date", required=True, help="Start date like 1/01/2026.")
@click.option("--end-date", required=True, help="End date like 3/31/2026.")
@click.option(
"--outdir",
default="costco_output",
@@ -537,18 +566,23 @@ def write_csv(path, rows, fieldnames):
type=int,
help="Maximum number of days to request per summary window.",
)
def main(start_date, end_date, outdir, document_type, document_sub_type, window_days):
config = load_config()
required = ["authorization", "client_id", "client_identifier"]
missing = [key for key in required if not config[key]]
if missing:
raise click.ClickException(
f"missing Costco auth config: {', '.join(missing)}"
)
@click.option(
"--months-back",
default=3,
show_default=True,
type=int,
help="How many months of receipts to enumerate back from today.",
)
def main(outdir, document_type, document_sub_type, window_days, months_back):
outdir = Path(outdir)
raw_dir = outdir / "raw"
session = build_session(config)
try:
session = build_session()
except Exception as exc:
raise click.ClickException(
f"failed to load Costco Firefox cookies: {exc}"
) from exc
start_date, end_date = resolve_date_range(months_back)
summary_payload, request_metadata = fetch_summary_windows(
session,
@@ -565,6 +599,7 @@ def main(start_date, end_date, outdir, document_type, document_sub_type, window_
detail_payloads = []
for receipt in receipts:
barcode = receipt["transactionBarcode"]
receipt_id = receipt_key(receipt) or barcode
click.echo(f"fetching {barcode}")
detail_payload = graphql_post(
session,
@@ -572,7 +607,7 @@ def main(start_date, end_date, outdir, document_type, document_sub_type, window_
{"barcode": barcode, "documentType": "warehouse"},
)
detail_payloads.append(detail_payload)
write_json(raw_dir / f"{barcode}.json", detail_payload)
write_json(raw_dir / f"{receipt_id}.json", detail_payload)
orders, items = flatten_costco_data(summary_payload, detail_payloads, raw_dir)
write_csv(outdir / "orders.csv", orders, ORDER_FIELDS)

View File

@@ -11,6 +11,14 @@ import validate_cross_retailer_flow
class CostcoPipelineTests(unittest.TestCase):
def test_resolve_date_range_uses_months_back(self):
start_date, end_date = scrape_costco.resolve_date_range(
3, today=scrape_costco.parse_cli_date("3/16/2026")
)
self.assertEqual("12/16/2025", start_date)
self.assertEqual("3/16/2026", end_date)
def test_build_date_windows_splits_long_ranges(self):
windows = scrape_costco.build_date_windows("1/01/2026", "6/30/2026", 92)
@@ -160,6 +168,74 @@ class CostcoPipelineTests(unittest.TestCase):
self.assertEqual("true", items[1]["is_discount_line"])
self.assertEqual("true", items[1]["is_coupon_line"])
def test_flatten_costco_data_uses_composite_summary_lookup_key(self):
summary_payload = {
"data": {
"receiptsWithCounts": {
"receipts": [
{
"transactionBarcode": "dup",
"transactionDateTime": "2026-03-12T16:16:00",
"tenderArray": [{"tenderDescription": "VISA"}],
"couponArray": [{"upcnumberCoupon": "111"}],
},
{
"transactionBarcode": "dup",
"transactionDateTime": "2026-02-14T16:25:00",
"tenderArray": [{"tenderDescription": "MASTERCARD"}],
"couponArray": [],
},
]
}
}
}
detail_payloads = [
{
"data": {
"receiptsWithCounts": {
"receipts": [
{
"transactionBarcode": "dup",
"transactionDateTime": "2026-03-12T16:16:00",
"transactionDate": "2026-03-12",
"receiptType": "In-Warehouse",
"total": 10.0,
"totalItemCount": 1,
"instantSavings": 5.0,
"warehouseName": "MT VERNON",
"warehouseNumber": 1115,
"warehouseAddress1": "7940 RICHMOND HWY",
"warehouseCity": "ALEXANDRIA",
"warehouseState": "VA",
"warehousePostalCode": "22306",
"itemArray": [
{
"itemNumber": "111",
"itemDescription01": "/ 111",
"itemDescription02": None,
"itemDepartmentNumber": 14,
"transDepartmentNumber": 14,
"unit": -1,
"itemIdentifier": None,
"amount": -5,
"itemUnitPriceAmount": 0,
}
],
}
]
}
}
}
]
orders, items = scrape_costco.flatten_costco_data(
summary_payload, detail_payloads, Path("costco_output/raw")
)
self.assertEqual("VISA", orders[0]["payment_method"])
self.assertEqual("true", items[0]["is_coupon_line"])
self.assertIn("dup::2026-03-12T16:16:00.json", items[0]["raw_order_path"])
def test_costco_enricher_parses_size_pack_and_discount(self):
row = enrich_costco.parse_costco_item(
order_id="abc",
@@ -335,13 +411,6 @@ class CostcoPipelineTests(unittest.TestCase):
]
with mock.patch.object(
scrape_costco, "load_config",
return_value={
"authorization": "token",
"client_id": "client",
"client_identifier": "identifier",
},
), mock.patch.object(
scrape_costco, "build_session", return_value=object()
), mock.patch.object(
scrape_costco,
@@ -353,12 +422,11 @@ class CostcoPipelineTests(unittest.TestCase):
return_value=detail_payload,
):
scrape_costco.main.callback(
start_date="1/01/2026",
end_date="3/31/2026",
outdir=str(outdir),
document_type="all",
document_sub_type="all",
window_days=92,
months_back=3,
)
metadata_path = outdir / "raw" / "summary_requests.json"