Align Costco scraper with browser session flow

This commit is contained in:
ben
2026-03-16 12:28:19 -04:00
parent 58d6efb7bb
commit c0054dc51e
2 changed files with 159 additions and 56 deletions

View File

@@ -1,12 +1,13 @@
import csv
import json
import os
import time
from calendar import monthrange
from datetime import datetime, timedelta
from pathlib import Path
import click
from dotenv import load_dotenv
import browser_cookie3
from curl_cffi import requests
BASE_URL = "https://ecom-api.costco.com/ebusiness/order/v1/orders/graphql"
RETAILER = "costco"
@@ -207,24 +208,12 @@ ITEM_FIELDS = [
]
def load_config():
load_dotenv()
return {
"authorization": os.getenv("COSTCO_X_AUTHORIZATION", "").strip(),
"client_id": os.getenv("COSTCO_WCS_CLIENT_ID", "").strip(),
"client_identifier": os.getenv("COSTCO_CLIENT_IDENTIFIER", "").strip(),
}
def build_headers(config):
def build_headers():
return {
"accept": "*/*",
"content-type": "application/json-patch+json",
"costco.service": "restOrders",
"costco.env": "ecom",
"costco-x-authorization": config["authorization"],
"costco-x-wcs-clientId": config["client_id"],
"client-identifier": config["client_identifier"],
"origin": "https://www.costco.com",
"referer": "https://www.costco.com/",
"user-agent": (
@@ -234,23 +223,36 @@ def build_headers(config):
}
def build_session(config):
from curl_cffi import requests
def build_session():
session = requests.Session()
session.headers.update(build_headers(config))
session.cookies.update(browser_cookie3.firefox(domain_name="costco.com"))
session.headers.update(build_headers())
return session
def graphql_post(session, query, variables):
response = session.post(
BASE_URL,
json={"query": query, "variables": variables},
impersonate="firefox",
timeout=30,
)
response.raise_for_status()
return response.json()
last_response = None
for attempt in range(3):
try:
response = session.post(
BASE_URL,
json={"query": query, "variables": variables},
impersonate="firefox",
timeout=30,
)
last_response = response
if response.status_code == 200:
return response.json()
click.echo(f"retry {attempt + 1}/3 status={response.status_code}")
except Exception as exc: # pragma: no cover - network error path
click.echo(f"retry {attempt + 1}/3 error={exc}")
time.sleep(3)
if last_response is not None:
last_response.raise_for_status()
raise RuntimeError("failed to fetch Costco GraphQL payload")
def summary_receipts(payload):
@@ -279,6 +281,25 @@ def format_cli_date(value):
return f"{value.month}/{value.day:02d}/{value.year}"
def subtract_months(value, months):
year = value.year
month = value.month - months
while month <= 0:
month += 12
year -= 1
day = min(value.day, monthrange(year, month)[1])
return value.replace(year=year, month=month, day=day)
def resolve_date_range(months_back, today=None):
if months_back < 1:
raise click.ClickException("months-back must be at least 1")
end = today or datetime.now().date()
start = subtract_months(end, months_back)
return format_cli_date(start), format_cli_date(end)
def build_date_windows(start_date, end_date, window_days):
start = parse_cli_date(start_date)
end = parse_cli_date(end_date)
@@ -304,12 +325,20 @@ def build_date_windows(start_date, end_date, window_days):
def unique_receipts(receipts):
by_barcode = {}
for receipt in receipts:
barcode = receipt.get("transactionBarcode")
if barcode:
by_barcode[barcode] = receipt
key = receipt_key(receipt)
if key:
by_barcode[key] = receipt
return list(by_barcode.values())
def receipt_key(receipt):
barcode = receipt.get("transactionBarcode", "")
transaction_date_time = receipt.get("transactionDateTime", "")
if not barcode:
return ""
return f"{barcode}::{transaction_date_time}"
def fetch_summary_windows(
session,
start_date,
@@ -377,8 +406,9 @@ def fetch_summary_windows(
def flatten_costco_data(summary_payload, detail_payloads, raw_dir):
summary_lookup = {
receipt["transactionBarcode"]: receipt
receipt_key(receipt): receipt
for receipt in summary_receipts(summary_payload)
if receipt_key(receipt)
}
orders = []
items = []
@@ -386,13 +416,14 @@ def flatten_costco_data(summary_payload, detail_payloads, raw_dir):
for detail_payload in detail_payloads:
for receipt in detail_receipts(detail_payload):
order_id = receipt["transactionBarcode"]
summary_row = summary_lookup.get(order_id, {})
receipt_id = receipt_key(receipt)
summary_row = summary_lookup.get(receipt_id, {})
coupon_numbers = {
row.get("upcnumberCoupon", "")
for row in summary_row.get("couponArray", []) or []
if row.get("upcnumberCoupon")
}
raw_order_path = raw_dir / f"{order_id}.json"
raw_order_path = raw_dir / f"{receipt_id or order_id}.json"
orders.append(
{
@@ -510,8 +541,6 @@ def write_csv(path, rows, fieldnames):
@click.command()
@click.option("--start-date", required=True, help="Start date like 1/01/2026.")
@click.option("--end-date", required=True, help="End date like 3/31/2026.")
@click.option(
"--outdir",
default="costco_output",
@@ -537,18 +566,23 @@ def write_csv(path, rows, fieldnames):
type=int,
help="Maximum number of days to request per summary window.",
)
def main(start_date, end_date, outdir, document_type, document_sub_type, window_days):
config = load_config()
required = ["authorization", "client_id", "client_identifier"]
missing = [key for key in required if not config[key]]
if missing:
raise click.ClickException(
f"missing Costco auth config: {', '.join(missing)}"
)
@click.option(
"--months-back",
default=3,
show_default=True,
type=int,
help="How many months of receipts to enumerate back from today.",
)
def main(outdir, document_type, document_sub_type, window_days, months_back):
outdir = Path(outdir)
raw_dir = outdir / "raw"
session = build_session(config)
try:
session = build_session()
except Exception as exc:
raise click.ClickException(
f"failed to load Costco Firefox cookies: {exc}"
) from exc
start_date, end_date = resolve_date_range(months_back)
summary_payload, request_metadata = fetch_summary_windows(
session,
@@ -565,6 +599,7 @@ def main(start_date, end_date, outdir, document_type, document_sub_type, window_
detail_payloads = []
for receipt in receipts:
barcode = receipt["transactionBarcode"]
receipt_id = receipt_key(receipt) or barcode
click.echo(f"fetching {barcode}")
detail_payload = graphql_post(
session,
@@ -572,7 +607,7 @@ def main(start_date, end_date, outdir, document_type, document_sub_type, window_
{"barcode": barcode, "documentType": "warehouse"},
)
detail_payloads.append(detail_payload)
write_json(raw_dir / f"{barcode}.json", detail_payload)
write_json(raw_dir / f"{receipt_id}.json", detail_payload)
orders, items = flatten_costco_data(summary_payload, detail_payloads, raw_dir)
write_csv(outdir / "orders.csv", orders, ORDER_FIELDS)