Align Costco scraper with browser session flow

This commit is contained in:
ben
2026-03-16 12:28:19 -04:00
parent 58d6efb7bb
commit c0054dc51e
2 changed files with 159 additions and 56 deletions

View File

@@ -1,12 +1,13 @@
import csv import csv
import json import json
import os import time
from calendar import monthrange
from datetime import datetime, timedelta from datetime import datetime, timedelta
from pathlib import Path from pathlib import Path
import click import click
from dotenv import load_dotenv import browser_cookie3
from curl_cffi import requests
BASE_URL = "https://ecom-api.costco.com/ebusiness/order/v1/orders/graphql" BASE_URL = "https://ecom-api.costco.com/ebusiness/order/v1/orders/graphql"
RETAILER = "costco" RETAILER = "costco"
@@ -207,24 +208,12 @@ ITEM_FIELDS = [
] ]
def load_config(): def build_headers():
load_dotenv()
return {
"authorization": os.getenv("COSTCO_X_AUTHORIZATION", "").strip(),
"client_id": os.getenv("COSTCO_WCS_CLIENT_ID", "").strip(),
"client_identifier": os.getenv("COSTCO_CLIENT_IDENTIFIER", "").strip(),
}
def build_headers(config):
return { return {
"accept": "*/*", "accept": "*/*",
"content-type": "application/json-patch+json", "content-type": "application/json-patch+json",
"costco.service": "restOrders", "costco.service": "restOrders",
"costco.env": "ecom", "costco.env": "ecom",
"costco-x-authorization": config["authorization"],
"costco-x-wcs-clientId": config["client_id"],
"client-identifier": config["client_identifier"],
"origin": "https://www.costco.com", "origin": "https://www.costco.com",
"referer": "https://www.costco.com/", "referer": "https://www.costco.com/",
"user-agent": ( "user-agent": (
@@ -234,23 +223,36 @@ def build_headers(config):
} }
def build_session(config): def build_session():
from curl_cffi import requests
session = requests.Session() session = requests.Session()
session.headers.update(build_headers(config)) session.cookies.update(browser_cookie3.firefox(domain_name="costco.com"))
session.headers.update(build_headers())
return session return session
def graphql_post(session, query, variables): def graphql_post(session, query, variables):
last_response = None
for attempt in range(3):
try:
response = session.post( response = session.post(
BASE_URL, BASE_URL,
json={"query": query, "variables": variables}, json={"query": query, "variables": variables},
impersonate="firefox", impersonate="firefox",
timeout=30, timeout=30,
) )
response.raise_for_status() last_response = response
if response.status_code == 200:
return response.json() return response.json()
click.echo(f"retry {attempt + 1}/3 status={response.status_code}")
except Exception as exc: # pragma: no cover - network error path
click.echo(f"retry {attempt + 1}/3 error={exc}")
time.sleep(3)
if last_response is not None:
last_response.raise_for_status()
raise RuntimeError("failed to fetch Costco GraphQL payload")
def summary_receipts(payload): def summary_receipts(payload):
@@ -279,6 +281,25 @@ def format_cli_date(value):
return f"{value.month}/{value.day:02d}/{value.year}" return f"{value.month}/{value.day:02d}/{value.year}"
def subtract_months(value, months):
year = value.year
month = value.month - months
while month <= 0:
month += 12
year -= 1
day = min(value.day, monthrange(year, month)[1])
return value.replace(year=year, month=month, day=day)
def resolve_date_range(months_back, today=None):
if months_back < 1:
raise click.ClickException("months-back must be at least 1")
end = today or datetime.now().date()
start = subtract_months(end, months_back)
return format_cli_date(start), format_cli_date(end)
def build_date_windows(start_date, end_date, window_days): def build_date_windows(start_date, end_date, window_days):
start = parse_cli_date(start_date) start = parse_cli_date(start_date)
end = parse_cli_date(end_date) end = parse_cli_date(end_date)
@@ -304,12 +325,20 @@ def build_date_windows(start_date, end_date, window_days):
def unique_receipts(receipts): def unique_receipts(receipts):
by_barcode = {} by_barcode = {}
for receipt in receipts: for receipt in receipts:
barcode = receipt.get("transactionBarcode") key = receipt_key(receipt)
if barcode: if key:
by_barcode[barcode] = receipt by_barcode[key] = receipt
return list(by_barcode.values()) return list(by_barcode.values())
def receipt_key(receipt):
barcode = receipt.get("transactionBarcode", "")
transaction_date_time = receipt.get("transactionDateTime", "")
if not barcode:
return ""
return f"{barcode}::{transaction_date_time}"
def fetch_summary_windows( def fetch_summary_windows(
session, session,
start_date, start_date,
@@ -377,8 +406,9 @@ def fetch_summary_windows(
def flatten_costco_data(summary_payload, detail_payloads, raw_dir): def flatten_costco_data(summary_payload, detail_payloads, raw_dir):
summary_lookup = { summary_lookup = {
receipt["transactionBarcode"]: receipt receipt_key(receipt): receipt
for receipt in summary_receipts(summary_payload) for receipt in summary_receipts(summary_payload)
if receipt_key(receipt)
} }
orders = [] orders = []
items = [] items = []
@@ -386,13 +416,14 @@ def flatten_costco_data(summary_payload, detail_payloads, raw_dir):
for detail_payload in detail_payloads: for detail_payload in detail_payloads:
for receipt in detail_receipts(detail_payload): for receipt in detail_receipts(detail_payload):
order_id = receipt["transactionBarcode"] order_id = receipt["transactionBarcode"]
summary_row = summary_lookup.get(order_id, {}) receipt_id = receipt_key(receipt)
summary_row = summary_lookup.get(receipt_id, {})
coupon_numbers = { coupon_numbers = {
row.get("upcnumberCoupon", "") row.get("upcnumberCoupon", "")
for row in summary_row.get("couponArray", []) or [] for row in summary_row.get("couponArray", []) or []
if row.get("upcnumberCoupon") if row.get("upcnumberCoupon")
} }
raw_order_path = raw_dir / f"{order_id}.json" raw_order_path = raw_dir / f"{receipt_id or order_id}.json"
orders.append( orders.append(
{ {
@@ -510,8 +541,6 @@ def write_csv(path, rows, fieldnames):
@click.command() @click.command()
@click.option("--start-date", required=True, help="Start date like 1/01/2026.")
@click.option("--end-date", required=True, help="End date like 3/31/2026.")
@click.option( @click.option(
"--outdir", "--outdir",
default="costco_output", default="costco_output",
@@ -537,18 +566,23 @@ def write_csv(path, rows, fieldnames):
type=int, type=int,
help="Maximum number of days to request per summary window.", help="Maximum number of days to request per summary window.",
) )
def main(start_date, end_date, outdir, document_type, document_sub_type, window_days): @click.option(
config = load_config() "--months-back",
required = ["authorization", "client_id", "client_identifier"] default=3,
missing = [key for key in required if not config[key]] show_default=True,
if missing: type=int,
raise click.ClickException( help="How many months of receipts to enumerate back from today.",
f"missing Costco auth config: {', '.join(missing)}"
) )
def main(outdir, document_type, document_sub_type, window_days, months_back):
outdir = Path(outdir) outdir = Path(outdir)
raw_dir = outdir / "raw" raw_dir = outdir / "raw"
session = build_session(config) try:
session = build_session()
except Exception as exc:
raise click.ClickException(
f"failed to load Costco Firefox cookies: {exc}"
) from exc
start_date, end_date = resolve_date_range(months_back)
summary_payload, request_metadata = fetch_summary_windows( summary_payload, request_metadata = fetch_summary_windows(
session, session,
@@ -565,6 +599,7 @@ def main(start_date, end_date, outdir, document_type, document_sub_type, window_
detail_payloads = [] detail_payloads = []
for receipt in receipts: for receipt in receipts:
barcode = receipt["transactionBarcode"] barcode = receipt["transactionBarcode"]
receipt_id = receipt_key(receipt) or barcode
click.echo(f"fetching {barcode}") click.echo(f"fetching {barcode}")
detail_payload = graphql_post( detail_payload = graphql_post(
session, session,
@@ -572,7 +607,7 @@ def main(start_date, end_date, outdir, document_type, document_sub_type, window_
{"barcode": barcode, "documentType": "warehouse"}, {"barcode": barcode, "documentType": "warehouse"},
) )
detail_payloads.append(detail_payload) detail_payloads.append(detail_payload)
write_json(raw_dir / f"{barcode}.json", detail_payload) write_json(raw_dir / f"{receipt_id}.json", detail_payload)
orders, items = flatten_costco_data(summary_payload, detail_payloads, raw_dir) orders, items = flatten_costco_data(summary_payload, detail_payloads, raw_dir)
write_csv(outdir / "orders.csv", orders, ORDER_FIELDS) write_csv(outdir / "orders.csv", orders, ORDER_FIELDS)

View File

@@ -11,6 +11,14 @@ import validate_cross_retailer_flow
class CostcoPipelineTests(unittest.TestCase): class CostcoPipelineTests(unittest.TestCase):
def test_resolve_date_range_uses_months_back(self):
start_date, end_date = scrape_costco.resolve_date_range(
3, today=scrape_costco.parse_cli_date("3/16/2026")
)
self.assertEqual("12/16/2025", start_date)
self.assertEqual("3/16/2026", end_date)
def test_build_date_windows_splits_long_ranges(self): def test_build_date_windows_splits_long_ranges(self):
windows = scrape_costco.build_date_windows("1/01/2026", "6/30/2026", 92) windows = scrape_costco.build_date_windows("1/01/2026", "6/30/2026", 92)
@@ -160,6 +168,74 @@ class CostcoPipelineTests(unittest.TestCase):
self.assertEqual("true", items[1]["is_discount_line"]) self.assertEqual("true", items[1]["is_discount_line"])
self.assertEqual("true", items[1]["is_coupon_line"]) self.assertEqual("true", items[1]["is_coupon_line"])
def test_flatten_costco_data_uses_composite_summary_lookup_key(self):
summary_payload = {
"data": {
"receiptsWithCounts": {
"receipts": [
{
"transactionBarcode": "dup",
"transactionDateTime": "2026-03-12T16:16:00",
"tenderArray": [{"tenderDescription": "VISA"}],
"couponArray": [{"upcnumberCoupon": "111"}],
},
{
"transactionBarcode": "dup",
"transactionDateTime": "2026-02-14T16:25:00",
"tenderArray": [{"tenderDescription": "MASTERCARD"}],
"couponArray": [],
},
]
}
}
}
detail_payloads = [
{
"data": {
"receiptsWithCounts": {
"receipts": [
{
"transactionBarcode": "dup",
"transactionDateTime": "2026-03-12T16:16:00",
"transactionDate": "2026-03-12",
"receiptType": "In-Warehouse",
"total": 10.0,
"totalItemCount": 1,
"instantSavings": 5.0,
"warehouseName": "MT VERNON",
"warehouseNumber": 1115,
"warehouseAddress1": "7940 RICHMOND HWY",
"warehouseCity": "ALEXANDRIA",
"warehouseState": "VA",
"warehousePostalCode": "22306",
"itemArray": [
{
"itemNumber": "111",
"itemDescription01": "/ 111",
"itemDescription02": None,
"itemDepartmentNumber": 14,
"transDepartmentNumber": 14,
"unit": -1,
"itemIdentifier": None,
"amount": -5,
"itemUnitPriceAmount": 0,
}
],
}
]
}
}
}
]
orders, items = scrape_costco.flatten_costco_data(
summary_payload, detail_payloads, Path("costco_output/raw")
)
self.assertEqual("VISA", orders[0]["payment_method"])
self.assertEqual("true", items[0]["is_coupon_line"])
self.assertIn("dup::2026-03-12T16:16:00.json", items[0]["raw_order_path"])
def test_costco_enricher_parses_size_pack_and_discount(self): def test_costco_enricher_parses_size_pack_and_discount(self):
row = enrich_costco.parse_costco_item( row = enrich_costco.parse_costco_item(
order_id="abc", order_id="abc",
@@ -335,13 +411,6 @@ class CostcoPipelineTests(unittest.TestCase):
] ]
with mock.patch.object( with mock.patch.object(
scrape_costco, "load_config",
return_value={
"authorization": "token",
"client_id": "client",
"client_identifier": "identifier",
},
), mock.patch.object(
scrape_costco, "build_session", return_value=object() scrape_costco, "build_session", return_value=object()
), mock.patch.object( ), mock.patch.object(
scrape_costco, scrape_costco,
@@ -353,12 +422,11 @@ class CostcoPipelineTests(unittest.TestCase):
return_value=detail_payload, return_value=detail_payload,
): ):
scrape_costco.main.callback( scrape_costco.main.callback(
start_date="1/01/2026",
end_date="3/31/2026",
outdir=str(outdir), outdir=str(outdir),
document_type="all", document_type="all",
document_sub_type="all", document_sub_type="all",
window_days=92, window_days=92,
months_back=3,
) )
metadata_path = outdir / "raw" / "summary_requests.json" metadata_path = outdir / "raw" / "summary_requests.json"