Align Costco scraper with browser session flow
This commit is contained in:
113
scrape_costco.py
113
scrape_costco.py
@@ -1,12 +1,13 @@
|
||||
import csv
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from calendar import monthrange
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
import click
|
||||
from dotenv import load_dotenv
|
||||
|
||||
import browser_cookie3
|
||||
from curl_cffi import requests
|
||||
|
||||
BASE_URL = "https://ecom-api.costco.com/ebusiness/order/v1/orders/graphql"
|
||||
RETAILER = "costco"
|
||||
@@ -207,24 +208,12 @@ ITEM_FIELDS = [
|
||||
]
|
||||
|
||||
|
||||
def load_config():
|
||||
load_dotenv()
|
||||
return {
|
||||
"authorization": os.getenv("COSTCO_X_AUTHORIZATION", "").strip(),
|
||||
"client_id": os.getenv("COSTCO_WCS_CLIENT_ID", "").strip(),
|
||||
"client_identifier": os.getenv("COSTCO_CLIENT_IDENTIFIER", "").strip(),
|
||||
}
|
||||
|
||||
|
||||
def build_headers(config):
|
||||
def build_headers():
|
||||
return {
|
||||
"accept": "*/*",
|
||||
"content-type": "application/json-patch+json",
|
||||
"costco.service": "restOrders",
|
||||
"costco.env": "ecom",
|
||||
"costco-x-authorization": config["authorization"],
|
||||
"costco-x-wcs-clientId": config["client_id"],
|
||||
"client-identifier": config["client_identifier"],
|
||||
"origin": "https://www.costco.com",
|
||||
"referer": "https://www.costco.com/",
|
||||
"user-agent": (
|
||||
@@ -234,23 +223,36 @@ def build_headers(config):
|
||||
}
|
||||
|
||||
|
||||
def build_session(config):
|
||||
from curl_cffi import requests
|
||||
|
||||
def build_session():
|
||||
session = requests.Session()
|
||||
session.headers.update(build_headers(config))
|
||||
session.cookies.update(browser_cookie3.firefox(domain_name="costco.com"))
|
||||
session.headers.update(build_headers())
|
||||
return session
|
||||
|
||||
|
||||
def graphql_post(session, query, variables):
|
||||
last_response = None
|
||||
|
||||
for attempt in range(3):
|
||||
try:
|
||||
response = session.post(
|
||||
BASE_URL,
|
||||
json={"query": query, "variables": variables},
|
||||
impersonate="firefox",
|
||||
timeout=30,
|
||||
)
|
||||
response.raise_for_status()
|
||||
last_response = response
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
click.echo(f"retry {attempt + 1}/3 status={response.status_code}")
|
||||
except Exception as exc: # pragma: no cover - network error path
|
||||
click.echo(f"retry {attempt + 1}/3 error={exc}")
|
||||
time.sleep(3)
|
||||
|
||||
if last_response is not None:
|
||||
last_response.raise_for_status()
|
||||
|
||||
raise RuntimeError("failed to fetch Costco GraphQL payload")
|
||||
|
||||
|
||||
def summary_receipts(payload):
|
||||
@@ -279,6 +281,25 @@ def format_cli_date(value):
|
||||
return f"{value.month}/{value.day:02d}/{value.year}"
|
||||
|
||||
|
||||
def subtract_months(value, months):
|
||||
year = value.year
|
||||
month = value.month - months
|
||||
while month <= 0:
|
||||
month += 12
|
||||
year -= 1
|
||||
day = min(value.day, monthrange(year, month)[1])
|
||||
return value.replace(year=year, month=month, day=day)
|
||||
|
||||
|
||||
def resolve_date_range(months_back, today=None):
|
||||
if months_back < 1:
|
||||
raise click.ClickException("months-back must be at least 1")
|
||||
|
||||
end = today or datetime.now().date()
|
||||
start = subtract_months(end, months_back)
|
||||
return format_cli_date(start), format_cli_date(end)
|
||||
|
||||
|
||||
def build_date_windows(start_date, end_date, window_days):
|
||||
start = parse_cli_date(start_date)
|
||||
end = parse_cli_date(end_date)
|
||||
@@ -304,12 +325,20 @@ def build_date_windows(start_date, end_date, window_days):
|
||||
def unique_receipts(receipts):
|
||||
by_barcode = {}
|
||||
for receipt in receipts:
|
||||
barcode = receipt.get("transactionBarcode")
|
||||
if barcode:
|
||||
by_barcode[barcode] = receipt
|
||||
key = receipt_key(receipt)
|
||||
if key:
|
||||
by_barcode[key] = receipt
|
||||
return list(by_barcode.values())
|
||||
|
||||
|
||||
def receipt_key(receipt):
|
||||
barcode = receipt.get("transactionBarcode", "")
|
||||
transaction_date_time = receipt.get("transactionDateTime", "")
|
||||
if not barcode:
|
||||
return ""
|
||||
return f"{barcode}::{transaction_date_time}"
|
||||
|
||||
|
||||
def fetch_summary_windows(
|
||||
session,
|
||||
start_date,
|
||||
@@ -377,8 +406,9 @@ def fetch_summary_windows(
|
||||
|
||||
def flatten_costco_data(summary_payload, detail_payloads, raw_dir):
|
||||
summary_lookup = {
|
||||
receipt["transactionBarcode"]: receipt
|
||||
receipt_key(receipt): receipt
|
||||
for receipt in summary_receipts(summary_payload)
|
||||
if receipt_key(receipt)
|
||||
}
|
||||
orders = []
|
||||
items = []
|
||||
@@ -386,13 +416,14 @@ def flatten_costco_data(summary_payload, detail_payloads, raw_dir):
|
||||
for detail_payload in detail_payloads:
|
||||
for receipt in detail_receipts(detail_payload):
|
||||
order_id = receipt["transactionBarcode"]
|
||||
summary_row = summary_lookup.get(order_id, {})
|
||||
receipt_id = receipt_key(receipt)
|
||||
summary_row = summary_lookup.get(receipt_id, {})
|
||||
coupon_numbers = {
|
||||
row.get("upcnumberCoupon", "")
|
||||
for row in summary_row.get("couponArray", []) or []
|
||||
if row.get("upcnumberCoupon")
|
||||
}
|
||||
raw_order_path = raw_dir / f"{order_id}.json"
|
||||
raw_order_path = raw_dir / f"{receipt_id or order_id}.json"
|
||||
|
||||
orders.append(
|
||||
{
|
||||
@@ -510,8 +541,6 @@ def write_csv(path, rows, fieldnames):
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option("--start-date", required=True, help="Start date like 1/01/2026.")
|
||||
@click.option("--end-date", required=True, help="End date like 3/31/2026.")
|
||||
@click.option(
|
||||
"--outdir",
|
||||
default="costco_output",
|
||||
@@ -537,18 +566,23 @@ def write_csv(path, rows, fieldnames):
|
||||
type=int,
|
||||
help="Maximum number of days to request per summary window.",
|
||||
)
|
||||
def main(start_date, end_date, outdir, document_type, document_sub_type, window_days):
|
||||
config = load_config()
|
||||
required = ["authorization", "client_id", "client_identifier"]
|
||||
missing = [key for key in required if not config[key]]
|
||||
if missing:
|
||||
raise click.ClickException(
|
||||
f"missing Costco auth config: {', '.join(missing)}"
|
||||
@click.option(
|
||||
"--months-back",
|
||||
default=3,
|
||||
show_default=True,
|
||||
type=int,
|
||||
help="How many months of receipts to enumerate back from today.",
|
||||
)
|
||||
|
||||
def main(outdir, document_type, document_sub_type, window_days, months_back):
|
||||
outdir = Path(outdir)
|
||||
raw_dir = outdir / "raw"
|
||||
session = build_session(config)
|
||||
try:
|
||||
session = build_session()
|
||||
except Exception as exc:
|
||||
raise click.ClickException(
|
||||
f"failed to load Costco Firefox cookies: {exc}"
|
||||
) from exc
|
||||
start_date, end_date = resolve_date_range(months_back)
|
||||
|
||||
summary_payload, request_metadata = fetch_summary_windows(
|
||||
session,
|
||||
@@ -565,6 +599,7 @@ def main(start_date, end_date, outdir, document_type, document_sub_type, window_
|
||||
detail_payloads = []
|
||||
for receipt in receipts:
|
||||
barcode = receipt["transactionBarcode"]
|
||||
receipt_id = receipt_key(receipt) or barcode
|
||||
click.echo(f"fetching {barcode}")
|
||||
detail_payload = graphql_post(
|
||||
session,
|
||||
@@ -572,7 +607,7 @@ def main(start_date, end_date, outdir, document_type, document_sub_type, window_
|
||||
{"barcode": barcode, "documentType": "warehouse"},
|
||||
)
|
||||
detail_payloads.append(detail_payload)
|
||||
write_json(raw_dir / f"{barcode}.json", detail_payload)
|
||||
write_json(raw_dir / f"{receipt_id}.json", detail_payload)
|
||||
|
||||
orders, items = flatten_costco_data(summary_payload, detail_payloads, raw_dir)
|
||||
write_csv(outdir / "orders.csv", orders, ORDER_FIELDS)
|
||||
|
||||
@@ -11,6 +11,14 @@ import validate_cross_retailer_flow
|
||||
|
||||
|
||||
class CostcoPipelineTests(unittest.TestCase):
|
||||
def test_resolve_date_range_uses_months_back(self):
|
||||
start_date, end_date = scrape_costco.resolve_date_range(
|
||||
3, today=scrape_costco.parse_cli_date("3/16/2026")
|
||||
)
|
||||
|
||||
self.assertEqual("12/16/2025", start_date)
|
||||
self.assertEqual("3/16/2026", end_date)
|
||||
|
||||
def test_build_date_windows_splits_long_ranges(self):
|
||||
windows = scrape_costco.build_date_windows("1/01/2026", "6/30/2026", 92)
|
||||
|
||||
@@ -160,6 +168,74 @@ class CostcoPipelineTests(unittest.TestCase):
|
||||
self.assertEqual("true", items[1]["is_discount_line"])
|
||||
self.assertEqual("true", items[1]["is_coupon_line"])
|
||||
|
||||
def test_flatten_costco_data_uses_composite_summary_lookup_key(self):
|
||||
summary_payload = {
|
||||
"data": {
|
||||
"receiptsWithCounts": {
|
||||
"receipts": [
|
||||
{
|
||||
"transactionBarcode": "dup",
|
||||
"transactionDateTime": "2026-03-12T16:16:00",
|
||||
"tenderArray": [{"tenderDescription": "VISA"}],
|
||||
"couponArray": [{"upcnumberCoupon": "111"}],
|
||||
},
|
||||
{
|
||||
"transactionBarcode": "dup",
|
||||
"transactionDateTime": "2026-02-14T16:25:00",
|
||||
"tenderArray": [{"tenderDescription": "MASTERCARD"}],
|
||||
"couponArray": [],
|
||||
},
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
detail_payloads = [
|
||||
{
|
||||
"data": {
|
||||
"receiptsWithCounts": {
|
||||
"receipts": [
|
||||
{
|
||||
"transactionBarcode": "dup",
|
||||
"transactionDateTime": "2026-03-12T16:16:00",
|
||||
"transactionDate": "2026-03-12",
|
||||
"receiptType": "In-Warehouse",
|
||||
"total": 10.0,
|
||||
"totalItemCount": 1,
|
||||
"instantSavings": 5.0,
|
||||
"warehouseName": "MT VERNON",
|
||||
"warehouseNumber": 1115,
|
||||
"warehouseAddress1": "7940 RICHMOND HWY",
|
||||
"warehouseCity": "ALEXANDRIA",
|
||||
"warehouseState": "VA",
|
||||
"warehousePostalCode": "22306",
|
||||
"itemArray": [
|
||||
{
|
||||
"itemNumber": "111",
|
||||
"itemDescription01": "/ 111",
|
||||
"itemDescription02": None,
|
||||
"itemDepartmentNumber": 14,
|
||||
"transDepartmentNumber": 14,
|
||||
"unit": -1,
|
||||
"itemIdentifier": None,
|
||||
"amount": -5,
|
||||
"itemUnitPriceAmount": 0,
|
||||
}
|
||||
],
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
orders, items = scrape_costco.flatten_costco_data(
|
||||
summary_payload, detail_payloads, Path("costco_output/raw")
|
||||
)
|
||||
|
||||
self.assertEqual("VISA", orders[0]["payment_method"])
|
||||
self.assertEqual("true", items[0]["is_coupon_line"])
|
||||
self.assertIn("dup::2026-03-12T16:16:00.json", items[0]["raw_order_path"])
|
||||
|
||||
def test_costco_enricher_parses_size_pack_and_discount(self):
|
||||
row = enrich_costco.parse_costco_item(
|
||||
order_id="abc",
|
||||
@@ -335,13 +411,6 @@ class CostcoPipelineTests(unittest.TestCase):
|
||||
]
|
||||
|
||||
with mock.patch.object(
|
||||
scrape_costco, "load_config",
|
||||
return_value={
|
||||
"authorization": "token",
|
||||
"client_id": "client",
|
||||
"client_identifier": "identifier",
|
||||
},
|
||||
), mock.patch.object(
|
||||
scrape_costco, "build_session", return_value=object()
|
||||
), mock.patch.object(
|
||||
scrape_costco,
|
||||
@@ -353,12 +422,11 @@ class CostcoPipelineTests(unittest.TestCase):
|
||||
return_value=detail_payload,
|
||||
):
|
||||
scrape_costco.main.callback(
|
||||
start_date="1/01/2026",
|
||||
end_date="3/31/2026",
|
||||
outdir=str(outdir),
|
||||
document_type="all",
|
||||
document_sub_type="all",
|
||||
window_days=92,
|
||||
months_back=3,
|
||||
)
|
||||
|
||||
metadata_path = outdir / "raw" / "summary_requests.json"
|
||||
|
||||
Reference in New Issue
Block a user