From c0054dc51e74555ef41bb3cb3239e3b5aab48221 Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 16 Mar 2026 12:28:19 -0400 Subject: [PATCH] Align Costco scraper with browser session flow --- scrape_costco.py | 129 +++++++++++++++++++++------------- tests/test_costco_pipeline.py | 86 ++++++++++++++++++++--- 2 files changed, 159 insertions(+), 56 deletions(-) diff --git a/scrape_costco.py b/scrape_costco.py index 0c36d9b..1f607b4 100644 --- a/scrape_costco.py +++ b/scrape_costco.py @@ -1,12 +1,13 @@ import csv import json -import os +import time +from calendar import monthrange from datetime import datetime, timedelta from pathlib import Path import click -from dotenv import load_dotenv - +import browser_cookie3 +from curl_cffi import requests BASE_URL = "https://ecom-api.costco.com/ebusiness/order/v1/orders/graphql" RETAILER = "costco" @@ -207,24 +208,12 @@ ITEM_FIELDS = [ ] -def load_config(): - load_dotenv() - return { - "authorization": os.getenv("COSTCO_X_AUTHORIZATION", "").strip(), - "client_id": os.getenv("COSTCO_WCS_CLIENT_ID", "").strip(), - "client_identifier": os.getenv("COSTCO_CLIENT_IDENTIFIER", "").strip(), - } - - -def build_headers(config): +def build_headers(): return { "accept": "*/*", "content-type": "application/json-patch+json", "costco.service": "restOrders", "costco.env": "ecom", - "costco-x-authorization": config["authorization"], - "costco-x-wcs-clientId": config["client_id"], - "client-identifier": config["client_identifier"], "origin": "https://www.costco.com", "referer": "https://www.costco.com/", "user-agent": ( @@ -234,23 +223,36 @@ def build_headers(config): } -def build_session(config): - from curl_cffi import requests - +def build_session(): session = requests.Session() - session.headers.update(build_headers(config)) + session.cookies.update(browser_cookie3.firefox(domain_name="costco.com")) + session.headers.update(build_headers()) return session def graphql_post(session, query, variables): - response = session.post( - BASE_URL, - json={"query": query, "variables": variables}, - impersonate="firefox", - timeout=30, - ) - response.raise_for_status() - return response.json() + last_response = None + + for attempt in range(3): + try: + response = session.post( + BASE_URL, + json={"query": query, "variables": variables}, + impersonate="firefox", + timeout=30, + ) + last_response = response + if response.status_code == 200: + return response.json() + click.echo(f"retry {attempt + 1}/3 status={response.status_code}") + except Exception as exc: # pragma: no cover - network error path + click.echo(f"retry {attempt + 1}/3 error={exc}") + time.sleep(3) + + if last_response is not None: + last_response.raise_for_status() + + raise RuntimeError("failed to fetch Costco GraphQL payload") def summary_receipts(payload): @@ -279,6 +281,25 @@ def format_cli_date(value): return f"{value.month}/{value.day:02d}/{value.year}" +def subtract_months(value, months): + year = value.year + month = value.month - months + while month <= 0: + month += 12 + year -= 1 + day = min(value.day, monthrange(year, month)[1]) + return value.replace(year=year, month=month, day=day) + + +def resolve_date_range(months_back, today=None): + if months_back < 1: + raise click.ClickException("months-back must be at least 1") + + end = today or datetime.now().date() + start = subtract_months(end, months_back) + return format_cli_date(start), format_cli_date(end) + + def build_date_windows(start_date, end_date, window_days): start = parse_cli_date(start_date) end = parse_cli_date(end_date) @@ -304,12 +325,20 @@ def build_date_windows(start_date, end_date, window_days): def unique_receipts(receipts): by_barcode = {} for receipt in receipts: - barcode = receipt.get("transactionBarcode") - if barcode: - by_barcode[barcode] = receipt + key = receipt_key(receipt) + if key: + by_barcode[key] = receipt return list(by_barcode.values()) +def receipt_key(receipt): + barcode = receipt.get("transactionBarcode", "") + transaction_date_time = receipt.get("transactionDateTime", "") + if not barcode: + return "" + return f"{barcode}::{transaction_date_time}" + + def fetch_summary_windows( session, start_date, @@ -377,8 +406,9 @@ def fetch_summary_windows( def flatten_costco_data(summary_payload, detail_payloads, raw_dir): summary_lookup = { - receipt["transactionBarcode"]: receipt + receipt_key(receipt): receipt for receipt in summary_receipts(summary_payload) + if receipt_key(receipt) } orders = [] items = [] @@ -386,13 +416,14 @@ def flatten_costco_data(summary_payload, detail_payloads, raw_dir): for detail_payload in detail_payloads: for receipt in detail_receipts(detail_payload): order_id = receipt["transactionBarcode"] - summary_row = summary_lookup.get(order_id, {}) + receipt_id = receipt_key(receipt) + summary_row = summary_lookup.get(receipt_id, {}) coupon_numbers = { row.get("upcnumberCoupon", "") for row in summary_row.get("couponArray", []) or [] if row.get("upcnumberCoupon") } - raw_order_path = raw_dir / f"{order_id}.json" + raw_order_path = raw_dir / f"{receipt_id or order_id}.json" orders.append( { @@ -510,8 +541,6 @@ def write_csv(path, rows, fieldnames): @click.command() -@click.option("--start-date", required=True, help="Start date like 1/01/2026.") -@click.option("--end-date", required=True, help="End date like 3/31/2026.") @click.option( "--outdir", default="costco_output", @@ -537,18 +566,23 @@ def write_csv(path, rows, fieldnames): type=int, help="Maximum number of days to request per summary window.", ) -def main(start_date, end_date, outdir, document_type, document_sub_type, window_days): - config = load_config() - required = ["authorization", "client_id", "client_identifier"] - missing = [key for key in required if not config[key]] - if missing: - raise click.ClickException( - f"missing Costco auth config: {', '.join(missing)}" - ) - +@click.option( + "--months-back", + default=3, + show_default=True, + type=int, + help="How many months of receipts to enumerate back from today.", +) +def main(outdir, document_type, document_sub_type, window_days, months_back): outdir = Path(outdir) raw_dir = outdir / "raw" - session = build_session(config) + try: + session = build_session() + except Exception as exc: + raise click.ClickException( + f"failed to load Costco Firefox cookies: {exc}" + ) from exc + start_date, end_date = resolve_date_range(months_back) summary_payload, request_metadata = fetch_summary_windows( session, @@ -565,6 +599,7 @@ def main(start_date, end_date, outdir, document_type, document_sub_type, window_ detail_payloads = [] for receipt in receipts: barcode = receipt["transactionBarcode"] + receipt_id = receipt_key(receipt) or barcode click.echo(f"fetching {barcode}") detail_payload = graphql_post( session, @@ -572,7 +607,7 @@ def main(start_date, end_date, outdir, document_type, document_sub_type, window_ {"barcode": barcode, "documentType": "warehouse"}, ) detail_payloads.append(detail_payload) - write_json(raw_dir / f"{barcode}.json", detail_payload) + write_json(raw_dir / f"{receipt_id}.json", detail_payload) orders, items = flatten_costco_data(summary_payload, detail_payloads, raw_dir) write_csv(outdir / "orders.csv", orders, ORDER_FIELDS) diff --git a/tests/test_costco_pipeline.py b/tests/test_costco_pipeline.py index 60b0738..5137f2c 100644 --- a/tests/test_costco_pipeline.py +++ b/tests/test_costco_pipeline.py @@ -11,6 +11,14 @@ import validate_cross_retailer_flow class CostcoPipelineTests(unittest.TestCase): + def test_resolve_date_range_uses_months_back(self): + start_date, end_date = scrape_costco.resolve_date_range( + 3, today=scrape_costco.parse_cli_date("3/16/2026") + ) + + self.assertEqual("12/16/2025", start_date) + self.assertEqual("3/16/2026", end_date) + def test_build_date_windows_splits_long_ranges(self): windows = scrape_costco.build_date_windows("1/01/2026", "6/30/2026", 92) @@ -160,6 +168,74 @@ class CostcoPipelineTests(unittest.TestCase): self.assertEqual("true", items[1]["is_discount_line"]) self.assertEqual("true", items[1]["is_coupon_line"]) + def test_flatten_costco_data_uses_composite_summary_lookup_key(self): + summary_payload = { + "data": { + "receiptsWithCounts": { + "receipts": [ + { + "transactionBarcode": "dup", + "transactionDateTime": "2026-03-12T16:16:00", + "tenderArray": [{"tenderDescription": "VISA"}], + "couponArray": [{"upcnumberCoupon": "111"}], + }, + { + "transactionBarcode": "dup", + "transactionDateTime": "2026-02-14T16:25:00", + "tenderArray": [{"tenderDescription": "MASTERCARD"}], + "couponArray": [], + }, + ] + } + } + } + detail_payloads = [ + { + "data": { + "receiptsWithCounts": { + "receipts": [ + { + "transactionBarcode": "dup", + "transactionDateTime": "2026-03-12T16:16:00", + "transactionDate": "2026-03-12", + "receiptType": "In-Warehouse", + "total": 10.0, + "totalItemCount": 1, + "instantSavings": 5.0, + "warehouseName": "MT VERNON", + "warehouseNumber": 1115, + "warehouseAddress1": "7940 RICHMOND HWY", + "warehouseCity": "ALEXANDRIA", + "warehouseState": "VA", + "warehousePostalCode": "22306", + "itemArray": [ + { + "itemNumber": "111", + "itemDescription01": "/ 111", + "itemDescription02": None, + "itemDepartmentNumber": 14, + "transDepartmentNumber": 14, + "unit": -1, + "itemIdentifier": None, + "amount": -5, + "itemUnitPriceAmount": 0, + } + ], + } + ] + } + } + } + ] + + orders, items = scrape_costco.flatten_costco_data( + summary_payload, detail_payloads, Path("costco_output/raw") + ) + + self.assertEqual("VISA", orders[0]["payment_method"]) + self.assertEqual("true", items[0]["is_coupon_line"]) + self.assertIn("dup::2026-03-12T16:16:00.json", items[0]["raw_order_path"]) + def test_costco_enricher_parses_size_pack_and_discount(self): row = enrich_costco.parse_costco_item( order_id="abc", @@ -335,13 +411,6 @@ class CostcoPipelineTests(unittest.TestCase): ] with mock.patch.object( - scrape_costco, "load_config", - return_value={ - "authorization": "token", - "client_id": "client", - "client_identifier": "identifier", - }, - ), mock.patch.object( scrape_costco, "build_session", return_value=object() ), mock.patch.object( scrape_costco, @@ -353,12 +422,11 @@ class CostcoPipelineTests(unittest.TestCase): return_value=detail_payload, ): scrape_costco.main.callback( - start_date="1/01/2026", - end_date="3/31/2026", outdir=str(outdir), document_type="all", document_sub_type="all", window_days=92, + months_back=3, ) metadata_path = outdir / "raw" / "summary_requests.json"