From e48dd6c4c2aeb9680e23e2e05891579a7e042452 Mon Sep 17 00:00:00 2001 From: eulaly Date: Mon, 16 Mar 2026 16:59:31 -0400 Subject: [PATCH] troubleshooting costco header extraction --- pm/scrape-giant.org | 8 ++++++ retailer_sessions.py | 61 ++++++++++++++++++++++++++++++++++---------- scrape_costco.py | 49 ++++++++++++++++++++--------------- 3 files changed, 83 insertions(+), 35 deletions(-) diff --git a/pm/scrape-giant.org b/pm/scrape-giant.org index 4275d3d..6617d23 100644 --- a/pm/scrape-giant.org +++ b/pm/scrape-giant.org @@ -125,6 +125,14 @@ request-context: appId=cid-v1:75750625-0c81-4f08-9f5d-ce4f73198e54 X-Firefox-Spdy: h2 * costco requests +- localstorage idToken has the auth token, but needs "Bearer " prepended +- localstorage clientID has the COSTCO_X_WCS_CLIENTID +- I don't see the client_identifier uuid anywhere. + +we will pull from .env first (may have to hardcode) +then overwrite with session data (token) +hopefully this doesnt change. + ** warehouse *** POST https://ecom-api.costco.com/ebusiness/order/v1/orders/graphql diff --git a/retailer_sessions.py b/retailer_sessions.py index 56aa8bb..6f5aad7 100644 --- a/retailer_sessions.py +++ b/retailer_sessions.py @@ -1,4 +1,6 @@ +import os from dataclasses import dataclass +from dotenv import load_dotenv from browser_session import ( find_json_storage_value, @@ -32,29 +34,60 @@ def load_giant_session(browser="firefox", profile_dir=None): ) return RetailerSession(cookies=context.cookies, headers={}) - def load_costco_session(browser="firefox", profile_dir=None): + load_dotenv() + + headers = { + "costco-x-authorization": os.getenv("COSTCO_X_AUTHORIZATION", "").strip(), + "costco-x-wcs-clientId": os.getenv("COSTCO_WCS_CLIENT_ID", "").strip(), + "client-identifier": os.getenv("COSTCO_CLIENT_IDENTIFIER", "").strip(), + } + context = load_browser_context( browser=browser, domain_name=".costco.com", - storage_origins=COSTCO_STORAGE_ORIGINS, + storage_origins=["costco.com"], profile_dir=profile_dir, ) - headers = extract_costco_headers(context.storage_entries) - missing = [ - header_name for header_name, value in headers.items() if not value - ] - if missing: - available_keys = ", ".join( - list_storage_keys(context.storage_entries, COSTCO_STORAGE_ORIGINS) - ) - raise ValueError( - "missing Costco browser session headers: " - f"{', '.join(missing)}. " - f"Available Costco storage keys: {available_keys or '(none)'}" + + storage = {entry.key: entry.value for entry in context.storage_entries} + + id_token = storage.get("idToken", "").strip() + client_id = storage.get("clientID", "").strip() + + if id_token: + headers["costco-x-authorization"] = ( + id_token if id_token.startswith("Bearer ") else f"Bearer {id_token}" ) + if client_id: + headers["costco-x-wcs-clientId"] = client_id + + headers = {k: v for k, v in headers.items() if v} + return RetailerSession(cookies=context.cookies, headers=headers) +#def load_costco_session(browser="firefox", profile_dir=None): +# context = load_browser_context( +# browser=browser, +# domain_name=".costco.com", +# storage_origins=COSTCO_STORAGE_ORIGINS, +# profile_dir=profile_dir, +# ) +# headers = extract_costco_headers(context.storage_entries) +# missing = [ +# header_name for header_name, value in headers.items() if not value +# ] +# if missing: +# available_keys = ", ".join( +# list_storage_keys(context.storage_entries, COSTCO_STORAGE_ORIGINS) +# ) +# raise ValueError( +# "missing Costco browser session headers: " +# f"{', '.join(missing)}. " +# f"Available Costco storage keys: {available_keys or '(none)'}" +# ) +# return RetailerSession(cookies=context.cookies, headers=headers) + def extract_costco_headers(storage_entries): headers = {} diff --git a/scrape_costco.py b/scrape_costco.py index 1beb062..ac58310 100644 --- a/scrape_costco.py +++ b/scrape_costco.py @@ -1,3 +1,4 @@ +import os import csv import json import time @@ -5,7 +6,7 @@ import re from calendar import monthrange from datetime import datetime, timedelta from pathlib import Path - +from dotenv import load_dotenv import click from curl_cffi import requests @@ -23,7 +24,7 @@ query receiptsWithCounts($startDate: String!, $endDate: String!, $documentType: gasAndCarWash receipts { warehouseName - receiptType + receiptType documentType transactionDateTime transactionBarcode @@ -225,11 +226,11 @@ def build_headers(auth_headers): headers.update(auth_headers) return headers -def build_session(profile_dir=None): - retailer_session = load_costco_session(profile_dir=profile_dir) +def build_session(retailer_session): session = requests.Session() session.cookies.update(retailer_session.cookies) - session.headers.update(build_headers(retailer_session.headers)) + session.headers.update(build_headers()) + session.headers.update(retailer_session.headers) return session @@ -593,23 +594,28 @@ def main( ): outdir = Path(outdir) raw_dir = outdir / "raw" - try: - session = build_session(profile_dir=firefox_profile_dir) - except Exception as exc: - if firefox_profile_dir: - raise click.ClickException( - f"failed to load Costco browser session: {exc}" - ) from exc - prompted_profile = click.prompt( - "Firefox profile dir", - type=click.Path(exists=True, file_okay=False, path_type=Path), + if firefox_profile_dir is None: + firefox_profile_dir = next( + (Path(os.getenv("APPDATA")) / "Mozilla" / "Firefox" / "Profiles").iterdir() ) - try: - session = build_session(profile_dir=prompted_profile) - except Exception as prompt_exc: - raise click.ClickException( - f"failed to load Costco browser session: {prompt_exc}" - ) from prompt_exc + try: + retailer_session = load_costco_session( + browser="firefox", + profile_dir=firefox_profile_dir, + ) + click.echo( + "session bootstrap: " + f"cookies={bool(retailer_session.cookies)}, " + f"authorization={'costco-x-authorization' in retailer_session.headers}, " + f"client_id={'costco-x-wcs-clientId' in retailer_session.headers}, " + f"client_identifier={'client-identifier' in retailer_session.headers}" + ) + session = build_session(retailer_session) + except Exception as exc: + raise click.ClickException( + f"failed to load Costco browser session: {exc}" + ) from exc + start_date, end_date = resolve_date_range(months_back) summary_payload, request_metadata = fetch_summary_windows( @@ -645,4 +651,5 @@ def main( if __name__ == "__main__": main() +