From d7a0329332d055feeb39b853d628860031c3b8f2 Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 16 Mar 2026 17:08:44 -0400 Subject: [PATCH] Simplify browser session bootstrap --- browser_session.py | 140 ++++------------------------------ retailer_sessions.py | 116 ---------------------------- scrape_costco.py | 107 +++++++++++++++++++------- scrape_giant.py | 6 +- tests/test_browser_session.py | 120 ++++++++++++++++------------- tests/test_costco_pipeline.py | 19 +++++ 6 files changed, 184 insertions(+), 324 deletions(-) delete mode 100644 retailer_sessions.py diff --git a/browser_session.py b/browser_session.py index fd4c415..247b750 100644 --- a/browser_session.py +++ b/browser_session.py @@ -1,47 +1,13 @@ import configparser -import json import os import shutil import sqlite3 import tempfile -from dataclasses import dataclass from pathlib import Path import browser_cookie3 -@dataclass -class StorageEntry: - origin: str - key: str - value: str - source: str - - -@dataclass -class BrowserContext: - cookies: object - storage_entries: list[StorageEntry] - - -def load_browser_context( - browser, - domain_name, - storage_origins=None, - profile_dir=None, -): - if browser != "firefox": - raise ValueError(f"unsupported browser: {browser}") - - profile = Path(profile_dir) if profile_dir else find_firefox_profile_dir() - cookies = load_firefox_cookies(domain_name, profile) - storage_entries = read_firefox_storage_entries( - profile, - origin_filters=storage_origins or [], - ) - return BrowserContext(cookies=cookies, storage_entries=storage_entries) - - def find_firefox_profile_dir(): profiles_ini = firefox_profiles_root() / "profiles.ini" parser = configparser.RawConfigParser() @@ -88,106 +54,37 @@ def load_firefox_cookies(domain_name, profile_dir): return browser_cookie3.firefox(cookie_file=str(cookie_file), domain_name=domain_name) -def read_firefox_storage_entries(profile_dir, origin_filters): - profile_dir = Path(profile_dir) - entries = [] - entries.extend(read_firefox_ls_entries(profile_dir, origin_filters)) - entries.extend(read_firefox_webapps_entries(profile_dir, origin_filters)) - - deduped = [] - seen = set() - for entry in entries: - key = (entry.origin, entry.key, entry.value, entry.source) - if key in seen: - continue - seen.add(key) - deduped.append(entry) - return deduped - - -def storage_entries_for_origin(storage_entries, origin_filters): - return [ - entry - for entry in storage_entries - if origin_matches(entry.origin, origin_filters) - ] - - -def find_storage_value(storage_entries, origin_filters, key): - for entry in storage_entries_for_origin(storage_entries, origin_filters): - if entry.key == key: - return entry.value - return "" - - -def find_json_storage_value(storage_entries, origin_filters, key, field): - raw_value = find_storage_value(storage_entries, origin_filters, key) - if not raw_value: - return "" - try: - payload = json.loads(raw_value) - except json.JSONDecodeError: - return "" - value = payload.get(field, "") - if value is None: - return "" - return str(value) - - -def list_storage_keys(storage_entries, origin_filters): - return sorted( - { - entry.key - for entry in storage_entries_for_origin(storage_entries, origin_filters) - if entry.key - } - ) - - -def read_firefox_ls_entries(profile_dir, origin_filters): - entries = [] +def read_firefox_local_storage(profile_dir, origin_filter): storage_root = profile_dir / "storage" / "default" if not storage_root.exists(): - return entries + return {} for ls_path in storage_root.glob("*/ls/data.sqlite"): origin = decode_firefox_origin(ls_path.parents[1].name) - if not origin_matches(origin, origin_filters): + if origin_filter.lower() not in origin.lower(): continue - for row in query_sqlite(ls_path, "SELECT key, value FROM data"): - entries.append( - StorageEntry( - origin=origin, - key=stringify_sql_value(row[0]), - value=stringify_sql_value(row[1]), - source=ls_path.as_posix(), - ) - ) - return entries + return { + stringify_sql_value(row[0]): stringify_sql_value(row[1]) + for row in query_sqlite(ls_path, "SELECT key, value FROM data") + } + return {} -def read_firefox_webapps_entries(profile_dir, origin_filters): +def read_firefox_webapps_store(profile_dir, origin_filter): webapps_path = profile_dir / "webappsstore.sqlite" if not webapps_path.exists(): - return [] + return {} - entries = [] + values = {} for row in query_sqlite( webapps_path, "SELECT originKey, key, value FROM webappsstore2", ): origin = stringify_sql_value(row[0]) - if not origin_matches(origin, origin_filters): + if origin_filter.lower() not in origin.lower(): continue - entries.append( - StorageEntry( - origin=origin, - key=stringify_sql_value(row[1]), - value=stringify_sql_value(row[2]), - source=webapps_path.as_posix(), - ) - ) - return entries + values[stringify_sql_value(row[1])] = stringify_sql_value(row[2]) + return values def query_sqlite(path, query): copied_path = copy_sqlite_to_temp(path) @@ -210,7 +107,6 @@ def query_sqlite(path, query): def copy_sqlite_to_temp(path): - import os, shutil, tempfile fd, tmp = tempfile.mkstemp(suffix=".sqlite") os.close(fd) shutil.copyfile(path, tmp) @@ -220,14 +116,6 @@ def decode_firefox_origin(raw_origin): origin = raw_origin.split("^", 1)[0] return origin.replace("+++", "://") - -def origin_matches(origin, origin_filters): - if not origin_filters: - return True - normalized_origin = origin.lower() - return any(filter_value.lower() in normalized_origin for filter_value in origin_filters) - - def stringify_sql_value(value): if value is None: return "" diff --git a/retailer_sessions.py b/retailer_sessions.py deleted file mode 100644 index 6f5aad7..0000000 --- a/retailer_sessions.py +++ /dev/null @@ -1,116 +0,0 @@ -import os -from dataclasses import dataclass -from dotenv import load_dotenv - -from browser_session import ( - find_json_storage_value, - find_storage_value, - list_storage_keys, - load_browser_context, -) - - -COSTCO_STORAGE_ORIGINS = ["costco.com"] -COSTCO_HEADER_FIELDS = [ - ("costco-x-authorization", "costco-x-authorization"), - ("costco-x-wcs-clientId", "costco-x-wcs-clientId"), - ("client-identifier", "client-identifier"), -] -COSTCO_JSON_HEADER_KEYS = ["headers", "costco.headers"] - - -@dataclass -class RetailerSession: - cookies: object - headers: dict[str, str] - - -def load_giant_session(browser="firefox", profile_dir=None): - context = load_browser_context( - browser=browser, - domain_name="giantfood.com", - storage_origins=["giantfood.com"], - profile_dir=profile_dir, - ) - return RetailerSession(cookies=context.cookies, headers={}) - -def load_costco_session(browser="firefox", profile_dir=None): - load_dotenv() - - headers = { - "costco-x-authorization": os.getenv("COSTCO_X_AUTHORIZATION", "").strip(), - "costco-x-wcs-clientId": os.getenv("COSTCO_WCS_CLIENT_ID", "").strip(), - "client-identifier": os.getenv("COSTCO_CLIENT_IDENTIFIER", "").strip(), - } - - context = load_browser_context( - browser=browser, - domain_name=".costco.com", - storage_origins=["costco.com"], - profile_dir=profile_dir, - ) - - storage = {entry.key: entry.value for entry in context.storage_entries} - - id_token = storage.get("idToken", "").strip() - client_id = storage.get("clientID", "").strip() - - if id_token: - headers["costco-x-authorization"] = ( - id_token if id_token.startswith("Bearer ") else f"Bearer {id_token}" - ) - if client_id: - headers["costco-x-wcs-clientId"] = client_id - - headers = {k: v for k, v in headers.items() if v} - - return RetailerSession(cookies=context.cookies, headers=headers) - -#def load_costco_session(browser="firefox", profile_dir=None): -# context = load_browser_context( -# browser=browser, -# domain_name=".costco.com", -# storage_origins=COSTCO_STORAGE_ORIGINS, -# profile_dir=profile_dir, -# ) -# headers = extract_costco_headers(context.storage_entries) -# missing = [ -# header_name for header_name, value in headers.items() if not value -# ] -# if missing: -# available_keys = ", ".join( -# list_storage_keys(context.storage_entries, COSTCO_STORAGE_ORIGINS) -# ) -# raise ValueError( -# "missing Costco browser session headers: " -# f"{', '.join(missing)}. " -# f"Available Costco storage keys: {available_keys or '(none)'}" -# ) -# return RetailerSession(cookies=context.cookies, headers=headers) - - -def extract_costco_headers(storage_entries): - headers = {} - for header_name, storage_key in COSTCO_HEADER_FIELDS: - value = find_storage_value( - storage_entries, - COSTCO_STORAGE_ORIGINS, - storage_key, - ) - if not value: - value = find_costco_header_in_json(storage_entries, header_name) - headers[header_name] = value - return headers - - -def find_costco_header_in_json(storage_entries, header_name): - for json_key in COSTCO_JSON_HEADER_KEYS: - value = find_json_storage_value( - storage_entries, - COSTCO_STORAGE_ORIGINS, - json_key, - header_name, - ) - if value: - return value - return "" diff --git a/scrape_costco.py b/scrape_costco.py index ac58310..bdb90bc 100644 --- a/scrape_costco.py +++ b/scrape_costco.py @@ -3,14 +3,19 @@ import csv import json import time import re +from pathlib import Path from calendar import monthrange from datetime import datetime, timedelta -from pathlib import Path from dotenv import load_dotenv import click from curl_cffi import requests -from retailer_sessions import load_costco_session +from browser_session import ( + find_firefox_profile_dir, + load_firefox_cookies, + read_firefox_local_storage, + read_firefox_webapps_store, +) BASE_URL = "https://ecom-api.costco.com/ebusiness/order/v1/orders/graphql" RETAILER = "costco" @@ -210,6 +215,18 @@ ITEM_FIELDS = [ "is_coupon_line", ] +COSTCO_STORAGE_ORIGIN = "costco.com" +COSTCO_AUTH_STORAGE_KEY = "costco-x-authorization" +COSTCO_HEADERS_BLOB_KEY = "headers" + +def load_config(): + load_dotenv() + return { + "client_id": os.getenv("COSTCO_X_WCS_CLIENTID", "").strip(), + "client_identifier": os.getenv("COSTCO_CLIENT_IDENTIFIER", "").strip(), + } + + def build_headers(auth_headers): headers = { "accept": "*/*", @@ -226,11 +243,52 @@ def build_headers(auth_headers): headers.update(auth_headers) return headers -def build_session(retailer_session): + +def load_costco_browser_headers(profile_dir, client_id, client_identifier): + local_storage = read_firefox_local_storage(profile_dir, COSTCO_STORAGE_ORIGIN) + webapps_store = read_firefox_webapps_store(profile_dir, COSTCO_STORAGE_ORIGIN) + auth_token = ( + local_storage.get(COSTCO_AUTH_STORAGE_KEY, "").strip() + or webapps_store.get(COSTCO_AUTH_STORAGE_KEY, "").strip() + ) + + if not auth_token: + header_blob = ( + local_storage.get(COSTCO_HEADERS_BLOB_KEY, "").strip() + or webapps_store.get(COSTCO_HEADERS_BLOB_KEY, "").strip() + ) + if header_blob: + try: + blob_data = json.loads(header_blob) + except json.JSONDecodeError: + blob_data = {} + auth_token = str(blob_data.get(COSTCO_AUTH_STORAGE_KEY, "")).strip() + client_id = client_id or str(blob_data.get("costco-x-wcs-clientId", "")).strip() + client_identifier = client_identifier or str( + blob_data.get("client-identifier", "") + ).strip() + + if not auth_token: + raise click.ClickException( + "could not find Costco auth token in Firefox session storage" + ) + if not client_id or not client_identifier: + raise click.ClickException( + "missing Costco client ids; set COSTCO_X_WCS_CLIENTID and COSTCO_CLIENT_IDENTIFIER" + ) + + return { + "costco-x-authorization": auth_token, + "costco-x-wcs-clientId": client_id, + "client-identifier": client_identifier, + } + + +def build_session(profile_dir, auth_headers): session = requests.Session() - session.cookies.update(retailer_session.cookies) + session.cookies.update(load_firefox_cookies(".costco.com", profile_dir)) session.headers.update(build_headers()) - session.headers.update(retailer_session.headers) + session.headers.update(auth_headers) return session @@ -594,27 +652,24 @@ def main( ): outdir = Path(outdir) raw_dir = outdir / "raw" - if firefox_profile_dir is None: - firefox_profile_dir = next( - (Path(os.getenv("APPDATA")) / "Mozilla" / "Firefox" / "Profiles").iterdir() - ) - try: - retailer_session = load_costco_session( - browser="firefox", - profile_dir=firefox_profile_dir, - ) - click.echo( - "session bootstrap: " - f"cookies={bool(retailer_session.cookies)}, " - f"authorization={'costco-x-authorization' in retailer_session.headers}, " - f"client_id={'costco-x-wcs-clientId' in retailer_session.headers}, " - f"client_identifier={'client-identifier' in retailer_session.headers}" - ) - session = build_session(retailer_session) - except Exception as exc: - raise click.ClickException( - f"failed to load Costco browser session: {exc}" - ) from exc + config = load_config() + + profile_dir = Path(firefox_profile_dir) if firefox_profile_dir else None + if profile_dir is None: + try: + profile_dir = find_firefox_profile_dir() + except Exception: + profile_dir = click.prompt( + "Firefox profile dir", + type=click.Path(exists=True, file_okay=False, path_type=Path), + ) + + auth_headers = load_costco_browser_headers( + profile_dir, + client_id=config["client_id"], + client_identifier=config["client_identifier"], + ) + session = build_session(profile_dir, auth_headers) start_date, end_date = resolve_date_range(months_back) diff --git a/scrape_giant.py b/scrape_giant.py index 55e9f9f..f1e0ed0 100644 --- a/scrape_giant.py +++ b/scrape_giant.py @@ -8,7 +8,7 @@ import click from dotenv import load_dotenv from curl_cffi import requests -from retailer_sessions import load_giant_session +from browser_session import find_firefox_profile_dir, load_firefox_cookies BASE = "https://giantfood.com" @@ -67,9 +67,9 @@ def load_config(): def build_session(): - browser_session = load_giant_session() + profile_dir = find_firefox_profile_dir() session = requests.Session() - session.cookies.update(browser_session.cookies) + session.cookies.update(load_firefox_cookies("giantfood.com", profile_dir)) session.headers.update( { "user-agent": ( diff --git a/tests/test_browser_session.py b/tests/test_browser_session.py index c7f6ff8..59f23bc 100644 --- a/tests/test_browser_session.py +++ b/tests/test_browser_session.py @@ -5,12 +5,11 @@ from pathlib import Path from unittest import mock import browser_session -import retailer_sessions import scrape_costco class BrowserSessionTests(unittest.TestCase): - def test_read_firefox_ls_entries_reads_storage_from_copied_sqlite(self): + def test_read_firefox_local_storage_reads_copied_sqlite(self): with tempfile.TemporaryDirectory() as tmpdir: profile_dir = Path(tmpdir) / "abcd.default-release" ls_dir = profile_dir / "storage" / "default" / "https+++www.costco.com" / "ls" @@ -24,38 +23,35 @@ class BrowserSessionTests(unittest.TestCase): ("costco-x-wcs-clientId", "4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf"), ) - entries = browser_session.read_firefox_storage_entries( + values = browser_session.read_firefox_local_storage( profile_dir, - origin_filters=["costco.com"], + origin_filter="costco.com", ) - self.assertEqual(1, len(entries)) - self.assertEqual("https://www.costco.com", entries[0].origin) - self.assertEqual("costco-x-wcs-clientId", entries[0].key) + self.assertEqual( + "4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf", + values["costco-x-wcs-clientId"], + ) - def test_extract_costco_headers_uses_exact_keys(self): - entries = [ - browser_session.StorageEntry( - origin="https://www.costco.com", - key="costco-x-authorization", - value="Bearer header.payload.signature", - source="memory", - ), - browser_session.StorageEntry( - origin="https://www.costco.com", - key="costco-x-wcs-clientId", - value="4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf", - source="memory", - ), - browser_session.StorageEntry( - origin="https://www.costco.com", - key="client-identifier", - value="481b1aec-aa3b-454b-b81b-48187e28f205", - source="memory", - ), - ] + def test_load_costco_browser_headers_reads_exact_auth_key(self): + with tempfile.TemporaryDirectory() as tmpdir: + profile_dir = Path(tmpdir) + storage_dir = profile_dir / "storage" / "default" / "https+++www.costco.com" / "ls" + storage_dir.mkdir(parents=True) + db_path = storage_dir / "data.sqlite" - headers = retailer_sessions.extract_costco_headers(entries) + with sqlite3.connect(db_path) as connection: + connection.execute("CREATE TABLE data (key TEXT, value TEXT)") + connection.execute( + "INSERT INTO data (key, value) VALUES (?, ?)", + ("costco-x-authorization", "Bearer header.payload.signature"), + ) + + headers = scrape_costco.load_costco_browser_headers( + profile_dir, + client_id="4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf", + client_identifier="481b1aec-aa3b-454b-b81b-48187e28f205", + ) self.assertEqual("Bearer header.payload.signature", headers["costco-x-authorization"]) self.assertEqual( @@ -67,42 +63,60 @@ class BrowserSessionTests(unittest.TestCase): headers["client-identifier"], ) - def test_extract_costco_headers_uses_exact_json_header_blob(self): - entries = [ - browser_session.StorageEntry( - origin="https://www.costco.com", - key="headers", - value=( - '{"costco-x-authorization":"Bearer header.payload.signature",' - '"costco-x-wcs-clientId":"4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",' - '"client-identifier":"481b1aec-aa3b-454b-b81b-48187e28f205"}' - ), - source="memory", - ) - ] + def test_load_costco_browser_headers_falls_back_to_exact_header_blob(self): + with tempfile.TemporaryDirectory() as tmpdir: + profile_dir = Path(tmpdir) + storage_dir = profile_dir / "storage" / "default" / "https+++www.costco.com" / "ls" + storage_dir.mkdir(parents=True) + db_path = storage_dir / "data.sqlite" - headers = retailer_sessions.extract_costco_headers(entries) + with sqlite3.connect(db_path) as connection: + connection.execute("CREATE TABLE data (key TEXT, value TEXT)") + connection.execute( + "INSERT INTO data (key, value) VALUES (?, ?)", + ( + "headers", + '{"costco-x-authorization":"Bearer header.payload.signature"}', + ), + ) + + headers = scrape_costco.load_costco_browser_headers( + profile_dir, + client_id="4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf", + client_identifier="481b1aec-aa3b-454b-b81b-48187e28f205", + ) self.assertEqual("Bearer header.payload.signature", headers["costco-x-authorization"]) - self.assertEqual( - "4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf", - headers["costco-x-wcs-clientId"], - ) - self.assertEqual( - "481b1aec-aa3b-454b-b81b-48187e28f205", - headers["client-identifier"], - ) def test_scrape_costco_prompts_for_profile_dir_when_autodiscovery_fails(self): with mock.patch.object( scrape_costco, - "build_session", - side_effect=[FileNotFoundError("no default profile"), object()], + "find_firefox_profile_dir", + side_effect=FileNotFoundError("no default profile"), ), mock.patch.object( scrape_costco.click, "prompt", return_value=Path("/tmp/profile"), ) as mocked_prompt, mock.patch.object( + scrape_costco, + "load_config", + return_value={ + "client_id": "4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf", + "client_identifier": "481b1aec-aa3b-454b-b81b-48187e28f205", + }, + ), mock.patch.object( + scrape_costco, + "load_costco_browser_headers", + return_value={ + "costco-x-authorization": "Bearer header.payload.signature", + "costco-x-wcs-clientId": "4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf", + "client-identifier": "481b1aec-aa3b-454b-b81b-48187e28f205", + }, + ), mock.patch.object( + scrape_costco, + "build_session", + return_value=object(), + ), mock.patch.object( scrape_costco, "fetch_summary_windows", return_value=( diff --git a/tests/test_costco_pipeline.py b/tests/test_costco_pipeline.py index 21f644a..2755fe1 100644 --- a/tests/test_costco_pipeline.py +++ b/tests/test_costco_pipeline.py @@ -411,6 +411,25 @@ class CostcoPipelineTests(unittest.TestCase): ] with mock.patch.object( + scrape_costco, + "load_config", + return_value={ + "client_id": "4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf", + "client_identifier": "481b1aec-aa3b-454b-b81b-48187e28f205", + }, + ), mock.patch.object( + scrape_costco, + "find_firefox_profile_dir", + return_value=Path("/tmp/profile"), + ), mock.patch.object( + scrape_costco, + "load_costco_browser_headers", + return_value={ + "costco-x-authorization": "Bearer header.payload.signature", + "costco-x-wcs-clientId": "4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf", + "client-identifier": "481b1aec-aa3b-454b-b81b-48187e28f205", + }, + ), mock.patch.object( scrape_costco, "build_session", return_value=object() ), mock.patch.object( scrape_costco,