diff --git a/browser_session.py b/browser_session.py index a9f317d..fd4c415 100644 --- a/browser_session.py +++ b/browser_session.py @@ -1,4 +1,5 @@ import configparser +import json import os import shutil import sqlite3 @@ -104,6 +105,45 @@ def read_firefox_storage_entries(profile_dir, origin_filters): return deduped +def storage_entries_for_origin(storage_entries, origin_filters): + return [ + entry + for entry in storage_entries + if origin_matches(entry.origin, origin_filters) + ] + + +def find_storage_value(storage_entries, origin_filters, key): + for entry in storage_entries_for_origin(storage_entries, origin_filters): + if entry.key == key: + return entry.value + return "" + + +def find_json_storage_value(storage_entries, origin_filters, key, field): + raw_value = find_storage_value(storage_entries, origin_filters, key) + if not raw_value: + return "" + try: + payload = json.loads(raw_value) + except json.JSONDecodeError: + return "" + value = payload.get(field, "") + if value is None: + return "" + return str(value) + + +def list_storage_keys(storage_entries, origin_filters): + return sorted( + { + entry.key + for entry in storage_entries_for_origin(storage_entries, origin_filters) + if entry.key + } + ) + + def read_firefox_ls_entries(profile_dir, origin_filters): entries = [] storage_root = profile_dir / "storage" / "default" diff --git a/retailer_sessions.py b/retailer_sessions.py index 77a44f7..56aa8bb 100644 --- a/retailer_sessions.py +++ b/retailer_sessions.py @@ -1,18 +1,20 @@ -import json -import re from dataclasses import dataclass -from browser_session import load_browser_context - - -UUID_RE = re.compile( - r"^[0-9a-fA-F]{8}-" - r"[0-9a-fA-F]{4}-" - r"[0-9a-fA-F]{4}-" - r"[0-9a-fA-F]{4}-" - r"[0-9a-fA-F]{12}$" +from browser_session import ( + find_json_storage_value, + find_storage_value, + list_storage_keys, + load_browser_context, ) -JWT_RE = re.compile(r"^[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+$") + + +COSTCO_STORAGE_ORIGINS = ["costco.com"] +COSTCO_HEADER_FIELDS = [ + ("costco-x-authorization", "costco-x-authorization"), + ("costco-x-wcs-clientId", "costco-x-wcs-clientId"), + ("client-identifier", "client-identifier"), +] +COSTCO_JSON_HEADER_KEYS = ["headers", "costco.headers"] @dataclass @@ -35,102 +37,47 @@ def load_costco_session(browser="firefox", profile_dir=None): context = load_browser_context( browser=browser, domain_name=".costco.com", - storage_origins=["costco.com"], + storage_origins=COSTCO_STORAGE_ORIGINS, profile_dir=profile_dir, ) - return RetailerSession( - cookies=context.cookies, - headers=extract_costco_headers(context.storage_entries), - ) + headers = extract_costco_headers(context.storage_entries) + missing = [ + header_name for header_name, value in headers.items() if not value + ] + if missing: + available_keys = ", ".join( + list_storage_keys(context.storage_entries, COSTCO_STORAGE_ORIGINS) + ) + raise ValueError( + "missing Costco browser session headers: " + f"{', '.join(missing)}. " + f"Available Costco storage keys: {available_keys or '(none)'}" + ) + return RetailerSession(cookies=context.cookies, headers=headers) def extract_costco_headers(storage_entries): - authorization = "" - client_id = "" - client_identifier = "" - - for key_path, value in iter_storage_candidates(storage_entries): - normalized_key = normalize_key(key_path) - normalized_value = str(value).strip() - if not normalized_value: - continue - - if not authorization and looks_like_authorization(normalized_key, normalized_value): - authorization = normalize_authorization(normalized_value) - continue - if not client_identifier and looks_like_client_identifier( - normalized_key, normalized_value - ): - client_identifier = normalized_value - continue - if not client_id and looks_like_client_id(normalized_key, normalized_value): - client_id = normalized_value - headers = {} - if authorization: - headers["costco-x-authorization"] = authorization - if client_id: - headers["costco-x-wcs-clientId"] = client_id - if client_identifier: - headers["client-identifier"] = client_identifier + for header_name, storage_key in COSTCO_HEADER_FIELDS: + value = find_storage_value( + storage_entries, + COSTCO_STORAGE_ORIGINS, + storage_key, + ) + if not value: + value = find_costco_header_in_json(storage_entries, header_name) + headers[header_name] = value return headers -def iter_storage_candidates(storage_entries): - for entry in storage_entries: - yield entry.key, entry.value - yield from walk_candidate_value(entry.key, parse_json_value(entry.value)) - - -def walk_candidate_value(prefix, value): - if isinstance(value, dict): - for key, nested in value.items(): - nested_prefix = f"{prefix}.{key}" - yield nested_prefix, nested - yield from walk_candidate_value(nested_prefix, nested) - elif isinstance(value, list): - for index, nested in enumerate(value): - nested_prefix = f"{prefix}[{index}]" - yield nested_prefix, nested - yield from walk_candidate_value(nested_prefix, nested) - - -def parse_json_value(value): - if not isinstance(value, str): - return value - text = value.strip() - if not text or text[0] not in "{[": - return value - try: - return json.loads(text) - except json.JSONDecodeError: - return value - - -def normalize_key(value): - return re.sub(r"[^a-z0-9]+", "", value.lower()) - - -def looks_like_authorization(key, value): - return ( - ("authorization" in key or "token" in key) - and bool(normalize_authorization(value)) - ) - - -def normalize_authorization(value): - candidate = str(value).strip() - if candidate.lower().startswith("bearer "): - token = candidate.split(None, 1)[1].strip() - return f"Bearer {token}" if JWT_RE.match(token) else "" - if JWT_RE.match(candidate): - return f"Bearer {candidate}" +def find_costco_header_in_json(storage_entries, header_name): + for json_key in COSTCO_JSON_HEADER_KEYS: + value = find_json_storage_value( + storage_entries, + COSTCO_STORAGE_ORIGINS, + json_key, + header_name, + ) + if value: + return value return "" - - -def looks_like_client_id(key, value): - return "clientid" in key and "identifier" not in key and bool(UUID_RE.match(value)) - - -def looks_like_client_identifier(key, value): - return "clientidentifier" in key and bool(UUID_RE.match(value)) diff --git a/scrape_costco.py b/scrape_costco.py index 2d04012..1beb062 100644 --- a/scrape_costco.py +++ b/scrape_costco.py @@ -225,27 +225,8 @@ def build_headers(auth_headers): headers.update(auth_headers) return headers -def build_session(): - retailer_session = load_costco_session() - click.echo( - "session bootstrap: " - f"cookies={bool(retailer_session.cookies)}, " - f"authorization={'costco-x-authorization' in retailer_session.headers}, " - f"client_id={'costco-x-wcs-clientId' in retailer_session.headers}, " - f"client_identifier={'client-identifier' in retailer_session.headers}" - ) - - auth = retailer_session.headers.get("costco-x-authorization", "") - if auth: - click.echo( - f"auth prefix ok={auth.startswith('Bearer ')} len={len(auth)} token_prefix={auth[:24]}" - ) - - click.echo( - "header values: " - f"client_id={retailer_session.headers.get('costco-x-wcs-clientId', '')} " - f"client_identifier={retailer_session.headers.get('client-identifier', '')}" - ) +def build_session(profile_dir=None): + retailer_session = load_costco_session(profile_dir=profile_dir) session = requests.Session() session.cookies.update(retailer_session.cookies) session.headers.update(build_headers(retailer_session.headers)) @@ -597,15 +578,38 @@ def write_csv(path, rows, fieldnames): type=int, help="How many months of receipts to enumerate back from today.", ) -def main(outdir, document_type, document_sub_type, window_days, months_back): +@click.option( + "--firefox-profile-dir", + default=None, + help="Firefox profile directory to use for cookies and session storage.", +) +def main( + outdir, + document_type, + document_sub_type, + window_days, + months_back, + firefox_profile_dir, +): outdir = Path(outdir) - raw_dir = outdir / "raw" + raw_dir = outdir / "raw" try: - session = build_session() + session = build_session(profile_dir=firefox_profile_dir) except Exception as exc: - raise click.ClickException( - f"failed to load Costco browser session: {exc}" - ) from exc + if firefox_profile_dir: + raise click.ClickException( + f"failed to load Costco browser session: {exc}" + ) from exc + prompted_profile = click.prompt( + "Firefox profile dir", + type=click.Path(exists=True, file_okay=False, path_type=Path), + ) + try: + session = build_session(profile_dir=prompted_profile) + except Exception as prompt_exc: + raise click.ClickException( + f"failed to load Costco browser session: {prompt_exc}" + ) from prompt_exc start_date, end_date = resolve_date_range(months_back) summary_payload, request_metadata = fetch_summary_windows( diff --git a/tests/test_browser_session.py b/tests/test_browser_session.py index 5477d07..c7f6ff8 100644 --- a/tests/test_browser_session.py +++ b/tests/test_browser_session.py @@ -2,9 +2,11 @@ import sqlite3 import tempfile import unittest from pathlib import Path +from unittest import mock import browser_session import retailer_sessions +import scrape_costco class BrowserSessionTests(unittest.TestCase): @@ -15,17 +17,13 @@ class BrowserSessionTests(unittest.TestCase): ls_dir.mkdir(parents=True) db_path = ls_dir / "data.sqlite" - connection = sqlite3.connect(db_path) - try: + with sqlite3.connect(db_path) as connection: connection.execute("CREATE TABLE data (key TEXT, value TEXT)") connection.execute( "INSERT INTO data (key, value) VALUES (?, ?)", - ("session", '{"costco":{"clientIdentifier":"481b1aec-aa3b-454b-b81b-48187e28f205"}}'), + ("costco-x-wcs-clientId", "4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf"), ) - connection.commit() - finally: - connection.close() - + entries = browser_session.read_firefox_storage_entries( profile_dir, origin_filters=["costco.com"], @@ -33,17 +31,51 @@ class BrowserSessionTests(unittest.TestCase): self.assertEqual(1, len(entries)) self.assertEqual("https://www.costco.com", entries[0].origin) - self.assertEqual("session", entries[0].key) + self.assertEqual("costco-x-wcs-clientId", entries[0].key) - def test_extract_costco_headers_from_storage_json(self): + def test_extract_costco_headers_uses_exact_keys(self): entries = [ browser_session.StorageEntry( origin="https://www.costco.com", - key="authState", + key="costco-x-authorization", + value="Bearer header.payload.signature", + source="memory", + ), + browser_session.StorageEntry( + origin="https://www.costco.com", + key="costco-x-wcs-clientId", + value="4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf", + source="memory", + ), + browser_session.StorageEntry( + origin="https://www.costco.com", + key="client-identifier", + value="481b1aec-aa3b-454b-b81b-48187e28f205", + source="memory", + ), + ] + + headers = retailer_sessions.extract_costco_headers(entries) + + self.assertEqual("Bearer header.payload.signature", headers["costco-x-authorization"]) + self.assertEqual( + "4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf", + headers["costco-x-wcs-clientId"], + ) + self.assertEqual( + "481b1aec-aa3b-454b-b81b-48187e28f205", + headers["client-identifier"], + ) + + def test_extract_costco_headers_uses_exact_json_header_blob(self): + entries = [ + browser_session.StorageEntry( + origin="https://www.costco.com", + key="headers", value=( - '{"authorization":"Bearer header.payload.signature",' - '"wcsClientId":"4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",' - '"clientIdentifier":"481b1aec-aa3b-454b-b81b-48187e28f205"}' + '{"costco-x-authorization":"Bearer header.payload.signature",' + '"costco-x-wcs-clientId":"4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",' + '"client-identifier":"481b1aec-aa3b-454b-b81b-48187e28f205"}' ), source="memory", ) @@ -61,6 +93,40 @@ class BrowserSessionTests(unittest.TestCase): headers["client-identifier"], ) + def test_scrape_costco_prompts_for_profile_dir_when_autodiscovery_fails(self): + with mock.patch.object( + scrape_costco, + "build_session", + side_effect=[FileNotFoundError("no default profile"), object()], + ), mock.patch.object( + scrape_costco.click, + "prompt", + return_value=Path("/tmp/profile"), + ) as mocked_prompt, mock.patch.object( + scrape_costco, + "fetch_summary_windows", + return_value=( + {"data": {"receiptsWithCounts": {"receipts": []}}}, + [], + ), + ), mock.patch.object( + scrape_costco, + "write_json", + ), mock.patch.object( + scrape_costco, + "write_csv", + ): + scrape_costco.main.callback( + outdir="/tmp/costco_output", + document_type="all", + document_sub_type="all", + window_days=92, + months_back=3, + firefox_profile_dir=None, + ) + + mocked_prompt.assert_called_once() + if __name__ == "__main__": unittest.main() diff --git a/tests/test_costco_pipeline.py b/tests/test_costco_pipeline.py index 9648b76..21f644a 100644 --- a/tests/test_costco_pipeline.py +++ b/tests/test_costco_pipeline.py @@ -427,6 +427,7 @@ class CostcoPipelineTests(unittest.TestCase): document_sub_type="all", window_days=92, months_back=3, + firefox_profile_dir=None, ) metadata_path = outdir / "raw" / "summary_requests.json"