Simplify Costco browser header extraction

This commit is contained in:
ben
2026-03-16 16:23:38 -04:00
parent 5a331c9af4
commit 1b4c7dde25
5 changed files with 199 additions and 141 deletions

View File

@@ -1,4 +1,5 @@
import configparser import configparser
import json
import os import os
import shutil import shutil
import sqlite3 import sqlite3
@@ -104,6 +105,45 @@ def read_firefox_storage_entries(profile_dir, origin_filters):
return deduped return deduped
def storage_entries_for_origin(storage_entries, origin_filters):
return [
entry
for entry in storage_entries
if origin_matches(entry.origin, origin_filters)
]
def find_storage_value(storage_entries, origin_filters, key):
for entry in storage_entries_for_origin(storage_entries, origin_filters):
if entry.key == key:
return entry.value
return ""
def find_json_storage_value(storage_entries, origin_filters, key, field):
raw_value = find_storage_value(storage_entries, origin_filters, key)
if not raw_value:
return ""
try:
payload = json.loads(raw_value)
except json.JSONDecodeError:
return ""
value = payload.get(field, "")
if value is None:
return ""
return str(value)
def list_storage_keys(storage_entries, origin_filters):
return sorted(
{
entry.key
for entry in storage_entries_for_origin(storage_entries, origin_filters)
if entry.key
}
)
def read_firefox_ls_entries(profile_dir, origin_filters): def read_firefox_ls_entries(profile_dir, origin_filters):
entries = [] entries = []
storage_root = profile_dir / "storage" / "default" storage_root = profile_dir / "storage" / "default"

View File

@@ -1,18 +1,20 @@
import json
import re
from dataclasses import dataclass from dataclasses import dataclass
from browser_session import load_browser_context from browser_session import (
find_json_storage_value,
find_storage_value,
UUID_RE = re.compile( list_storage_keys,
r"^[0-9a-fA-F]{8}-" load_browser_context,
r"[0-9a-fA-F]{4}-"
r"[0-9a-fA-F]{4}-"
r"[0-9a-fA-F]{4}-"
r"[0-9a-fA-F]{12}$"
) )
JWT_RE = re.compile(r"^[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+$")
COSTCO_STORAGE_ORIGINS = ["costco.com"]
COSTCO_HEADER_FIELDS = [
("costco-x-authorization", "costco-x-authorization"),
("costco-x-wcs-clientId", "costco-x-wcs-clientId"),
("client-identifier", "client-identifier"),
]
COSTCO_JSON_HEADER_KEYS = ["headers", "costco.headers"]
@dataclass @dataclass
@@ -35,102 +37,47 @@ def load_costco_session(browser="firefox", profile_dir=None):
context = load_browser_context( context = load_browser_context(
browser=browser, browser=browser,
domain_name=".costco.com", domain_name=".costco.com",
storage_origins=["costco.com"], storage_origins=COSTCO_STORAGE_ORIGINS,
profile_dir=profile_dir, profile_dir=profile_dir,
) )
return RetailerSession( headers = extract_costco_headers(context.storage_entries)
cookies=context.cookies, missing = [
headers=extract_costco_headers(context.storage_entries), header_name for header_name, value in headers.items() if not value
]
if missing:
available_keys = ", ".join(
list_storage_keys(context.storage_entries, COSTCO_STORAGE_ORIGINS)
) )
raise ValueError(
"missing Costco browser session headers: "
f"{', '.join(missing)}. "
f"Available Costco storage keys: {available_keys or '(none)'}"
)
return RetailerSession(cookies=context.cookies, headers=headers)
def extract_costco_headers(storage_entries): def extract_costco_headers(storage_entries):
authorization = ""
client_id = ""
client_identifier = ""
for key_path, value in iter_storage_candidates(storage_entries):
normalized_key = normalize_key(key_path)
normalized_value = str(value).strip()
if not normalized_value:
continue
if not authorization and looks_like_authorization(normalized_key, normalized_value):
authorization = normalize_authorization(normalized_value)
continue
if not client_identifier and looks_like_client_identifier(
normalized_key, normalized_value
):
client_identifier = normalized_value
continue
if not client_id and looks_like_client_id(normalized_key, normalized_value):
client_id = normalized_value
headers = {} headers = {}
if authorization: for header_name, storage_key in COSTCO_HEADER_FIELDS:
headers["costco-x-authorization"] = authorization value = find_storage_value(
if client_id: storage_entries,
headers["costco-x-wcs-clientId"] = client_id COSTCO_STORAGE_ORIGINS,
if client_identifier: storage_key,
headers["client-identifier"] = client_identifier )
if not value:
value = find_costco_header_in_json(storage_entries, header_name)
headers[header_name] = value
return headers return headers
def iter_storage_candidates(storage_entries): def find_costco_header_in_json(storage_entries, header_name):
for entry in storage_entries: for json_key in COSTCO_JSON_HEADER_KEYS:
yield entry.key, entry.value value = find_json_storage_value(
yield from walk_candidate_value(entry.key, parse_json_value(entry.value)) storage_entries,
COSTCO_STORAGE_ORIGINS,
json_key,
def walk_candidate_value(prefix, value): header_name,
if isinstance(value, dict):
for key, nested in value.items():
nested_prefix = f"{prefix}.{key}"
yield nested_prefix, nested
yield from walk_candidate_value(nested_prefix, nested)
elif isinstance(value, list):
for index, nested in enumerate(value):
nested_prefix = f"{prefix}[{index}]"
yield nested_prefix, nested
yield from walk_candidate_value(nested_prefix, nested)
def parse_json_value(value):
if not isinstance(value, str):
return value
text = value.strip()
if not text or text[0] not in "{[":
return value
try:
return json.loads(text)
except json.JSONDecodeError:
return value
def normalize_key(value):
return re.sub(r"[^a-z0-9]+", "", value.lower())
def looks_like_authorization(key, value):
return (
("authorization" in key or "token" in key)
and bool(normalize_authorization(value))
) )
if value:
return value
def normalize_authorization(value):
candidate = str(value).strip()
if candidate.lower().startswith("bearer "):
token = candidate.split(None, 1)[1].strip()
return f"Bearer {token}" if JWT_RE.match(token) else ""
if JWT_RE.match(candidate):
return f"Bearer {candidate}"
return "" return ""
def looks_like_client_id(key, value):
return "clientid" in key and "identifier" not in key and bool(UUID_RE.match(value))
def looks_like_client_identifier(key, value):
return "clientidentifier" in key and bool(UUID_RE.match(value))

View File

@@ -225,27 +225,8 @@ def build_headers(auth_headers):
headers.update(auth_headers) headers.update(auth_headers)
return headers return headers
def build_session(): def build_session(profile_dir=None):
retailer_session = load_costco_session() retailer_session = load_costco_session(profile_dir=profile_dir)
click.echo(
"session bootstrap: "
f"cookies={bool(retailer_session.cookies)}, "
f"authorization={'costco-x-authorization' in retailer_session.headers}, "
f"client_id={'costco-x-wcs-clientId' in retailer_session.headers}, "
f"client_identifier={'client-identifier' in retailer_session.headers}"
)
auth = retailer_session.headers.get("costco-x-authorization", "")
if auth:
click.echo(
f"auth prefix ok={auth.startswith('Bearer ')} len={len(auth)} token_prefix={auth[:24]}"
)
click.echo(
"header values: "
f"client_id={retailer_session.headers.get('costco-x-wcs-clientId', '')} "
f"client_identifier={retailer_session.headers.get('client-identifier', '')}"
)
session = requests.Session() session = requests.Session()
session.cookies.update(retailer_session.cookies) session.cookies.update(retailer_session.cookies)
session.headers.update(build_headers(retailer_session.headers)) session.headers.update(build_headers(retailer_session.headers))
@@ -597,15 +578,38 @@ def write_csv(path, rows, fieldnames):
type=int, type=int,
help="How many months of receipts to enumerate back from today.", help="How many months of receipts to enumerate back from today.",
) )
def main(outdir, document_type, document_sub_type, window_days, months_back): @click.option(
"--firefox-profile-dir",
default=None,
help="Firefox profile directory to use for cookies and session storage.",
)
def main(
outdir,
document_type,
document_sub_type,
window_days,
months_back,
firefox_profile_dir,
):
outdir = Path(outdir) outdir = Path(outdir)
raw_dir = outdir / "raw" raw_dir = outdir / "raw"
try: try:
session = build_session() session = build_session(profile_dir=firefox_profile_dir)
except Exception as exc: except Exception as exc:
if firefox_profile_dir:
raise click.ClickException( raise click.ClickException(
f"failed to load Costco browser session: {exc}" f"failed to load Costco browser session: {exc}"
) from exc ) from exc
prompted_profile = click.prompt(
"Firefox profile dir",
type=click.Path(exists=True, file_okay=False, path_type=Path),
)
try:
session = build_session(profile_dir=prompted_profile)
except Exception as prompt_exc:
raise click.ClickException(
f"failed to load Costco browser session: {prompt_exc}"
) from prompt_exc
start_date, end_date = resolve_date_range(months_back) start_date, end_date = resolve_date_range(months_back)
summary_payload, request_metadata = fetch_summary_windows( summary_payload, request_metadata = fetch_summary_windows(

View File

@@ -2,9 +2,11 @@ import sqlite3
import tempfile import tempfile
import unittest import unittest
from pathlib import Path from pathlib import Path
from unittest import mock
import browser_session import browser_session
import retailer_sessions import retailer_sessions
import scrape_costco
class BrowserSessionTests(unittest.TestCase): class BrowserSessionTests(unittest.TestCase):
@@ -15,16 +17,12 @@ class BrowserSessionTests(unittest.TestCase):
ls_dir.mkdir(parents=True) ls_dir.mkdir(parents=True)
db_path = ls_dir / "data.sqlite" db_path = ls_dir / "data.sqlite"
connection = sqlite3.connect(db_path) with sqlite3.connect(db_path) as connection:
try:
connection.execute("CREATE TABLE data (key TEXT, value TEXT)") connection.execute("CREATE TABLE data (key TEXT, value TEXT)")
connection.execute( connection.execute(
"INSERT INTO data (key, value) VALUES (?, ?)", "INSERT INTO data (key, value) VALUES (?, ?)",
("session", '{"costco":{"clientIdentifier":"481b1aec-aa3b-454b-b81b-48187e28f205"}}'), ("costco-x-wcs-clientId", "4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf"),
) )
connection.commit()
finally:
connection.close()
entries = browser_session.read_firefox_storage_entries( entries = browser_session.read_firefox_storage_entries(
profile_dir, profile_dir,
@@ -33,17 +31,51 @@ class BrowserSessionTests(unittest.TestCase):
self.assertEqual(1, len(entries)) self.assertEqual(1, len(entries))
self.assertEqual("https://www.costco.com", entries[0].origin) self.assertEqual("https://www.costco.com", entries[0].origin)
self.assertEqual("session", entries[0].key) self.assertEqual("costco-x-wcs-clientId", entries[0].key)
def test_extract_costco_headers_from_storage_json(self): def test_extract_costco_headers_uses_exact_keys(self):
entries = [ entries = [
browser_session.StorageEntry( browser_session.StorageEntry(
origin="https://www.costco.com", origin="https://www.costco.com",
key="authState", key="costco-x-authorization",
value="Bearer header.payload.signature",
source="memory",
),
browser_session.StorageEntry(
origin="https://www.costco.com",
key="costco-x-wcs-clientId",
value="4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",
source="memory",
),
browser_session.StorageEntry(
origin="https://www.costco.com",
key="client-identifier",
value="481b1aec-aa3b-454b-b81b-48187e28f205",
source="memory",
),
]
headers = retailer_sessions.extract_costco_headers(entries)
self.assertEqual("Bearer header.payload.signature", headers["costco-x-authorization"])
self.assertEqual(
"4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",
headers["costco-x-wcs-clientId"],
)
self.assertEqual(
"481b1aec-aa3b-454b-b81b-48187e28f205",
headers["client-identifier"],
)
def test_extract_costco_headers_uses_exact_json_header_blob(self):
entries = [
browser_session.StorageEntry(
origin="https://www.costco.com",
key="headers",
value=( value=(
'{"authorization":"Bearer header.payload.signature",' '{"costco-x-authorization":"Bearer header.payload.signature",'
'"wcsClientId":"4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",' '"costco-x-wcs-clientId":"4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",'
'"clientIdentifier":"481b1aec-aa3b-454b-b81b-48187e28f205"}' '"client-identifier":"481b1aec-aa3b-454b-b81b-48187e28f205"}'
), ),
source="memory", source="memory",
) )
@@ -61,6 +93,40 @@ class BrowserSessionTests(unittest.TestCase):
headers["client-identifier"], headers["client-identifier"],
) )
def test_scrape_costco_prompts_for_profile_dir_when_autodiscovery_fails(self):
with mock.patch.object(
scrape_costco,
"build_session",
side_effect=[FileNotFoundError("no default profile"), object()],
), mock.patch.object(
scrape_costco.click,
"prompt",
return_value=Path("/tmp/profile"),
) as mocked_prompt, mock.patch.object(
scrape_costco,
"fetch_summary_windows",
return_value=(
{"data": {"receiptsWithCounts": {"receipts": []}}},
[],
),
), mock.patch.object(
scrape_costco,
"write_json",
), mock.patch.object(
scrape_costco,
"write_csv",
):
scrape_costco.main.callback(
outdir="/tmp/costco_output",
document_type="all",
document_sub_type="all",
window_days=92,
months_back=3,
firefox_profile_dir=None,
)
mocked_prompt.assert_called_once()
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()

View File

@@ -427,6 +427,7 @@ class CostcoPipelineTests(unittest.TestCase):
document_sub_type="all", document_sub_type="all",
window_days=92, window_days=92,
months_back=3, months_back=3,
firefox_profile_dir=None,
) )
metadata_path = outdir / "raw" / "summary_requests.json" metadata_path = outdir / "raw" / "summary_requests.json"