Simplify browser session bootstrap

This commit is contained in:
ben
2026-03-16 17:08:44 -04:00
parent e48dd6c4c2
commit d7a0329332
6 changed files with 184 additions and 324 deletions

View File

@@ -1,47 +1,13 @@
import configparser
import json
import os
import shutil
import sqlite3
import tempfile
from dataclasses import dataclass
from pathlib import Path
import browser_cookie3
@dataclass
class StorageEntry:
origin: str
key: str
value: str
source: str
@dataclass
class BrowserContext:
cookies: object
storage_entries: list[StorageEntry]
def load_browser_context(
browser,
domain_name,
storage_origins=None,
profile_dir=None,
):
if browser != "firefox":
raise ValueError(f"unsupported browser: {browser}")
profile = Path(profile_dir) if profile_dir else find_firefox_profile_dir()
cookies = load_firefox_cookies(domain_name, profile)
storage_entries = read_firefox_storage_entries(
profile,
origin_filters=storage_origins or [],
)
return BrowserContext(cookies=cookies, storage_entries=storage_entries)
def find_firefox_profile_dir():
profiles_ini = firefox_profiles_root() / "profiles.ini"
parser = configparser.RawConfigParser()
@@ -88,106 +54,37 @@ def load_firefox_cookies(domain_name, profile_dir):
return browser_cookie3.firefox(cookie_file=str(cookie_file), domain_name=domain_name)
def read_firefox_storage_entries(profile_dir, origin_filters):
profile_dir = Path(profile_dir)
entries = []
entries.extend(read_firefox_ls_entries(profile_dir, origin_filters))
entries.extend(read_firefox_webapps_entries(profile_dir, origin_filters))
deduped = []
seen = set()
for entry in entries:
key = (entry.origin, entry.key, entry.value, entry.source)
if key in seen:
continue
seen.add(key)
deduped.append(entry)
return deduped
def storage_entries_for_origin(storage_entries, origin_filters):
return [
entry
for entry in storage_entries
if origin_matches(entry.origin, origin_filters)
]
def find_storage_value(storage_entries, origin_filters, key):
for entry in storage_entries_for_origin(storage_entries, origin_filters):
if entry.key == key:
return entry.value
return ""
def find_json_storage_value(storage_entries, origin_filters, key, field):
raw_value = find_storage_value(storage_entries, origin_filters, key)
if not raw_value:
return ""
try:
payload = json.loads(raw_value)
except json.JSONDecodeError:
return ""
value = payload.get(field, "")
if value is None:
return ""
return str(value)
def list_storage_keys(storage_entries, origin_filters):
return sorted(
{
entry.key
for entry in storage_entries_for_origin(storage_entries, origin_filters)
if entry.key
}
)
def read_firefox_ls_entries(profile_dir, origin_filters):
entries = []
def read_firefox_local_storage(profile_dir, origin_filter):
storage_root = profile_dir / "storage" / "default"
if not storage_root.exists():
return entries
return {}
for ls_path in storage_root.glob("*/ls/data.sqlite"):
origin = decode_firefox_origin(ls_path.parents[1].name)
if not origin_matches(origin, origin_filters):
if origin_filter.lower() not in origin.lower():
continue
for row in query_sqlite(ls_path, "SELECT key, value FROM data"):
entries.append(
StorageEntry(
origin=origin,
key=stringify_sql_value(row[0]),
value=stringify_sql_value(row[1]),
source=ls_path.as_posix(),
)
)
return entries
return {
stringify_sql_value(row[0]): stringify_sql_value(row[1])
for row in query_sqlite(ls_path, "SELECT key, value FROM data")
}
return {}
def read_firefox_webapps_entries(profile_dir, origin_filters):
def read_firefox_webapps_store(profile_dir, origin_filter):
webapps_path = profile_dir / "webappsstore.sqlite"
if not webapps_path.exists():
return []
return {}
entries = []
values = {}
for row in query_sqlite(
webapps_path,
"SELECT originKey, key, value FROM webappsstore2",
):
origin = stringify_sql_value(row[0])
if not origin_matches(origin, origin_filters):
if origin_filter.lower() not in origin.lower():
continue
entries.append(
StorageEntry(
origin=origin,
key=stringify_sql_value(row[1]),
value=stringify_sql_value(row[2]),
source=webapps_path.as_posix(),
)
)
return entries
values[stringify_sql_value(row[1])] = stringify_sql_value(row[2])
return values
def query_sqlite(path, query):
copied_path = copy_sqlite_to_temp(path)
@@ -210,7 +107,6 @@ def query_sqlite(path, query):
def copy_sqlite_to_temp(path):
import os, shutil, tempfile
fd, tmp = tempfile.mkstemp(suffix=".sqlite")
os.close(fd)
shutil.copyfile(path, tmp)
@@ -220,14 +116,6 @@ def decode_firefox_origin(raw_origin):
origin = raw_origin.split("^", 1)[0]
return origin.replace("+++", "://")
def origin_matches(origin, origin_filters):
if not origin_filters:
return True
normalized_origin = origin.lower()
return any(filter_value.lower() in normalized_origin for filter_value in origin_filters)
def stringify_sql_value(value):
if value is None:
return ""

View File

@@ -1,116 +0,0 @@
import os
from dataclasses import dataclass
from dotenv import load_dotenv
from browser_session import (
find_json_storage_value,
find_storage_value,
list_storage_keys,
load_browser_context,
)
COSTCO_STORAGE_ORIGINS = ["costco.com"]
COSTCO_HEADER_FIELDS = [
("costco-x-authorization", "costco-x-authorization"),
("costco-x-wcs-clientId", "costco-x-wcs-clientId"),
("client-identifier", "client-identifier"),
]
COSTCO_JSON_HEADER_KEYS = ["headers", "costco.headers"]
@dataclass
class RetailerSession:
cookies: object
headers: dict[str, str]
def load_giant_session(browser="firefox", profile_dir=None):
context = load_browser_context(
browser=browser,
domain_name="giantfood.com",
storage_origins=["giantfood.com"],
profile_dir=profile_dir,
)
return RetailerSession(cookies=context.cookies, headers={})
def load_costco_session(browser="firefox", profile_dir=None):
load_dotenv()
headers = {
"costco-x-authorization": os.getenv("COSTCO_X_AUTHORIZATION", "").strip(),
"costco-x-wcs-clientId": os.getenv("COSTCO_WCS_CLIENT_ID", "").strip(),
"client-identifier": os.getenv("COSTCO_CLIENT_IDENTIFIER", "").strip(),
}
context = load_browser_context(
browser=browser,
domain_name=".costco.com",
storage_origins=["costco.com"],
profile_dir=profile_dir,
)
storage = {entry.key: entry.value for entry in context.storage_entries}
id_token = storage.get("idToken", "").strip()
client_id = storage.get("clientID", "").strip()
if id_token:
headers["costco-x-authorization"] = (
id_token if id_token.startswith("Bearer ") else f"Bearer {id_token}"
)
if client_id:
headers["costco-x-wcs-clientId"] = client_id
headers = {k: v for k, v in headers.items() if v}
return RetailerSession(cookies=context.cookies, headers=headers)
#def load_costco_session(browser="firefox", profile_dir=None):
# context = load_browser_context(
# browser=browser,
# domain_name=".costco.com",
# storage_origins=COSTCO_STORAGE_ORIGINS,
# profile_dir=profile_dir,
# )
# headers = extract_costco_headers(context.storage_entries)
# missing = [
# header_name for header_name, value in headers.items() if not value
# ]
# if missing:
# available_keys = ", ".join(
# list_storage_keys(context.storage_entries, COSTCO_STORAGE_ORIGINS)
# )
# raise ValueError(
# "missing Costco browser session headers: "
# f"{', '.join(missing)}. "
# f"Available Costco storage keys: {available_keys or '(none)'}"
# )
# return RetailerSession(cookies=context.cookies, headers=headers)
def extract_costco_headers(storage_entries):
headers = {}
for header_name, storage_key in COSTCO_HEADER_FIELDS:
value = find_storage_value(
storage_entries,
COSTCO_STORAGE_ORIGINS,
storage_key,
)
if not value:
value = find_costco_header_in_json(storage_entries, header_name)
headers[header_name] = value
return headers
def find_costco_header_in_json(storage_entries, header_name):
for json_key in COSTCO_JSON_HEADER_KEYS:
value = find_json_storage_value(
storage_entries,
COSTCO_STORAGE_ORIGINS,
json_key,
header_name,
)
if value:
return value
return ""

View File

@@ -3,14 +3,19 @@ import csv
import json
import time
import re
from pathlib import Path
from calendar import monthrange
from datetime import datetime, timedelta
from pathlib import Path
from dotenv import load_dotenv
import click
from curl_cffi import requests
from retailer_sessions import load_costco_session
from browser_session import (
find_firefox_profile_dir,
load_firefox_cookies,
read_firefox_local_storage,
read_firefox_webapps_store,
)
BASE_URL = "https://ecom-api.costco.com/ebusiness/order/v1/orders/graphql"
RETAILER = "costco"
@@ -210,6 +215,18 @@ ITEM_FIELDS = [
"is_coupon_line",
]
COSTCO_STORAGE_ORIGIN = "costco.com"
COSTCO_AUTH_STORAGE_KEY = "costco-x-authorization"
COSTCO_HEADERS_BLOB_KEY = "headers"
def load_config():
load_dotenv()
return {
"client_id": os.getenv("COSTCO_X_WCS_CLIENTID", "").strip(),
"client_identifier": os.getenv("COSTCO_CLIENT_IDENTIFIER", "").strip(),
}
def build_headers(auth_headers):
headers = {
"accept": "*/*",
@@ -226,11 +243,52 @@ def build_headers(auth_headers):
headers.update(auth_headers)
return headers
def build_session(retailer_session):
def load_costco_browser_headers(profile_dir, client_id, client_identifier):
local_storage = read_firefox_local_storage(profile_dir, COSTCO_STORAGE_ORIGIN)
webapps_store = read_firefox_webapps_store(profile_dir, COSTCO_STORAGE_ORIGIN)
auth_token = (
local_storage.get(COSTCO_AUTH_STORAGE_KEY, "").strip()
or webapps_store.get(COSTCO_AUTH_STORAGE_KEY, "").strip()
)
if not auth_token:
header_blob = (
local_storage.get(COSTCO_HEADERS_BLOB_KEY, "").strip()
or webapps_store.get(COSTCO_HEADERS_BLOB_KEY, "").strip()
)
if header_blob:
try:
blob_data = json.loads(header_blob)
except json.JSONDecodeError:
blob_data = {}
auth_token = str(blob_data.get(COSTCO_AUTH_STORAGE_KEY, "")).strip()
client_id = client_id or str(blob_data.get("costco-x-wcs-clientId", "")).strip()
client_identifier = client_identifier or str(
blob_data.get("client-identifier", "")
).strip()
if not auth_token:
raise click.ClickException(
"could not find Costco auth token in Firefox session storage"
)
if not client_id or not client_identifier:
raise click.ClickException(
"missing Costco client ids; set COSTCO_X_WCS_CLIENTID and COSTCO_CLIENT_IDENTIFIER"
)
return {
"costco-x-authorization": auth_token,
"costco-x-wcs-clientId": client_id,
"client-identifier": client_identifier,
}
def build_session(profile_dir, auth_headers):
session = requests.Session()
session.cookies.update(retailer_session.cookies)
session.cookies.update(load_firefox_cookies(".costco.com", profile_dir))
session.headers.update(build_headers())
session.headers.update(retailer_session.headers)
session.headers.update(auth_headers)
return session
@@ -594,27 +652,24 @@ def main(
):
outdir = Path(outdir)
raw_dir = outdir / "raw"
if firefox_profile_dir is None:
firefox_profile_dir = next(
(Path(os.getenv("APPDATA")) / "Mozilla" / "Firefox" / "Profiles").iterdir()
)
try:
retailer_session = load_costco_session(
browser="firefox",
profile_dir=firefox_profile_dir,
)
click.echo(
"session bootstrap: "
f"cookies={bool(retailer_session.cookies)}, "
f"authorization={'costco-x-authorization' in retailer_session.headers}, "
f"client_id={'costco-x-wcs-clientId' in retailer_session.headers}, "
f"client_identifier={'client-identifier' in retailer_session.headers}"
)
session = build_session(retailer_session)
except Exception as exc:
raise click.ClickException(
f"failed to load Costco browser session: {exc}"
) from exc
config = load_config()
profile_dir = Path(firefox_profile_dir) if firefox_profile_dir else None
if profile_dir is None:
try:
profile_dir = find_firefox_profile_dir()
except Exception:
profile_dir = click.prompt(
"Firefox profile dir",
type=click.Path(exists=True, file_okay=False, path_type=Path),
)
auth_headers = load_costco_browser_headers(
profile_dir,
client_id=config["client_id"],
client_identifier=config["client_identifier"],
)
session = build_session(profile_dir, auth_headers)
start_date, end_date = resolve_date_range(months_back)

View File

@@ -8,7 +8,7 @@ import click
from dotenv import load_dotenv
from curl_cffi import requests
from retailer_sessions import load_giant_session
from browser_session import find_firefox_profile_dir, load_firefox_cookies
BASE = "https://giantfood.com"
@@ -67,9 +67,9 @@ def load_config():
def build_session():
browser_session = load_giant_session()
profile_dir = find_firefox_profile_dir()
session = requests.Session()
session.cookies.update(browser_session.cookies)
session.cookies.update(load_firefox_cookies("giantfood.com", profile_dir))
session.headers.update(
{
"user-agent": (

View File

@@ -5,12 +5,11 @@ from pathlib import Path
from unittest import mock
import browser_session
import retailer_sessions
import scrape_costco
class BrowserSessionTests(unittest.TestCase):
def test_read_firefox_ls_entries_reads_storage_from_copied_sqlite(self):
def test_read_firefox_local_storage_reads_copied_sqlite(self):
with tempfile.TemporaryDirectory() as tmpdir:
profile_dir = Path(tmpdir) / "abcd.default-release"
ls_dir = profile_dir / "storage" / "default" / "https+++www.costco.com" / "ls"
@@ -24,38 +23,35 @@ class BrowserSessionTests(unittest.TestCase):
("costco-x-wcs-clientId", "4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf"),
)
entries = browser_session.read_firefox_storage_entries(
values = browser_session.read_firefox_local_storage(
profile_dir,
origin_filters=["costco.com"],
origin_filter="costco.com",
)
self.assertEqual(1, len(entries))
self.assertEqual("https://www.costco.com", entries[0].origin)
self.assertEqual("costco-x-wcs-clientId", entries[0].key)
self.assertEqual(
"4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",
values["costco-x-wcs-clientId"],
)
def test_extract_costco_headers_uses_exact_keys(self):
entries = [
browser_session.StorageEntry(
origin="https://www.costco.com",
key="costco-x-authorization",
value="Bearer header.payload.signature",
source="memory",
),
browser_session.StorageEntry(
origin="https://www.costco.com",
key="costco-x-wcs-clientId",
value="4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",
source="memory",
),
browser_session.StorageEntry(
origin="https://www.costco.com",
key="client-identifier",
value="481b1aec-aa3b-454b-b81b-48187e28f205",
source="memory",
),
]
def test_load_costco_browser_headers_reads_exact_auth_key(self):
with tempfile.TemporaryDirectory() as tmpdir:
profile_dir = Path(tmpdir)
storage_dir = profile_dir / "storage" / "default" / "https+++www.costco.com" / "ls"
storage_dir.mkdir(parents=True)
db_path = storage_dir / "data.sqlite"
headers = retailer_sessions.extract_costco_headers(entries)
with sqlite3.connect(db_path) as connection:
connection.execute("CREATE TABLE data (key TEXT, value TEXT)")
connection.execute(
"INSERT INTO data (key, value) VALUES (?, ?)",
("costco-x-authorization", "Bearer header.payload.signature"),
)
headers = scrape_costco.load_costco_browser_headers(
profile_dir,
client_id="4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",
client_identifier="481b1aec-aa3b-454b-b81b-48187e28f205",
)
self.assertEqual("Bearer header.payload.signature", headers["costco-x-authorization"])
self.assertEqual(
@@ -67,42 +63,60 @@ class BrowserSessionTests(unittest.TestCase):
headers["client-identifier"],
)
def test_extract_costco_headers_uses_exact_json_header_blob(self):
entries = [
browser_session.StorageEntry(
origin="https://www.costco.com",
key="headers",
value=(
'{"costco-x-authorization":"Bearer header.payload.signature",'
'"costco-x-wcs-clientId":"4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",'
'"client-identifier":"481b1aec-aa3b-454b-b81b-48187e28f205"}'
),
source="memory",
)
]
def test_load_costco_browser_headers_falls_back_to_exact_header_blob(self):
with tempfile.TemporaryDirectory() as tmpdir:
profile_dir = Path(tmpdir)
storage_dir = profile_dir / "storage" / "default" / "https+++www.costco.com" / "ls"
storage_dir.mkdir(parents=True)
db_path = storage_dir / "data.sqlite"
headers = retailer_sessions.extract_costco_headers(entries)
with sqlite3.connect(db_path) as connection:
connection.execute("CREATE TABLE data (key TEXT, value TEXT)")
connection.execute(
"INSERT INTO data (key, value) VALUES (?, ?)",
(
"headers",
'{"costco-x-authorization":"Bearer header.payload.signature"}',
),
)
headers = scrape_costco.load_costco_browser_headers(
profile_dir,
client_id="4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",
client_identifier="481b1aec-aa3b-454b-b81b-48187e28f205",
)
self.assertEqual("Bearer header.payload.signature", headers["costco-x-authorization"])
self.assertEqual(
"4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",
headers["costco-x-wcs-clientId"],
)
self.assertEqual(
"481b1aec-aa3b-454b-b81b-48187e28f205",
headers["client-identifier"],
)
def test_scrape_costco_prompts_for_profile_dir_when_autodiscovery_fails(self):
with mock.patch.object(
scrape_costco,
"build_session",
side_effect=[FileNotFoundError("no default profile"), object()],
"find_firefox_profile_dir",
side_effect=FileNotFoundError("no default profile"),
), mock.patch.object(
scrape_costco.click,
"prompt",
return_value=Path("/tmp/profile"),
) as mocked_prompt, mock.patch.object(
scrape_costco,
"load_config",
return_value={
"client_id": "4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",
"client_identifier": "481b1aec-aa3b-454b-b81b-48187e28f205",
},
), mock.patch.object(
scrape_costco,
"load_costco_browser_headers",
return_value={
"costco-x-authorization": "Bearer header.payload.signature",
"costco-x-wcs-clientId": "4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",
"client-identifier": "481b1aec-aa3b-454b-b81b-48187e28f205",
},
), mock.patch.object(
scrape_costco,
"build_session",
return_value=object(),
), mock.patch.object(
scrape_costco,
"fetch_summary_windows",
return_value=(

View File

@@ -411,6 +411,25 @@ class CostcoPipelineTests(unittest.TestCase):
]
with mock.patch.object(
scrape_costco,
"load_config",
return_value={
"client_id": "4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",
"client_identifier": "481b1aec-aa3b-454b-b81b-48187e28f205",
},
), mock.patch.object(
scrape_costco,
"find_firefox_profile_dir",
return_value=Path("/tmp/profile"),
), mock.patch.object(
scrape_costco,
"load_costco_browser_headers",
return_value={
"costco-x-authorization": "Bearer header.payload.signature",
"costco-x-wcs-clientId": "4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",
"client-identifier": "481b1aec-aa3b-454b-b81b-48187e28f205",
},
), mock.patch.object(
scrape_costco, "build_session", return_value=object()
), mock.patch.object(
scrape_costco,