Simplify browser session bootstrap

This commit is contained in:
ben
2026-03-16 17:08:44 -04:00
parent e48dd6c4c2
commit d7a0329332
6 changed files with 184 additions and 324 deletions

View File

@@ -1,47 +1,13 @@
import configparser import configparser
import json
import os import os
import shutil import shutil
import sqlite3 import sqlite3
import tempfile import tempfile
from dataclasses import dataclass
from pathlib import Path from pathlib import Path
import browser_cookie3 import browser_cookie3
@dataclass
class StorageEntry:
origin: str
key: str
value: str
source: str
@dataclass
class BrowserContext:
cookies: object
storage_entries: list[StorageEntry]
def load_browser_context(
browser,
domain_name,
storage_origins=None,
profile_dir=None,
):
if browser != "firefox":
raise ValueError(f"unsupported browser: {browser}")
profile = Path(profile_dir) if profile_dir else find_firefox_profile_dir()
cookies = load_firefox_cookies(domain_name, profile)
storage_entries = read_firefox_storage_entries(
profile,
origin_filters=storage_origins or [],
)
return BrowserContext(cookies=cookies, storage_entries=storage_entries)
def find_firefox_profile_dir(): def find_firefox_profile_dir():
profiles_ini = firefox_profiles_root() / "profiles.ini" profiles_ini = firefox_profiles_root() / "profiles.ini"
parser = configparser.RawConfigParser() parser = configparser.RawConfigParser()
@@ -88,106 +54,37 @@ def load_firefox_cookies(domain_name, profile_dir):
return browser_cookie3.firefox(cookie_file=str(cookie_file), domain_name=domain_name) return browser_cookie3.firefox(cookie_file=str(cookie_file), domain_name=domain_name)
def read_firefox_storage_entries(profile_dir, origin_filters): def read_firefox_local_storage(profile_dir, origin_filter):
profile_dir = Path(profile_dir)
entries = []
entries.extend(read_firefox_ls_entries(profile_dir, origin_filters))
entries.extend(read_firefox_webapps_entries(profile_dir, origin_filters))
deduped = []
seen = set()
for entry in entries:
key = (entry.origin, entry.key, entry.value, entry.source)
if key in seen:
continue
seen.add(key)
deduped.append(entry)
return deduped
def storage_entries_for_origin(storage_entries, origin_filters):
return [
entry
for entry in storage_entries
if origin_matches(entry.origin, origin_filters)
]
def find_storage_value(storage_entries, origin_filters, key):
for entry in storage_entries_for_origin(storage_entries, origin_filters):
if entry.key == key:
return entry.value
return ""
def find_json_storage_value(storage_entries, origin_filters, key, field):
raw_value = find_storage_value(storage_entries, origin_filters, key)
if not raw_value:
return ""
try:
payload = json.loads(raw_value)
except json.JSONDecodeError:
return ""
value = payload.get(field, "")
if value is None:
return ""
return str(value)
def list_storage_keys(storage_entries, origin_filters):
return sorted(
{
entry.key
for entry in storage_entries_for_origin(storage_entries, origin_filters)
if entry.key
}
)
def read_firefox_ls_entries(profile_dir, origin_filters):
entries = []
storage_root = profile_dir / "storage" / "default" storage_root = profile_dir / "storage" / "default"
if not storage_root.exists(): if not storage_root.exists():
return entries return {}
for ls_path in storage_root.glob("*/ls/data.sqlite"): for ls_path in storage_root.glob("*/ls/data.sqlite"):
origin = decode_firefox_origin(ls_path.parents[1].name) origin = decode_firefox_origin(ls_path.parents[1].name)
if not origin_matches(origin, origin_filters): if origin_filter.lower() not in origin.lower():
continue continue
for row in query_sqlite(ls_path, "SELECT key, value FROM data"): return {
entries.append( stringify_sql_value(row[0]): stringify_sql_value(row[1])
StorageEntry( for row in query_sqlite(ls_path, "SELECT key, value FROM data")
origin=origin, }
key=stringify_sql_value(row[0]), return {}
value=stringify_sql_value(row[1]),
source=ls_path.as_posix(),
)
)
return entries
def read_firefox_webapps_entries(profile_dir, origin_filters): def read_firefox_webapps_store(profile_dir, origin_filter):
webapps_path = profile_dir / "webappsstore.sqlite" webapps_path = profile_dir / "webappsstore.sqlite"
if not webapps_path.exists(): if not webapps_path.exists():
return [] return {}
entries = [] values = {}
for row in query_sqlite( for row in query_sqlite(
webapps_path, webapps_path,
"SELECT originKey, key, value FROM webappsstore2", "SELECT originKey, key, value FROM webappsstore2",
): ):
origin = stringify_sql_value(row[0]) origin = stringify_sql_value(row[0])
if not origin_matches(origin, origin_filters): if origin_filter.lower() not in origin.lower():
continue continue
entries.append( values[stringify_sql_value(row[1])] = stringify_sql_value(row[2])
StorageEntry( return values
origin=origin,
key=stringify_sql_value(row[1]),
value=stringify_sql_value(row[2]),
source=webapps_path.as_posix(),
)
)
return entries
def query_sqlite(path, query): def query_sqlite(path, query):
copied_path = copy_sqlite_to_temp(path) copied_path = copy_sqlite_to_temp(path)
@@ -210,7 +107,6 @@ def query_sqlite(path, query):
def copy_sqlite_to_temp(path): def copy_sqlite_to_temp(path):
import os, shutil, tempfile
fd, tmp = tempfile.mkstemp(suffix=".sqlite") fd, tmp = tempfile.mkstemp(suffix=".sqlite")
os.close(fd) os.close(fd)
shutil.copyfile(path, tmp) shutil.copyfile(path, tmp)
@@ -220,14 +116,6 @@ def decode_firefox_origin(raw_origin):
origin = raw_origin.split("^", 1)[0] origin = raw_origin.split("^", 1)[0]
return origin.replace("+++", "://") return origin.replace("+++", "://")
def origin_matches(origin, origin_filters):
if not origin_filters:
return True
normalized_origin = origin.lower()
return any(filter_value.lower() in normalized_origin for filter_value in origin_filters)
def stringify_sql_value(value): def stringify_sql_value(value):
if value is None: if value is None:
return "" return ""

View File

@@ -1,116 +0,0 @@
import os
from dataclasses import dataclass
from dotenv import load_dotenv
from browser_session import (
find_json_storage_value,
find_storage_value,
list_storage_keys,
load_browser_context,
)
COSTCO_STORAGE_ORIGINS = ["costco.com"]
COSTCO_HEADER_FIELDS = [
("costco-x-authorization", "costco-x-authorization"),
("costco-x-wcs-clientId", "costco-x-wcs-clientId"),
("client-identifier", "client-identifier"),
]
COSTCO_JSON_HEADER_KEYS = ["headers", "costco.headers"]
@dataclass
class RetailerSession:
cookies: object
headers: dict[str, str]
def load_giant_session(browser="firefox", profile_dir=None):
context = load_browser_context(
browser=browser,
domain_name="giantfood.com",
storage_origins=["giantfood.com"],
profile_dir=profile_dir,
)
return RetailerSession(cookies=context.cookies, headers={})
def load_costco_session(browser="firefox", profile_dir=None):
load_dotenv()
headers = {
"costco-x-authorization": os.getenv("COSTCO_X_AUTHORIZATION", "").strip(),
"costco-x-wcs-clientId": os.getenv("COSTCO_WCS_CLIENT_ID", "").strip(),
"client-identifier": os.getenv("COSTCO_CLIENT_IDENTIFIER", "").strip(),
}
context = load_browser_context(
browser=browser,
domain_name=".costco.com",
storage_origins=["costco.com"],
profile_dir=profile_dir,
)
storage = {entry.key: entry.value for entry in context.storage_entries}
id_token = storage.get("idToken", "").strip()
client_id = storage.get("clientID", "").strip()
if id_token:
headers["costco-x-authorization"] = (
id_token if id_token.startswith("Bearer ") else f"Bearer {id_token}"
)
if client_id:
headers["costco-x-wcs-clientId"] = client_id
headers = {k: v for k, v in headers.items() if v}
return RetailerSession(cookies=context.cookies, headers=headers)
#def load_costco_session(browser="firefox", profile_dir=None):
# context = load_browser_context(
# browser=browser,
# domain_name=".costco.com",
# storage_origins=COSTCO_STORAGE_ORIGINS,
# profile_dir=profile_dir,
# )
# headers = extract_costco_headers(context.storage_entries)
# missing = [
# header_name for header_name, value in headers.items() if not value
# ]
# if missing:
# available_keys = ", ".join(
# list_storage_keys(context.storage_entries, COSTCO_STORAGE_ORIGINS)
# )
# raise ValueError(
# "missing Costco browser session headers: "
# f"{', '.join(missing)}. "
# f"Available Costco storage keys: {available_keys or '(none)'}"
# )
# return RetailerSession(cookies=context.cookies, headers=headers)
def extract_costco_headers(storage_entries):
headers = {}
for header_name, storage_key in COSTCO_HEADER_FIELDS:
value = find_storage_value(
storage_entries,
COSTCO_STORAGE_ORIGINS,
storage_key,
)
if not value:
value = find_costco_header_in_json(storage_entries, header_name)
headers[header_name] = value
return headers
def find_costco_header_in_json(storage_entries, header_name):
for json_key in COSTCO_JSON_HEADER_KEYS:
value = find_json_storage_value(
storage_entries,
COSTCO_STORAGE_ORIGINS,
json_key,
header_name,
)
if value:
return value
return ""

View File

@@ -3,14 +3,19 @@ import csv
import json import json
import time import time
import re import re
from pathlib import Path
from calendar import monthrange from calendar import monthrange
from datetime import datetime, timedelta from datetime import datetime, timedelta
from pathlib import Path
from dotenv import load_dotenv from dotenv import load_dotenv
import click import click
from curl_cffi import requests from curl_cffi import requests
from retailer_sessions import load_costco_session from browser_session import (
find_firefox_profile_dir,
load_firefox_cookies,
read_firefox_local_storage,
read_firefox_webapps_store,
)
BASE_URL = "https://ecom-api.costco.com/ebusiness/order/v1/orders/graphql" BASE_URL = "https://ecom-api.costco.com/ebusiness/order/v1/orders/graphql"
RETAILER = "costco" RETAILER = "costco"
@@ -210,6 +215,18 @@ ITEM_FIELDS = [
"is_coupon_line", "is_coupon_line",
] ]
COSTCO_STORAGE_ORIGIN = "costco.com"
COSTCO_AUTH_STORAGE_KEY = "costco-x-authorization"
COSTCO_HEADERS_BLOB_KEY = "headers"
def load_config():
load_dotenv()
return {
"client_id": os.getenv("COSTCO_X_WCS_CLIENTID", "").strip(),
"client_identifier": os.getenv("COSTCO_CLIENT_IDENTIFIER", "").strip(),
}
def build_headers(auth_headers): def build_headers(auth_headers):
headers = { headers = {
"accept": "*/*", "accept": "*/*",
@@ -226,11 +243,52 @@ def build_headers(auth_headers):
headers.update(auth_headers) headers.update(auth_headers)
return headers return headers
def build_session(retailer_session):
def load_costco_browser_headers(profile_dir, client_id, client_identifier):
local_storage = read_firefox_local_storage(profile_dir, COSTCO_STORAGE_ORIGIN)
webapps_store = read_firefox_webapps_store(profile_dir, COSTCO_STORAGE_ORIGIN)
auth_token = (
local_storage.get(COSTCO_AUTH_STORAGE_KEY, "").strip()
or webapps_store.get(COSTCO_AUTH_STORAGE_KEY, "").strip()
)
if not auth_token:
header_blob = (
local_storage.get(COSTCO_HEADERS_BLOB_KEY, "").strip()
or webapps_store.get(COSTCO_HEADERS_BLOB_KEY, "").strip()
)
if header_blob:
try:
blob_data = json.loads(header_blob)
except json.JSONDecodeError:
blob_data = {}
auth_token = str(blob_data.get(COSTCO_AUTH_STORAGE_KEY, "")).strip()
client_id = client_id or str(blob_data.get("costco-x-wcs-clientId", "")).strip()
client_identifier = client_identifier or str(
blob_data.get("client-identifier", "")
).strip()
if not auth_token:
raise click.ClickException(
"could not find Costco auth token in Firefox session storage"
)
if not client_id or not client_identifier:
raise click.ClickException(
"missing Costco client ids; set COSTCO_X_WCS_CLIENTID and COSTCO_CLIENT_IDENTIFIER"
)
return {
"costco-x-authorization": auth_token,
"costco-x-wcs-clientId": client_id,
"client-identifier": client_identifier,
}
def build_session(profile_dir, auth_headers):
session = requests.Session() session = requests.Session()
session.cookies.update(retailer_session.cookies) session.cookies.update(load_firefox_cookies(".costco.com", profile_dir))
session.headers.update(build_headers()) session.headers.update(build_headers())
session.headers.update(retailer_session.headers) session.headers.update(auth_headers)
return session return session
@@ -594,27 +652,24 @@ def main(
): ):
outdir = Path(outdir) outdir = Path(outdir)
raw_dir = outdir / "raw" raw_dir = outdir / "raw"
if firefox_profile_dir is None: config = load_config()
firefox_profile_dir = next(
(Path(os.getenv("APPDATA")) / "Mozilla" / "Firefox" / "Profiles").iterdir() profile_dir = Path(firefox_profile_dir) if firefox_profile_dir else None
) if profile_dir is None:
try: try:
retailer_session = load_costco_session( profile_dir = find_firefox_profile_dir()
browser="firefox", except Exception:
profile_dir=firefox_profile_dir, profile_dir = click.prompt(
"Firefox profile dir",
type=click.Path(exists=True, file_okay=False, path_type=Path),
) )
click.echo(
"session bootstrap: " auth_headers = load_costco_browser_headers(
f"cookies={bool(retailer_session.cookies)}, " profile_dir,
f"authorization={'costco-x-authorization' in retailer_session.headers}, " client_id=config["client_id"],
f"client_id={'costco-x-wcs-clientId' in retailer_session.headers}, " client_identifier=config["client_identifier"],
f"client_identifier={'client-identifier' in retailer_session.headers}"
) )
session = build_session(retailer_session) session = build_session(profile_dir, auth_headers)
except Exception as exc:
raise click.ClickException(
f"failed to load Costco browser session: {exc}"
) from exc
start_date, end_date = resolve_date_range(months_back) start_date, end_date = resolve_date_range(months_back)

View File

@@ -8,7 +8,7 @@ import click
from dotenv import load_dotenv from dotenv import load_dotenv
from curl_cffi import requests from curl_cffi import requests
from retailer_sessions import load_giant_session from browser_session import find_firefox_profile_dir, load_firefox_cookies
BASE = "https://giantfood.com" BASE = "https://giantfood.com"
@@ -67,9 +67,9 @@ def load_config():
def build_session(): def build_session():
browser_session = load_giant_session() profile_dir = find_firefox_profile_dir()
session = requests.Session() session = requests.Session()
session.cookies.update(browser_session.cookies) session.cookies.update(load_firefox_cookies("giantfood.com", profile_dir))
session.headers.update( session.headers.update(
{ {
"user-agent": ( "user-agent": (

View File

@@ -5,12 +5,11 @@ from pathlib import Path
from unittest import mock from unittest import mock
import browser_session import browser_session
import retailer_sessions
import scrape_costco import scrape_costco
class BrowserSessionTests(unittest.TestCase): class BrowserSessionTests(unittest.TestCase):
def test_read_firefox_ls_entries_reads_storage_from_copied_sqlite(self): def test_read_firefox_local_storage_reads_copied_sqlite(self):
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
profile_dir = Path(tmpdir) / "abcd.default-release" profile_dir = Path(tmpdir) / "abcd.default-release"
ls_dir = profile_dir / "storage" / "default" / "https+++www.costco.com" / "ls" ls_dir = profile_dir / "storage" / "default" / "https+++www.costco.com" / "ls"
@@ -24,38 +23,35 @@ class BrowserSessionTests(unittest.TestCase):
("costco-x-wcs-clientId", "4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf"), ("costco-x-wcs-clientId", "4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf"),
) )
entries = browser_session.read_firefox_storage_entries( values = browser_session.read_firefox_local_storage(
profile_dir, profile_dir,
origin_filters=["costco.com"], origin_filter="costco.com",
) )
self.assertEqual(1, len(entries)) self.assertEqual(
self.assertEqual("https://www.costco.com", entries[0].origin) "4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",
self.assertEqual("costco-x-wcs-clientId", entries[0].key) values["costco-x-wcs-clientId"],
)
def test_extract_costco_headers_uses_exact_keys(self): def test_load_costco_browser_headers_reads_exact_auth_key(self):
entries = [ with tempfile.TemporaryDirectory() as tmpdir:
browser_session.StorageEntry( profile_dir = Path(tmpdir)
origin="https://www.costco.com", storage_dir = profile_dir / "storage" / "default" / "https+++www.costco.com" / "ls"
key="costco-x-authorization", storage_dir.mkdir(parents=True)
value="Bearer header.payload.signature", db_path = storage_dir / "data.sqlite"
source="memory",
),
browser_session.StorageEntry(
origin="https://www.costco.com",
key="costco-x-wcs-clientId",
value="4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",
source="memory",
),
browser_session.StorageEntry(
origin="https://www.costco.com",
key="client-identifier",
value="481b1aec-aa3b-454b-b81b-48187e28f205",
source="memory",
),
]
headers = retailer_sessions.extract_costco_headers(entries) with sqlite3.connect(db_path) as connection:
connection.execute("CREATE TABLE data (key TEXT, value TEXT)")
connection.execute(
"INSERT INTO data (key, value) VALUES (?, ?)",
("costco-x-authorization", "Bearer header.payload.signature"),
)
headers = scrape_costco.load_costco_browser_headers(
profile_dir,
client_id="4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",
client_identifier="481b1aec-aa3b-454b-b81b-48187e28f205",
)
self.assertEqual("Bearer header.payload.signature", headers["costco-x-authorization"]) self.assertEqual("Bearer header.payload.signature", headers["costco-x-authorization"])
self.assertEqual( self.assertEqual(
@@ -67,42 +63,60 @@ class BrowserSessionTests(unittest.TestCase):
headers["client-identifier"], headers["client-identifier"],
) )
def test_extract_costco_headers_uses_exact_json_header_blob(self): def test_load_costco_browser_headers_falls_back_to_exact_header_blob(self):
entries = [ with tempfile.TemporaryDirectory() as tmpdir:
browser_session.StorageEntry( profile_dir = Path(tmpdir)
origin="https://www.costco.com", storage_dir = profile_dir / "storage" / "default" / "https+++www.costco.com" / "ls"
key="headers", storage_dir.mkdir(parents=True)
value=( db_path = storage_dir / "data.sqlite"
'{"costco-x-authorization":"Bearer header.payload.signature",'
'"costco-x-wcs-clientId":"4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",'
'"client-identifier":"481b1aec-aa3b-454b-b81b-48187e28f205"}'
),
source="memory",
)
]
headers = retailer_sessions.extract_costco_headers(entries) with sqlite3.connect(db_path) as connection:
connection.execute("CREATE TABLE data (key TEXT, value TEXT)")
connection.execute(
"INSERT INTO data (key, value) VALUES (?, ?)",
(
"headers",
'{"costco-x-authorization":"Bearer header.payload.signature"}',
),
)
headers = scrape_costco.load_costco_browser_headers(
profile_dir,
client_id="4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",
client_identifier="481b1aec-aa3b-454b-b81b-48187e28f205",
)
self.assertEqual("Bearer header.payload.signature", headers["costco-x-authorization"]) self.assertEqual("Bearer header.payload.signature", headers["costco-x-authorization"])
self.assertEqual(
"4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",
headers["costco-x-wcs-clientId"],
)
self.assertEqual(
"481b1aec-aa3b-454b-b81b-48187e28f205",
headers["client-identifier"],
)
def test_scrape_costco_prompts_for_profile_dir_when_autodiscovery_fails(self): def test_scrape_costco_prompts_for_profile_dir_when_autodiscovery_fails(self):
with mock.patch.object( with mock.patch.object(
scrape_costco, scrape_costco,
"build_session", "find_firefox_profile_dir",
side_effect=[FileNotFoundError("no default profile"), object()], side_effect=FileNotFoundError("no default profile"),
), mock.patch.object( ), mock.patch.object(
scrape_costco.click, scrape_costco.click,
"prompt", "prompt",
return_value=Path("/tmp/profile"), return_value=Path("/tmp/profile"),
) as mocked_prompt, mock.patch.object( ) as mocked_prompt, mock.patch.object(
scrape_costco,
"load_config",
return_value={
"client_id": "4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",
"client_identifier": "481b1aec-aa3b-454b-b81b-48187e28f205",
},
), mock.patch.object(
scrape_costco,
"load_costco_browser_headers",
return_value={
"costco-x-authorization": "Bearer header.payload.signature",
"costco-x-wcs-clientId": "4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",
"client-identifier": "481b1aec-aa3b-454b-b81b-48187e28f205",
},
), mock.patch.object(
scrape_costco,
"build_session",
return_value=object(),
), mock.patch.object(
scrape_costco, scrape_costco,
"fetch_summary_windows", "fetch_summary_windows",
return_value=( return_value=(

View File

@@ -411,6 +411,25 @@ class CostcoPipelineTests(unittest.TestCase):
] ]
with mock.patch.object( with mock.patch.object(
scrape_costco,
"load_config",
return_value={
"client_id": "4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",
"client_identifier": "481b1aec-aa3b-454b-b81b-48187e28f205",
},
), mock.patch.object(
scrape_costco,
"find_firefox_profile_dir",
return_value=Path("/tmp/profile"),
), mock.patch.object(
scrape_costco,
"load_costco_browser_headers",
return_value={
"costco-x-authorization": "Bearer header.payload.signature",
"costco-x-wcs-clientId": "4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",
"client-identifier": "481b1aec-aa3b-454b-b81b-48187e28f205",
},
), mock.patch.object(
scrape_costco, "build_session", return_value=object() scrape_costco, "build_session", return_value=object()
), mock.patch.object( ), mock.patch.object(
scrape_costco, scrape_costco,