Simplify Costco browser header extraction
This commit is contained in:
@@ -1,4 +1,5 @@
|
|||||||
import configparser
|
import configparser
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
import sqlite3
|
import sqlite3
|
||||||
@@ -104,6 +105,45 @@ def read_firefox_storage_entries(profile_dir, origin_filters):
|
|||||||
return deduped
|
return deduped
|
||||||
|
|
||||||
|
|
||||||
|
def storage_entries_for_origin(storage_entries, origin_filters):
|
||||||
|
return [
|
||||||
|
entry
|
||||||
|
for entry in storage_entries
|
||||||
|
if origin_matches(entry.origin, origin_filters)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def find_storage_value(storage_entries, origin_filters, key):
|
||||||
|
for entry in storage_entries_for_origin(storage_entries, origin_filters):
|
||||||
|
if entry.key == key:
|
||||||
|
return entry.value
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def find_json_storage_value(storage_entries, origin_filters, key, field):
|
||||||
|
raw_value = find_storage_value(storage_entries, origin_filters, key)
|
||||||
|
if not raw_value:
|
||||||
|
return ""
|
||||||
|
try:
|
||||||
|
payload = json.loads(raw_value)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
return ""
|
||||||
|
value = payload.get(field, "")
|
||||||
|
if value is None:
|
||||||
|
return ""
|
||||||
|
return str(value)
|
||||||
|
|
||||||
|
|
||||||
|
def list_storage_keys(storage_entries, origin_filters):
|
||||||
|
return sorted(
|
||||||
|
{
|
||||||
|
entry.key
|
||||||
|
for entry in storage_entries_for_origin(storage_entries, origin_filters)
|
||||||
|
if entry.key
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def read_firefox_ls_entries(profile_dir, origin_filters):
|
def read_firefox_ls_entries(profile_dir, origin_filters):
|
||||||
entries = []
|
entries = []
|
||||||
storage_root = profile_dir / "storage" / "default"
|
storage_root = profile_dir / "storage" / "default"
|
||||||
|
|||||||
@@ -1,18 +1,20 @@
|
|||||||
import json
|
|
||||||
import re
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
|
||||||
from browser_session import load_browser_context
|
from browser_session import (
|
||||||
|
find_json_storage_value,
|
||||||
|
find_storage_value,
|
||||||
UUID_RE = re.compile(
|
list_storage_keys,
|
||||||
r"^[0-9a-fA-F]{8}-"
|
load_browser_context,
|
||||||
r"[0-9a-fA-F]{4}-"
|
|
||||||
r"[0-9a-fA-F]{4}-"
|
|
||||||
r"[0-9a-fA-F]{4}-"
|
|
||||||
r"[0-9a-fA-F]{12}$"
|
|
||||||
)
|
)
|
||||||
JWT_RE = re.compile(r"^[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+$")
|
|
||||||
|
|
||||||
|
COSTCO_STORAGE_ORIGINS = ["costco.com"]
|
||||||
|
COSTCO_HEADER_FIELDS = [
|
||||||
|
("costco-x-authorization", "costco-x-authorization"),
|
||||||
|
("costco-x-wcs-clientId", "costco-x-wcs-clientId"),
|
||||||
|
("client-identifier", "client-identifier"),
|
||||||
|
]
|
||||||
|
COSTCO_JSON_HEADER_KEYS = ["headers", "costco.headers"]
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -35,102 +37,47 @@ def load_costco_session(browser="firefox", profile_dir=None):
|
|||||||
context = load_browser_context(
|
context = load_browser_context(
|
||||||
browser=browser,
|
browser=browser,
|
||||||
domain_name=".costco.com",
|
domain_name=".costco.com",
|
||||||
storage_origins=["costco.com"],
|
storage_origins=COSTCO_STORAGE_ORIGINS,
|
||||||
profile_dir=profile_dir,
|
profile_dir=profile_dir,
|
||||||
)
|
)
|
||||||
return RetailerSession(
|
headers = extract_costco_headers(context.storage_entries)
|
||||||
cookies=context.cookies,
|
missing = [
|
||||||
headers=extract_costco_headers(context.storage_entries),
|
header_name for header_name, value in headers.items() if not value
|
||||||
)
|
]
|
||||||
|
if missing:
|
||||||
|
available_keys = ", ".join(
|
||||||
|
list_storage_keys(context.storage_entries, COSTCO_STORAGE_ORIGINS)
|
||||||
|
)
|
||||||
|
raise ValueError(
|
||||||
|
"missing Costco browser session headers: "
|
||||||
|
f"{', '.join(missing)}. "
|
||||||
|
f"Available Costco storage keys: {available_keys or '(none)'}"
|
||||||
|
)
|
||||||
|
return RetailerSession(cookies=context.cookies, headers=headers)
|
||||||
|
|
||||||
|
|
||||||
def extract_costco_headers(storage_entries):
|
def extract_costco_headers(storage_entries):
|
||||||
authorization = ""
|
|
||||||
client_id = ""
|
|
||||||
client_identifier = ""
|
|
||||||
|
|
||||||
for key_path, value in iter_storage_candidates(storage_entries):
|
|
||||||
normalized_key = normalize_key(key_path)
|
|
||||||
normalized_value = str(value).strip()
|
|
||||||
if not normalized_value:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if not authorization and looks_like_authorization(normalized_key, normalized_value):
|
|
||||||
authorization = normalize_authorization(normalized_value)
|
|
||||||
continue
|
|
||||||
if not client_identifier and looks_like_client_identifier(
|
|
||||||
normalized_key, normalized_value
|
|
||||||
):
|
|
||||||
client_identifier = normalized_value
|
|
||||||
continue
|
|
||||||
if not client_id and looks_like_client_id(normalized_key, normalized_value):
|
|
||||||
client_id = normalized_value
|
|
||||||
|
|
||||||
headers = {}
|
headers = {}
|
||||||
if authorization:
|
for header_name, storage_key in COSTCO_HEADER_FIELDS:
|
||||||
headers["costco-x-authorization"] = authorization
|
value = find_storage_value(
|
||||||
if client_id:
|
storage_entries,
|
||||||
headers["costco-x-wcs-clientId"] = client_id
|
COSTCO_STORAGE_ORIGINS,
|
||||||
if client_identifier:
|
storage_key,
|
||||||
headers["client-identifier"] = client_identifier
|
)
|
||||||
|
if not value:
|
||||||
|
value = find_costco_header_in_json(storage_entries, header_name)
|
||||||
|
headers[header_name] = value
|
||||||
return headers
|
return headers
|
||||||
|
|
||||||
|
|
||||||
def iter_storage_candidates(storage_entries):
|
def find_costco_header_in_json(storage_entries, header_name):
|
||||||
for entry in storage_entries:
|
for json_key in COSTCO_JSON_HEADER_KEYS:
|
||||||
yield entry.key, entry.value
|
value = find_json_storage_value(
|
||||||
yield from walk_candidate_value(entry.key, parse_json_value(entry.value))
|
storage_entries,
|
||||||
|
COSTCO_STORAGE_ORIGINS,
|
||||||
|
json_key,
|
||||||
def walk_candidate_value(prefix, value):
|
header_name,
|
||||||
if isinstance(value, dict):
|
)
|
||||||
for key, nested in value.items():
|
if value:
|
||||||
nested_prefix = f"{prefix}.{key}"
|
return value
|
||||||
yield nested_prefix, nested
|
|
||||||
yield from walk_candidate_value(nested_prefix, nested)
|
|
||||||
elif isinstance(value, list):
|
|
||||||
for index, nested in enumerate(value):
|
|
||||||
nested_prefix = f"{prefix}[{index}]"
|
|
||||||
yield nested_prefix, nested
|
|
||||||
yield from walk_candidate_value(nested_prefix, nested)
|
|
||||||
|
|
||||||
|
|
||||||
def parse_json_value(value):
|
|
||||||
if not isinstance(value, str):
|
|
||||||
return value
|
|
||||||
text = value.strip()
|
|
||||||
if not text or text[0] not in "{[":
|
|
||||||
return value
|
|
||||||
try:
|
|
||||||
return json.loads(text)
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
return value
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_key(value):
|
|
||||||
return re.sub(r"[^a-z0-9]+", "", value.lower())
|
|
||||||
|
|
||||||
|
|
||||||
def looks_like_authorization(key, value):
|
|
||||||
return (
|
|
||||||
("authorization" in key or "token" in key)
|
|
||||||
and bool(normalize_authorization(value))
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_authorization(value):
|
|
||||||
candidate = str(value).strip()
|
|
||||||
if candidate.lower().startswith("bearer "):
|
|
||||||
token = candidate.split(None, 1)[1].strip()
|
|
||||||
return f"Bearer {token}" if JWT_RE.match(token) else ""
|
|
||||||
if JWT_RE.match(candidate):
|
|
||||||
return f"Bearer {candidate}"
|
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
def looks_like_client_id(key, value):
|
|
||||||
return "clientid" in key and "identifier" not in key and bool(UUID_RE.match(value))
|
|
||||||
|
|
||||||
|
|
||||||
def looks_like_client_identifier(key, value):
|
|
||||||
return "clientidentifier" in key and bool(UUID_RE.match(value))
|
|
||||||
|
|||||||
@@ -225,27 +225,8 @@ def build_headers(auth_headers):
|
|||||||
headers.update(auth_headers)
|
headers.update(auth_headers)
|
||||||
return headers
|
return headers
|
||||||
|
|
||||||
def build_session():
|
def build_session(profile_dir=None):
|
||||||
retailer_session = load_costco_session()
|
retailer_session = load_costco_session(profile_dir=profile_dir)
|
||||||
click.echo(
|
|
||||||
"session bootstrap: "
|
|
||||||
f"cookies={bool(retailer_session.cookies)}, "
|
|
||||||
f"authorization={'costco-x-authorization' in retailer_session.headers}, "
|
|
||||||
f"client_id={'costco-x-wcs-clientId' in retailer_session.headers}, "
|
|
||||||
f"client_identifier={'client-identifier' in retailer_session.headers}"
|
|
||||||
)
|
|
||||||
|
|
||||||
auth = retailer_session.headers.get("costco-x-authorization", "")
|
|
||||||
if auth:
|
|
||||||
click.echo(
|
|
||||||
f"auth prefix ok={auth.startswith('Bearer ')} len={len(auth)} token_prefix={auth[:24]}"
|
|
||||||
)
|
|
||||||
|
|
||||||
click.echo(
|
|
||||||
"header values: "
|
|
||||||
f"client_id={retailer_session.headers.get('costco-x-wcs-clientId', '')} "
|
|
||||||
f"client_identifier={retailer_session.headers.get('client-identifier', '')}"
|
|
||||||
)
|
|
||||||
session = requests.Session()
|
session = requests.Session()
|
||||||
session.cookies.update(retailer_session.cookies)
|
session.cookies.update(retailer_session.cookies)
|
||||||
session.headers.update(build_headers(retailer_session.headers))
|
session.headers.update(build_headers(retailer_session.headers))
|
||||||
@@ -597,15 +578,38 @@ def write_csv(path, rows, fieldnames):
|
|||||||
type=int,
|
type=int,
|
||||||
help="How many months of receipts to enumerate back from today.",
|
help="How many months of receipts to enumerate back from today.",
|
||||||
)
|
)
|
||||||
def main(outdir, document_type, document_sub_type, window_days, months_back):
|
@click.option(
|
||||||
|
"--firefox-profile-dir",
|
||||||
|
default=None,
|
||||||
|
help="Firefox profile directory to use for cookies and session storage.",
|
||||||
|
)
|
||||||
|
def main(
|
||||||
|
outdir,
|
||||||
|
document_type,
|
||||||
|
document_sub_type,
|
||||||
|
window_days,
|
||||||
|
months_back,
|
||||||
|
firefox_profile_dir,
|
||||||
|
):
|
||||||
outdir = Path(outdir)
|
outdir = Path(outdir)
|
||||||
raw_dir = outdir / "raw"
|
raw_dir = outdir / "raw"
|
||||||
try:
|
try:
|
||||||
session = build_session()
|
session = build_session(profile_dir=firefox_profile_dir)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
raise click.ClickException(
|
if firefox_profile_dir:
|
||||||
f"failed to load Costco browser session: {exc}"
|
raise click.ClickException(
|
||||||
) from exc
|
f"failed to load Costco browser session: {exc}"
|
||||||
|
) from exc
|
||||||
|
prompted_profile = click.prompt(
|
||||||
|
"Firefox profile dir",
|
||||||
|
type=click.Path(exists=True, file_okay=False, path_type=Path),
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
session = build_session(profile_dir=prompted_profile)
|
||||||
|
except Exception as prompt_exc:
|
||||||
|
raise click.ClickException(
|
||||||
|
f"failed to load Costco browser session: {prompt_exc}"
|
||||||
|
) from prompt_exc
|
||||||
start_date, end_date = resolve_date_range(months_back)
|
start_date, end_date = resolve_date_range(months_back)
|
||||||
|
|
||||||
summary_payload, request_metadata = fetch_summary_windows(
|
summary_payload, request_metadata = fetch_summary_windows(
|
||||||
|
|||||||
@@ -2,9 +2,11 @@ import sqlite3
|
|||||||
import tempfile
|
import tempfile
|
||||||
import unittest
|
import unittest
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from unittest import mock
|
||||||
|
|
||||||
import browser_session
|
import browser_session
|
||||||
import retailer_sessions
|
import retailer_sessions
|
||||||
|
import scrape_costco
|
||||||
|
|
||||||
|
|
||||||
class BrowserSessionTests(unittest.TestCase):
|
class BrowserSessionTests(unittest.TestCase):
|
||||||
@@ -15,16 +17,12 @@ class BrowserSessionTests(unittest.TestCase):
|
|||||||
ls_dir.mkdir(parents=True)
|
ls_dir.mkdir(parents=True)
|
||||||
db_path = ls_dir / "data.sqlite"
|
db_path = ls_dir / "data.sqlite"
|
||||||
|
|
||||||
connection = sqlite3.connect(db_path)
|
with sqlite3.connect(db_path) as connection:
|
||||||
try:
|
|
||||||
connection.execute("CREATE TABLE data (key TEXT, value TEXT)")
|
connection.execute("CREATE TABLE data (key TEXT, value TEXT)")
|
||||||
connection.execute(
|
connection.execute(
|
||||||
"INSERT INTO data (key, value) VALUES (?, ?)",
|
"INSERT INTO data (key, value) VALUES (?, ?)",
|
||||||
("session", '{"costco":{"clientIdentifier":"481b1aec-aa3b-454b-b81b-48187e28f205"}}'),
|
("costco-x-wcs-clientId", "4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf"),
|
||||||
)
|
)
|
||||||
connection.commit()
|
|
||||||
finally:
|
|
||||||
connection.close()
|
|
||||||
|
|
||||||
entries = browser_session.read_firefox_storage_entries(
|
entries = browser_session.read_firefox_storage_entries(
|
||||||
profile_dir,
|
profile_dir,
|
||||||
@@ -33,17 +31,51 @@ class BrowserSessionTests(unittest.TestCase):
|
|||||||
|
|
||||||
self.assertEqual(1, len(entries))
|
self.assertEqual(1, len(entries))
|
||||||
self.assertEqual("https://www.costco.com", entries[0].origin)
|
self.assertEqual("https://www.costco.com", entries[0].origin)
|
||||||
self.assertEqual("session", entries[0].key)
|
self.assertEqual("costco-x-wcs-clientId", entries[0].key)
|
||||||
|
|
||||||
def test_extract_costco_headers_from_storage_json(self):
|
def test_extract_costco_headers_uses_exact_keys(self):
|
||||||
entries = [
|
entries = [
|
||||||
browser_session.StorageEntry(
|
browser_session.StorageEntry(
|
||||||
origin="https://www.costco.com",
|
origin="https://www.costco.com",
|
||||||
key="authState",
|
key="costco-x-authorization",
|
||||||
|
value="Bearer header.payload.signature",
|
||||||
|
source="memory",
|
||||||
|
),
|
||||||
|
browser_session.StorageEntry(
|
||||||
|
origin="https://www.costco.com",
|
||||||
|
key="costco-x-wcs-clientId",
|
||||||
|
value="4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",
|
||||||
|
source="memory",
|
||||||
|
),
|
||||||
|
browser_session.StorageEntry(
|
||||||
|
origin="https://www.costco.com",
|
||||||
|
key="client-identifier",
|
||||||
|
value="481b1aec-aa3b-454b-b81b-48187e28f205",
|
||||||
|
source="memory",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
headers = retailer_sessions.extract_costco_headers(entries)
|
||||||
|
|
||||||
|
self.assertEqual("Bearer header.payload.signature", headers["costco-x-authorization"])
|
||||||
|
self.assertEqual(
|
||||||
|
"4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",
|
||||||
|
headers["costco-x-wcs-clientId"],
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
"481b1aec-aa3b-454b-b81b-48187e28f205",
|
||||||
|
headers["client-identifier"],
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_extract_costco_headers_uses_exact_json_header_blob(self):
|
||||||
|
entries = [
|
||||||
|
browser_session.StorageEntry(
|
||||||
|
origin="https://www.costco.com",
|
||||||
|
key="headers",
|
||||||
value=(
|
value=(
|
||||||
'{"authorization":"Bearer header.payload.signature",'
|
'{"costco-x-authorization":"Bearer header.payload.signature",'
|
||||||
'"wcsClientId":"4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",'
|
'"costco-x-wcs-clientId":"4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",'
|
||||||
'"clientIdentifier":"481b1aec-aa3b-454b-b81b-48187e28f205"}'
|
'"client-identifier":"481b1aec-aa3b-454b-b81b-48187e28f205"}'
|
||||||
),
|
),
|
||||||
source="memory",
|
source="memory",
|
||||||
)
|
)
|
||||||
@@ -61,6 +93,40 @@ class BrowserSessionTests(unittest.TestCase):
|
|||||||
headers["client-identifier"],
|
headers["client-identifier"],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_scrape_costco_prompts_for_profile_dir_when_autodiscovery_fails(self):
|
||||||
|
with mock.patch.object(
|
||||||
|
scrape_costco,
|
||||||
|
"build_session",
|
||||||
|
side_effect=[FileNotFoundError("no default profile"), object()],
|
||||||
|
), mock.patch.object(
|
||||||
|
scrape_costco.click,
|
||||||
|
"prompt",
|
||||||
|
return_value=Path("/tmp/profile"),
|
||||||
|
) as mocked_prompt, mock.patch.object(
|
||||||
|
scrape_costco,
|
||||||
|
"fetch_summary_windows",
|
||||||
|
return_value=(
|
||||||
|
{"data": {"receiptsWithCounts": {"receipts": []}}},
|
||||||
|
[],
|
||||||
|
),
|
||||||
|
), mock.patch.object(
|
||||||
|
scrape_costco,
|
||||||
|
"write_json",
|
||||||
|
), mock.patch.object(
|
||||||
|
scrape_costco,
|
||||||
|
"write_csv",
|
||||||
|
):
|
||||||
|
scrape_costco.main.callback(
|
||||||
|
outdir="/tmp/costco_output",
|
||||||
|
document_type="all",
|
||||||
|
document_sub_type="all",
|
||||||
|
window_days=92,
|
||||||
|
months_back=3,
|
||||||
|
firefox_profile_dir=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
mocked_prompt.assert_called_once()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
@@ -427,6 +427,7 @@ class CostcoPipelineTests(unittest.TestCase):
|
|||||||
document_sub_type="all",
|
document_sub_type="all",
|
||||||
window_days=92,
|
window_days=92,
|
||||||
months_back=3,
|
months_back=3,
|
||||||
|
firefox_profile_dir=None,
|
||||||
)
|
)
|
||||||
|
|
||||||
metadata_path = outdir / "raw" / "summary_requests.json"
|
metadata_path = outdir / "raw" / "summary_requests.json"
|
||||||
|
|||||||
Reference in New Issue
Block a user