Align Costco scraper with browser session flow
This commit is contained in:
129
scrape_costco.py
129
scrape_costco.py
@@ -1,12 +1,13 @@
|
|||||||
import csv
|
import csv
|
||||||
import json
|
import json
|
||||||
import os
|
import time
|
||||||
|
from calendar import monthrange
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import click
|
import click
|
||||||
from dotenv import load_dotenv
|
import browser_cookie3
|
||||||
|
from curl_cffi import requests
|
||||||
|
|
||||||
BASE_URL = "https://ecom-api.costco.com/ebusiness/order/v1/orders/graphql"
|
BASE_URL = "https://ecom-api.costco.com/ebusiness/order/v1/orders/graphql"
|
||||||
RETAILER = "costco"
|
RETAILER = "costco"
|
||||||
@@ -207,24 +208,12 @@ ITEM_FIELDS = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def load_config():
|
def build_headers():
|
||||||
load_dotenv()
|
|
||||||
return {
|
|
||||||
"authorization": os.getenv("COSTCO_X_AUTHORIZATION", "").strip(),
|
|
||||||
"client_id": os.getenv("COSTCO_WCS_CLIENT_ID", "").strip(),
|
|
||||||
"client_identifier": os.getenv("COSTCO_CLIENT_IDENTIFIER", "").strip(),
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def build_headers(config):
|
|
||||||
return {
|
return {
|
||||||
"accept": "*/*",
|
"accept": "*/*",
|
||||||
"content-type": "application/json-patch+json",
|
"content-type": "application/json-patch+json",
|
||||||
"costco.service": "restOrders",
|
"costco.service": "restOrders",
|
||||||
"costco.env": "ecom",
|
"costco.env": "ecom",
|
||||||
"costco-x-authorization": config["authorization"],
|
|
||||||
"costco-x-wcs-clientId": config["client_id"],
|
|
||||||
"client-identifier": config["client_identifier"],
|
|
||||||
"origin": "https://www.costco.com",
|
"origin": "https://www.costco.com",
|
||||||
"referer": "https://www.costco.com/",
|
"referer": "https://www.costco.com/",
|
||||||
"user-agent": (
|
"user-agent": (
|
||||||
@@ -234,23 +223,36 @@ def build_headers(config):
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def build_session(config):
|
def build_session():
|
||||||
from curl_cffi import requests
|
|
||||||
|
|
||||||
session = requests.Session()
|
session = requests.Session()
|
||||||
session.headers.update(build_headers(config))
|
session.cookies.update(browser_cookie3.firefox(domain_name="costco.com"))
|
||||||
|
session.headers.update(build_headers())
|
||||||
return session
|
return session
|
||||||
|
|
||||||
|
|
||||||
def graphql_post(session, query, variables):
|
def graphql_post(session, query, variables):
|
||||||
response = session.post(
|
last_response = None
|
||||||
BASE_URL,
|
|
||||||
json={"query": query, "variables": variables},
|
for attempt in range(3):
|
||||||
impersonate="firefox",
|
try:
|
||||||
timeout=30,
|
response = session.post(
|
||||||
)
|
BASE_URL,
|
||||||
response.raise_for_status()
|
json={"query": query, "variables": variables},
|
||||||
return response.json()
|
impersonate="firefox",
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
last_response = response
|
||||||
|
if response.status_code == 200:
|
||||||
|
return response.json()
|
||||||
|
click.echo(f"retry {attempt + 1}/3 status={response.status_code}")
|
||||||
|
except Exception as exc: # pragma: no cover - network error path
|
||||||
|
click.echo(f"retry {attempt + 1}/3 error={exc}")
|
||||||
|
time.sleep(3)
|
||||||
|
|
||||||
|
if last_response is not None:
|
||||||
|
last_response.raise_for_status()
|
||||||
|
|
||||||
|
raise RuntimeError("failed to fetch Costco GraphQL payload")
|
||||||
|
|
||||||
|
|
||||||
def summary_receipts(payload):
|
def summary_receipts(payload):
|
||||||
@@ -279,6 +281,25 @@ def format_cli_date(value):
|
|||||||
return f"{value.month}/{value.day:02d}/{value.year}"
|
return f"{value.month}/{value.day:02d}/{value.year}"
|
||||||
|
|
||||||
|
|
||||||
|
def subtract_months(value, months):
|
||||||
|
year = value.year
|
||||||
|
month = value.month - months
|
||||||
|
while month <= 0:
|
||||||
|
month += 12
|
||||||
|
year -= 1
|
||||||
|
day = min(value.day, monthrange(year, month)[1])
|
||||||
|
return value.replace(year=year, month=month, day=day)
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_date_range(months_back, today=None):
|
||||||
|
if months_back < 1:
|
||||||
|
raise click.ClickException("months-back must be at least 1")
|
||||||
|
|
||||||
|
end = today or datetime.now().date()
|
||||||
|
start = subtract_months(end, months_back)
|
||||||
|
return format_cli_date(start), format_cli_date(end)
|
||||||
|
|
||||||
|
|
||||||
def build_date_windows(start_date, end_date, window_days):
|
def build_date_windows(start_date, end_date, window_days):
|
||||||
start = parse_cli_date(start_date)
|
start = parse_cli_date(start_date)
|
||||||
end = parse_cli_date(end_date)
|
end = parse_cli_date(end_date)
|
||||||
@@ -304,12 +325,20 @@ def build_date_windows(start_date, end_date, window_days):
|
|||||||
def unique_receipts(receipts):
|
def unique_receipts(receipts):
|
||||||
by_barcode = {}
|
by_barcode = {}
|
||||||
for receipt in receipts:
|
for receipt in receipts:
|
||||||
barcode = receipt.get("transactionBarcode")
|
key = receipt_key(receipt)
|
||||||
if barcode:
|
if key:
|
||||||
by_barcode[barcode] = receipt
|
by_barcode[key] = receipt
|
||||||
return list(by_barcode.values())
|
return list(by_barcode.values())
|
||||||
|
|
||||||
|
|
||||||
|
def receipt_key(receipt):
|
||||||
|
barcode = receipt.get("transactionBarcode", "")
|
||||||
|
transaction_date_time = receipt.get("transactionDateTime", "")
|
||||||
|
if not barcode:
|
||||||
|
return ""
|
||||||
|
return f"{barcode}::{transaction_date_time}"
|
||||||
|
|
||||||
|
|
||||||
def fetch_summary_windows(
|
def fetch_summary_windows(
|
||||||
session,
|
session,
|
||||||
start_date,
|
start_date,
|
||||||
@@ -377,8 +406,9 @@ def fetch_summary_windows(
|
|||||||
|
|
||||||
def flatten_costco_data(summary_payload, detail_payloads, raw_dir):
|
def flatten_costco_data(summary_payload, detail_payloads, raw_dir):
|
||||||
summary_lookup = {
|
summary_lookup = {
|
||||||
receipt["transactionBarcode"]: receipt
|
receipt_key(receipt): receipt
|
||||||
for receipt in summary_receipts(summary_payload)
|
for receipt in summary_receipts(summary_payload)
|
||||||
|
if receipt_key(receipt)
|
||||||
}
|
}
|
||||||
orders = []
|
orders = []
|
||||||
items = []
|
items = []
|
||||||
@@ -386,13 +416,14 @@ def flatten_costco_data(summary_payload, detail_payloads, raw_dir):
|
|||||||
for detail_payload in detail_payloads:
|
for detail_payload in detail_payloads:
|
||||||
for receipt in detail_receipts(detail_payload):
|
for receipt in detail_receipts(detail_payload):
|
||||||
order_id = receipt["transactionBarcode"]
|
order_id = receipt["transactionBarcode"]
|
||||||
summary_row = summary_lookup.get(order_id, {})
|
receipt_id = receipt_key(receipt)
|
||||||
|
summary_row = summary_lookup.get(receipt_id, {})
|
||||||
coupon_numbers = {
|
coupon_numbers = {
|
||||||
row.get("upcnumberCoupon", "")
|
row.get("upcnumberCoupon", "")
|
||||||
for row in summary_row.get("couponArray", []) or []
|
for row in summary_row.get("couponArray", []) or []
|
||||||
if row.get("upcnumberCoupon")
|
if row.get("upcnumberCoupon")
|
||||||
}
|
}
|
||||||
raw_order_path = raw_dir / f"{order_id}.json"
|
raw_order_path = raw_dir / f"{receipt_id or order_id}.json"
|
||||||
|
|
||||||
orders.append(
|
orders.append(
|
||||||
{
|
{
|
||||||
@@ -510,8 +541,6 @@ def write_csv(path, rows, fieldnames):
|
|||||||
|
|
||||||
|
|
||||||
@click.command()
|
@click.command()
|
||||||
@click.option("--start-date", required=True, help="Start date like 1/01/2026.")
|
|
||||||
@click.option("--end-date", required=True, help="End date like 3/31/2026.")
|
|
||||||
@click.option(
|
@click.option(
|
||||||
"--outdir",
|
"--outdir",
|
||||||
default="costco_output",
|
default="costco_output",
|
||||||
@@ -537,18 +566,23 @@ def write_csv(path, rows, fieldnames):
|
|||||||
type=int,
|
type=int,
|
||||||
help="Maximum number of days to request per summary window.",
|
help="Maximum number of days to request per summary window.",
|
||||||
)
|
)
|
||||||
def main(start_date, end_date, outdir, document_type, document_sub_type, window_days):
|
@click.option(
|
||||||
config = load_config()
|
"--months-back",
|
||||||
required = ["authorization", "client_id", "client_identifier"]
|
default=3,
|
||||||
missing = [key for key in required if not config[key]]
|
show_default=True,
|
||||||
if missing:
|
type=int,
|
||||||
raise click.ClickException(
|
help="How many months of receipts to enumerate back from today.",
|
||||||
f"missing Costco auth config: {', '.join(missing)}"
|
)
|
||||||
)
|
def main(outdir, document_type, document_sub_type, window_days, months_back):
|
||||||
|
|
||||||
outdir = Path(outdir)
|
outdir = Path(outdir)
|
||||||
raw_dir = outdir / "raw"
|
raw_dir = outdir / "raw"
|
||||||
session = build_session(config)
|
try:
|
||||||
|
session = build_session()
|
||||||
|
except Exception as exc:
|
||||||
|
raise click.ClickException(
|
||||||
|
f"failed to load Costco Firefox cookies: {exc}"
|
||||||
|
) from exc
|
||||||
|
start_date, end_date = resolve_date_range(months_back)
|
||||||
|
|
||||||
summary_payload, request_metadata = fetch_summary_windows(
|
summary_payload, request_metadata = fetch_summary_windows(
|
||||||
session,
|
session,
|
||||||
@@ -565,6 +599,7 @@ def main(start_date, end_date, outdir, document_type, document_sub_type, window_
|
|||||||
detail_payloads = []
|
detail_payloads = []
|
||||||
for receipt in receipts:
|
for receipt in receipts:
|
||||||
barcode = receipt["transactionBarcode"]
|
barcode = receipt["transactionBarcode"]
|
||||||
|
receipt_id = receipt_key(receipt) or barcode
|
||||||
click.echo(f"fetching {barcode}")
|
click.echo(f"fetching {barcode}")
|
||||||
detail_payload = graphql_post(
|
detail_payload = graphql_post(
|
||||||
session,
|
session,
|
||||||
@@ -572,7 +607,7 @@ def main(start_date, end_date, outdir, document_type, document_sub_type, window_
|
|||||||
{"barcode": barcode, "documentType": "warehouse"},
|
{"barcode": barcode, "documentType": "warehouse"},
|
||||||
)
|
)
|
||||||
detail_payloads.append(detail_payload)
|
detail_payloads.append(detail_payload)
|
||||||
write_json(raw_dir / f"{barcode}.json", detail_payload)
|
write_json(raw_dir / f"{receipt_id}.json", detail_payload)
|
||||||
|
|
||||||
orders, items = flatten_costco_data(summary_payload, detail_payloads, raw_dir)
|
orders, items = flatten_costco_data(summary_payload, detail_payloads, raw_dir)
|
||||||
write_csv(outdir / "orders.csv", orders, ORDER_FIELDS)
|
write_csv(outdir / "orders.csv", orders, ORDER_FIELDS)
|
||||||
|
|||||||
@@ -11,6 +11,14 @@ import validate_cross_retailer_flow
|
|||||||
|
|
||||||
|
|
||||||
class CostcoPipelineTests(unittest.TestCase):
|
class CostcoPipelineTests(unittest.TestCase):
|
||||||
|
def test_resolve_date_range_uses_months_back(self):
|
||||||
|
start_date, end_date = scrape_costco.resolve_date_range(
|
||||||
|
3, today=scrape_costco.parse_cli_date("3/16/2026")
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual("12/16/2025", start_date)
|
||||||
|
self.assertEqual("3/16/2026", end_date)
|
||||||
|
|
||||||
def test_build_date_windows_splits_long_ranges(self):
|
def test_build_date_windows_splits_long_ranges(self):
|
||||||
windows = scrape_costco.build_date_windows("1/01/2026", "6/30/2026", 92)
|
windows = scrape_costco.build_date_windows("1/01/2026", "6/30/2026", 92)
|
||||||
|
|
||||||
@@ -160,6 +168,74 @@ class CostcoPipelineTests(unittest.TestCase):
|
|||||||
self.assertEqual("true", items[1]["is_discount_line"])
|
self.assertEqual("true", items[1]["is_discount_line"])
|
||||||
self.assertEqual("true", items[1]["is_coupon_line"])
|
self.assertEqual("true", items[1]["is_coupon_line"])
|
||||||
|
|
||||||
|
def test_flatten_costco_data_uses_composite_summary_lookup_key(self):
|
||||||
|
summary_payload = {
|
||||||
|
"data": {
|
||||||
|
"receiptsWithCounts": {
|
||||||
|
"receipts": [
|
||||||
|
{
|
||||||
|
"transactionBarcode": "dup",
|
||||||
|
"transactionDateTime": "2026-03-12T16:16:00",
|
||||||
|
"tenderArray": [{"tenderDescription": "VISA"}],
|
||||||
|
"couponArray": [{"upcnumberCoupon": "111"}],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"transactionBarcode": "dup",
|
||||||
|
"transactionDateTime": "2026-02-14T16:25:00",
|
||||||
|
"tenderArray": [{"tenderDescription": "MASTERCARD"}],
|
||||||
|
"couponArray": [],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
detail_payloads = [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"receiptsWithCounts": {
|
||||||
|
"receipts": [
|
||||||
|
{
|
||||||
|
"transactionBarcode": "dup",
|
||||||
|
"transactionDateTime": "2026-03-12T16:16:00",
|
||||||
|
"transactionDate": "2026-03-12",
|
||||||
|
"receiptType": "In-Warehouse",
|
||||||
|
"total": 10.0,
|
||||||
|
"totalItemCount": 1,
|
||||||
|
"instantSavings": 5.0,
|
||||||
|
"warehouseName": "MT VERNON",
|
||||||
|
"warehouseNumber": 1115,
|
||||||
|
"warehouseAddress1": "7940 RICHMOND HWY",
|
||||||
|
"warehouseCity": "ALEXANDRIA",
|
||||||
|
"warehouseState": "VA",
|
||||||
|
"warehousePostalCode": "22306",
|
||||||
|
"itemArray": [
|
||||||
|
{
|
||||||
|
"itemNumber": "111",
|
||||||
|
"itemDescription01": "/ 111",
|
||||||
|
"itemDescription02": None,
|
||||||
|
"itemDepartmentNumber": 14,
|
||||||
|
"transDepartmentNumber": 14,
|
||||||
|
"unit": -1,
|
||||||
|
"itemIdentifier": None,
|
||||||
|
"amount": -5,
|
||||||
|
"itemUnitPriceAmount": 0,
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
orders, items = scrape_costco.flatten_costco_data(
|
||||||
|
summary_payload, detail_payloads, Path("costco_output/raw")
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual("VISA", orders[0]["payment_method"])
|
||||||
|
self.assertEqual("true", items[0]["is_coupon_line"])
|
||||||
|
self.assertIn("dup::2026-03-12T16:16:00.json", items[0]["raw_order_path"])
|
||||||
|
|
||||||
def test_costco_enricher_parses_size_pack_and_discount(self):
|
def test_costco_enricher_parses_size_pack_and_discount(self):
|
||||||
row = enrich_costco.parse_costco_item(
|
row = enrich_costco.parse_costco_item(
|
||||||
order_id="abc",
|
order_id="abc",
|
||||||
@@ -335,13 +411,6 @@ class CostcoPipelineTests(unittest.TestCase):
|
|||||||
]
|
]
|
||||||
|
|
||||||
with mock.patch.object(
|
with mock.patch.object(
|
||||||
scrape_costco, "load_config",
|
|
||||||
return_value={
|
|
||||||
"authorization": "token",
|
|
||||||
"client_id": "client",
|
|
||||||
"client_identifier": "identifier",
|
|
||||||
},
|
|
||||||
), mock.patch.object(
|
|
||||||
scrape_costco, "build_session", return_value=object()
|
scrape_costco, "build_session", return_value=object()
|
||||||
), mock.patch.object(
|
), mock.patch.object(
|
||||||
scrape_costco,
|
scrape_costco,
|
||||||
@@ -353,12 +422,11 @@ class CostcoPipelineTests(unittest.TestCase):
|
|||||||
return_value=detail_payload,
|
return_value=detail_payload,
|
||||||
):
|
):
|
||||||
scrape_costco.main.callback(
|
scrape_costco.main.callback(
|
||||||
start_date="1/01/2026",
|
|
||||||
end_date="3/31/2026",
|
|
||||||
outdir=str(outdir),
|
outdir=str(outdir),
|
||||||
document_type="all",
|
document_type="all",
|
||||||
document_sub_type="all",
|
document_sub_type="all",
|
||||||
window_days=92,
|
window_days=92,
|
||||||
|
months_back=3,
|
||||||
)
|
)
|
||||||
|
|
||||||
metadata_path = outdir / "raw" / "summary_requests.json"
|
metadata_path = outdir / "raw" / "summary_requests.json"
|
||||||
|
|||||||
Reference in New Issue
Block a user