added token and dotenv so costco scrapes successfully 36 mo
This commit is contained in:
File diff suppressed because one or more lines are too long
@@ -1,6 +1,9 @@
|
|||||||
|
import os
|
||||||
import csv
|
import csv
|
||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
|
import re
|
||||||
|
from dotenv import load_dotenv
|
||||||
from calendar import monthrange
|
from calendar import monthrange
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -208,8 +211,16 @@ ITEM_FIELDS = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def build_headers():
|
def load_config():
|
||||||
|
load_dotenv()
|
||||||
return {
|
return {
|
||||||
|
"authorization": os.getenv("COSTCO_X_AUTHORIZATION", "").strip(),
|
||||||
|
"client_id": os.getenv("COSTCO_X_WCS_CLIENTID", "").strip(),
|
||||||
|
"client_identifier": os.getenv("COSTCO_CLIENT_IDENTIFIER", "").strip(),
|
||||||
|
}
|
||||||
|
|
||||||
|
def build_headers(config):
|
||||||
|
headers = {
|
||||||
"accept": "*/*",
|
"accept": "*/*",
|
||||||
"content-type": "application/json-patch+json",
|
"content-type": "application/json-patch+json",
|
||||||
"costco.service": "restOrders",
|
"costco.service": "restOrders",
|
||||||
@@ -221,12 +232,18 @@ def build_headers():
|
|||||||
"Gecko/20100101 Firefox/148.0"
|
"Gecko/20100101 Firefox/148.0"
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
|
if config["authorization"]:
|
||||||
|
headers["costco-x-authorization"] = config["authorization"]
|
||||||
|
if config["client_id"]:
|
||||||
|
headers["costco-x-wcs-clientId"] = config["client_id"]
|
||||||
|
if config["client_identifier"]:
|
||||||
|
headers["client-identifier"] = config["client_identifier"]
|
||||||
|
return headers
|
||||||
|
|
||||||
|
def build_session(config):
|
||||||
def build_session():
|
|
||||||
session = requests.Session()
|
session = requests.Session()
|
||||||
session.cookies.update(browser_cookie3.firefox(domain_name="costco.com"))
|
session.cookies.update(browser_cookie3.firefox(domain_name=".costco.com"))
|
||||||
session.headers.update(build_headers())
|
session.headers.update(build_headers(config))
|
||||||
return session
|
return session
|
||||||
|
|
||||||
|
|
||||||
@@ -254,6 +271,8 @@ def graphql_post(session, query, variables):
|
|||||||
|
|
||||||
raise RuntimeError("failed to fetch Costco GraphQL payload")
|
raise RuntimeError("failed to fetch Costco GraphQL payload")
|
||||||
|
|
||||||
|
def safe_filename(value):
|
||||||
|
return re.sub(r'[<>:"/\\|?*]+', "-", str(value))
|
||||||
|
|
||||||
def summary_receipts(payload):
|
def summary_receipts(payload):
|
||||||
return payload.get("data", {}).get("receiptsWithCounts", {}).get("receipts", [])
|
return payload.get("data", {}).get("receiptsWithCounts", {}).get("receipts", [])
|
||||||
@@ -423,7 +442,7 @@ def flatten_costco_data(summary_payload, detail_payloads, raw_dir):
|
|||||||
for row in summary_row.get("couponArray", []) or []
|
for row in summary_row.get("couponArray", []) or []
|
||||||
if row.get("upcnumberCoupon")
|
if row.get("upcnumberCoupon")
|
||||||
}
|
}
|
||||||
raw_order_path = raw_dir / f"{receipt_id or order_id}.json"
|
raw_order_path = raw_dir / f"{safe_filename(receipt_id or order_id)}.json"
|
||||||
|
|
||||||
orders.append(
|
orders.append(
|
||||||
{
|
{
|
||||||
@@ -568,16 +587,23 @@ def write_csv(path, rows, fieldnames):
|
|||||||
)
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
"--months-back",
|
"--months-back",
|
||||||
default=3,
|
default=36,
|
||||||
show_default=True,
|
show_default=True,
|
||||||
type=int,
|
type=int,
|
||||||
help="How many months of receipts to enumerate back from today.",
|
help="How many months of receipts to enumerate back from today.",
|
||||||
)
|
)
|
||||||
def main(outdir, document_type, document_sub_type, window_days, months_back):
|
def main(outdir, document_type, document_sub_type, window_days, months_back):
|
||||||
outdir = Path(outdir)
|
outdir = Path(outdir)
|
||||||
raw_dir = outdir / "raw"
|
raw_dir = outdir / "raw"
|
||||||
try:
|
try:
|
||||||
session = build_session()
|
config = load_config()
|
||||||
|
click.echo(
|
||||||
|
"auth headers present: "
|
||||||
|
f"authorization={bool(config['authorization'])}, "
|
||||||
|
f"client_id={bool(config['client_id'])}, "
|
||||||
|
f"client_identifier={bool(config['client_identifier'])}"
|
||||||
|
)
|
||||||
|
session = build_session(config)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
raise click.ClickException(
|
raise click.ClickException(
|
||||||
f"failed to load Costco Firefox cookies: {exc}"
|
f"failed to load Costco Firefox cookies: {exc}"
|
||||||
@@ -607,7 +633,7 @@ def main(outdir, document_type, document_sub_type, window_days, months_back):
|
|||||||
{"barcode": barcode, "documentType": "warehouse"},
|
{"barcode": barcode, "documentType": "warehouse"},
|
||||||
)
|
)
|
||||||
detail_payloads.append(detail_payload)
|
detail_payloads.append(detail_payload)
|
||||||
write_json(raw_dir / f"{receipt_id}.json", detail_payload)
|
write_json(raw_dir / f"{safe_filename(receipt_id)}.json", detail_payload)
|
||||||
|
|
||||||
orders, items = flatten_costco_data(summary_payload, detail_payloads, raw_dir)
|
orders, items = flatten_costco_data(summary_payload, detail_payloads, raw_dir)
|
||||||
write_csv(outdir / "orders.csv", orders, ORDER_FIELDS)
|
write_csv(outdir / "orders.csv", orders, ORDER_FIELDS)
|
||||||
@@ -617,3 +643,4 @@ def main(outdir, document_type, document_sub_type, window_days, months_back):
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user