Compare commits

..

2 Commits

Author SHA1 Message Date
ben
ec1f36a140 Record t1.14 task evidence 2026-03-18 15:18:54 -04:00
ben
48c6eaf753 Refactor retailer collection entrypoints 2026-03-18 15:18:47 -04:00
6 changed files with 180 additions and 12 deletions

65
collect_costco_web.py Normal file
View File

@@ -0,0 +1,65 @@
import click
import scrape_costco
@click.command()
@click.option(
"--outdir",
default="data/costco-web",
show_default=True,
help="Directory for Costco raw and collected outputs.",
)
@click.option(
"--document-type",
default="all",
show_default=True,
help="Summary document type.",
)
@click.option(
"--document-sub-type",
default="all",
show_default=True,
help="Summary document sub type.",
)
@click.option(
"--window-days",
default=92,
show_default=True,
type=int,
help="Maximum number of days to request per summary window.",
)
@click.option(
"--months-back",
default=36,
show_default=True,
type=int,
help="How many months of receipts to enumerate back from today.",
)
@click.option(
"--firefox-profile-dir",
default=None,
help="Firefox profile directory to use for cookies and session storage.",
)
def main(
outdir,
document_type,
document_sub_type,
window_days,
months_back,
firefox_profile_dir,
):
scrape_costco.run_collection(
outdir=outdir,
document_type=document_type,
document_sub_type=document_sub_type,
window_days=window_days,
months_back=months_back,
firefox_profile_dir=firefox_profile_dir,
orders_filename="collected_orders.csv",
items_filename="collected_items.csv",
)
if __name__ == "__main__":
main()

34
collect_giant_web.py Normal file
View File

@@ -0,0 +1,34 @@
import click
import scrape_giant
@click.command()
@click.option("--user-id", default=None, help="Giant user id.")
@click.option("--loyalty", default=None, help="Giant loyalty number.")
@click.option(
"--outdir",
default="data/giant-web",
show_default=True,
help="Directory for raw json and collected csv outputs.",
)
@click.option(
"--sleep-seconds",
default=1.5,
show_default=True,
type=float,
help="Delay between order detail requests.",
)
def main(user_id, loyalty, outdir, sleep_seconds):
scrape_giant.run_collection(
user_id,
loyalty,
outdir,
sleep_seconds,
orders_filename="collected_orders.csv",
items_filename="collected_items.csv",
)
if __name__ == "__main__":
main()

View File

@@ -472,7 +472,7 @@ refactor canonical generation so product identity is cleaner, duplicate canonica
** notes ** notes
- Removed weak exact-name auto-canonical creation so ambiguous products stay in review instead of generating junk canonicals. - Removed weak exact-name auto-canonical creation so ambiguous products stay in review instead of generating junk canonicals.
- Canonical display names are now cleaned of obvious punctuation and packaging noise, but I kept the cleanup conservative rather than adding a broad fuzzy merge layer. - Canonical display names are now cleaned of obvious punctuation and packaging noise, but I kept the cleanup conservative rather than adding a broad fuzzy merge layer.
* [ ] t1.14: refactor retailer collection into the new data model (2-4 commits) * [X] t1.14: refactor retailer collection into the new data model (2-4 commits)
move Giant and Costco collection into the new collect structure and make both retailers emit the same collected schemas move Giant and Costco collection into the new collect structure and make both retailers emit the same collected schemas
** Acceptance Criteria ** Acceptance Criteria
@@ -493,11 +493,14 @@ move Giant and Costco collection into the new collect structure and make both re
- pm note: this is a path/schema refactor, not a parsing rewrite - pm note: this is a path/schema refactor, not a parsing rewrite
** evidence ** evidence
- commit: - commit: `48c6eaf`
- tests: - tests: `./venv/bin/python -m unittest tests.test_scraper tests.test_costco_pipeline tests.test_browser_session`; `./venv/bin/python collect_giant_web.py --help`; `./venv/bin/python collect_costco_web.py --help`; `./venv/bin/python scrape_giant.py --help`; `./venv/bin/python scrape_costco.py --help`
- datetime: - datetime: 2026-03-18
** notes ** notes
- Kept this as a path/schema move, not a parsing rewrite: the existing Giant and Costco collection behavior remains in place behind new `collect_*` entry points.
- Added lightweight deprecation nudges on the legacy `scrape_*` commands rather than removing them immediately, so the move is inspectable and low-risk.
- The main schema fix was on Giant collection, which was missing retailer/provenance/audit fields that Costco collection already carried.
* [ ] t1.14.1: refactor retailer normalization into the new normalized_items schema (3-5 commits) * [ ] t1.14.1: refactor retailer normalization into the new normalized_items schema (3-5 commits)
make Giant and Costco emit the shared normalized line-item schema without introducing cross-retailer identity logic make Giant and Costco emit the shared normalized line-item schema without introducing cross-retailer identity logic

View File

@@ -648,6 +648,27 @@ def main(
window_days, window_days,
months_back, months_back,
firefox_profile_dir, firefox_profile_dir,
):
click.echo("legacy entrypoint: prefer collect_costco_web.py for data-model outputs")
run_collection(
outdir=outdir,
document_type=document_type,
document_sub_type=document_sub_type,
window_days=window_days,
months_back=months_back,
firefox_profile_dir=firefox_profile_dir,
)
def run_collection(
outdir,
document_type,
document_sub_type,
window_days,
months_back,
firefox_profile_dir,
orders_filename="orders.csv",
items_filename="items.csv",
): ):
outdir = Path(outdir) outdir = Path(outdir)
raw_dir = outdir / "raw" raw_dir = outdir / "raw"
@@ -706,8 +727,8 @@ def main(
write_json(raw_dir / f"{safe_filename(receipt_id)}.json", detail_payload) write_json(raw_dir / f"{safe_filename(receipt_id)}.json", detail_payload)
orders, items = flatten_costco_data(summary_payload, detail_payloads, raw_dir) orders, items = flatten_costco_data(summary_payload, detail_payloads, raw_dir)
write_csv(outdir / "orders.csv", orders, ORDER_FIELDS) write_csv(outdir / orders_filename, orders, ORDER_FIELDS)
write_csv(outdir / "items.csv", items, ITEM_FIELDS) write_csv(outdir / items_filename, items, ITEM_FIELDS)
click.echo(f"wrote {len(orders)} orders and {len(items)} item rows to {outdir}") click.echo(f"wrote {len(orders)} orders and {len(items)} item rows to {outdir}")

View File

@@ -13,8 +13,10 @@ from browser_session import find_firefox_profile_dir, load_firefox_cookies
BASE = "https://giantfood.com" BASE = "https://giantfood.com"
ACCOUNT_PAGE = f"{BASE}/account/history/invoice/in-store" ACCOUNT_PAGE = f"{BASE}/account/history/invoice/in-store"
RETAILER = "giant"
ORDER_FIELDS = [ ORDER_FIELDS = [
"retailer",
"order_id", "order_id",
"order_date", "order_date",
"delivery_date", "delivery_date",
@@ -33,12 +35,16 @@ ORDER_FIELDS = [
"store_zipcode", "store_zipcode",
"refund_order", "refund_order",
"ebt_order", "ebt_order",
"raw_history_path",
"raw_order_path",
] ]
ITEM_FIELDS = [ ITEM_FIELDS = [
"retailer",
"order_id", "order_id",
"order_date", "order_date",
"line_no", "line_no",
"retailer_item_id",
"pod_id", "pod_id",
"item_name", "item_name",
"upc", "upc",
@@ -53,6 +59,10 @@ ITEM_FIELDS = [
"reward_savings", "reward_savings",
"coupon_savings", "coupon_savings",
"coupon_price", "coupon_price",
"image_url",
"raw_order_path",
"is_discount_line",
"is_coupon_line",
] ]
@@ -130,18 +140,21 @@ def get_order_detail(session, user_id, order_id):
return response.json() return response.json()
def flatten_orders(history, details): def flatten_orders(history, details, history_path=None, raw_dir=None):
orders = [] orders = []
items = [] items = []
history_lookup = {record["orderId"]: record for record in history.get("records", [])} history_lookup = {record["orderId"]: record for record in history.get("records", [])}
history_path_value = history_path.as_posix() if history_path else ""
for detail in details: for detail in details:
order_id = str(detail["orderId"]) order_id = str(detail["orderId"])
history_row = history_lookup.get(detail["orderId"], {}) history_row = history_lookup.get(detail["orderId"], {})
pickup = detail.get("pup", {}) pickup = detail.get("pup", {})
raw_order_path = (raw_dir / f"{order_id}.json").as_posix() if raw_dir else ""
orders.append( orders.append(
{ {
"retailer": RETAILER,
"order_id": order_id, "order_id": order_id,
"order_date": detail.get("orderDate"), "order_date": detail.get("orderDate"),
"delivery_date": detail.get("deliveryDate"), "delivery_date": detail.get("deliveryDate"),
@@ -160,15 +173,19 @@ def flatten_orders(history, details):
"store_zipcode": pickup.get("storeZipcode"), "store_zipcode": pickup.get("storeZipcode"),
"refund_order": detail.get("refundOrder"), "refund_order": detail.get("refundOrder"),
"ebt_order": detail.get("ebtOrder"), "ebt_order": detail.get("ebtOrder"),
"raw_history_path": history_path_value,
"raw_order_path": raw_order_path,
} }
) )
for line_no, item in enumerate(detail.get("items", []), start=1): for line_no, item in enumerate(detail.get("items", []), start=1):
items.append( items.append(
{ {
"retailer": RETAILER,
"order_id": order_id, "order_id": order_id,
"order_date": detail.get("orderDate"), "order_date": detail.get("orderDate"),
"line_no": str(line_no), "line_no": str(line_no),
"retailer_item_id": "",
"pod_id": item.get("podId"), "pod_id": item.get("podId"),
"item_name": item.get("itemName"), "item_name": item.get("itemName"),
"upc": item.get("primUpcCd"), "upc": item.get("primUpcCd"),
@@ -183,6 +200,10 @@ def flatten_orders(history, details):
"reward_savings": item.get("rewardSavings"), "reward_savings": item.get("rewardSavings"),
"coupon_savings": item.get("couponSavings"), "coupon_savings": item.get("couponSavings"),
"coupon_price": item.get("couponPrice"), "coupon_price": item.get("couponPrice"),
"image_url": "",
"raw_order_path": raw_order_path,
"is_discount_line": "false",
"is_coupon_line": "false",
} }
) )
@@ -269,6 +290,18 @@ def write_json(path, payload):
help="Delay between order detail requests.", help="Delay between order detail requests.",
) )
def main(user_id, loyalty, outdir, sleep_seconds): def main(user_id, loyalty, outdir, sleep_seconds):
click.echo("legacy entrypoint: prefer collect_giant_web.py for data-model outputs")
run_collection(user_id, loyalty, outdir, sleep_seconds)
def run_collection(
user_id,
loyalty,
outdir,
sleep_seconds,
orders_filename="orders.csv",
items_filename="items.csv",
):
config = load_config() config = load_config()
user_id = user_id or config["user_id"] or click.prompt("Giant user id", type=str) user_id = user_id or config["user_id"] or click.prompt("Giant user id", type=str)
loyalty = loyalty or config["loyalty"] or click.prompt( loyalty = loyalty or config["loyalty"] or click.prompt(
@@ -279,13 +312,14 @@ def main(user_id, loyalty, outdir, sleep_seconds):
rawdir = outdir / "raw" rawdir = outdir / "raw"
rawdir.mkdir(parents=True, exist_ok=True) rawdir.mkdir(parents=True, exist_ok=True)
orders_csv = outdir / "orders.csv" orders_csv = outdir / orders_filename
items_csv = outdir / "items.csv" items_csv = outdir / items_filename
existing_order_ids = read_existing_order_ids(orders_csv) existing_order_ids = read_existing_order_ids(orders_csv)
session = build_session() session = build_session()
history = get_history(session, user_id, loyalty) history = get_history(session, user_id, loyalty)
write_json(rawdir / "history.json", history) history_path = rawdir / "history.json"
write_json(history_path, history)
records = history.get("records", []) records = history.get("records", [])
click.echo(f"history returned {len(records)} visits; Giant exposes only the most recent 50") click.echo(f"history returned {len(records)} visits; Giant exposes only the most recent 50")
@@ -310,7 +344,7 @@ def main(user_id, loyalty, outdir, sleep_seconds):
if index < len(unseen_records): if index < len(unseen_records):
time.sleep(sleep_seconds) time.sleep(sleep_seconds)
orders, items = flatten_orders(history, details) orders, items = flatten_orders(history, details, history_path=history_path, raw_dir=rawdir)
merged_orders = append_dedup( merged_orders = append_dedup(
orders_csv, orders_csv,
orders, orders,

View File

@@ -58,14 +58,25 @@ class ScraperTests(unittest.TestCase):
} }
] ]
orders, items = scraper.flatten_orders(history, details) orders, items = scraper.flatten_orders(
history,
details,
history_path=Path("data/giant-web/raw/history.json"),
raw_dir=Path("data/giant-web/raw"),
)
self.assertEqual(1, len(orders)) self.assertEqual(1, len(orders))
self.assertEqual("abc123", orders[0]["order_id"]) self.assertEqual("abc123", orders[0]["order_id"])
self.assertEqual("giant", orders[0]["retailer"])
self.assertEqual("PICKUP", orders[0]["service_type"]) self.assertEqual("PICKUP", orders[0]["service_type"])
self.assertEqual("data/giant-web/raw/history.json", orders[0]["raw_history_path"])
self.assertEqual("data/giant-web/raw/abc123.json", orders[0]["raw_order_path"])
self.assertEqual(1, len(items)) self.assertEqual(1, len(items))
self.assertEqual("1", items[0]["line_no"]) self.assertEqual("1", items[0]["line_no"])
self.assertEqual("Bananas", items[0]["item_name"]) self.assertEqual("Bananas", items[0]["item_name"])
self.assertEqual("giant", items[0]["retailer"])
self.assertEqual("data/giant-web/raw/abc123.json", items[0]["raw_order_path"])
self.assertEqual("false", items[0]["is_discount_line"])
def test_append_dedup_replaces_duplicate_rows_and_preserves_new_values(self): def test_append_dedup_replaces_duplicate_rows_and_preserves_new_values(self):
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir: