Files
scrape-giant/tests/test_costco_pipeline.py
2026-03-21 21:50:10 -04:00

579 lines
24 KiB
Python

import csv
import json
import tempfile
import unittest
from pathlib import Path
from unittest import mock
import enrich_costco
import scrape_costco
import validate_cross_retailer_flow
class CostcoPipelineTests(unittest.TestCase):
def test_resolve_date_range_uses_months_back(self):
start_date, end_date = scrape_costco.resolve_date_range(
3, today=scrape_costco.parse_cli_date("3/16/2026")
)
self.assertEqual("12/16/2025", start_date)
self.assertEqual("3/16/2026", end_date)
def test_build_date_windows_splits_long_ranges(self):
windows = scrape_costco.build_date_windows("1/01/2026", "6/30/2026", 92)
self.assertEqual(
[
{"startDate": "1/01/2026", "endDate": "4/02/2026"},
{"startDate": "4/03/2026", "endDate": "6/30/2026"},
],
windows,
)
def test_fetch_summary_windows_records_metadata_and_warns_on_mismatch(self):
payloads = [
{
"data": {
"receiptsWithCounts": {
"inWarehouse": 2,
"gasStation": 0,
"carWash": 0,
"gasAndCarWash": 0,
"receipts": [
{
"transactionBarcode": "abc",
"receiptType": "In-Warehouse",
}
],
}
}
},
{
"data": {
"receiptsWithCounts": {
"inWarehouse": 1,
"gasStation": 0,
"carWash": 0,
"gasAndCarWash": 0,
"receipts": [
{
"transactionBarcode": "def",
"receiptType": "In-Warehouse",
}
],
}
}
},
]
with mock.patch.object(
scrape_costco, "graphql_post", side_effect=payloads
) as mocked_post, mock.patch.object(scrape_costco.click, "echo") as mocked_echo:
summary_payload, metadata = scrape_costco.fetch_summary_windows(
session=object(),
start_date="1/01/2026",
end_date="6/30/2026",
document_type="all",
document_sub_type="all",
window_days=92,
)
self.assertEqual(2, mocked_post.call_count)
self.assertEqual(2, len(metadata))
self.assertTrue(metadata[0]["countMismatch"])
self.assertFalse(metadata[1]["countMismatch"])
self.assertEqual("1/01/2026", metadata[0]["startDate"])
self.assertEqual("4/03/2026", metadata[1]["startDate"])
self.assertEqual(
["abc", "def"],
[
row["transactionBarcode"]
for row in scrape_costco.summary_receipts(summary_payload)
],
)
mocked_echo.assert_called_once()
warning_text = mocked_echo.call_args.args[0]
self.assertIn("warning: summary count mismatch", warning_text)
def test_flatten_costco_data_preserves_discount_rows(self):
summary_payload = {
"data": {
"receiptsWithCounts": {
"receipts": [
{
"transactionBarcode": "abc",
"tenderArray": [{"tenderDescription": "VISA"}],
"couponArray": [{"upcnumberCoupon": "2100003746641"}],
}
]
}
}
}
detail_payloads = [
{
"data": {
"receiptsWithCounts": {
"receipts": [
{
"transactionBarcode": "abc",
"transactionDate": "2026-03-12",
"receiptType": "In-Warehouse",
"total": 10.0,
"totalItemCount": 2,
"instantSavings": 5.0,
"warehouseName": "MT VERNON",
"warehouseNumber": 1115,
"warehouseAddress1": "7940 RICHMOND HWY",
"warehouseCity": "ALEXANDRIA",
"warehouseState": "VA",
"warehousePostalCode": "22306",
"itemArray": [
{
"itemNumber": "4873222",
"itemDescription01": "ALL F&C",
"itemDescription02": "200OZ 160LOADS P104",
"itemDepartmentNumber": 14,
"transDepartmentNumber": 14,
"unit": 1,
"itemIdentifier": "E",
"amount": 19.99,
"itemUnitPriceAmount": 19.99,
},
{
"itemNumber": "374664",
"itemDescription01": "/ 4873222",
"itemDescription02": None,
"itemDepartmentNumber": 14,
"transDepartmentNumber": 14,
"unit": -1,
"itemIdentifier": None,
"amount": -5,
"itemUnitPriceAmount": 0,
},
],
}
]
}
}
}
]
orders, items = scrape_costco.flatten_costco_data(
summary_payload, detail_payloads, Path("costco_output/raw")
)
self.assertEqual(1, len(orders))
self.assertEqual(2, len(items))
self.assertEqual("false", items[0]["is_discount_line"])
self.assertEqual("true", items[1]["is_discount_line"])
self.assertEqual("true", items[1]["is_coupon_line"])
def test_flatten_costco_data_uses_composite_summary_lookup_key(self):
summary_payload = {
"data": {
"receiptsWithCounts": {
"receipts": [
{
"transactionBarcode": "dup",
"transactionDateTime": "2026-03-12T16:16:00",
"tenderArray": [{"tenderDescription": "VISA"}],
"couponArray": [{"upcnumberCoupon": "111"}],
},
{
"transactionBarcode": "dup",
"transactionDateTime": "2026-02-14T16:25:00",
"tenderArray": [{"tenderDescription": "MASTERCARD"}],
"couponArray": [],
},
]
}
}
}
detail_payloads = [
{
"data": {
"receiptsWithCounts": {
"receipts": [
{
"transactionBarcode": "dup",
"transactionDateTime": "2026-03-12T16:16:00",
"transactionDate": "2026-03-12",
"receiptType": "In-Warehouse",
"total": 10.0,
"totalItemCount": 1,
"instantSavings": 5.0,
"warehouseName": "MT VERNON",
"warehouseNumber": 1115,
"warehouseAddress1": "7940 RICHMOND HWY",
"warehouseCity": "ALEXANDRIA",
"warehouseState": "VA",
"warehousePostalCode": "22306",
"itemArray": [
{
"itemNumber": "111",
"itemDescription01": "/ 111",
"itemDescription02": None,
"itemDepartmentNumber": 14,
"transDepartmentNumber": 14,
"unit": -1,
"itemIdentifier": None,
"amount": -5,
"itemUnitPriceAmount": 0,
}
],
}
]
}
}
}
]
orders, items = scrape_costco.flatten_costco_data(
summary_payload, detail_payloads, Path("costco_output/raw")
)
self.assertEqual("VISA", orders[0]["payment_method"])
self.assertEqual("true", items[0]["is_coupon_line"])
self.assertIn("dup-2026-03-12T16-16-00.json", items[0]["raw_order_path"])
def test_costco_enricher_parses_size_pack_and_discount(self):
row = enrich_costco.parse_costco_item(
order_id="abc",
order_date="2026-03-12",
raw_path=Path("costco_output/raw/abc.json"),
line_no=1,
item={
"itemNumber": "60357",
"itemDescription01": "MIXED PEPPER",
"itemDescription02": "6-PACK",
"itemDepartmentNumber": 65,
"transDepartmentNumber": 65,
"unit": 1,
"itemIdentifier": "E",
"amount": 7.49,
"itemUnitPriceAmount": 7.49,
},
)
self.assertEqual("60357", row["retailer_item_id"])
self.assertEqual("MIXED PEPPER", row["item_name_norm"])
self.assertEqual("6", row["pack_qty"])
self.assertEqual("count", row["measure_type"])
self.assertEqual("costco:abc:1", row["normalized_row_id"])
self.assertEqual("exact_retailer_item_id", row["normalization_basis"])
self.assertTrue(row["normalized_item_id"])
self.assertEqual("6", row["normalized_quantity"])
self.assertEqual("count", row["normalized_quantity_unit"])
volume_row = enrich_costco.parse_costco_item(
order_id="abc",
order_date="2026-03-12",
raw_path=Path("costco_output/raw/abc.json"),
line_no=3,
item={
"itemNumber": "1185912",
"itemDescription01": "KS ALMND BAR US 1.74QTS CN",
"itemDescription02": None,
"itemDepartmentNumber": 18,
"transDepartmentNumber": 18,
"unit": 2,
"itemIdentifier": "E",
"amount": 21.98,
"itemUnitPriceAmount": 10.99,
},
)
self.assertEqual("3.48", volume_row["normalized_quantity"])
self.assertEqual("qt", volume_row["normalized_quantity_unit"])
discount = enrich_costco.parse_costco_item(
order_id="abc",
order_date="2026-03-12",
raw_path=Path("costco_output/raw/abc.json"),
line_no=2,
item={
"itemNumber": "374664",
"itemDescription01": "/ 4873222",
"itemDescription02": None,
"itemDepartmentNumber": 14,
"transDepartmentNumber": 14,
"unit": -1,
"itemIdentifier": None,
"amount": -5,
"itemUnitPriceAmount": 0,
},
)
self.assertEqual("true", discount["is_discount_line"])
self.assertEqual("true", discount["is_coupon_line"])
self.assertEqual("false", discount["is_item"])
def test_costco_name_cleanup_removes_dual_weight_and_logistics_artifacts(self):
mixed_units = enrich_costco.parse_costco_item(
order_id="abc",
order_date="2026-03-12",
raw_path=Path("costco_output/raw/abc.json"),
line_no=1,
item={
"itemNumber": "18600",
"itemDescription01": "MANDARINS 2.27 KG / 5 LBS",
"itemDescription02": None,
"itemDepartmentNumber": 65,
"transDepartmentNumber": 65,
"unit": 1,
"itemIdentifier": "E",
"amount": 7.49,
"itemUnitPriceAmount": 7.49,
},
)
self.assertEqual("MANDARIN", mixed_units["item_name_norm"])
self.assertEqual("5", mixed_units["size_value"])
self.assertEqual("lb", mixed_units["size_unit"])
logistics = enrich_costco.parse_costco_item(
order_id="abc",
order_date="2026-03-12",
raw_path=Path("costco_output/raw/abc.json"),
line_no=2,
item={
"itemNumber": "1375005",
"itemDescription01": "LIFE 6'TABLE MDL #80873U - T12/H3/P36",
"itemDescription02": None,
"itemDepartmentNumber": 18,
"transDepartmentNumber": 18,
"unit": 1,
"itemIdentifier": "E",
"amount": 119.98,
"itemUnitPriceAmount": 119.98,
},
)
self.assertEqual("LIFE 6'TABLE MDL", logistics["item_name_norm"])
def test_build_items_enriched_matches_discount_to_item(self):
with tempfile.TemporaryDirectory() as tmpdir:
raw_dir = Path(tmpdir) / "raw"
raw_dir.mkdir()
payload = {
"data": {
"receiptsWithCounts": {
"receipts": [
{
"transactionBarcode": "abc",
"transactionDate": "2026-03-12",
"itemArray": [
{
"itemNumber": "4873222",
"itemDescription01": "ALL F&C",
"itemDescription02": "200OZ 160LOADS P104",
"itemDepartmentNumber": 14,
"transDepartmentNumber": 14,
"unit": 1,
"itemIdentifier": "E",
"amount": 19.99,
"itemUnitPriceAmount": 19.99,
},
{
"itemNumber": "374664",
"itemDescription01": "/ 4873222",
"itemDescription02": None,
"itemDepartmentNumber": 14,
"transDepartmentNumber": 14,
"unit": -1,
"itemIdentifier": None,
"amount": -5,
"itemUnitPriceAmount": 0,
},
],
}
]
}
}
}
(raw_dir / "abc.json").write_text(json.dumps(payload), encoding="utf-8")
rows = enrich_costco.build_items_enriched(raw_dir)
purchase_row = next(row for row in rows if row["is_discount_line"] == "false")
discount_row = next(row for row in rows if row["is_discount_line"] == "true")
self.assertEqual("-5", purchase_row["matched_discount_amount"])
self.assertEqual("14.99", purchase_row["net_line_total"])
self.assertIn("matched_discount=4873222", purchase_row["parse_notes"])
self.assertIn("matched_to_item=4873222", discount_row["parse_notes"])
def test_cross_retailer_validation_writes_proof_example(self):
with tempfile.TemporaryDirectory() as tmpdir:
giant_csv = Path(tmpdir) / "giant_items_enriched.csv"
costco_csv = Path(tmpdir) / "costco_items_enriched.csv"
outdir = Path(tmpdir) / "combined"
fieldnames = enrich_costco.OUTPUT_FIELDS
giant_row = {field: "" for field in fieldnames}
giant_row.update(
{
"retailer": "giant",
"order_id": "g1",
"line_no": "1",
"order_date": "2026-03-01",
"retailer_item_id": "100",
"item_name": "FRESH BANANA",
"item_name_norm": "BANANA",
"upc": "4011",
"measure_type": "weight",
"is_store_brand": "false",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
"line_total": "1.29",
}
)
costco_row = {field: "" for field in fieldnames}
costco_row.update(
{
"retailer": "costco",
"order_id": "c1",
"line_no": "1",
"order_date": "2026-03-12",
"retailer_item_id": "30669",
"item_name": "BANANAS 3 LB / 1.36 KG",
"item_name_norm": "BANANA",
"upc": "",
"size_value": "3",
"size_unit": "lb",
"measure_type": "weight",
"is_store_brand": "false",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
"line_total": "2.98",
}
)
with giant_csv.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=fieldnames)
writer.writeheader()
writer.writerow(giant_row)
with costco_csv.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=fieldnames)
writer.writeheader()
writer.writerow(costco_row)
validate_cross_retailer_flow.main.callback(
giant_items_enriched_csv=str(giant_csv),
costco_items_enriched_csv=str(costco_csv),
outdir=str(outdir),
)
proof_path = outdir / "proof_examples.csv"
self.assertTrue(proof_path.exists())
with proof_path.open(newline="", encoding="utf-8") as handle:
rows = list(csv.DictReader(handle))
self.assertEqual(1, len(rows))
self.assertEqual("banana", rows[0]["proof_name"])
def test_main_writes_summary_request_metadata(self):
with tempfile.TemporaryDirectory() as tmpdir:
outdir = Path(tmpdir) / "costco_output"
summary_payload = {
"data": {
"receiptsWithCounts": {
"inWarehouse": 1,
"gasStation": 0,
"carWash": 0,
"gasAndCarWash": 0,
"receipts": [
{
"transactionBarcode": "abc",
"receiptType": "In-Warehouse",
"tenderArray": [],
"couponArray": [],
}
],
}
}
}
detail_payload = {
"data": {
"receiptsWithCounts": {
"receipts": [
{
"transactionBarcode": "abc",
"transactionDate": "2026-03-12",
"receiptType": "In-Warehouse",
"total": 10.0,
"totalItemCount": 1,
"instantSavings": 0,
"warehouseName": "MT VERNON",
"warehouseNumber": 1115,
"warehouseAddress1": "7940 RICHMOND HWY",
"warehouseCity": "ALEXANDRIA",
"warehouseState": "VA",
"warehousePostalCode": "22306",
"itemArray": [],
}
]
}
}
}
metadata = [
{
"startDate": "1/01/2026",
"endDate": "3/31/2026",
"text": "custom",
"documentType": "all",
"documentSubType": "all",
"returnedReceipts": 1,
"returnedInWarehouseReceipts": 1,
"inWarehouse": 1,
"gasStation": 0,
"carWash": 0,
"gasAndCarWash": 0,
"countMismatch": False,
}
]
with mock.patch.object(
scrape_costco,
"load_config",
return_value={
"authorization": "",
"client_id": "4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",
"client_identifier": "481b1aec-aa3b-454b-b81b-48187e28f205",
},
), mock.patch.object(
scrape_costco,
"find_firefox_profile_dir",
return_value=Path("/tmp/profile"),
), mock.patch.object(
scrape_costco,
"load_costco_browser_headers",
return_value={
"costco-x-authorization": "Bearer header.payload.signature",
"costco-x-wcs-clientId": "4900eb1f-0c10-4bd9-99c3-c59e6c1ecebf",
"client-identifier": "481b1aec-aa3b-454b-b81b-48187e28f205",
},
), mock.patch.object(
scrape_costco, "build_session", return_value=object()
), mock.patch.object(
scrape_costco,
"fetch_summary_windows",
return_value=(summary_payload, metadata),
), mock.patch.object(
scrape_costco,
"graphql_post",
return_value=detail_payload,
):
scrape_costco.main.callback(
outdir=str(outdir),
document_type="all",
document_sub_type="all",
window_days=92,
months_back=3,
firefox_profile_dir=None,
)
metadata_path = outdir / "raw" / "summary_requests.json"
self.assertTrue(metadata_path.exists())
saved_metadata = json.loads(metadata_path.read_text(encoding="utf-8"))
self.assertEqual(metadata, saved_metadata)
if __name__ == "__main__":
unittest.main()