Finalize post-refactor layout and remove old pipeline files

This commit is contained in:
ben
2026-03-24 17:09:57 -04:00
parent cdb7a15739
commit 09829b2b9d
17 changed files with 59 additions and 1154 deletions

View File

@@ -1,119 +0,0 @@
import unittest
import build_canonical_layer
class CanonicalLayerTests(unittest.TestCase):
def test_build_canonical_layer_auto_links_exact_upc_and_name_size_only(self):
observed_rows = [
{
"observed_product_id": "gobs_1",
"representative_upc": "111",
"representative_retailer_item_id": "11",
"representative_name_norm": "GALA APPLE",
"representative_brand": "SB",
"representative_variant": "",
"representative_size_value": "5",
"representative_size_unit": "lb",
"representative_pack_qty": "",
"representative_measure_type": "weight",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
},
{
"observed_product_id": "gobs_2",
"representative_upc": "111",
"representative_retailer_item_id": "12",
"representative_name_norm": "LARGE WHITE EGGS",
"representative_brand": "SB",
"representative_variant": "",
"representative_size_value": "",
"representative_size_unit": "",
"representative_pack_qty": "18",
"representative_measure_type": "count",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
},
{
"observed_product_id": "gobs_3",
"representative_upc": "",
"representative_retailer_item_id": "21",
"representative_name_norm": "ROTINI",
"representative_brand": "",
"representative_variant": "",
"representative_size_value": "16",
"representative_size_unit": "oz",
"representative_pack_qty": "",
"representative_measure_type": "weight",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
},
{
"observed_product_id": "gobs_4",
"representative_upc": "",
"representative_retailer_item_id": "22",
"representative_name_norm": "ROTINI",
"representative_brand": "SB",
"representative_variant": "",
"representative_size_value": "16",
"representative_size_unit": "oz",
"representative_pack_qty": "",
"representative_measure_type": "weight",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
},
{
"observed_product_id": "gobs_5",
"representative_upc": "",
"representative_retailer_item_id": "99",
"representative_name_norm": "GL BAG CHARGE",
"representative_brand": "",
"representative_variant": "",
"representative_size_value": "",
"representative_size_unit": "",
"representative_pack_qty": "",
"representative_measure_type": "each",
"is_fee": "true",
"is_discount_line": "false",
"is_coupon_line": "false",
},
{
"observed_product_id": "gobs_6",
"representative_upc": "",
"representative_retailer_item_id": "",
"representative_name_norm": "LIME",
"representative_brand": "",
"representative_variant": "",
"representative_size_value": "",
"representative_size_unit": "",
"representative_pack_qty": "",
"representative_measure_type": "each",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
},
]
canonicals, links = build_canonical_layer.build_canonical_layer(observed_rows)
self.assertEqual(2, len(canonicals))
self.assertEqual(4, len(links))
methods = {row["observed_product_id"]: row["link_method"] for row in links}
self.assertEqual("exact_upc", methods["gobs_1"])
self.assertEqual("exact_upc", methods["gobs_2"])
self.assertEqual("exact_name_size", methods["gobs_3"])
self.assertEqual("exact_name_size", methods["gobs_4"])
self.assertNotIn("gobs_5", methods)
self.assertNotIn("gobs_6", methods)
def test_clean_canonical_name_removes_packaging_noise(self):
self.assertEqual("LIME", build_canonical_layer.clean_canonical_name("LIME . / ."))
self.assertEqual("EGG", build_canonical_layer.clean_canonical_name("5DZ EGG / /"))
if __name__ == "__main__":
unittest.main()

View File

@@ -7,7 +7,6 @@ from unittest import mock
import enrich_costco
import scrape_costco
import validate_cross_retailer_flow
class CostcoPipelineTests(unittest.TestCase):
@@ -423,76 +422,6 @@ class CostcoPipelineTests(unittest.TestCase):
self.assertIn("matched_discount=4873222", purchase_row["parse_notes"])
self.assertIn("matched_to_item=4873222", discount_row["parse_notes"])
def test_cross_retailer_validation_writes_proof_example(self):
with tempfile.TemporaryDirectory() as tmpdir:
giant_csv = Path(tmpdir) / "giant_items_enriched.csv"
costco_csv = Path(tmpdir) / "costco_items_enriched.csv"
outdir = Path(tmpdir) / "combined"
fieldnames = enrich_costco.OUTPUT_FIELDS
giant_row = {field: "" for field in fieldnames}
giant_row.update(
{
"retailer": "giant",
"order_id": "g1",
"line_no": "1",
"order_date": "2026-03-01",
"retailer_item_id": "100",
"item_name": "FRESH BANANA",
"item_name_norm": "BANANA",
"upc": "4011",
"measure_type": "weight",
"is_store_brand": "false",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
"line_total": "1.29",
}
)
costco_row = {field: "" for field in fieldnames}
costco_row.update(
{
"retailer": "costco",
"order_id": "c1",
"line_no": "1",
"order_date": "2026-03-12",
"retailer_item_id": "30669",
"item_name": "BANANAS 3 LB / 1.36 KG",
"item_name_norm": "BANANA",
"upc": "",
"size_value": "3",
"size_unit": "lb",
"measure_type": "weight",
"is_store_brand": "false",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
"line_total": "2.98",
}
)
with giant_csv.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=fieldnames)
writer.writeheader()
writer.writerow(giant_row)
with costco_csv.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=fieldnames)
writer.writeheader()
writer.writerow(costco_row)
validate_cross_retailer_flow.main.callback(
giant_items_enriched_csv=str(giant_csv),
costco_items_enriched_csv=str(costco_csv),
outdir=str(outdir),
)
proof_path = outdir / "proof_examples.csv"
self.assertTrue(proof_path.exists())
with proof_path.open(newline="", encoding="utf-8") as handle:
rows = list(csv.DictReader(handle))
self.assertEqual(1, len(rows))
self.assertEqual("banana", rows[0]["proof_name"])
def test_main_writes_summary_request_metadata(self):
with tempfile.TemporaryDirectory() as tmpdir:
outdir = Path(tmpdir) / "costco_output"

View File

@@ -1,67 +0,0 @@
import unittest
import build_observed_products
class ObservedProductTests(unittest.TestCase):
def test_build_observed_products_aggregates_rows_with_same_key(self):
rows = [
{
"retailer": "giant",
"order_id": "1",
"line_no": "1",
"order_date": "2026-01-01",
"item_name": "SB GALA APPLE 5LB",
"item_name_norm": "GALA APPLE",
"retailer_item_id": "11",
"upc": "111",
"brand_guess": "SB",
"variant": "",
"size_value": "5",
"size_unit": "lb",
"pack_qty": "",
"measure_type": "weight",
"image_url": "https://example.test/a.jpg",
"is_store_brand": "true",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
"line_total": "7.99",
},
{
"retailer": "giant",
"order_id": "2",
"line_no": "1",
"order_date": "2026-01-10",
"item_name": "SB GALA APPLE 5 LB",
"item_name_norm": "GALA APPLE",
"retailer_item_id": "11",
"upc": "111",
"brand_guess": "SB",
"variant": "",
"size_value": "5",
"size_unit": "lb",
"pack_qty": "",
"measure_type": "weight",
"image_url": "",
"is_store_brand": "true",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
"line_total": "8.49",
},
]
observed = build_observed_products.build_observed_products(rows)
self.assertEqual(1, len(observed))
self.assertEqual("2", observed[0]["times_seen"])
self.assertEqual("2026-01-01", observed[0]["first_seen_date"])
self.assertEqual("2026-01-10", observed[0]["last_seen_date"])
self.assertEqual("11", observed[0]["representative_retailer_item_id"])
self.assertEqual("111", observed[0]["representative_upc"])
self.assertIn("SB GALA APPLE 5LB", observed[0]["raw_name_examples"])
if __name__ == "__main__":
unittest.main()

View File

@@ -65,6 +65,21 @@ class PipelineStatusTests(unittest.TestCase):
},
],
resolutions=[],
links=[
{
"normalized_item_id": "gnorm_banana",
"catalog_id": "cat_banana",
"review_status": "approved",
}
],
catalog=[
{
"catalog_id": "cat_banana",
"catalog_name": "BANANA",
"product_type": "banana",
"category": "produce",
}
],
)
counts = {row["stage"]: row["count"] for row in summary}

View File

@@ -1,133 +0,0 @@
import tempfile
import unittest
from pathlib import Path
import build_observed_products
import build_review_queue
from layer_helpers import write_csv_rows
class ReviewQueueTests(unittest.TestCase):
def test_build_review_queue_preserves_existing_status(self):
observed_rows = [
{
"observed_product_id": "gobs_1",
"retailer": "giant",
"representative_upc": "111",
"representative_image_url": "",
"representative_name_norm": "GALA APPLE",
"times_seen": "2",
"distinct_item_names_count": "2",
"distinct_upcs_count": "1",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
}
]
item_rows = [
{
"observed_product_id": "gobs_1",
"item_name": "SB GALA APPLE 5LB",
"item_name_norm": "GALA APPLE",
"line_total": "7.99",
},
{
"observed_product_id": "gobs_1",
"item_name": "SB GALA APPLE 5 LB",
"item_name_norm": "GALA APPLE",
"line_total": "8.49",
},
]
existing = {
build_review_queue.stable_id("rvw", "gobs_1|missing_image"): {
"status": "approved",
"resolution_notes": "looked fine",
"created_at": "2026-03-15",
}
}
queue = build_review_queue.build_review_queue(
observed_rows, item_rows, existing, "2026-03-16"
)
self.assertEqual(2, len(queue))
missing_image = [row for row in queue if row["reason_code"] == "missing_image"][0]
self.assertEqual("approved", missing_image["status"])
self.assertEqual("looked fine", missing_image["resolution_notes"])
def test_review_queue_main_writes_output(self):
with tempfile.TemporaryDirectory() as tmpdir:
observed_path = Path(tmpdir) / "products_observed.csv"
items_path = Path(tmpdir) / "items_enriched.csv"
output_path = Path(tmpdir) / "review_queue.csv"
observed_rows = [
{
"observed_product_id": "gobs_1",
"retailer": "giant",
"observed_key": "giant|upc=111|name=GALA APPLE",
"representative_retailer_item_id": "11",
"representative_upc": "111",
"representative_item_name": "SB GALA APPLE 5LB",
"representative_name_norm": "GALA APPLE",
"representative_brand": "SB",
"representative_variant": "",
"representative_size_value": "5",
"representative_size_unit": "lb",
"representative_pack_qty": "",
"representative_measure_type": "weight",
"representative_image_url": "",
"is_store_brand": "true",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
"first_seen_date": "2026-01-01",
"last_seen_date": "2026-01-10",
"times_seen": "2",
"example_order_id": "1",
"example_item_name": "SB GALA APPLE 5LB",
"raw_name_examples": "SB GALA APPLE 5LB | SB GALA APPLE 5 LB",
"normalized_name_examples": "GALA APPLE",
"example_prices": "7.99 | 8.49",
"distinct_item_names_count": "2",
"distinct_retailer_item_ids_count": "1",
"distinct_upcs_count": "1",
}
]
item_rows = [
{
"retailer": "giant",
"order_id": "1",
"line_no": "1",
"item_name": "SB GALA APPLE 5LB",
"item_name_norm": "GALA APPLE",
"retailer_item_id": "11",
"upc": "111",
"size_value": "5",
"size_unit": "lb",
"pack_qty": "",
"measure_type": "weight",
"is_store_brand": "true",
"is_fee": "false",
"is_discount_line": "false",
"is_coupon_line": "false",
"line_total": "7.99",
}
]
write_csv_rows(
observed_path, observed_rows, build_observed_products.OUTPUT_FIELDS
)
write_csv_rows(items_path, item_rows, list(item_rows[0].keys()))
build_review_queue.main.callback(
observed_csv=str(observed_path),
items_enriched_csv=str(items_path),
output_csv=str(output_path),
)
self.assertTrue(output_path.exists())
if __name__ == "__main__":
unittest.main()

View File

@@ -3,7 +3,7 @@ import tempfile
import unittest
from pathlib import Path
import scraper
import scrape_giant as scraper
class ScraperTests(unittest.TestCase):