data-model refactor and prep scope

Fix t1.13 evidence hashes
Record t1.13 task evidence
2026-03-18 13:08:28 -04:00 · 2026-03-17 15:08:09 -04:00 · 2026-03-17 15:07:51 -04:00 · 2026-03-17 15:07:48 -04:00 · 2026-03-17 15:07:45 -04:00 · 2026-03-17 15:07:42 -04:00
12 changed files with 952 additions and 288 deletions
--- a/README.md
+++ b/README.md
@@ -12,6 +12,7 @@ Run each script step-by-step from the terminal.
 4. `enrich_costco.py`: normalize Costco line items
 5. `build_purchases.py`: combine retailer outputs into one purchase table
 6. `review_products.py`: review unresolved product matches in the terminal
+7. `report_pipeline_status.py`: show how many rows survive each stage

 ## Requirements

@@ -31,6 +32,7 @@ pip install -r requirements.txt
 Current version works best with `.env` in the project root.  The scraper will prompt for these values if they are not found in the current browser session.  
 - `scrape_giant` prompts if `GIANT_USER_ID` or `GIANT_LOYALTY_NUMBER` is missing.
 - `scrape_costco` tries `.env` first, then Firefox local storage for session-backed values; `COSTCO_CLIENT_IDENTIFIER` should still be set explicitly.
+- Costco discount matching happens later in `enrich_costco.py`; you do not need to pre-clean discount lines by hand.

 ```env
 GIANT_USER_ID=...
@@ -53,6 +55,8 @@ python enrich_costco.py
 python build_purchases.py
 python review_products.py
 python build_purchases.py
+python review_products.py --refresh-only
+python report_pipeline_status.py
 ```

 Why run `build_purchases.py` twice:
@@ -66,6 +70,12 @@ If you only want to refresh the queue without reviewing interactively:
 python review_products.py --refresh-only
 ```

+If you want a quick stage-by-stage accountability check:
+
+```bash
+python report_pipeline_status.py
+```
+
 ## Key Outputs

 Giant:
@@ -77,6 +87,7 @@ Costco:
 - `costco_output/orders.csv`
 - `costco_output/items.csv`
 - `costco_output/items_enriched.csv`
+- `costco_output/items_enriched.csv` now preserves raw totals and matched net discount fields

 Combined:
 - `combined_output/purchases.csv`
@@ -85,6 +96,8 @@ Combined:
 - `combined_output/canonical_catalog.csv`
 - `combined_output/product_links.csv`
 - `combined_output/comparison_examples.csv`
+- `combined_output/pipeline_status.csv`
+- `combined_output/pipeline_status.json`

 ## Review Workflow

@@ -95,9 +108,14 @@ Run `review_products.py` to cleanup unresolved or weakly unified items:
 - skip it for later
 Decisions are saved and reused on later runs.

+The review step is intentionally conservative:
+- weak exact-name matches stay in the queue instead of auto-creating canonical products
+- canonical names should describe stable product identity, not retailer packaging text
+
 ## Notes
 - This project is designed around fragile retailer scraping flows, so the code favors explicit retailer-specific steps over heavy abstraction.
 - `scrape_giant.py` and `scrape_costco.py` are meant to work as standalone acquisition scripts.
+- Costco discount rows are preserved for auditability and also matched back to purchased items during enrichment.
 - `validate_cross_retailer_flow.py` is a proof/check script, not a required production step.

 ## Test
--- a/build_canonical_layer.py
+++ b/build_canonical_layer.py
@@ -1,4 +1,5 @@
 import click
+import re

 from layer_helpers import read_csv_rows, representative_value, stable_id, write_csv_rows

@@ -20,6 +21,8 @@ CANONICAL_FIELDS = [
    "updated_at",
 ]

+CANONICAL_DROP_TOKENS = {"CT", "COUNT", "COUNTS", "DOZ", "DOZEN", "DOZ.", "PACK"}
+
 LINK_FIELDS = [
    "observed_product_id",
    "canonical_product_id",
@@ -91,26 +94,24 @@ def auto_link_rule(observed_row):
            "high",
        )

-    if (
-        observed_row.get("representative_name_norm")
-        and not observed_row.get("representative_size_value")
-        and not observed_row.get("representative_size_unit")
-        and not observed_row.get("representative_pack_qty")
-    ):
-        return (
-            "exact_name",
-            "|".join(
-                [
-                    f"name={observed_row['representative_name_norm']}",
-                    f"measure={observed_row['representative_measure_type']}",
-                ]
-            ),
-            "medium",
-        )
-
    return "", "", ""


+def clean_canonical_name(name):
+    tokens = []
+    for token in re.sub(r"[^A-Z0-9\s]", " ", (name or "").upper()).split():
+        if token.isdigit():
+            continue
+        if token in CANONICAL_DROP_TOKENS:
+            continue
+        if re.fullmatch(r"\d+(?:PK|PACK)", token):
+            continue
+        if re.fullmatch(r"\d+DZ", token):
+            continue
+        tokens.append(token)
+    return " ".join(tokens).strip()
+
+
 def canonical_row_for_group(canonical_product_id, group_rows, link_method):
    quantity_value, quantity_unit = normalized_quantity(
        {
@@ -130,7 +131,10 @@ def canonical_row_for_group(canonical_product_id, group_rows, link_method):
    )
    return {
        "canonical_product_id": canonical_product_id,
-        "canonical_name": representative_value(group_rows, "representative_name_norm"),
+        "canonical_name": clean_canonical_name(
+            representative_value(group_rows, "representative_name_norm")
+        )
+        or representative_value(group_rows, "representative_name_norm"),
        "product_type": "",
        "brand": representative_value(group_rows, "representative_brand"),
        "variant": representative_value(group_rows, "representative_variant"),
--- a/build_purchases.py
+++ b/build_purchases.py
@@ -33,6 +33,8 @@ PURCHASE_FIELDS = [
    "measure_type",
    "line_total",
    "unit_price",
+    "matched_discount_amount",
+    "net_line_total",
    "store_name",
    "store_number",
    "store_city",
@@ -94,7 +96,7 @@ def decimal_or_zero(value):


 def derive_metrics(row):
-    line_total = to_decimal(row.get("line_total"))
+    line_total = to_decimal(row.get("net_line_total") or row.get("line_total"))
    qty = to_decimal(row.get("qty"))
    pack_qty = to_decimal(row.get("pack_qty"))
    size_value = to_decimal(row.get("size_value"))
@@ -292,6 +294,8 @@ def build_purchase_rows(
                "measure_type": row["measure_type"],
                "line_total": row["line_total"],
                "unit_price": row["unit_price"],
+                "matched_discount_amount": row.get("matched_discount_amount", ""),
+                "net_line_total": row.get("net_line_total", ""),
                "store_name": order_row.get("store_name", ""),
                "store_number": order_row.get("store_number", ""),
                "store_city": order_row.get("store_city", ""),
--- a/enrich_costco.py
+++ b/enrich_costco.py
@@ -1,6 +1,7 @@
 import csv
 import json
 import re
+from collections import defaultdict
 from pathlib import Path

 import click
@@ -29,6 +30,7 @@ HASH_SIZE_RE = re.compile(r"(?<![A-Z0-9])(\d+(?:\.\d+)?)#\b")
 PACK_DASH_RE = re.compile(r"(?<![A-Z0-9])(\d+)\s*-\s*PACK\b")
 PACK_WORD_RE = re.compile(r"(?<![A-Z0-9])(\d+)\s*PACK\b")
 SIZE_RE = re.compile(r"(?<![A-Z0-9])(\d+(?:\.\d+)?)\s*(OZ|LB|LBS|CT|KG|G)\b")
+DISCOUNT_TARGET_RE = re.compile(r"^/\s*(\d+)\b")


 def clean_costco_name(name):
@@ -156,6 +158,13 @@ def is_discount_item(item):
    return amount < 0 or unit < 0 or description.startswith("/")


+def discount_target_id(raw_name):
+    match = DISCOUNT_TARGET_RE.match(normalize_whitespace(raw_name))
+    if not match:
+        return ""
+    return match.group(1)
+
+
 def parse_costco_item(order_id, order_date, raw_path, line_no, item):
    raw_name = combine_description(item)
    cleaned_name = clean_costco_name(raw_name)
@@ -190,6 +199,8 @@ def parse_costco_item(order_id, order_date, raw_path, line_no, item):
        "reward_savings": "",
        "coupon_savings": str(item.get("amount", "")) if is_discount_line else "",
        "coupon_price": "",
+        "matched_discount_amount": "",
+        "net_line_total": str(item.get("amount", "")) if not is_discount_line else "",
        "image_url": "",
        "raw_order_path": raw_path.as_posix(),
        "item_name_norm": item_name_norm,
@@ -211,6 +222,51 @@ def parse_costco_item(order_id, order_date, raw_path, line_no, item):
    }


+def match_costco_discounts(rows):
+    rows_by_order = defaultdict(list)
+    for row in rows:
+        rows_by_order[row["order_id"]].append(row)
+
+    for order_rows in rows_by_order.values():
+        purchase_rows_by_item_id = defaultdict(list)
+        for row in order_rows:
+            if row.get("is_discount_line") == "true":
+                continue
+            retailer_item_id = row.get("retailer_item_id", "")
+            if retailer_item_id:
+                purchase_rows_by_item_id[retailer_item_id].append(row)
+
+        for row in order_rows:
+            if row.get("is_discount_line") != "true":
+                continue
+            target_id = discount_target_id(row.get("item_name", ""))
+            if not target_id:
+                continue
+            matches = purchase_rows_by_item_id.get(target_id, [])
+            if len(matches) != 1:
+                row["parse_notes"] = normalize_whitespace(
+                    f"{row.get('parse_notes', '')};discount_target_unmatched={target_id}"
+                ).strip(";")
+                continue
+
+            purchase_row = matches[0]
+            matched_discount = to_decimal(row.get("line_total"))
+            gross_total = to_decimal(purchase_row.get("line_total"))
+            existing_discount = to_decimal(purchase_row.get("matched_discount_amount")) or 0
+            if matched_discount is None or gross_total is None:
+                continue
+
+            total_discount = existing_discount + matched_discount
+            purchase_row["matched_discount_amount"] = format_decimal(total_discount)
+            purchase_row["net_line_total"] = format_decimal(gross_total + total_discount)
+            purchase_row["parse_notes"] = normalize_whitespace(
+                f"{purchase_row.get('parse_notes', '')};matched_discount={target_id}"
+            ).strip(";")
+            row["parse_notes"] = normalize_whitespace(
+                f"{row.get('parse_notes', '')};matched_to_item={target_id}"
+            ).strip(";")
+
+
 def iter_costco_rows(raw_dir):
    for path in discover_json_files(raw_dir):
        if path.name in {"summary.json", "summary_requests.json"}:
@@ -238,6 +294,7 @@ def discover_json_files(raw_dir):

 def build_items_enriched(raw_dir):
    rows = list(iter_costco_rows(raw_dir))
+    match_costco_discounts(rows)
    rows.sort(key=lambda row: (row["order_date"], row["order_id"], int(row["line_no"])))
    return rows

--- a/enrich_giant.py
+++ b/enrich_giant.py
@@ -33,6 +33,8 @@ OUTPUT_FIELDS = [
    "reward_savings",
    "coupon_savings",
    "coupon_price",
+    "matched_discount_amount",
+    "net_line_total",
    "image_url",
    "raw_order_path",
    "item_name_norm",
@@ -371,6 +373,8 @@ def parse_item(order_id, order_date, raw_path, line_no, item):
        "reward_savings": stringify(item.get("rewardSavings")),
        "coupon_savings": stringify(item.get("couponSavings")),
        "coupon_price": stringify(item.get("couponPrice")),
+        "matched_discount_amount": "",
+        "net_line_total": stringify(item.get("totalPrice")),
        "image_url": extract_image_url(item),
        "raw_order_path": raw_path.as_posix(),
        "item_name_norm": normalized_name,
--- a/pm/data-model.org
+++ b/pm/data-model.org
@@ -1,12 +1,13 @@
-* grocery data model and file layout
+* Grocery data model and file layout

 This document defines the shared file layout and stable CSV schemas for the
-grocery pipeline. The goal is to keep retailer-specific ingest separate from
-cross-retailer product modeling so Giant-specific quirks do not become the
-system of record.
-
-** design rules
+grocery pipeline.
+Goals:
+- Ensure data gathering is separate from analysis
+- Enable multiple data gathering methods
+- One layer for review and analysis  

+ ** Design Rules
 - Raw retailer exports remain the source of truth.
 - Retailer parsing is isolated to retailer-specific files and ids.
 - Cross-retailer product layers begin only after retailer-specific enrichment.
@@ -14,296 +15,313 @@ system of record.
  existing columns should not be repurposed.
 - Unknown values should be left blank rather than guessed.

-** directory layout
-
-Use one top-level data root:
-
-#+begin_example
-data/
-  giant/
-    raw/
-      history.json
-      orders/
-        <order_id>.json
-    orders.csv
-    items_raw.csv
-    items_enriched.csv
-    products_observed.csv
-  costco/
-    raw/
-      ...
-    orders.csv
-    items_raw.csv
-    items_enriched.csv
-    products_observed.csv
-  shared/
-    products_canonical.csv
-    product_links.csv
-    review_queue.csv
-#+end_example
-
-** layer responsibilities
-
- `data/<retailer>/raw/`
-  Stores unmodified retailer payloads exactly as fetched.
- `data/<retailer>/orders.csv`
-  One row per retailer order or visit, flattened from raw order data.
- `data/<retailer>/items_raw.csv`
-  One row per retailer line item, preserving retailer-native values needed for
-  reruns and debugging.
- `data/<retailer>/items_enriched.csv`
-  Parsed retailer line items with normalized fields and derived guesses, still
-  retailer-specific.
- `data/<retailer>/products_observed.csv`
-  Distinct retailer-facing observed products aggregated from enriched items.
- `data/shared/products_canonical.csv`
-  Cross-retailer canonical product entities used for comparison.
- `data/shared/product_links.csv`
-  Links from retailer observed products to canonical products.
- `data/shared/review_queue.csv`
-  Human review queue for unresolved or low-confidence matching/parsing cases.
-
-** retailer-specific versus shared
-
-Retailer-specific:
-
+*** Retailer-specific data:
 - raw json payloads
 - retailer order ids
 - retailer line numbers
 - retailer category ids and names
 - retailer item names
 - retailer image urls
- parsed guesses derived from one retailer feed
 - observed products scoped to one retailer

-Shared:
-
+*** Review/Combined data:
 - canonical products
 - observed-to-canonical links
 - human review state for unresolved cases
 - comparison-ready normalized quantity basis fields

+// I don't like this terminology - what is "observed" doing for us?
+// output should be normalized_items, not observed
+// unless this is the way we're matching multiple upc's?
 Observed products are the boundary between retailer-specific parsing and
 cross-retailer canonicalization. Nothing upstream of `products_observed.csv`
 should require knowledge of another retailer.

-** schema: `data/<retailer>/orders.csv`
+* Pipeline
+Key: 
+- (1) input
+- [2] output

-One row per order or visit.
+Each step can be run alone if its dependents exist.

-| column | meaning |
-|-
-| `retailer` | retailer slug such as `giant` |
-| `order_id` | retailer order or visit id |
-| `order_date` | order date in `YYYY-MM-DD` when available |
-| `delivery_date` | fulfillment date in `YYYY-MM-DD` when available |
-| `service_type` | retailer service type such as `INSTORE` |
-| `order_total` | order total as provided by retailer |
-| `payment_method` | retailer payment label |
-| `total_item_count` | total line count or item count from retailer |
-| `total_savings` | total savings as provided by retailer |
-| `your_savings_total` | savings field from retailer when present |
-| `coupons_discounts_total` | coupon/discount total from retailer |
-| `store_name` | retailer store name |
-| `store_number` | retailer store number |
-| `store_address1` | street address |
-| `store_city` | city |
-| `store_state` | state or province |
-| `store_zipcode` | postal code |
-| `refund_order` | retailer refund flag |
-| `ebt_order` | retailer EBT flag |
-| `raw_history_path` | relative path to source history payload |
-| `raw_order_path` | relative path to source order payload |
+** 1. Collect
+Get raw receipt/visit and item data from a retailer.  Scraping is unique to a Retailer and method (e.g., Giant-Web and Giant-Scan).  Preserve complete raw data and preserve fidelity.  Avoid interpretation beyond basic data flattening.
+ - (1) Source access (Varies, eg header data, auth for API access)
+ - [1] collected visits from each retailer
+ - [2] collected items from each retailer
+ - [3] any other raw data that supports [1] and [2]; explicit source (eventual receipt scan?)
+   
+** 2. Normalize
+Parse and extract structured facts from retailer-specific raw data to create a standardized item format for that retailer.  Strictly dependent on Collect method and output.
+ - Extract quantity, size, pack, pricing, variant
+ - Add discount line items to product line items using upc/retail_item_id and concurrence
+ - Cleanup naming to facilitate later matching
+ - (1) collected items from each retailer
+ - (2) collected visits from each retailer
+ - [1] normalized items from each retailer

-Primary key:
+** 3. Review/Combine (Canonicalization)
+Decide whether two normalized retailer items are "the same product"; match items across retailers using algo/logic and human review.  Create catalog linked to normalized items.
+ - Grouping the same item from retailer
+ - Asking human to create a canonical/catalog item with:
+   - friendly/canonical_name: "bell pepper"; "milk"
+   - category: "produce"; "dairy"
+   - product_type: "pepper"; "milk"
+   - ? variant? "whole, "skim", "2pct"
+ - (1) normalized items from each retailer
+ - [1] review queue of items to be reviewed
+ - [2] catalog (lookup table) of confirmed retailer_item and canonical_name
+ - [3] canonical purchase list, pivot-ready
+   
+** Unresolved Issues
+1. need central script to orchestrate; metadata belongs there and nowhere else

- (`retailer`, `order_id`)
+** Symptoms
+- `LIME` and `LIME . / .` appearing in canonical_catalog:
+  - names must come from review-approved names, not raw strings

-** schema: `data/<retailer>/items_raw.csv`

+* Directory Layout
+Use one top-level data root:
+#+begin_example
+main.py
+collect_<retailer>_<method>.py
+normalize_<retailer>_<method>.py
+review.py
+data/
+  <retailer-method>/
+    raw/  # unmodified retailer payloads exactly as fetched
+      <order_id.json> 
+    collected_items.csv # one row per retailer line item w/ retailer-native values
+    collected_orders.csv # one row per receipt/visit, flattened from raw order data
+    normalized_items.csv # parsed retailer-specific line items with normalized fields
+  costco-web/ # sample
+    raw/
+      orders/
+        history.json
+        <order_id>.json
+    collected_items.csv
+    collected_orders.csv
+    normalized_items.csv
+  review/
+    review_queue.csv #  Human review queue for unresolved matching/parsing cases.
+    product_links.csv # Links from retailer-observed products to canonical products.
+  catalog.csv  # Cross-retailer canonical product entities used for comparison.
+  purchases.csv
+#+end_example
+
+* Schemas
+** `data/<retailer-method>/collected_items.csv`
 One row per retailer line item.
+| key                | definition                                 |
+|--------------------+--------------------------------------------|
+| `retailer` PK      | retailer slug                              |
+| `order_id` PK      | retailer order id                          |
+| `line_no`  PK      | stable line number within order export     |
+| `order_date`       | copied from order when available           |
+| `retailer_item_id` | retailer-native item id when available     |
+| `pod_id`           | retailer pod/item id                       |
+| `item_name`        | raw retailer item name                     |
+| `upc`              | retailer UPC or PLU value                  |
+| `category_id`      | retailer category id                       |
+| `category`         | retailer category description              |
+| `qty`              | retailer quantity field                    |
+| `unit`             | retailer unit code such as `EA` or `LB`    |
+| `unit_price`       | retailer unit price field                  |
+| `line_total`       | retailer extended price field              |
+| `picked_weight`    | retailer picked weight field               |
+| `mvp_savings`      | retailer savings field                     |
+| `reward_savings`   | retailer rewards savings field             |
+| `coupon_savings`   | retailer coupon savings field              |
+| `coupon_price`     | retailer coupon price field                |
+| `image_url`        | raw retailer image url when present        |
+| `raw_order_path`   | relative path to source order payload      |
+| `is_discount_line` | retailer adjustment or discount-line flag  |
+| `is_coupon_line`   | coupon-like line flag when distinguishable |

-| column           | meaning                                 |
-|------------------+-----------------------------------------|
-| `retailer`       | retailer slug                           |
-| `order_id`       | retailer order id                       |
-| `line_no`        | stable line number within order export  |
-| `order_date`     | copied from order when available        |
-| `retailer_item_id` | retailer-native item id when available |
-| `pod_id`         | retailer pod/item id                    |
-| `item_name`      | raw retailer item name                  |
-| `upc`            | retailer UPC or PLU value               |
-| `category_id`    | retailer category id                    |
-| `category`       | retailer category description           |
-| `qty`            | retailer quantity field                 |
-| `unit`           | retailer unit code such as `EA` or `LB` |
-| `unit_price`     | retailer unit price field               |
-| `line_total`     | retailer extended price field           |
-| `picked_weight`  | retailer picked weight field            |
-| `mvp_savings`    | retailer savings field                  |
-| `reward_savings` | retailer rewards savings field          |
-| `coupon_savings` | retailer coupon savings field           |
-| `coupon_price`   | retailer coupon price field             |
-| `image_url`      | raw retailer image url when present     |
-| `raw_order_path` | relative path to source order payload   |
-| `is_discount_line` | retailer adjustment or discount-line flag |
-| `is_coupon_line` | coupon-like line flag when distinguishable |
+** `data/<retailer-method>/collected_orders.csv`
+One row per order or visit.
+| key                       | definition                                      |
+|---------------------------+-------------------------------------------------|
+| `retailer` PK             | retailer slug such as `giant`                   |
+| `order_id` PK             | retailer order or visit id                      |
+| `order_date`              | order date in `YYYY-MM-DD` when available       |
+| `delivery_date`           | fulfillment date in `YYYY-MM-DD` when available |
+| `service_type`            | retailer service type such as `INSTORE`         |
+| `order_total`             | order total as provided by retailer             |
+| `payment_method`          | retailer payment label                          |
+| `total_item_count`        | total line count or item count from retailer    |
+| `total_savings`           | total savings as provided by retailer           |
+| `your_savings_total`      | savings field from retailer when present        |
+| `coupons_discounts_total` | coupon/discount total from retailer             |
+| `store_name`              | retailer store name                             |
+| `store_number`            | retailer store number                           |
+| `store_address1`          | street address                                  |
+| `store_city`              | city                                            |
+| `store_state`             | state or province                               |
+| `store_zipcode`           | postal code                                     |
+| `refund_order`            | retailer refund flag                            |
+| `ebt_order`               | retailer EBT flag                               |
+| `raw_history_path`        | relative path to source history payload         |
+| `raw_order_path`          | relative path to source order payload           |

-Primary key:
+** `data/<retailer-method>/normalized_items.csv`
+One row per retailer line item after deterministic parsing. Preserve raw
+fields from `collected_items.csv` and add parsed fields plus retailer-level
+identity needed before cross-retailer review.

- (`retailer`, `order_id`, `line_no`)
+| key                        | definition                                                       |
+|----------------------------+------------------------------------------------------------------|
+| `retailer` PK              | retailer slug                                                    |
+| `order_id` PK              | retailer order id                                                |
+| `line_no` PK               | line number within order                                         |
+| `normalized_row_id`        | stable row key, typically `<retailer>:<order_id>:<line_no>`      |
+| `normalized_item_id`       | stable retailer-level item identity after deterministic grouping |
+| `normalization_basis`      | basis used to assign `normalized_item_id`                        |
+| `retailer_item_id`         | retailer-native item id                                          |
+| `item_name`                | raw retailer item name                                           |
+| `item_name_norm`           | normalized retailer item name                                    |
+| `brand_guess`              | parsed brand guess                                               |
+| `variant`                  | parsed variant text                                              |
+| `size_value`               | parsed numeric size value                                        |
+| `size_unit`                | parsed size unit such as `oz`, `lb`, `fl_oz`                     |
+| `pack_qty`                 | parsed pack or count guess                                       |
+| `measure_type`             | `each`, `weight`, `volume`, `count`, or blank                    |
+| `normalized_quantity`      | numeric comparison basis derived during normalization            |
+| `normalized_quantity_unit` | basis unit such as `oz`, `lb`, `count`, or blank                 |
+| `is_store_brand`           | store-brand guess                                                |
+| `is_fee`                   | fee or non-product flag                                          |
+| `is_discount_line`         | discount or adjustment-line flag                                 |
+| `is_coupon_line`           | coupon-like line flag                                            |
+| `matched_discount_amount`  | matched discount value carried onto purchased row when supported |
+| `net_line_total`           | line total after matched discount when supported                 |
+| `price_per_each`           | derived per-each price when supported                            |
+| `price_per_each_basis`     | source basis for `price_per_each`                                |
+| `price_per_count`          | derived per-count price when supported                           |
+| `price_per_count_basis`    | source basis for `price_per_count`                               |
+| `price_per_lb`             | derived per-pound price when supported                           |
+| `price_per_lb_basis`       | source basis for `price_per_lb`                                  |
+| `price_per_oz`             | derived per-ounce price when supported                           |
+| `price_per_oz_basis`       | source basis for `price_per_oz`                                  |
+| `image_url`                | best available retailer image url                                |
+| `raw_order_path`           | relative path to source order payload                            |
+| `parse_version`            | parser version string for reruns                                 |
+| `parse_notes`              | optional non-fatal parser notes                                  |

-** schema: `data/<retailer>/items_enriched.csv`
-
-One row per retailer line item after deterministic parsing. Preserve the raw
-fields from `items_raw.csv` and add parsed fields.
-
-| column              | meaning                                                     |
-|---------------------+-------------------------------------------------------------|
-| `retailer`          | retailer slug                                               |
-| `order_id`          | retailer order id                                           |
-| `line_no`           | line number within order                                    |
-| `observed_item_key` | stable row key, typically `<retailer>:<order_id>:<line_no>` |
-| `retailer_item_id`  | retailer-native item id                                     |
-| `item_name`         | raw retailer item name                                      |
-| `item_name_norm`    | normalized item name                                        |
-| `brand_guess`       | parsed brand guess                                          |
-| `variant`           | parsed variant text                                         |
-| `size_value`        | parsed numeric size value                                   |
-| `size_unit`         | parsed size unit such as `oz`, `lb`, `fl_oz`                |
-| `pack_qty`          | parsed pack or count guess                                  |
-| `measure_type`      | `each`, `weight`, `volume`, `count`, or blank               |
-| `is_store_brand`    | store-brand guess                                           |
-| `is_fee`            | fee or non-product flag                                     |
-| `is_discount_line`  | discount or adjustment-line flag                            |
-| `is_coupon_line`    | coupon-like line flag                                       |
-| `price_per_each`    | derived per-each price when supported                       |
-| `price_per_lb`      | derived per-pound price when supported                      |
-| `price_per_oz`      | derived per-ounce price when supported                      |
-| `image_url`         | best available retailer image url                           |
-| `parse_version`     | parser version string for reruns                            |
-| `parse_notes`       | optional non-fatal parser notes                             |
-
-Primary key:
-
- (`retailer`, `order_id`, `line_no`)
-
-** schema: `data/<retailer>/products_observed.csv`
-
-One row per distinct retailer-facing observed product.
-
-| column                        | meaning                                                        |
-|-------------------------------+----------------------------------------------------------------|
-| `observed_product_id`         | stable observed product id                                     |
-| `retailer`                    | retailer slug                                                  |
-| `observed_key`                | deterministic grouping key used to create the observed product |
-| `representative_retailer_item_id` | best representative retailer-native item id               |
-| `representative_upc`          | best representative UPC/PLU                                    |
-| `representative_item_name`    | representative raw retailer name                               |
-| `representative_name_norm`    | representative normalized name                                 |
-| `representative_brand`        | representative brand guess                                     |
-| `representative_variant`      | representative variant                                         |
-| `representative_size_value`   | representative size value                                      |
-| `representative_size_unit`    | representative size unit                                       |
-| `representative_pack_qty`     | representative pack/count                                      |
-| `representative_measure_type` | representative measure type                                    |
-| `representative_image_url`    | representative image url                                       |
-| `is_store_brand`              | representative store-brand flag                                |
-| `is_fee`                      | representative fee flag                                        |
-| `is_discount_line`            | representative discount-line flag                              |
-| `is_coupon_line`              | representative coupon-line flag                                |
-| `first_seen_date`             | first order date seen                                          |
-| `last_seen_date`              | last order date seen                                           |
-| `times_seen`                  | number of enriched item rows grouped here                      |
-| `example_order_id`            | one example retailer order id                                  |
-| `example_item_name`           | one example raw item name                                      |
-| `distinct_retailer_item_ids_count` | count of distinct retailer-native item ids               |
-
-Primary key:
-
- (`observed_product_id`)
-
-** schema: `data/shared/products_canonical.csv`
-
-One row per cross-retailer canonical product.
-
-| column                     | meaning                                          |
-|----------------------------+--------------------------------------------------|
-| `canonical_product_id`     | stable canonical product id                      |
-| `canonical_name`           | canonical human-readable name                    |
-| `product_type`             | broad class such as `apple`, `milk`, `trash_bag` |
-| `brand`                    | canonical brand when applicable                  |
-| `variant`                  | canonical variant                                |
-| `size_value`               | normalized size value                            |
-| `size_unit`                | normalized size unit                             |
-| `pack_qty`                 | normalized pack/count                            |
-| `measure_type`             | normalized measure type                          |
-| `normalized_quantity`      | numeric comparison basis value                   |
-| `normalized_quantity_unit` | basis unit such as `oz`, `lb`, `count`           |
-| `notes`                    | optional human notes                             |
-| `created_at`               | creation timestamp or date                       |
-| `updated_at`               | last update timestamp or date                    |
-
-Primary key:
-
- (`canonical_product_id`)
-
-** schema: `data/shared/product_links.csv`
+Notes:
+- `normalized_item_id` replaces the need for a core `observed_products.csv` layer.
+- `normalization_basis` should be explicit values like `exact_upc`, `retailer_item_id`, `name_size_pack`, or `manual_retailer_alias`.
+- Cross-retailer identity is still handled later in review/combine via `catalog.csv` and `product_links.csv`.

+** `data/review/product_links.csv`
 One row per observed-to-canonical relationship.
+1 (catalog_item) to many (normalized_items)

-| column | meaning |
-|-
-| `observed_product_id` | retailer observed product id |
-| `canonical_product_id` | linked canonical product id |
-| `link_method` | `manual`, `exact_upc`, `exact_name`, etc. |
-| `link_confidence` | optional confidence label |
-| `review_status` | `pending`, `approved`, `rejected`, or blank |
-| `reviewed_by` | reviewer id or initials |
-| `reviewed_at` | review timestamp or date |
-| `link_notes` | optional notes |
-
-Primary key:
-
- (`observed_product_id`, `canonical_product_id`)
-
-** schema: `data/shared/review_queue.csv`
+| key               | definition                                  |
+|-------------------+---------------------------------------------|
+| `observed_id` PK  | retailer observed product id                |
+| `catalog_id` PK   | linked canonical product id                 |
+| `link_method`     | `manual`, `exact_upc`, `exact_name`, etc.   |
+| `link_confidence` | optional confidence label                   |
+| `review_status`   | `pending`, `approved`, `rejected`, or blank |
+| `reviewed_by`     | reviewer id or initials                     |
+| `reviewed_at`     | review timestamp or date                    |
+| `link_notes`      | optional notes                              |

+** `data/review/review_queue.csv`
 One row per issue needing human review.

-| column | meaning |
-|-
-| `review_id` | stable review row id |
-| `queue_type` | `observed_product`, `link_candidate`, `parse_issue` |
-| `retailer` | retailer slug when applicable |
-| `observed_product_id` | observed product id when applicable |
-| `canonical_product_id` | candidate canonical id when applicable |
-| `reason_code` | machine-readable review reason |
-| `priority` | optional priority label |
-| `raw_item_names` | compact list of example raw names |
-| `normalized_names` | compact list of example normalized names |
-| `upc` | example UPC/PLU |
-| `image_url` | example image url |
-| `example_prices` | compact list of example prices |
-| `seen_count` | count of related rows |
-| `status` | `pending`, `approved`, `rejected`, `deferred` |
-| `resolution_notes` | reviewer notes |
-| `created_at` | creation timestamp or date |
-| `updated_at` | last update timestamp or date |
+| key                   | definition                                          |
+|-----------------------+-----------------------------------------------------|
+| `review_id` PK        | stable review row id                                |
+| `queue_type`          | `observed_product`, `link_candidate`, `parse_issue` |
+| `retailer`            | retailer slug when applicable                       |
+| `observed_product_id` | observed product id when applicable                 |
+| `catalod_id`          | candidate canonical id when applicable              |
+| `reason_code`         | machine-readable review reason                      |
+| `priority`            | optional priority label                             |
+| `raw_item_names`      | compact list of example raw names                   |
+| `normalized_names`    | compact list of example normalized names            |
+| `upc`                 | example UPC/PLU                                     |
+| `image_url`           | example image url                                   |
+| `example_prices`      | compact list of example prices                      |
+| `seen_count`          | count of related rows                               |
+| `status`              | `pending`, `approved`, `rejected`, `deferred`       |
+| `resolution_notes`    | reviewer notes                                      |
+| `created_at`          | creation timestamp or date                          |
+| `updated_at`          | last update timestamp or date                       |
+** `data/catalog.csv`
+One row per cross-retailer canonical product.
+| key                        | definition                             |
+|----------------------------+----------------------------------------|
+| `catalog_id` PK            | stable canonical product id            |
+| `catalog_name`             | canonical human-readable name          |
+| `product_type`             | generic product eg `apple`, `milk`     |
+| `category`                 | broad section eg `produce`, `dairy`    |
+| `brand`                    | canonical brand when applicable        |
+| `variant`                  | canonical variant                      |
+| `size_value`               | normalized size value                  |
+| `size_unit`                | normalized size unit                   |
+| `pack_qty`                 | normalized pack/count                  |
+| `measure_type`             | normalized measure type                |
+| `normalized_quantity`      | numeric comparison basis value         |
+| `normalized_quantity_unit` | basis unit such as `oz`, `lb`, `count` |
+| `notes`                    | optional human notes                   |
+| `created_at`               | creation timestamp or date             |
+| `updated_at`               | last update timestamp or date          |

-Primary key:
+** `data/purchases.csv`
+One row per purchased item (i.e., `row_type=item` from normalized layer), with
+catalog attributes denormalized in and discounts already applied.

- (`review_id`)
+| key                        | definition                                                     |
+|----------------------------+----------------------------------------------------------------|
+| `purchase_date`            | date of purchase (from order)                                  |
+| `retailer`                 | retailer slug                                                  |
+| `order_id`                 | retailer order id                                              |
+| `line_no`                  | line number within order                                       |
+| `normalized_row_id`        | `<retailer>:<order_id>:<line_no>`                              |
+| `normalized_item_id`       | retailer-level normalized item identity                        |
+| `catalog_id`               | linked canonical product id                                    |
+| `catalog_name`             | canonical product name for analysis                            |
+| `catalog_product_type`     | broader product family (e.g., `egg`, `milk`)                   |
+| `catalog_category`         | category such as `produce`, `dairy`                            |
+| `catalog_brand`            | canonical brand when applicable                                |
+| `catalog_variant`          | canonical variant when applicable                              |
+| `raw_item_name`            | original retailer item name                                    |
+| `normalized_item_name`     | cleaned/normalized retailer item name                          |
+| `retailer_item_id`         | retailer-native item id                                        |
+| `upc`                      | UPC/PLU when available                                         |
+| `qty`                      | retailer quantity field                                        |
+| `unit`                     | retailer unit (e.g., `EA`, `LB`)                               |
+| `pack_qty`                 | parsed pack/count                                              |
+| `size_value`               | parsed size value                                              |
+| `size_unit`                | parsed size unit                                               |
+| `measure_type`             | `each`, `weight`, `volume`, `count`                            |
+| `normalized_quantity`      | normalized comparison quantity                                 |
+| `normalized_quantity_unit` | unit for normalized quantity                                   |
+| `unit_price`               | retailer unit price                                            |
+| `line_total`               | original retailer extended price (pre-discount)                |
+| `matched_discount_amount`  | discount amount matched from discount lines                    |
+| `net_line_total`           | effective price after discount (`line_total` + discounts)      |
+| `store_name`               | retailer store name                                            |
+| `store_city`               | store city                                                     |
+| `store_state`              | store state                                                    |
+| `price_per_each`           | derived per-each price                                         |
+| `price_per_each_basis`     | source basis for per-each calc                                 |
+| `price_per_count`          | derived per-count price                                        |
+| `price_per_count_basis`    | source basis for per-count calc                                |
+| `price_per_lb`             | derived per-pound price                                        |
+| `price_per_lb_basis`       | source basis for per-pound calc                                |
+| `price_per_oz`             | derived per-ounce price                                        |
+| `price_per_oz_basis`       | source basis for per-ounce calc                                |
+| `is_fee`                   | true if row represents non-product fee                         |
+| `raw_order_path`           | relative path to original order payload                        |

-** current giant mapping
+Notes:
+- Only rows with `row_type=item` from normalization should appear here.
+- `line_total` preserves retailer truth; `net_line_total` is what you actually paid.
+- catalog fields are denormalized in to make pivoting trivial.
+- no discount/coupon rows exist here; their effects are carried via `matched_discount_amount`.

-Current scraper outputs map to the new layout as follows:
-
- `giant_output/raw/history.json` -> `data/giant/raw/history.json`
- `giant_output/raw/<order_id>.json` -> `data/giant/raw/orders/<order_id>.json`
- `giant_output/orders.csv` -> `data/giant/orders.csv`
- `giant_output/items.csv` -> `data/giant/items_raw.csv`
-
-Current Giant raw order payloads already expose fields needed for future
-enrichment, including `image`, `itemName`, `primUpcCd`, `lbEachCd`,
-`unitPrice`, `groceryAmount`, and `totalPickedWeight`.
+* /
--- a/pm/scrape-giant.org
+++ b/pm/scrape-giant.org
@@ -70,7 +70,13 @@ b l : switch to local branch (cx)
 l l : open local reflog
 put point on the commit; highlighted remote gitea/cx
 X   : reset branch; prompts you, selected cx
-       
+
+
+
+** merge branch
+b b : switch to branch to be merged into (cx)
+m m : pick branch to merge into current branch
+
 * giant requests
 ** item:
 get:
@@ -250,18 +256,247 @@ python build_observed_products.py
 python build_review_queue.py
 python build_canonical_layer.py
 python validate_cross_retailer_flow.py
-* t1.11 tasks [2026-03-17 Tue 13:49]
+* t1.13 tasks [2026-03-17 Tue 13:49]
 ok i ran a few. time to run some cleanups here - i'm wondering if we shouldn't be less aggressive with canonical names and encourage a better manual process to start. 
-1. auto-created canonical_names lack category, product_type - ok with filling these in manually in the catalog once the queue is empty
-2. canonical_names feel too specific, e.g., "5DZ egg"
-3. some canonical_names need consolidation, eg "LIME" and "LIME  . / ." ; poss cleanup issue. there are 5 entries for ergg but but they are all regular large grade A white eggs, just different amounts in dozens.
+** TODO fill in auto-created canonical category, product-type
+auto-created canonical_names lack category, product_type - ok with filling these in manually in the catalog once the queue is empty
+
+** TODO consolidation cleanup
+1. canonical_names feel too specific, e.g., "5DZ egg" - probably a problem with the enrich_* steps not adding appropraite normalizing data /and/ removing from observed product title?
+2.  some canonical_names need consolidation, eg "LIME" and "LIME  . / ." ; poss cleanup issue. there are 5 entries for ergg but but they are all regular large grade A white eggs, just different amounts in dozens.
  Eggs are actually a great candidate for the kind of analysis we want to do - the pipeline should have caught and properly sorted these into size/qty:
+  #+begin_example
  ```canonical_product_id	canonical_name	category	product_type	brand	variant	size_value	size_unit	pack_qty	measure_type	notes	created_at	updated_at
  gcan_0e350505fd22	5DZ EGG / /			KS					each	auto-linked via exact_name		
  gcan_47279a80f5f3	EGG 5 DOZ. BBS								each	auto-linked via exact_name		
  gcan_7d099130c1bf	LRG WHITE EGG			SB				30	count	auto-linked via exact_upc		
  gcan_849c2817e667	GDA LRG WHITE EGG			SB				18	count	auto-linked via exact_upc		
  gcan_cb0c6c8cf480	LG EGG CONVENTIONAL					18	count		count	auto-linked via exact_name_size		  ```
-4. Build costco mechanism for matching discount to line item.
+  #+end_example
+** TODO costco discount matching
+Build costco mechanism for matching discount to line item.
   1. Discounts appear as their own line items with a number like /123456, this matches the UPC of the discounted item
   2. must be date-matched to the UPC
+
+Data model might be missing shape:
+1. match discount rows like `item_name:/2303476` to `retailer_item_id:2303476`
+2. display this value on the item somehow? maybe update line_total? otherwise we lose fidelity. should be stored in items_enriched somehow
+#+begin_example
+```retailer	order_id	line_no	observed_item_key	order_date	retailer_item_id	pod_id	item_name	upc	category_id	category	qty	unit	unit_price	line_total	picked_weight	mvp_savings	reward_savings	coupon_savings	coupon_price	image_url	raw_order_path	item_name_norm	brand_guess	variant	size_value	size_unit	pack_qty	measure_type	is_store_brand	is_fee	is_discount_line	is_coupon_line	price_per_each	price_per_lb	price_per_oz	parse_version	parse_notes
+costco	2.11115E+22	3	costco:21111520101942404241753:3	4/24/2024	2303476		KA 6QT MIXER P16 KSM60SECXER/CU FY23		33	33	1	None	399.99	399.99							costco_output/raw/21111520101942404241753-2024-04-24T17-53-00.json	KA 6QT MIXER KSM60SECXER/CU						each	FALSE	FALSE	FALSE	FALSE	399.99			costco-enrich-v1	
+costco	2.11115E+22	4	costco:21111520101942404241753:4	4/24/2024	325173		/2303476		33	33	-1	None	0	-100				-100			costco_output/raw/21111520101942404241753-2024-04-24T17-53-00.json	/2303476						each	FALSE	FALSE	TRUE	TRUE	100			costco-enrich-v1	```
+#+end_example
+** TODO giant discount matching
+
+* prompt
+ do not add new abstractions unless they remove real duplication. prefer explicit retailer-specific logic over generic heuristics. do not auto-create new canonical products from weak normalized names.
+ and propose the smallest set of edits needed.
+* 1.13 fixes
+** 15x Costco discounts not caught
+- 15x, some with slash-space: `/ 1768123`and some without: `/2303476`
+** canonical names suck - tempted to force manual config from scratch?
+- maybe first-pass should be naming groups, starting with largest groups and going on down.
+- unfortunately not seeing many cross-retailer items? looks like costco-only; just taking Giant as gospel
+- could be as simple as changing canonical name in canonical_catalog.csv  
+- tough to figure out where the data is, leading to below:  
+** need to refactor whole flow and where data is stored
+group by browser or by site, or both? currently mixed. 
+1. Scrape
+   - Script:
+   - Output: /output/raw/orderN.json, history.json, orders.csv, history.csv
+2. Enrich
+   - Scripts:
+   - Output: /output/enrich/items.json
+3. Combined - /output/?
+   - Review step?
+
+** propsed fixes
+* 1.14 prep - OBE
+** [ ] t1.14.1 define and document the filesystem/data-layer layout (2-3 commits)
+make stage ownership and retailer ownership explicit so every artifact has one obvious home
+
+** AC
+1. define and document the canonical directory layout for the pipeline, separating retailer-specific artifacts from shared combined artifacts
+2. adopt an explicit layout of the form:
+   - `data/<retailer>/raw/`
+   - `data/<retailer>/orders.csv`
+   - `data/<retailer>/items.csv`
+   - `data/<retailer>/items_enriched.csv`
+   - `data/combined/products_observed.csv`
+   - `data/combined/review_queue.csv`
+   - `data/combined/item_aliases.csv`
+   - `data/combined/canonical_catalog.csv`
+   - `data/combined/product_links.csv`
+   - `data/combined/purchases.csv`
+   - `data/combined/pipeline_status.csv`
+   - `data/combined/pipeline_status.json`
+3. update docs/readme and pipeline docs so each script’s inputs and outputs point to the new layout
+4. remove or deprecate ambiguous stage outputs living under a retailer-specific output directory when they are actually shared artifacts
+- pm note: goal is “where does this file live?” should have one answer, not three
+
+** evidence
+- commit:
+- tests:
+- date:
+
+** notes
+
+** [ ] t1.14.2 define the row-level data model for raw, enriched, observed, canonical, and purchases layers (2-4 commits)
+lock the item model before further refactors so each stage has a clear grain and purpose
+
+** AC
+1. document the row grain for each layer:
+   - raw item row = one receipt line from one retailer order
+   - enriched item row = one retailer line with retailer-specific parsed fields
+   - observed product row = one grouped retailer-facing product concept
+   - canonical catalog row = one review-controlled product identity
+   - purchase row = one final pivot-ready purchased item line
+2. define the required fields for each layer, including stable ids and provenance fields
+3. explicitly document which fields are allowed to be blank at each layer (e.g. `upc`, `canonical_item_id`, category)
+4. document the relationship between:
+   - `raw_item_name`
+   - `normalized_item_name`
+   - `observed_product_id`
+   - `canonical_item_id`
+5. document how retailer-native ids (e.g. Costco `retailer_item_id`) fit into the shared model without being forced into `upc`
+- pm note: this is the schema contract task; code should follow it, not invent it ad hoc
+
+** evidence
+- commit:
+- tests:
+- date:
+
+** notes
+** [ ] t1.14.3 refactor pipeline outputs to the new layout without changing semantics (2-4 commits)
+move files and script defaults to the new structure while preserving current behavior
+
+** AC
+1. update scraper and enrich scripts to write retailer-specific outputs under `data/<retailer>/...`
+2. update combined/shared scripts to read from retailer-specific enriched outputs and write to `data/combined/...`
+3. preserve current content/meaning of outputs during the move; this is a location/structure refactor, not a behavior rewrite
+4. update tests, docs, and script defaults to use the new paths
+- pm note: do not mix data-layout cleanup with canonical/review logic changes in this task
+
+** evidence
+- commit:
+- tests:
+- date:
+
+** notes
+** [ ] t1.14.4 make the review and catalog layer explicit and authoritative (2-4 commits)
+treat review and canonical resolution as first-class data, not incidental byproducts
+
+** AC
+1. define `review_queue.csv`, `item_aliases.csv`, and `canonical_catalog.csv` as the authoritative review/catalog files in `data/combined/`
+2. document the intended purpose of each:
+   - `review_queue.csv` = unresolved observed items needing action
+   - `item_aliases.csv` = approved mapping from observed/normalized names to canonical ids
+   - `canonical_catalog.csv` = review-controlled canonical product definitions and display names
+3. ensure final purchase generation reads from these files as the source of truth for resolution
+4. stop relying on weak implicit canonical creation as a substitute for the explicit review/catalog layer
+- pm note: this is the control-plane task; observed products may be automatic, canonical products are review-controlled
+
+** evidence
+- commit:
+- tests:
+- date:
+
+** notes
+** [ ] t1.14.5 define and document the final pivot-ready purchases output (2-3 commits)
+make the final analysis artifact explicit so excel/pivot/chart use is a first-class target
+
+** AC
+1. define `data/combined/purchases.csv` as the final normalized purchase log
+2. ensure each purchase row retains:
+   - purchase date
+   - retailer
+   - order id
+   - raw item name
+   - normalized item name
+   - canonical item id when resolved
+   - quantity and unit
+   - original line total
+   - discount-adjusted fields when applicable
+   - store/location fields where available
+3. document that `purchases.csv` is the primary excel/pivot input and that earlier files are staging layers
+4. document expected pivot uses such as purchase frequency and cost over time by canonical item
+- pm note: this task is about making the final artifact explicit and stable, not about adding new metrics
+
+** evidence
+- commit:
+- tests:
+- date:
+
+** notes
+
+* pipeline prep [2026-03-17 Tue]
+
+data saved to /data
+1. "scrape_<retailer>" gathers data from a retailer and outputs:
+   1. raw list of items per visit          ./<retailer>/scraped/raw/order-<uid>.json
+   2. raw list of visits                   ./<retailer>/scraped_visits.csv
+   3. raw list of items from all visits    ./<retailer>/scraped_items.csv
+2. "enrich <retailer>" takes /scraped/ data and outputs:
+   1. normalized list of items             ./<retailer>/enriched_items.csv
+3. "combine" takes retailer 
+ input:
+   1. all enriched items                   ./<retailer>/enriched_items.csv
+   2. all retailer visits                  ./<retailer>/scraped_visits.csv
+ outputs:
+   1. observed product groups              ./combined/observed/products_observed.csv
+   2. unresolved products for review       ./combined/review/review_queue.csv
+   3. pipeline accounting/status           ./combined/status/pipeline_status.csv
+   4. pipeline accounting/status           ./combined/status/pipeline_status.json
+4. review resolves unknown or weakly identified products and maintains:
+   1. canonical product catalog            ./combined/review/canonical_catalog.csv
+   2. approved alias mappings              ./combined/review/item_aliases.csv
+   3. optional observed→canonical links    ./combined/review/product_links.csv
+5. build purchases takes combined observed data plus review/catalog data and outputs:
+   [1]. final normalized purchase log        ./combined/purchases/purchases.csv
+
+lets get this pipeline right before more refactoring.
+
+* Pipeline - moved to data-model.org [2026-03-18 Wed]
+Key: 
+- (1) input
+- [2] output
+
+Each step can be run alone if its dependents exist.
+
+** 1. Collect
+Get raw receipt/visit and item data from a retailer.  Scraping is unique to a Retailer and method (e.g., Giant-Web and Giant-Scan).  Preserve complete raw data and preserve fidelity.  Avoid interpretation beyond basic data flattening.
+ - (1) Source access (Varies, eg header data, auth for API access)
+ - [1] collected visits from each retailer
+ - [2] collected items from each retailer
+ - [3] any other raw data that supports [1] and [2]; explicit source (eventual receipt scan?)
+   
+** 2. Normalize
+Parse and extract structured facts from retailer-specific raw data to create a standardized item format.  Strictly dependent on Collect method and output.
+ - Extract quantity, size, pack, pricing, variant
+ - Consolidate discount with item using upc/retail_item_id and concurrence
+ - Cleanup naming to facilitate later matching
+ - (1) collected items from each retailer
+ - (2) collected visits from each retailer
+ - [1] normalized items from each retailer
+
+** 3. Review/Combine (Canonicalization)
+Decide whether two normalized retailer items are "the same product"; match items across retailers using algo/logic and human review.  Create catalog linked to normalized items.
+ - Grouping the same item from retailer
+ - Asking human to create a canonical/catalog item with:
+   - friendly/canonical_name: "bell pepper"; "milk"
+   - category: "produce"; "dairy"
+   - product_type: "pepper"; "milk"
+   - ? variant? "whole, "skim", "2pct"
+ - (1) normalized items from each retailer
+ - [1] review queue of items to be reviewed
+ - [2] catalog (lookup table) of confirmed retailer_item and canonical_name
+ - [3] canonical purchase list, pivot-ready
+   
+** Unresolved Issues
+2. Create tags: canonical_name (need better label), category, product_type is missing data like Variant, shouldn't this be part of the normalization step?
+3. need central script to orchestrate; metadata belongs here and nowhere else
+
+** Symptoms
+- `LIME` and `LIME . / .` appearing in canonical_catalog:
+  - names must come from review-approved names, not raw strings
+* 
--- a/pm/tasks.org
+++ b/pm/tasks.org
@@ -416,7 +416,61 @@ Clearly show current state separate from proposed future state.
 - Numbered canonical selection plus confirmation worked better than free-text id entry and should reduce accidental links.
 - Deterministic suggestions remain intentionally conservative; they speed up common cases, but unresolved items still depend on human review by design.

-* [ ] t1.10: add optional llm-assisted suggestion workflow for unresolved products (2-4 commits)
+* [X] t1.13.1 pipeline accountability and stage visibility (1-2 commits)
+add simple accounting so we can see what survives or drops at each pipeline stage
+
+** AC
+1. emit counts for raw, enriched, combined/observed, review-queued, canonical-linked, and final purchase-log rows
+2. report unresolved and dropped item counts explicitly
+3. make it easy to verify that missing items were intentionally left in review rather than silently lost
+- pm note: simple text/json/csv summary is sufficient; trust and visibility matter more than presentation
+
+** evidence
+- commit: `967e19e`
+- tests: `./venv/bin/python -m unittest discover -s tests`; `./venv/bin/python report_pipeline_status.py --help`; `./venv/bin/python report_pipeline_status.py`; verified `combined_output/pipeline_status.csv` and `combined_output/pipeline_status.json`
+- date: 2026-03-17
+
+** notes
+- Added a single explicit status script instead of threading counters through every pipeline step; this keeps the pipeline simple while still making row survival visible.
+- The most useful check here is `unresolved_not_in_review_rows`; when it is non-zero, we know we have a real accounting bug rather than normal unresolved work.
+
+* [X] t1.13.2 costco discount matching and net pricing in enrich_costco (2-3 commits)
+refactor costco enrichment so discount lines are matched to purchased items and net pricing is preserved
+
+** AC
+1. detect costco discount/coupon rows like `/<retailer_item_id>` and match them to purchased items within the same order
+2. preserve raw discount rows for auditability while also carrying matched discount values onto the purchased item row
+3. add explicit fields for discount-adjusted pricing, e.g. `matched_discount_amount` and `net_line_total` (or equivalent)
+4. preserve original raw receipt amounts (`line_total`) without overwriting them
+- pm note: keep this retailer-specific and explicit; do not introduce generic discount heuristics
+
+** evidence
+- commit: `56a03bc`
+- tests: `./venv/bin/python -m unittest discover -s tests`; `./venv/bin/python enrich_costco.py`; verified matched Costco discount rows now populate `matched_discount_amount` and `net_line_total` while preserving raw `line_total`
+- date: 2026-03-17
+
+** notes
+- Kept this retailer-specific and literal: only discount rows with `/<retailer_item_id>` are matched, and only within the same order.
+- Raw discount rows are still preserved for auditability; the purchased row now carries the matched adjustment separately rather than overwriting the original amount.
+* [X] t1.13.3 canonical cleanup and review-first product identity (3-4 commits)
+refactor canonical generation so product identity is cleaner, duplicate canonicals are reduced, and unresolved items stay in review instead of spawning junk canonicals
+
+** AC
+1. stop auto-creating new canonical products from weak normalized names alone; unresolved items remain in `review_queue.csv`
+2. canonical names are based on stable product identity rather than noisy observed titles
+3. packaging/count/size tokens are removed from canonical names when they belong in structured fields (`pack_qty`, `size_value`, `size_unit`)
+4. consolidate obvious duplicate canonicals (e.g. egg/lime cases) and ensure final outputs retain raw item name, normalized item name, and canonical item id
+- pm note: prefer conservative canonical creation and a better manual review loop over aggressive auto-unification
+
+** evidence
+- commit: `08e2a86`
+- tests: `./venv/bin/python -m unittest discover -s tests`; `./venv/bin/python build_purchases.py`; `./venv/bin/python review_products.py --refresh-only`; verified weaker exact-name cases now remain unresolved in `combined_output/review_queue.csv` and canonical names are cleaned before auto-catalog creation
+- date: 2026-03-17
+
+** notes
+- Removed weak exact-name auto-canonical creation so ambiguous products stay in review instead of generating junk canonicals.
+- Canonical display names are now cleaned of obvious punctuation and packaging noise, but I kept the cleanup conservative rather than adding a broad fuzzy merge layer.
+* [ ] 1t.10: add optional llm-assisted suggestion workflow for unresolved products (2-4 commits)

 ** acceptance criteria
 - llm suggestions are generated only for unresolved observed products
--- a/report_pipeline_status.py
+++ b/report_pipeline_status.py
@@ -0,0 +1,119 @@
+import json
+from pathlib import Path
+
+import click
+
+import build_observed_products
+import build_purchases
+import review_products
+from layer_helpers import read_csv_rows, write_csv_rows
+
+
+SUMMARY_FIELDS = ["stage", "count"]
+
+
+def read_rows_if_exists(path):
+    path = Path(path)
+    if not path.exists():
+        return []
+    return read_csv_rows(path)
+
+
+def build_status_summary(
+    giant_orders,
+    giant_items,
+    giant_enriched,
+    costco_orders,
+    costco_items,
+    costco_enriched,
+    purchases,
+    resolutions,
+):
+    enriched_rows = giant_enriched + costco_enriched
+    observed_rows = build_observed_products.build_observed_products(enriched_rows)
+    queue_rows = review_products.build_review_queue(purchases, resolutions)
+
+    unresolved_purchase_rows = [
+        row
+        for row in purchases
+        if row.get("observed_product_id")
+        and not row.get("canonical_product_id")
+        and row.get("is_fee") != "true"
+        and row.get("is_discount_line") != "true"
+        and row.get("is_coupon_line") != "true"
+    ]
+    excluded_rows = [
+        row
+        for row in purchases
+        if row.get("resolution_action") == "exclude"
+    ]
+    linked_purchase_rows = [row for row in purchases if row.get("canonical_product_id")]
+
+    summary = [
+        {"stage": "raw_orders", "count": len(giant_orders) + len(costco_orders)},
+        {"stage": "raw_items", "count": len(giant_items) + len(costco_items)},
+        {"stage": "enriched_items", "count": len(enriched_rows)},
+        {"stage": "observed_products", "count": len(observed_rows)},
+        {"stage": "review_queue_observed_products", "count": len(queue_rows)},
+        {"stage": "canonical_linked_purchase_rows", "count": len(linked_purchase_rows)},
+        {"stage": "final_purchase_rows", "count": len(purchases)},
+        {"stage": "unresolved_purchase_rows", "count": len(unresolved_purchase_rows)},
+        {"stage": "excluded_purchase_rows", "count": len(excluded_rows)},
+        {
+            "stage": "unresolved_not_in_review_rows",
+            "count": len(
+                [
+                    row
+                    for row in unresolved_purchase_rows
+                    if row.get("observed_product_id")
+                    not in {queue_row["observed_product_id"] for queue_row in queue_rows}
+                ]
+            ),
+        },
+    ]
+    return summary
+
+
+@click.command()
+@click.option("--giant-orders-csv", default="giant_output/orders.csv", show_default=True)
+@click.option("--giant-items-csv", default="giant_output/items.csv", show_default=True)
+@click.option("--giant-enriched-csv", default="giant_output/items_enriched.csv", show_default=True)
+@click.option("--costco-orders-csv", default="costco_output/orders.csv", show_default=True)
+@click.option("--costco-items-csv", default="costco_output/items.csv", show_default=True)
+@click.option("--costco-enriched-csv", default="costco_output/items_enriched.csv", show_default=True)
+@click.option("--purchases-csv", default="combined_output/purchases.csv", show_default=True)
+@click.option("--resolutions-csv", default="combined_output/review_resolutions.csv", show_default=True)
+@click.option("--summary-csv", default="combined_output/pipeline_status.csv", show_default=True)
+@click.option("--summary-json", default="combined_output/pipeline_status.json", show_default=True)
+def main(
+    giant_orders_csv,
+    giant_items_csv,
+    giant_enriched_csv,
+    costco_orders_csv,
+    costco_items_csv,
+    costco_enriched_csv,
+    purchases_csv,
+    resolutions_csv,
+    summary_csv,
+    summary_json,
+):
+    summary_rows = build_status_summary(
+        read_rows_if_exists(giant_orders_csv),
+        read_rows_if_exists(giant_items_csv),
+        read_rows_if_exists(giant_enriched_csv),
+        read_rows_if_exists(costco_orders_csv),
+        read_rows_if_exists(costco_items_csv),
+        read_rows_if_exists(costco_enriched_csv),
+        read_rows_if_exists(purchases_csv),
+        read_rows_if_exists(resolutions_csv),
+    )
+    write_csv_rows(summary_csv, summary_rows, SUMMARY_FIELDS)
+    summary_json_path = Path(summary_json)
+    summary_json_path.parent.mkdir(parents=True, exist_ok=True)
+    summary_json_path.write_text(json.dumps(summary_rows, indent=2), encoding="utf-8")
+    for row in summary_rows:
+        click.echo(f"{row['stage']}: {row['count']}")
+
+
+if __name__ == "__main__":
+    main()
--- a/tests/test_canonical_layer.py
+++ b/tests/test_canonical_layer.py
@@ -4,7 +4,7 @@ import build_canonical_layer


 class CanonicalLayerTests(unittest.TestCase):
-    def test_build_canonical_layer_auto_links_exact_upc_and_name_size(self):
+    def test_build_canonical_layer_auto_links_exact_upc_and_name_size_only(self):
        observed_rows = [
            {
                "observed_product_id": "gobs_1",
@@ -81,6 +81,21 @@ class CanonicalLayerTests(unittest.TestCase):
                "is_discount_line": "false",
                "is_coupon_line": "false",
            },
+            {
+                "observed_product_id": "gobs_6",
+                "representative_upc": "",
+                "representative_retailer_item_id": "",
+                "representative_name_norm": "LIME",
+                "representative_brand": "",
+                "representative_variant": "",
+                "representative_size_value": "",
+                "representative_size_unit": "",
+                "representative_pack_qty": "",
+                "representative_measure_type": "each",
+                "is_fee": "false",
+                "is_discount_line": "false",
+                "is_coupon_line": "false",
+            },
        ]

        canonicals, links = build_canonical_layer.build_canonical_layer(observed_rows)
@@ -93,6 +108,11 @@ class CanonicalLayerTests(unittest.TestCase):
        self.assertEqual("exact_name_size", methods["gobs_3"])
        self.assertEqual("exact_name_size", methods["gobs_4"])
        self.assertNotIn("gobs_5", methods)
+        self.assertNotIn("gobs_6", methods)
+
+    def test_clean_canonical_name_removes_packaging_noise(self):
+        self.assertEqual("LIME", build_canonical_layer.clean_canonical_name("LIME  . / ."))
+        self.assertEqual("EGG", build_canonical_layer.clean_canonical_name("5DZ EGG / /"))


 if __name__ == "__main__":
--- a/tests/test_costco_pipeline.py
+++ b/tests/test_costco_pipeline.py
@@ -279,6 +279,57 @@ class CostcoPipelineTests(unittest.TestCase):
        self.assertEqual("true", discount["is_discount_line"])
        self.assertEqual("true", discount["is_coupon_line"])

+    def test_build_items_enriched_matches_discount_to_item(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            raw_dir = Path(tmpdir) / "raw"
+            raw_dir.mkdir()
+            payload = {
+                "data": {
+                    "receiptsWithCounts": {
+                        "receipts": [
+                            {
+                                "transactionBarcode": "abc",
+                                "transactionDate": "2026-03-12",
+                                "itemArray": [
+                                    {
+                                        "itemNumber": "4873222",
+                                        "itemDescription01": "ALL F&C",
+                                        "itemDescription02": "200OZ 160LOADS P104",
+                                        "itemDepartmentNumber": 14,
+                                        "transDepartmentNumber": 14,
+                                        "unit": 1,
+                                        "itemIdentifier": "E",
+                                        "amount": 19.99,
+                                        "itemUnitPriceAmount": 19.99,
+                                    },
+                                    {
+                                        "itemNumber": "374664",
+                                        "itemDescription01": "/ 4873222",
+                                        "itemDescription02": None,
+                                        "itemDepartmentNumber": 14,
+                                        "transDepartmentNumber": 14,
+                                        "unit": -1,
+                                        "itemIdentifier": None,
+                                        "amount": -5,
+                                        "itemUnitPriceAmount": 0,
+                                    },
+                                ],
+                            }
+                        ]
+                    }
+                }
+            }
+            (raw_dir / "abc.json").write_text(json.dumps(payload), encoding="utf-8")
+
+            rows = enrich_costco.build_items_enriched(raw_dir)
+
+            purchase_row = next(row for row in rows if row["is_discount_line"] == "false")
+            discount_row = next(row for row in rows if row["is_discount_line"] == "true")
+            self.assertEqual("-5", purchase_row["matched_discount_amount"])
+            self.assertEqual("14.99", purchase_row["net_line_total"])
+            self.assertIn("matched_discount=4873222", purchase_row["parse_notes"])
+            self.assertIn("matched_to_item=4873222", discount_row["parse_notes"])
+
    def test_cross_retailer_validation_writes_proof_example(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            giant_csv = Path(tmpdir) / "giant_items_enriched.csv"
--- a/tests/test_pipeline_status.py
+++ b/tests/test_pipeline_status.py
@@ -0,0 +1,80 @@
+import unittest
+
+import report_pipeline_status
+
+
+class PipelineStatusTests(unittest.TestCase):
+    def test_build_status_summary_reports_unresolved_and_reviewed_counts(self):
+        summary = report_pipeline_status.build_status_summary(
+            giant_orders=[{"order_id": "g1"}],
+            giant_items=[{"order_id": "g1", "line_no": "1"}],
+            giant_enriched=[
+                {
+                    "retailer": "giant",
+                    "order_id": "g1",
+                    "line_no": "1",
+                    "item_name_norm": "BANANA",
+                    "item_name": "FRESH BANANA",
+                    "retailer_item_id": "1",
+                    "upc": "4011",
+                    "brand_guess": "",
+                    "variant": "",
+                    "size_value": "",
+                    "size_unit": "",
+                    "pack_qty": "",
+                    "measure_type": "weight",
+                    "image_url": "",
+                    "is_store_brand": "false",
+                    "is_fee": "false",
+                    "is_discount_line": "false",
+                    "is_coupon_line": "false",
+                    "order_date": "2026-03-01",
+                    "line_total": "1.29",
+                }
+            ],
+            costco_orders=[],
+            costco_items=[],
+            costco_enriched=[],
+            purchases=[
+                {
+                    "observed_product_id": "gobs_banana",
+                    "canonical_product_id": "gcan_banana",
+                    "resolution_action": "",
+                    "is_fee": "false",
+                    "is_discount_line": "false",
+                    "is_coupon_line": "false",
+                    "retailer": "giant",
+                    "raw_item_name": "FRESH BANANA",
+                    "normalized_item_name": "BANANA",
+                    "upc": "4011",
+                    "line_total": "1.29",
+                },
+                {
+                    "observed_product_id": "gobs_lime",
+                    "canonical_product_id": "",
+                    "resolution_action": "",
+                    "is_fee": "false",
+                    "is_discount_line": "false",
+                    "is_coupon_line": "false",
+                    "retailer": "costco",
+                    "raw_item_name": "LIME 5LB",
+                    "normalized_item_name": "LIME",
+                    "upc": "",
+                    "line_total": "4.99",
+                },
+            ],
+            resolutions=[],
+        )
+
+        counts = {row["stage"]: row["count"] for row in summary}
+        self.assertEqual(1, counts["raw_orders"])
+        self.assertEqual(1, counts["raw_items"])
+        self.assertEqual(1, counts["enriched_items"])
+        self.assertEqual(1, counts["canonical_linked_purchase_rows"])
+        self.assertEqual(1, counts["unresolved_purchase_rows"])
+        self.assertEqual(1, counts["review_queue_observed_products"])
+        self.assertEqual(0, counts["unresolved_not_in_review_rows"])
+
+
+if __name__ == "__main__":
+    unittest.main()
Author	SHA1	Message	Date
ben	10aad05808	data-model refactor and prep scope	2026-03-18 13:08:28 -04:00
ben	9122821db1	Fix t1.13 evidence hashes	2026-03-17 15:08:09 -04:00
ben	7743421918	Record t1.13 task evidence	2026-03-17 15:07:51 -04:00
ben	08e2a86cbd	Make canonical auto-linking more conservative	2026-03-17 15:07:48 -04:00
ben	56a03bcb1d	Attach Costco discounts to purchase rows	2026-03-17 15:07:45 -04:00
ben	967e19e561	Add pipeline status accounting	2026-03-17 15:07:42 -04:00