diff --git a/.gitignore b/.gitignore index 9b52fef..36ce2cb 100644 --- a/.gitignore +++ b/.gitignore @@ -21,7 +21,6 @@ env/ # --- project private data --- /private/ -/pm/ # --- django --- db.sqlite3 diff --git a/pm/scrape-giant.org b/pm/scrape-giant.org new file mode 100644 index 0000000..2c3cf99 --- /dev/null +++ b/pm/scrape-giant.org @@ -0,0 +1,107 @@ +* python setup +venv install playwright, pandas +playwright install +1. scrape - raw giant json +2. enrich - + cols: +item_name_norm +brand_guess +size_value +size_unit +pack_qty +variant +is_store_brand +is_fee +measure_type +price_per_lb +price_per_oz +price_per_each +image_url + +normalize abbreviationsta +extract size like 12z, 10ct, 5lb +detect fees like bag charges +infer whether something is sold by each vs weight +carry forward image url + +3. build observed-product atble from enriched items + + +* item: +get: + /api/v6.0/user/369513017/order/history/detail/69a2e44a16be1142e74ad3cc + +headers: + request: +GET /api/v6.0/user/369513017/order/history/detail/69a2e44a16be1142e74ad3cc?isInStore=true HTTP/2 +Host: giantfood.com +User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:148.0) Gecko/20100101 Firefox/148.0 +Accept: application/json, text/plain, */* +Accept-Language: en-US,en;q=0.9 +Accept-Encoding: gzip, deflate, br, zstd +DNT: 1 +Sec-GPC: 1 +Connection: keep-alive +Referer: https://giantfood.com/account/history/invoice/in-store +Cookie: datadome=rDtvd3J2hO5AeghJMSFRRxGc6ifKCQYgMLcqPNr9rWiz2rdcXb032AY6GIZn8tUmYB96BKKbzh3_jSjEzYWLj8hDjl3oGYYAiu4jwdaxpf3vh2v4f7KH7kbqgsMWpkjt; cf_clearance=WEPyQokx9f0qoyS4Svsw4EkZ1TYOxjOwcUHspT3.rXw-1773348940-1.2.1.1-fPvERGxBlFUaBW83sUppbUWpwvFG7mZivag5vBvZb3kxUQv2WSVIV1tON0HV2n8bkVY0U8_BBl62a00Np.oJylYQcGME540gZlYEoL.gMs4WynLqApFe5BOXAEwOm01_6h6b62H90bl4ypRehVb_TXEi4qHaPLVSZhjZK_h.fv6RBqjgYch2j_8XnHe5HXvLziVjl1k2aJskozqy04KOyeHyc3OyIPTZd5On_KAzFIM; dvrctk=MnjKJVShVraEtbrBkkxWxLaZrXnIGNQlwB7QtZVPFeA=; __cflb=0H28vXMLFyydRmDMNgcPHijM6auXkCspCkuh58tVuJ3; __cf_bm=C6QbqiEvbbwdrYBpoJOkcWcedf60vcOfPfTPPbZzKbM-1773348202-1.0.1.1-cSHoYwi8ZjIHTdBItXQP_iXJdRJS6FYjFsGdl1eGHvS5pgfbcT4Lg19P6UStX.bZz1u0OXiS5ykdipPBtwP6OvZr68k4XSmjYpir05jNLhw; _dd_s=rum=0&expire=1773349846445; ppdtk=Uog72CR22mD85C7U4iZHlgOQeRmvHEYp0OdQc+0lEes1c5/LeqGT+ZUlXpSC6FpW; cartId=3820547 +Sec-Fetch-Dest: empty +Sec-Fetch-Mode: cors +Sec-Fetch-Site: same-origin +Priority: u=0 +TE: trailers + + response: +HTTP/2 200 +date: Thu, 12 Mar 2026 20:55:47 GMT +content-type: application/json +server: cloudflare +cf-ray: 9db5b3a5d84aff28-IAD +cf-cache-status: DYNAMIC +content-encoding: gzip +set-cookie: datadome=MXMri0hss6PlQ0_oS7gG2iMdOKnNkbDmGvOxelgN~nCcupgkJQOqjcjcgdprIaI7hSlt_w8E9Ri_RAzPFrGqtUfqAJ_szB_aNZ2FdC26qmI3870Nn4~T0vtx8Gj3dEZR; Max-Age=31536000; Domain=.giantfood.com; Path=/; Secure; SameSite=Lax +strict-transport-security: max-age=31536000; includeSubDomains +vary: Origin, Access-Control-Request-Method, Access-Control-Request-Headers, accept-encoding +accept-ch: Sec-CH-UA,Sec-CH-UA-Mobile,Sec-CH-UA-Platform,Sec-CH-UA-Arch,Sec-CH-UA-Full-Version-List,Sec-CH-UA-Model,Sec-CH-Device-Memory +x-datadome: protected +request-context: appId=cid-v1:75750625-0c81-4f08-9f5d-ce4f73198e54 +X-Firefox-Spdy: h2 + +* history: +GET + https://giantfood.com/api/v6.0/user/369513017/order/history?filter=instore&loyaltyNumber=440155630880 + +headers: + request: +GET /api/v6.0/user/369513017/order/history?filter=instore&loyaltyNumber=440155630880 HTTP/2 +Host: giantfood.com +User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:148.0) Gecko/20100101 Firefox/148.0 +Accept: application/json, text/plain, */* +Accept-Language: en-US,en;q=0.9 +Accept-Encoding: gzip, deflate, br, zstd +DNT: 1 +Sec-GPC: 1 +Connection: keep-alive +Referer: https://giantfood.com/account/history/invoice/in-store +Cookie: datadome=OH2XjtCoI6XjE3Qsz_b0F1YULKLatAC0Ea~VMeDGBP0N9Z~CeI3RqEbvkGmNW_VCOU~vRb6p0kqibvF2tLbWnzyAGIdO7jsC41KiYbp7USpJDnefZhIg0e1ypAugvDSw; cf_clearance=WEPyQokx9f0qoyS4Svsw4EkZ1TYOxjOwcUHspT3.rXw-1773348940-1.2.1.1-fPvERGxBlFUaBW83sUppbUWpwvFG7mZivag5vBvZb3kxUQv2WSVIV1tON0HV2n8bkVY0U8_BBl62a00Np.oJylYQcGME540gZlYEoL.gMs4WynLqApFe5BOXAEwOm01_6h6b62H90bl4ypRehVb_TXEi4qHaPLVSZhjZK_h.fv6RBqjgYch2j_8XnHe5HXvLziVjl1k2aJskozqy04KOyeHyc3OyIPTZd5On_KAzFIM; dvrctk=MnjKJVShVraEtbrBkkxWxLaZrXnIGNQlwB7QtZVPFeA=; __cflb=0H28vXMLFyydRmDMNgcPHijM6auXkCspCkuh58tVuJ3; __cf_bm=C6QbqiEvbbwdrYBpoJOkcWcedf60vcOfPfTPPbZzKbM-1773348202-1.0.1.1-cSHoYwi8ZjIHTdBItXQP_iXJdRJS6FYjFsGdl1eGHvS5pgfbcT4Lg19P6UStX.bZz1u0OXiS5ykdipPBtwP6OvZr68k4XSmjYpir05jNLhw; _dd_s=rum=0&expire=1773349842848; ppdtk=Uog72CR22mD85C7U4iZHlgOQeRmvHEYp0OdQc+0lEes1c5/LeqGT+ZUlXpSC6FpW; cartId=3820547 +Sec-Fetch-Dest: empty +Sec-Fetch-Mode: cors +Sec-Fetch-Site: same-origin +Priority: u=0 +TE: trailers + + + response: + HTTP/2 200 +date: Thu, 12 Mar 2026 20:55:43 GMT +content-type: application/json +server: cloudflare +cf-ray: 9db5b38f7eebff28-IAD +cf-cache-status: DYNAMIC +content-encoding: gzip +set-cookie: datadome=rDtvd3J2hO5AeghJMSFRRxGc6ifKCQYgMLcqPNr9rWiz2rdcXb032AY6GIZn8tUmYB96BKKbzh3_jSjEzYWLj8hDjl3oGYYAiu4jwdaxpf3vh2v4f7KH7kbqgsMWpkjt; Max-Age=31536000; Domain=.giantfood.com; Path=/; Secure; SameSite=Lax +strict-transport-security: max-age=31536000; includeSubDomains +vary: Origin, Access-Control-Request-Method, Access-Control-Request-Headers, accept-encoding +accept-ch: Sec-CH-UA,Sec-CH-UA-Mobile,Sec-CH-UA-Platform,Sec-CH-UA-Arch,Sec-CH-UA-Full-Version-List,Sec-CH-UA-Model,Sec-CH-Device-Memory +x-datadome: protected +request-context: appId=cid-v1:75750625-0c81-4f08-9f5d-ce4f73198e54 +X-Firefox-Spdy: h2 diff --git a/pm/tasks.org b/pm/tasks.org new file mode 100644 index 0000000..63723e5 --- /dev/null +++ b/pm/tasks.org @@ -0,0 +1,200 @@ +* [ ] t1.1: harden giant receipt fetch cli (2-4 commits) +** acceptance criteria +- giant scraper runs from cli with prompts or env-backed defaults for `user_id` and `loyalty` +- script reuses current browser session via firefox cookies + `curl_cffi` +- script only fetches unseen orders +- script appends to `orders.csv` and `items.csv` without duplicating prior visits +- script prints a note that giant only exposes the most recent 50 visits + +** notes +- keep this giant-specific +- no canonical product logic here +- raw json archive remains source of truth + +** evidence +- commit: +- tests: +- date: + +* [ ] t1.2: define grocery data model and file layout (1-2 commits) +** acceptance criteria +- decide and document the files/directories for: + - retailer raw exports + - enriched line items + - observed products + - canonical products + - product links +- define stable column schemas for each file +- explicitly separate retailer-specific parsing from cross-retailer canonicalization + +** notes +- this is the guardrail task so we don’t make giant-specific hacks the system of record +- keep schema minimal but extensible + +** evidence +- commit: +- tests: +- date: + +* [ ] t1.3: build giant parser/enricher from raw json (2-4 commits) +** acceptance criteria +- parser reads giant raw order json files +- outputs `items_enriched.csv` +- preserves core raw values plus parsed fields such as: + - normalized item name + - image url + - size value/unit guesses + - pack/count guesses + - fee/store-brand flags + - per-unit/per-weight derived price where possible +- parser is deterministic and rerunnable + +** notes +- do not attempt canonical cross-store matching yet +- parser should preserve ambiguity rather than hallucinating precision + +** evidence +- commit: +- tests: +- date: + +* [ ] t1.4: generate observed-product layer from enriched items (2-3 commits) + +** acceptance criteria +- distinct observed products are generated from enriched giant items +- each observed product has a stable `observed_product_id` +- observed products aggregate: + - first seen / last seen + - times seen + - representative upc + - representative image url + - representative normalized name +- outputs `products_observed.csv` + +** notes +- observed product is retailer-facing, not yet canonical +- likely key is some combo of retailer + upc + normalized name + +** evidence +- commit: +- tests: +- date: + +* [ ] t1.5: build review queue for unresolved or low-confidence products (1-3 commits) + +** acceptance criteria +- produce a review file containing observed products needing manual review +- include enough context to review quickly: + - raw names + - parsed names + - upc + - image url + - example prices + - seen count +- reviewed status can be stored and reused + +** notes +- this is where human-in-the-loop starts +- optimize for “approve once, remember forever” + +** evidence +- commit: +- tests: +- date: + +* [ ] t1.6: create canonical product layer and observed→canonical links (2-4 commits) + +** acceptance criteria +- define and create `products_canonical.csv` +- define and create `product_links.csv` +- support linking one or more observed products to one canonical product +- canonical product schema supports food-cost comparison fields such as: + - product type + - variant + - size + - measure type + - normalized quantity basis + +** notes +- this is the first cross-retailer abstraction layer +- do not require llm assistance for v1 + +** evidence +- commit: +- tests: +- date: + +* [ ] t1.7: implement auto-link rules for easy matches (2-3 commits) + +** acceptance criteria +- auto-link can match observed products to canonical products using deterministic rules +- rules include at least: + - exact upc + - exact normalized name + - exact size/unit match where available +- low-confidence cases remain unlinked for review + +** notes +- keep the rules conservative +- false positives are worse than unresolved items + +** evidence +- commit: +- tests: +- date: + +* [ ] t1.8: support costco raw ingest path (2-5 commits) + +** acceptance criteria +- add a costco-specific raw ingest/export path +- output costco line items into the same shared raw/enriched schema family +- confirm at least one product class can exist as: + - giant observed product + - costco observed product + - one shared canonical product + +** notes +- this is the proof that the architecture generalizes +- don’t chase perfection before the second retailer lands + +** evidence +- commit: +- tests: +- date: + +* [ ] t1.9: compute normalized comparison metrics (2-3 commits) + +** acceptance criteria +- derive normalized comparison fields where possible: + - price per lb + - price per oz + - price per each + - price per count +- metrics are attached at canonical or linked-observed level as appropriate +- emit obvious nulls when basis is unknown rather than inventing values + +** notes +- this is where “gala apples 5 lb bag vs other gala apples” becomes possible +- units discipline matters a lot here + +** evidence +- commit: +- tests: +- date: + +* [ ] t1.10: add optional llm-assisted suggestion workflow for unresolved products (2-4 commits) + +** acceptance criteria +- llm suggestions are generated only for unresolved observed products +- llm outputs are stored as suggestions, not auto-applied truth +- reviewer can approve/edit/reject suggestions +- approved decisions are persisted into canonical/link files + +** notes +- bounded assistant, not autonomous goblin +- image urls may become useful here + +** evidence +- commit: +- tests: +- date: