From cd3543bd0f8e468fb8e290f37085fca277d16949 Mon Sep 17 00:00:00 2001 From: eulaly Date: Tue, 5 May 2026 11:35:19 -0400 Subject: [PATCH] initial commit --- .gitignore | 26 +++++++ README.md | 133 ++++++++++++++++++++++++++++++++++++ agents.md | 40 +++++++++++ docs/tasks.org | 28 ++++++++ docs/vatownhall.org | 53 ++++++++++++++ scraper/__init__.py | 0 scraper/items.py | 12 ++++ scraper/middlewares.py | 100 +++++++++++++++++++++++++++ scraper/pipelines.py | 13 ++++ scraper/settings.py | 87 +++++++++++++++++++++++ scraper/spiders/__init__.py | 4 ++ scrapy.cfg | 11 +++ 12 files changed, 507 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 agents.md create mode 100644 docs/tasks.org create mode 100644 docs/vatownhall.org create mode 100644 scraper/__init__.py create mode 100644 scraper/items.py create mode 100644 scraper/middlewares.py create mode 100644 scraper/pipelines.py create mode 100644 scraper/settings.py create mode 100644 scraper/spiders/__init__.py create mode 100644 scrapy.cfg diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..993262f --- /dev/null +++ b/.gitignore @@ -0,0 +1,26 @@ +# --- python bytecode --- +__pycache__/ +*.py[cod] +*$py.class + +# --- environment files --- +.env +.env.* +*.local +.venv/ +venv/ +env/ + +# --- emacs --- +*~ +\#*\# +.\#* +*.elc + +# --- project private data --- +/private/ +archive/ + + +# --- misc --- +.DS_Store \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..c5b07fa --- /dev/null +++ b/README.md @@ -0,0 +1,133 @@ + +# Table of Contents + +1. [Project Goals](#org863a759) +2. [Architecture](#orgcd91fd0) + 1. [Scraper](#org3256ad3) + 2. [Storage](#org7a9a92c) + 3. [Analysis](#org6ed72dc) +3. [Roadmap](#org416f14d) + + + + + +# Project Goals + +1. Document and analyze sentiment of public comments on Virginia law, to determine: + 1. the utility of this forum as a mechanism for public comment, and + 2. the impact of this forum on Virginia regulation. +2. Make data and insights broadly available. +3. Generalize to other public comment tools. + + + + +# Architecture + +1. Scrape/Parse: ****Scrapy**** for downloading comments +2. Storage: json +3. Sentiment analysis: Claude haiku +4. Display: TBD + + + + +## Scraper + +Scrapy provides a simple mechanism for browsing and + +1. Forums listing page: \`Forums.cfm\` - lists all open forums with agency, reg title, action type, brief description, closing date, comment count +2. Comment listing page: \`comments.cfm?GDocForumID=X\` or \`comments.cfm?stageid=X\` or \`comments.cfm?petitionid=X\` - lists comments with title, author, date +3. Individual comment page: \`viewcomments.cfm?commentid=X\` - shows regulation title + brief description at the top, plus the comment + + + + +## Storage + +One JSONL file per forum/bill. + + + + +## Analysis + +Google and Amazon both return generic sentiment (tone of writing: positive/negative), not stance (for/against the regulation): "I strongly believe the government should NOT interfere" is negative tone but "against" the regulation. We will run the forum/bill title and cache the entirety of the proposed change, perhaps as a fallback. + + + + +++ ++ ++ ++ ++ ++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ToolOutputContextSarcasmContext windowCost/1k comments
Google NL API-1→+1, magnitudeNo/genericPoorlyNo~$1–2
Amazon ComprehendPos/Neg/Neutral/MixedNo/genericPoorlyNo~$0.10
Claude HaikuPrompted → for/against/neutralYesYes, with promptYes~$0.10–0.30
GPT-4o-miniPrompted → sameYesYesYes~$0.05–0.15
+ + + + +# Roadmap + +1. Scrape one forum +2. Compare sentiment models +3. Display +4. Scrape all data +5. Scale? + diff --git a/agents.md b/agents.md new file mode 100644 index 0000000..96531e9 --- /dev/null +++ b/agents.md @@ -0,0 +1,40 @@ +# agent rules + +## priorities +- optimize for simplicity, boringness, and long-term maintainability +- prefer minimal diffs; avoid refactors unless required for the active task + +## tech stack +- python; scrapy +- file storage: json or csv +- assume local virtual env is available and accessible +- do not add new dependencies unless explicitly approved; if unavoidable, document justification in the active task notes + +## workflow +- prefer direct argv commands (no bash -lc / compound shell chains) unless necessary +- work on ONE task at a time unless explicitly instructed otherwise +- at the start of work, state the task id you are executing +- do not start work unless a task id is specified; if missing, choose the earliest unchecked task and say so +- propose incremental steps +- always include basic tests for core logic +- when you complete a task: + - mark it [X] in docs/tasks.md + - fill in evidence with commit hash + commands run + - never mark complete unless acceptance criteria are met + - include date and time (HH:MM) + +``` +* [ ] t1.1 Task Title (1) +Description and PM notes +** acceptance criteria +1. AC 1 +2. AC 2 + +** notes +- document thoughts, decisions, reasoning + +** evidence +- commit: +- tests: +- datetime: +``` diff --git a/docs/tasks.org b/docs/tasks.org new file mode 100644 index 0000000..e6c0922 --- /dev/null +++ b/docs/tasks.org @@ -0,0 +1,28 @@ +* [ ] t1.1: scrape one forum (1) +Use https://www.townhall.virginia.gov/L/comments.cfm?GDocForumID=452 as the first forum. Scraper should be run manually at this step. +** acceptance criteria +1. run manual scraper + 1. store proposal title and description + 2. store comment title, commenter, date + 3. store relevant metadata +2. friendly/polite scraping + +** notes + +** evidence +- commit: +- tests: +- datetime: + +* [ ] t1.2: initial analysis pipeline +Write a simple pipeline for both - prefer non-concurrent/async from scraping run. Should be run manually, separate from scraper. You may use scrapy, but are not required to. +** acceptance criteria +1. run manual sentiment analysis of selected file against haiku +2. run manual sentiment analysis of selected file against gpt-4o + +** notes + +** evidence +- commit: +- tests: +- date: diff --git a/docs/vatownhall.org b/docs/vatownhall.org new file mode 100644 index 0000000..128b222 --- /dev/null +++ b/docs/vatownhall.org @@ -0,0 +1,53 @@ +#+title: VA Townhall +#+date: [2026-05-05 Tue] +#+version: 1 + +* Project Goals +1. Document and analyze sentiment of public comments on Virginia law, to determine: + 1. the utility of this forum as a mechanism for public comment, and + 2. the impact of this forum on Virginia regulation. +2. Make data and insights broadly available. +3. Generalize to other public comment tools. + +** Document and analyze sentiment +- Scrape the data, parse, clean, and store. Clearly separate scraper from sentiment analyzer for maximum auditability. +- Build tests for identifying abuse, such as spam and account fraud +- Identify any patterns connecting measured sentiment against VA decisions + +** Make data available +- Pick a good visualization tool + +** Generalize +- Identify scalable ways to apply this toolset to similar problems + +* Architecture +1. Scrape/Parse: **Scrapy** for downloading comments +2. Storage: json +3. Sentiment analysis: Claude haiku +4. Display: TBD + +** Scraper +Scrapy provides a simple mechanism for browsing and +1. Forums listing page: `Forums.cfm` - lists all open forums with agency, reg title, action type, brief description, closing date, comment count +2. Comment listing page: `comments.cfm?GDocForumID=X` or `comments.cfm?stageid=X` or `comments.cfm?petitionid=X` - lists comments with title, author, date +3. Individual comment page: `viewcomments.cfm?commentid=X` - shows regulation title + brief description at the top, plus the comment + +** Storage +One JSONL file per forum/bill. + +** Analysis +Google and Amazon both return generic sentiment (tone of writing: positive/negative), not stance (for/against the regulation): "I strongly believe the government should NOT interfere" is negative tone but "against" the regulation. We will run the forum/bill title and cache the entirety of the proposed change, perhaps as a fallback. + +| Tool | Output | Context | Sarcasm | Context window | Cost/1k comments | +|-------------------+--------------------------------+------------+------------------+----------------+------------------| +| Google NL API | -1→+1, magnitude | No/generic | Poorly | No | ~$1–2 | +| Amazon Comprehend | Pos/Neg/Neutral/Mixed | No/generic | Poorly | No | ~$0.10 | +| Claude Haiku | Prompted → for/against/neutral | Yes | Yes, with prompt | Yes | ~$0.10–0.30 | +| GPT-4o-mini | Prompted → same | Yes | Yes | Yes | ~$0.05–0.15 | + +* Roadmap +1. Scrape one forum +2. Compare sentiment models +3. Display +4. Scrape all data +5. Scale? diff --git a/scraper/__init__.py b/scraper/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scraper/items.py b/scraper/items.py new file mode 100644 index 0000000..97bbc0a --- /dev/null +++ b/scraper/items.py @@ -0,0 +1,12 @@ +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class ScraperItem(scrapy.Item): + # define the fields for your item here like: + # name = scrapy.Field() + pass diff --git a/scraper/middlewares.py b/scraper/middlewares.py new file mode 100644 index 0000000..6e58b73 --- /dev/null +++ b/scraper/middlewares.py @@ -0,0 +1,100 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + +# useful for handling different item types with a single interface +from itemadapter import ItemAdapter + + +class ScraperSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + async def process_start(self, start): + # Called with an async iterator over the spider start() method or the + # matching method of an earlier spider middleware. + async for item_or_request in start: + yield item_or_request + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) + + +class ScraperDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) diff --git a/scraper/pipelines.py b/scraper/pipelines.py new file mode 100644 index 0000000..6db8721 --- /dev/null +++ b/scraper/pipelines.py @@ -0,0 +1,13 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + + +# useful for handling different item types with a single interface +from itemadapter import ItemAdapter + + +class ScraperPipeline: + def process_item(self, item, spider): + return item diff --git a/scraper/settings.py b/scraper/settings.py new file mode 100644 index 0000000..048bdb3 --- /dev/null +++ b/scraper/settings.py @@ -0,0 +1,87 @@ +# Scrapy settings for scraper project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = "scraper" + +SPIDER_MODULES = ["scraper.spiders"] +NEWSPIDER_MODULE = "scraper.spiders" + +ADDONS = {} + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = "scraper (+http://www.yourdomain.com)" + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Concurrency and throttling settings +#CONCURRENT_REQUESTS = 16 +CONCURRENT_REQUESTS_PER_DOMAIN = 1 +DOWNLOAD_DELAY = 1 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", +# "Accept-Language": "en", +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# "scraper.middlewares.ScraperSpiderMiddleware": 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# "scraper.middlewares.ScraperDownloaderMiddleware": 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# "scrapy.extensions.telnet.TelnetConsole": None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +#ITEM_PIPELINES = { +# "scraper.pipelines.ScraperPipeline": 300, +#} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = "httpcache" +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" + +# Set settings whose default value is deprecated to a future-proof value +FEED_EXPORT_ENCODING = "utf-8" diff --git a/scraper/spiders/__init__.py b/scraper/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/scraper/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/scrapy.cfg b/scrapy.cfg new file mode 100644 index 0000000..e490281 --- /dev/null +++ b/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = scraper.settings + +[deploy] +#url = http://localhost:6800/ +project = scraper