Compare commits

..

8 Commits

Author SHA1 Message Date
8f1d9e7723 added forum metadata for later use 2026-05-09 00:36:30 -04:00
181477bce7 streamlit > local docker 2026-05-09 00:25:27 -04:00
771f11fd3c updated readme 2026-05-09 00:02:24 -04:00
f42183eeda added streamlit link 2026-05-09 00:00:59 -04:00
92706bafb5 updated tasks and deps 2026-05-08 23:57:46 -04:00
723b353db8 lol 2026-05-08 23:33:55 -04:00
67cd96a523 updated readme.md 2026-05-08 23:32:44 -04:00
cc16acbb12 added argparse for job dir, added tone filter 2026-05-08 23:28:13 -04:00
6 changed files with 41 additions and 43 deletions

View File

@@ -1,17 +1,3 @@
# Table of Contents
1. [Project Goals](#org2da6874)
1. [Research questions](#org1a2b8b3)
2. [Architecture](#orgfabfcd9)
1. [Scraper](#org2c5c7a2)
2. [Analysis](#org72990f4)
3. [Storage](#org58a5b72)
3. [Instructions](#org24fe465)
1. [Roadmap](#org5739d49)
<a id="org2da6874"></a>
## Project Goals ## Project Goals
@@ -21,8 +7,9 @@
2. Make data and insights broadly available. 2. Make data and insights broadly available.
3. Generalize to other public comment tools. 3. Generalize to other public comment tools.
Take a look at https://vatownhall.streamlit.app
![img](./docs/streamlit-snapshot.png)
<a id="org1a2b8b3"></a>
### Research questions ### Research questions
@@ -66,9 +53,9 @@ Scrapy provides a simple mechanism for retrieving, parsing, and saving content f
Google and Amazon both return generic sentiment (tone of writing: positive/negative), not stance (for/against the regulation): "I strongly believe the government should NOT interfere" is negative tone but "against" the regulation. We add the proposed change as context to the model. Google and Amazon both return generic sentiment (tone of writing: positive/negative), not stance (for/against the regulation): "I strongly believe the government should NOT interfere" is negative tone but "against" the regulation. We add the proposed change as context to the model.
Before sending the comments for sentiment analysis, \`tokenizer.py\` receives the forum to be processed and prompt as inputs, then generates a \`report.json\` estimating tokens (tiktoken), cost, and time to run for multiple models. Before sending the comments for sentiment analysis, `tokenizer.py` receives the forum to be processed and prompt as inputs, then generates a `report.json` estimating tokens (tiktoken), cost, and time to run for multiple models.
Then, the batch processing scripts uses the \`report.json\` to create multiple jobs, with subcommands to download and check their status. Then, the batch processing scripts uses the `report.json` to create multiple jobs, with subcommands to download and check their status.
We selected gpt-5.4-mini for a good balance of quality, cost, and time. We selected gpt-5.4-mini for a good balance of quality, cost, and time.

View File

@@ -335,7 +335,7 @@ create ./viz/prototype_charts.py generating individual plotly charts for explori
- tests: see viz/proto and viz/chart_tests - tests: see viz/proto and viz/chart_tests
- datetime: [2026-05-08 Fri 08:38] - datetime: [2026-05-08 Fri 08:38]
* [ ] t1.5: streamlit * [X] t1.5: streamlit
create organized webpage displaying useful information from completed job and analysis create organized webpage displaying useful information from completed job and analysis
** acceptance criteria ** acceptance criteria
@@ -350,16 +350,20 @@ create organized webpage displaying useful information from completed job and an
data pulls entirely from the job; goal is to point viz/streamlit.py at any job/ folder and have everything it needs data pulls entirely from the job; goal is to point viz/streamlit.py at any job/ folder and have everything it needs
** evidence ** evidence
- commit: - commit: cc16acb
- tests: from root dir, `streamlit run viz/streamlit.py` - tests: from root dir, `streamlit run viz/streamlit.py <job-dir>`
- datetime: [2026-05-08 Fri 23:44]
7. add forum_url, forum_collected_date to scraper * +[ ] t1.6 host streamlit via dockerfile+
planning to deploy manually, get cert, etc etc. probably dont care about https?
+using streamlit.app instead+
** acceptance criteria
1. write dockerfile with slim image
* [ ] t1.6 host streamlit ** notes
figure out how to host this, locally or via streamlit servers
* === Backlog === * === Backlog ===
- add forum_url, forum_collected_date to scraper (to add to viz)
* [ ] X: complete proposal information * [ ] X: complete proposal information
Ensure we capture as much useful information as possible about the actual proposal - contact information, etc. what the state actually says about what was posted. Ensure we capture as much useful information as possible about the actual proposal - contact information, etc. what the state actually says about what was posted.
** acceptance criteria ** acceptance criteria

Binary file not shown.

View File

@@ -5,6 +5,8 @@ class ForumItem(scrapy.Item):
forum_id = scrapy.Field() forum_id = scrapy.Field()
reg_title = scrapy.Field() reg_title = scrapy.Field()
reg_desc = scrapy.Field() reg_desc = scrapy.Field()
scraped_at = scrapy.Field()
forum_url = scrapy.Field()
class CommentItem(scrapy.Item): class CommentItem(scrapy.Item):

View File

@@ -63,6 +63,8 @@ class ForumSpider(scrapy.Spider):
forum_id=self.forum_id, forum_id=self.forum_id,
reg_title=reg_title, reg_title=reg_title,
reg_desc=reg_desc, reg_desc=reg_desc,
scraped_at=datetime.utcnow().isoformat(),
forum_url=_view_url(self.forum_id),
) )
for page in range(2, last_page + 1): for page in range(2, last_page + 1):
yield scrapy.FormRequest( yield scrapy.FormRequest(

View File

@@ -1,14 +1,17 @@
# streamlit run analysis/viz/streamlit.py # streamlit run viz/streamlit.py -- --jobs-dir analysis/jobs/f452-1
import argparse import argparse
from pathlib import Path from pathlib import Path
from datetime import datetime as dt from datetime import datetime as dt
import pandas as pd import pandas as pd
import plotly.graph_objects as go import plotly.graph_objects as go
import plotly.express as px import plotly.express as px
import plotly.subplots as ps
import streamlit as st import streamlit as st
workdir = Path("analysis/jobs/f452-1") parser = argparse.ArgumentParser()
parser.add_argument("--jobs-dir", default="analysis/jobs/f452-1", type=Path,
help="Job directory containing review.csv, forum.jsonl, and prompt.txt")
args, _ = parser.parse_known_args() # parse_known_args: ignore Streamlit's own argv entries
workdir = args.jobs_dir
df = pd.read_csv(workdir/"review.csv") df = pd.read_csv(workdir/"review.csv")
df['date_dt'] = pd.to_datetime(df.date) df['date_dt'] = pd.to_datetime(df.date)
df["date_day"] = df["date_dt"].dt.date df["date_day"] = df["date_dt"].dt.date
@@ -128,14 +131,14 @@ st.subheader("Comment Explorer",anchor=False,divider="gray")
# comment explorer # comment explorer
cex_left, cex_right = st.columns([1,1]) cex_left, cex_right = st.columns([1,1])
with cex_left: with cex_left:
stance = st.multiselect("Filter stance", sorted(df["stance"].dropna().unique()), default=sorted(df["stance"].dropna().unique())) filter_stance = st.multiselect("Filter stance", sorted(df["stance"].dropna().unique()), default=sorted(df["stance"].dropna().unique()))
q = st.text_input("Search comment title and text") filter_tone = st.multiselect("Filter tone", sorted(df["tone"].dropna().unique()), default=sorted(df["tone"].dropna().unique()))
dff = df[df["stance"].isin(stance)] dff = df[df["stance"].isin(filter_stance) & df["tone"].isin(filter_tone)]
if q:
dff = dff[dff["text"].fillna("").str.contains(q, case=False, regex=False)]
with cex_right: with cex_right:
filter_tone = st.multiselect("Filter tone", sorted(df["tone"].dropna().unique()), default=sorted(df["tone"].dropna().unique())) q = st.text_input("Search comment title and text")
if q:
dff = dff[dff["text"].fillna("").str.contains(q, case=False, regex=False)]
st.text(""); st.text("") st.text(""); st.text("")
st.text("Showing " + str(len(dff))+ " comments",text_alignment="right", width="stretch") st.text("Showing " + str(len(dff))+ " comments",text_alignment="right", width="stretch")