full local streamlit support

2026-05-08 21:57:04 -04:00
parent 3fb424da3c
commit afd5b8c60e
3 changed files with 119 additions and 94 deletions
--- a/docs/streamlit-snapshot.png
+++ b/docs/streamlit-snapshot.png
--- a/docs/tasks.org
+++ b/docs/tasks.org
@@ -314,20 +314,10 @@ fix mojibake in scraped text before analysis/reporting, especially curly quotes
 - before/after sample: N/A — f452.jsonl is clean; tests cover synthetic mojibake patterns
 - datetime: [2026-05-07 Thu 17:00]
-* [ ] t1.4: graph data prep
+* [X] t1.4: graph data prototype
-create a script ./viz/prototype_charts.py generating individual plotly charts for exploring graphs to embed into streamlit or dash later
+create ./viz/prototype_charts.py generating individual plotly charts for exploring graphs to embed into streamlit or dash later
 1. in create_csv.py, create helper columns:
   - stance_signed = {"support":1, "oppose":-1, "neutral":0, "unknown":0}
   - stance_weighted = stance_signed * stance_confidence
   - is_support_oppose = stance in ["support", "oppose"]
   - date_day
   - date_hour
   - text_norm
   - text_hash
   - confidence_bucket = 'low' <.7 | 'med' .7-.89 | 'high' >=.9
 2. add forum_url, forum_collected_date to scraper
 ** acceptance criteria
 2. create graph for Stance/Share
   - stacked h-bar with % support/oppose/neutral/unknown + raw totals, eg  63% (5720) / 37% (3320) / 0.09% (8) / 0.37% (34)
   - later, consider centered diverging h-bar: oppose ← | neutral/unknown | → support
@@ -336,21 +326,37 @@ create a script ./viz/prototype_charts.py generating individual plotly charts fo
 4. create graph for Stance/Tone (heatmap count)
 5. create graph for Confidence/Stance (boxplot or histogram)
 ** notes
 - prototyped in plotly
 - initial streamlit  
 ** evidence
 - commit: 3fb424d
 - tests: see viz/proto and viz/chart_tests
 - datetime: [2026-05-08 Fri 08:38]
 * [ ] t1.5: streamlit
 create organized webpage displaying useful information from completed job and analysis
 ** acceptance criteria
-1. load parquet/csv review dataset
+1. display total stance breakdown
-2. show stance counts, tone counts, tag counts, and confidence histogram
+2. display centered horiz-bar with absolute stances
-3. provide filters for stance, tone, confidence, tag, and text search
+3. show daily comment stances and cumulative
-4. show filtered comment table
+4. show comment table with filters for stance (filter tone?)
 5. clicking/selecting a comment shows full text and model rationale
 6. app runs locally with one command
 ** notes
 data pulls entirely from the job; goal is to point viz/streamlit.py at any job/ folder and have everything it needs
 ** evidence
 - commit: 
- tests:
+- tests: from root dir, `streamlit run viz/streamlit.py`
- datetime:
+
 7. add forum_url, forum_collected_date to scraper
 * [ ] t1.6 host streamlit
 figure out how to host this, locally or via streamlit servers
 * === Backlog ===
@@ -360,3 +366,13 @@ Ensure we capture as much useful information as possible about the actual propos
 1. Item: `Forum` stores id, url, proposal title, description, open/close date, number of comments, agency, board, guidance document id
   - add details for guidanceDoc, publication date, comments, guidance docs - eg: https://www.townhall.virginia.gov/L/GDocForum.cfm?GDocForumID=452
 2. Item: `Comment` stores forum_id, comment_id, author, title, text, date, url
 * [ ] X: add helper data to create_csv
 1. in create_csv.py, create helper columns:
   - stance_signed = {"support":1, "oppose":-1, "neutral":0, "unknown":0}
   - stance_weighted = stance_signed * stance_confidence
   - is_support_oppose = stance in ["support", "oppose"]
   - date_day
   - date_hour
   - text_norm
   - text_hash
   - confidence_bucket = 'low' <.7 | 'med' .7-.89 | 'high' >=.9
--- a/viz/streamlit.py
+++ b/viz/streamlit.py
@@ -1,6 +1,7 @@
-# streamlit run analysis/viz/comment_streamlit2.py
+# streamlit run analysis/viz/streamlit.py
-from datetime import datetime as dt
+import argparse
 from pathlib import Path
 from datetime import datetime as dt
 import pandas as pd
 import plotly.graph_objects as go
 import plotly.express as px
@@ -15,25 +16,21 @@ forum = pd.read_json(workdir/"forum.jsonl", lines=True).iloc[0].to_dict()
 prompt = (workdir/"prompt.txt").read_text(encoding="utf-8")
 stance_colors = {'oppose':'#ffa15a', 'neutral':'#e377c2','support':'#19d3f3','unknown':'#000000'}
-#stance_colors = {'oppose':'orange', 'neutral':'green','support':'blue','unknown':'gray','mixed':'violet'}
+stance_order = ["oppose", "mixed", "unknown", "neutral", "support"]
 stance_order = ["oppose", "neutral", "unknown", "support"]
 st.set_page_config(layout="wide")
-st.title("Virginia Townhall Explorer")
+st.title("Virginia Townhall Explorer",anchor=None)
-st.divider()
+st.caption("Explore data collected from Virginia's public comment system. Source code at https://github.com/eulaly/vath")
-st.subheader(forum.get('reg_title'))
+
 st.subheader("Proposal",anchor=None,divider="gray")
 st.markdown(f"**{forum.get('reg_title')}**")
 st.text(forum.get('reg_desc'))
-st.caption(f"Link: https://www.townhall.virginia.gov/L/Comments.cfm?GDocForumID={forum.get('forum_id')}")
+st.caption(f'Comments posted from {dt.strftime(min(df.date_dt),"%D")}—{dt.strftime(max(df.date_dt),"%D")} at https://www.townhall.virginia.gov/L/Comments.cfm?GDocForumID={forum.get("forum_id")}')
-st.write(f'Comments posted from {dt.strftime(min(df.date_dt),"%D")}—{dt.strftime(max(df.date_dt),"%D")}')
+st.subheader("Comment Summary",anchor=False,divider="gray")
 st.write('Data collected on _')
 st.subheader("Comment Summary")
 # summary
 summary_left, summary_right = st.columns([1,2])
 with summary_left:
-    # summary table
+# Summary Table
    #summary_stats = df.groupby("stance").size().reindex(stance_order,fill_value=0).reset_index(name="count")
    summary_stats = (
    df.groupby("stance").size()
      .reindex(stance_order, fill_value=0)
@@ -43,7 +40,7 @@ with summary_left:
    st.dataframe(summary_stats, hide_index=True, width="stretch")
 with summary_right:
-# stance div-h
+# Stance div-h
    counts = df["stance"].value_counts()
    stance_divh = go.Figure()
    stance_divh.add_bar(y=["stance"], x=[-counts.get("oppose",0)], name="oppose", orientation="h", marker_color=stance_colors.get('oppose'), text=[counts.get("oppose",0)], textposition="inside")
@@ -52,15 +49,8 @@ with summary_right:
    stance_divh.add_bar(y=["stance"], x=[counts.get("support",0)], name="support", orientation="h", marker_color=stance_colors.get('support'), text=[counts.get("support",0)], textposition="inside")
    stance_divh.update_yaxes(title_text="",showticklabels=False)
    stance_divh.update_layout(barmode="relative", title="", height=180, margin=dict(l=0,r=0,t=0,b=0),xaxis_title="", yaxis_title="",legend=dict(orientation="v",y=0.12))
                              #legend_orientation="v")
    st.plotly_chart(stance_divh,width='stretch')
 # stance_time
 #stance_order = ["oppose", "neutral","unknown","support"]
 #daily = df.groupby(["date_day", "stance"]).size().reset_index(name="count")
 #stance_time = px.bar(daily, x="date_day", y="count", color="stance", category_orders={"stance": stance_order},color_discrete_map=stance_colors,title="")
 #st.plotly_chart(stance_time, width='stretch')
 # Daily Comments Breakdown, 3 Tabs
 daily_wide = (
    df.groupby(["date_day", "stance"])
@@ -90,6 +80,7 @@ cum_share_long = (
      .melt(id_vars="date_day", var_name="stance", value_name="cumulative_share")
 )
 tab_daily, tab_area, tab_share = st.tabs([
    "Daily",
    "Cumulative",
@@ -133,45 +124,63 @@ with tab_share:
    fig.update_layout(height=420, legend_orientation="v")
    st.plotly_chart(fig, width="stretch")
-st.subheader("Comment Explorer")    
+st.subheader("Comment Explorer",anchor=False,divider="gray") 
 # comment explorer
 cex_left, cex_right = st.columns([1,1])
 with cex_left:
    stance = st.multiselect("Filter stance", sorted(df["stance"].dropna().unique()), default=sorted(df["stance"].dropna().unique()))
    q = st.text_input("Search comment title and text")
    dff = df[df["stance"].isin(stance)]
    if q:
        dff = dff[dff["text"].fillna("").str.contains(q, case=False, regex=False)]
-# stance/tone heatmap
+with cex_right:
-# TODO add raw values
+    filter_tone = st.multiselect("Filter tone", sorted(df["tone"].dropna().unique()), default=sorted(df["tone"].dropna().unique()))
-# TODO OPT add button to swap between pct/tone <> pct/stance
+    st.text(""); st.text("")
-x_order = ["unknown","oppose","mixed","neutral","support"]  # includes mixed even if absent; harmless zero column
+    st.text("Showing " + str(len(dff))+ " comments",text_alignment="right", width="stretch")
-y_order = ["positive","neutral","mixed","negative","unclear"]
+
-tab = pd.crosstab(df["tone"], df["stance"]).reindex(index=y_order, columns=x_order, fill_value=0)
+st.dataframe(dff[["comment_id", "title", "text", "stance", "stance_confidence", "tone"]], width="stretch")
-pct = tab.div(tab.sum(axis=1).replace(0, pd.NA), axis=0).fillna(0)
+
-fig = px.imshow(
+cid = st.selectbox("Select comment to view:", dff["comment_id"].astype(str))
 row = dff[dff["comment_id"].astype(str) == cid].iloc[0]
 st.markdown(f'**{row["title"]}**')
 st.text(row["text"])
 st.write(row["author"] + ", " + row["date_dt"].strftime("%D"))
 st.divider()
 st.subheader('Analysis')
 cexs_left, cexs_right = st.columns([1,1])
 with cexs_left:
    st.write(f"**stance:** {row['stance']}")
    st.write(f"**stance_confidence:** {row['stance_confidence']:.2f}")
    st.write(f"**tone:** {row['tone']}")
    st.write("**analysis:** "+ row["stance_rationale"])
 with cexs_right:
    x_order = ["unknown","oppose","mixed","neutral","support"]  # includes mixed even if absent; harmless zero column
    y_order = ["positive","neutral","mixed","negative","unclear"]
    tab = pd.crosstab(df["tone"], df["stance"]).reindex(index=y_order, columns=x_order, fill_value=0)
    pct = tab.div(tab.sum(axis=1).replace(0, pd.NA), axis=0).fillna(0)
    tone_stance = px.imshow(
        pct,
        x=x_order, y=y_order,
        text_auto=".0%",
        aspect="auto",
        color_continuous_scale="Greens",
-    title="tone by stance, percent within tone",
+    )
-)
+    tone_stance.update_traces(text=tab.astype(str) + " / " + (pct*100).round(0).astype(int).astype(str) + "%")
-fig.update_traces(text=tab.astype(str) + " / " + (pct*100).round(0).astype(int).astype(str) + "%")
+    tone_stance.add_scatter(x=[row["stance"]],y=[row["tone"]],mode="markers",marker=dict(size=15,color="yellow",symbol="cross",line=dict(width=1, color="red")),showlegend=False)
-fig.update_layout(height=420, xaxis_title="stance", yaxis_title="tone")
+    tone_stance.update_layout(height=420, xaxis_title="stance", yaxis_title="tone")
-st.plotly_chart(fig, width='stretch')
+    st.plotly_chart(tone_stance, width='stretch')
    st.caption("Tone by stance, % within tone", text_alignment="right",width="stretch")
-# comment explorer
+st.divider()
 stance = st.multiselect("Filter stance", sorted(df["stance"].dropna().unique()), default=sorted(df["stance"].dropna().unique()))
 q = st.text_input("Search comment text")
 dff = df[df["stance"].isin(stance)]
 if q:
    dff = dff[dff["text"].fillna("").str.contains(q, case=False, regex=False)]
 st.dataframe(dff[["comment_id", "title", "text", "stance", "stance_confidence", "tone"]], width="stretch")
 st.write("Showing " + str(len(dff))+ " comments")
 cid = st.selectbox("comment", dff["comment_id"].astype(str))
 row = dff[dff["comment_id"].astype(str) == cid].iloc[0]
 st.subheader(row["title"])
 st.write(row["text"])
 st.write(row["author"] + ", " + row["date"][:10])
 st.markdown(f"**stance:** {row['stance']} \t|\t **confidence:** {row['stance_confidence']:.2f} \t|\t **tone:** {row['tone']}")
 st.write("**analysis:** "+ row["stance_rationale"])
 st.write("**model:** " + str(row["model"]))
 with st.expander("Prompt", expanded=False):
    st.code(prompt, language="text")
 tone_conf = px.box(df,x="stance",y="stance_confidence",color="stance",category_orders={"stance":stance_order},color_discrete_map=stance_colors,points="outliers",title="Comment Stance Classification Confidence")
 tone_conf.update_yaxes(range=[0,1.02])
 tone_conf.update_layout(height=430, legend_orientation="v")
 st.plotly_chart(tone_conf,width="stretch")