added streamlit v1

2026-05-08 17:22:33 -04:00
parent c3f2911563
commit 3fb424da3c
19 changed files with 50922 additions and 13 deletions
--- a/viz/chart_tests/confidence_by_stance.html
+++ b/viz/chart_tests/confidence_by_stance.html
--- a/viz/chart_tests/cumulative_stance_area.html
+++ b/viz/chart_tests/cumulative_stance_area.html
--- a/viz/chart_tests/cumulative_stance_share.html
+++ b/viz/chart_tests/cumulative_stance_share.html
--- a/viz/chart_tests/stance_diverging_bar.html
+++ b/viz/chart_tests/stance_diverging_bar.html
--- a/viz/chart_tests/stance_over_time.html
+++ b/viz/chart_tests/stance_over_time.html
--- a/viz/chart_tests/stance_share.html
+++ b/viz/chart_tests/stance_share.html
--- a/viz/chart_tests/stance_tone_counts.html
+++ b/viz/chart_tests/stance_tone_counts.html
--- a/viz/chart_tests/stance_tone_heatmap.html
+++ b/viz/chart_tests/stance_tone_heatmap.html
--- a/viz/chart_tests/stance_tone_rowpct.html
+++ b/viz/chart_tests/stance_tone_rowpct.html
--- a/viz/proto/confidence_by_stance.html
+++ b/viz/proto/confidence_by_stance.html
--- a/viz/proto/stance_over_time.html
+++ b/viz/proto/stance_over_time.html
--- a/viz/proto/stance_share.html
+++ b/viz/proto/stance_share.html
--- a/viz/proto/stance_tone_heatmap.html
+++ b/viz/proto/stance_tone_heatmap.html
--- a/viz/prototype_charts.py
+++ b/viz/prototype_charts.py
@@ -0,0 +1,134 @@
+'''
+    prototype_charts.py
+    generate test charts for later addition to streamlit
+'''
+   
+
+from pathlib import Path
+import pandas as pd
+import plotly.express as px
+import numpy as np
+
+inp = Path(r"c:/users/moses/projects/vath/analysis/jobs/f452-1/review.csv")
+out = Path("viz/")
+out.mkdir(parents=True, exist_ok=True)
+
+stance_order = ["support", "oppose", "neutral", "unknown"]
+
+# tone_order = ["positive", "negative", "neutral", "mixed", "unknown", "unclear"]
+# default order was actually better - unclear/negative/neutral/mixed/positive vs unknown/oppose/neutral/support
+# same for pct w/in stance
+df = pd.read_csv(inp)
+df["date"] = pd.to_datetime(df["date"], errors="coerce")
+df["date_day"] = df["date"].dt.date
+df["stance"] = df["stance"].fillna("unknown")
+df["tone"] = df["tone"].fillna("unknown")
+
+# 1. stance share
+counts = df["stance"].value_counts().reindex(stance_order, fill_value=0).reset_index()
+counts.columns = ["stance", "count"]
+fig = px.bar(counts, x="count", y="stance", orientation="h", text="count")
+fig.write_html(out / "stance_share.html")
+
+# 2. stance over time
+daily = df.groupby(["date_day", "stance"]).size().reset_index(name="count")
+fig = px.bar(daily, x="date_day", y="count", color="stance", category_orders={"stance": stance_order})
+fig.write_html(out / "stance_over_time.html")
+
+# 3. stance x tone
+heat = df.groupby(["stance", "tone"]).size().reset_index(name="count")
+fig = px.density_heatmap(heat, x="tone", y="stance", z="count", category_orders={"stance": stance_order})
+fig.write_html(out / "stance_tone_heatmap.html")
+
+# 4. confidence by stance
+fig = px.box(df, x="stance", y="stance_confidence", category_orders={"stance": stance_order}, points="outliers")
+fig.write_html(out / "confidence_by_stance.html")
+
+# 5. cumulative stance and share over time
+daily = (
+    df.groupby(["date_day", "stance"])
+      .size()
+      .unstack(fill_value=0)
+      .reindex(columns=stance_order, fill_value=0)
+      .sort_index()
+)
+
+cum = daily.cumsum()
+cum_long = cum.reset_index().melt(id_vars="date_day", var_name="stance", value_name="cumulative_count")
+
+fig = px.area(
+    cum_long,
+    x="date_day",
+    y="cumulative_count",
+    color="stance",
+    category_orders={"stance": stance_order},
+    title="cumulative comments by stance over time",
+)
+fig.write_html(out / "cumulative_stance_area.html")
+
+cum_pct = cum.div(cum.sum(axis=1), axis=0).reset_index().melt(
+    id_vars="date_day", var_name="stance", value_name="cumulative_share"
+)
+
+fig = px.line(
+    cum_pct,
+    x="date_day",
+    y="cumulative_share",
+    color="stance",
+    category_orders={"stance": stance_order},
+    title="cumulative stance share over time",
+)
+fig.update_yaxes(tickformat=".0%")
+fig.write_html(out / "cumulative_stance_share.html")
+
+# 7. diverging h-bar
+stance_counts = df["stance"].value_counts().reindex(stance_order, fill_value=0)
+
+div = pd.DataFrame({
+    "stance": ["oppose", "support", "neutral", "unknown"],
+    "count": [
+        -stance_counts.get("oppose", 0),
+         stance_counts.get("support", 0),
+         stance_counts.get("neutral", 0),
+         stance_counts.get("unknown", 0),
+    ],
+})
+
+fig = px.bar(
+    div,
+    x="count",
+    y="stance",
+    orientation="h",
+    text=div["count"].abs(),
+    title="support vs oppose",
+)
+fig.update_xaxes(title="comments", zeroline=True)
+fig.update_traces(textposition="outside")
+fig.write_html(out / "stance_diverging_bar.html")
+
+# 8. Stance x Tone labels
+heat = pd.crosstab(df["stance"], df["tone"]).reindex(
+    index=stance_order,
+    columns=[c for c in tone_order if c in df["tone"].unique()],
+    fill_value=0,
+)
+
+fig = px.imshow(
+    heat,
+    text_auto=True,
+    aspect="auto",
+    title="stance x tone, count",
+)
+fig.write_html(out / "stance_tone_counts.html")
+
+rowpct = heat.div(heat.sum(axis=1).replace(0, np.nan), axis=0)
+
+fig = px.imshow(
+    rowpct,
+    text_auto=".0%",
+    aspect="auto",
+    title="stance x tone, percent within stance",
+)
+fig.write_html(out / "stance_tone_rowpct.html")
+
+
--- a/viz/prototype_streamlit.py
+++ b/viz/prototype_streamlit.py
@@ -0,0 +1,28 @@
+# streamlit run analysis/viz/prototype_streamlit.py
+from datetime import datetime
+import pandas as pd
+import plotly.graph_objects as go
+import plotly.express as px
+import streamlit as st
+
+df = pd.read_csv(r"analysis/jobs/f452-1/review.csv")
+st.set_page_config(layout="wide")
+
+stance = st.multiselect("Filter stance", sorted(df["stance"].dropna().unique()), default=sorted(df["stance"].dropna().unique()))
+q = st.text_input("Search comment text")
+dff = df[df["stance"].isin(stance)]
+if q:
+    dff = dff[dff["text"].fillna("").str.contains(q, case=False, regex=False)]
+
+st.dataframe(dff[["comment_id", "title", "stance", "stance_confidence", "tone"]], width="stretch")
+st.write("Showing " + str(len(dff))+ " comments")
+
+cid = st.selectbox("comment", dff["comment_id"].astype(str))
+row = dff[dff["comment_id"].astype(str) == cid].iloc[0]
+
+st.subheader(row["title"])
+st.write(row["text"])
+st.write(row["author"] + ", " + row["date"][:10])
+st.write("**model:** " + str(row["model"]))
+st.markdown("**stance:** " + str(row["stance"]) + "  \n**confidence:** " + str(row["stance_confidence"]) + "  \n**tone:** " + str(row["tone"]))
+st.write("**analysis:** "+ row["stance_rationale"])
--- a/viz/streamlit.py
+++ b/viz/streamlit.py
@@ -0,0 +1,177 @@
+# streamlit run analysis/viz/comment_streamlit2.py
+from datetime import datetime as dt
+from pathlib import Path
+import pandas as pd
+import plotly.graph_objects as go
+import plotly.express as px
+import plotly.subplots as ps
+import streamlit as st
+
+workdir = Path("analysis/jobs/f452-1")
+df = pd.read_csv(workdir/"review.csv")
+df['date_dt'] = pd.to_datetime(df.date)
+df["date_day"] = df["date_dt"].dt.date
+forum = pd.read_json(workdir/"forum.jsonl", lines=True).iloc[0].to_dict()
+prompt = (workdir/"prompt.txt").read_text(encoding="utf-8")
+
+stance_colors = {'oppose':'#ffa15a', 'neutral':'#e377c2','support':'#19d3f3','unknown':'#000000'}
+#stance_colors = {'oppose':'orange', 'neutral':'green','support':'blue','unknown':'gray','mixed':'violet'}
+stance_order = ["oppose", "neutral", "unknown", "support"]
+
+st.set_page_config(layout="wide")
+st.title("Virginia Townhall Explorer")
+st.divider()
+st.subheader(forum.get('reg_title'))
+st.text(forum.get('reg_desc'))
+st.caption(f"Link: https://www.townhall.virginia.gov/L/Comments.cfm?GDocForumID={forum.get('forum_id')}")
+
+st.write(f'Comments posted from {dt.strftime(min(df.date_dt),"%D")}—{dt.strftime(max(df.date_dt),"%D")}')
+st.write('Data collected on _')
+
+st.subheader("Comment Summary")
+# summary
+summary_left, summary_right = st.columns([1,2])
+with summary_left:
+    # summary table
+    #summary_stats = df.groupby("stance").size().reindex(stance_order,fill_value=0).reset_index(name="count")
+    summary_stats = (
+    df.groupby("stance").size()
+      .reindex(stance_order, fill_value=0)
+      .reset_index(name="count")
+      .assign(percent=lambda d: (d["count"] / d["count"].sum()).map("{:.1%}".format))
+)
+
+    st.dataframe(summary_stats, hide_index=True, width="stretch")
+with summary_right:
+# stance div-h
+    counts = df["stance"].value_counts()
+    stance_divh = go.Figure()
+    stance_divh.add_bar(y=["stance"], x=[-counts.get("oppose",0)], name="oppose", orientation="h", marker_color=stance_colors.get('oppose'), text=[counts.get("oppose",0)], textposition="inside")
+    stance_divh.add_bar(y=["stance"], x=[counts.get("neutral",0)], name="neutral", orientation="h", marker_color=stance_colors.get('neutral'), text=[counts.get("neutral",0)], textposition="inside")
+    stance_divh.add_bar(y=["stance"], x=[counts.get("unknown",0)], name="unknown", orientation="h", marker_color=stance_colors.get('unknown'), text=[counts.get("unknown",0)], textposition="inside")
+    stance_divh.add_bar(y=["stance"], x=[counts.get("support",0)], name="support", orientation="h", marker_color=stance_colors.get('support'), text=[counts.get("support",0)], textposition="inside")
+    stance_divh.update_yaxes(title_text="",showticklabels=False)
+    stance_divh.update_layout(barmode="relative", title="", height=180, margin=dict(l=0,r=0,t=0,b=0),xaxis_title="", yaxis_title="",legend=dict(orientation="v",y=0.12))
+                              #legend_orientation="v")
+    st.plotly_chart(stance_divh,width='stretch')
+    
+# stance_time
+#stance_order = ["oppose", "neutral","unknown","support"]
+#daily = df.groupby(["date_day", "stance"]).size().reset_index(name="count")
+#stance_time = px.bar(daily, x="date_day", y="count", color="stance", category_orders={"stance": stance_order},color_discrete_map=stance_colors,title="")
+#st.plotly_chart(stance_time, width='stretch')
+
+# Daily Comments Breakdown, 3 Tabs
+daily_wide = (
+    df.groupby(["date_day", "stance"])
+      .size()
+      .unstack(fill_value=0)
+      .reindex(columns=stance_order, fill_value=0)
+      .sort_index()
+)
+
+daily_long = (
+    daily_wide.reset_index()
+      .melt(id_vars="date_day", var_name="stance", value_name="count")
+)
+
+cum_wide = daily_wide.cumsum()
+
+cum_long = (
+    cum_wide.reset_index()
+      .melt(id_vars="date_day", var_name="stance", value_name="cumulative_count")
+)
+
+cum_total = cum_wide.sum(axis=1)
+cum_share = cum_wide.div(cum_total.where(cum_total > 0), axis=0)
+
+cum_share_long = (
+    cum_share.reset_index()
+      .melt(id_vars="date_day", var_name="stance", value_name="cumulative_share")
+)
+
+tab_daily, tab_area, tab_share = st.tabs([
+    "Daily",
+    "Cumulative",
+    "Cumulative Share",
+])
+
+with tab_daily:
+    fig = px.bar(
+        daily_long,
+        x="date_day",
+        y="count",
+        color="stance",
+        category_orders={"stance": stance_order},
+        color_discrete_map=stance_colors,
+    )
+    fig.update_layout(barmode="stack", height=420, legend_orientation="v")
+    st.plotly_chart(fig, width="stretch")
+
+with tab_area:
+    fig = px.area(
+        cum_long,
+        x="date_day",
+        y="cumulative_count",
+        color="stance",
+        category_orders={"stance": stance_order},
+        color_discrete_map=stance_colors,
+    )
+    fig.update_layout(height=420, legend_orientation="v")
+    st.plotly_chart(fig, width="stretch")
+
+with tab_share:
+    fig = px.line(
+        cum_share_long,
+        x="date_day",
+        y="cumulative_share",
+        color="stance",
+        category_orders={"stance": stance_order},
+        color_discrete_map=stance_colors,
+    )
+    fig.update_yaxes(tickformat=".0%", range=[0, 1])
+    fig.update_layout(height=420, legend_orientation="v")
+    st.plotly_chart(fig, width="stretch")
+
+st.subheader("Comment Explorer")    
+
+# stance/tone heatmap
+# TODO add raw values
+# TODO OPT add button to swap between pct/tone <> pct/stance
+x_order = ["unknown","oppose","mixed","neutral","support"]  # includes mixed even if absent; harmless zero column
+y_order = ["positive","neutral","mixed","negative","unclear"]
+tab = pd.crosstab(df["tone"], df["stance"]).reindex(index=y_order, columns=x_order, fill_value=0)
+pct = tab.div(tab.sum(axis=1).replace(0, pd.NA), axis=0).fillna(0)
+fig = px.imshow(
+    pct,
+    x=x_order, y=y_order,
+    text_auto=".0%",
+    aspect="auto",
+    color_continuous_scale="Greens",
+    title="tone by stance, percent within tone",
+)
+fig.update_traces(text=tab.astype(str) + " / " + (pct*100).round(0).astype(int).astype(str) + "%")
+fig.update_layout(height=420, xaxis_title="stance", yaxis_title="tone")
+st.plotly_chart(fig, width='stretch')
+
+# comment explorer
+stance = st.multiselect("Filter stance", sorted(df["stance"].dropna().unique()), default=sorted(df["stance"].dropna().unique()))
+q = st.text_input("Search comment text")
+dff = df[df["stance"].isin(stance)]
+if q:
+    dff = dff[dff["text"].fillna("").str.contains(q, case=False, regex=False)]
+
+st.dataframe(dff[["comment_id", "title", "text", "stance", "stance_confidence", "tone"]], width="stretch")
+st.write("Showing " + str(len(dff))+ " comments")
+
+cid = st.selectbox("comment", dff["comment_id"].astype(str))
+row = dff[dff["comment_id"].astype(str) == cid].iloc[0]
+
+st.subheader(row["title"])
+st.write(row["text"])
+st.write(row["author"] + ", " + row["date"][:10])
+st.markdown(f"**stance:** {row['stance']} \t|\t **confidence:** {row['stance_confidence']:.2f} \t|\t **tone:** {row['tone']}")
+st.write("**analysis:** "+ row["stance_rationale"])
+st.write("**model:** " + str(row["model"]))
+with st.expander("Prompt", expanded=False):
+    st.code(prompt, language="text")