updated instructions and added artifacts

2026-05-05 13:50:27 -04:00
parent e7df0b24a1
commit 314f8d2621
4 changed files with 225 additions and 13 deletions
--- a/agents.md
+++ b/agents.md
@@ -5,14 +5,14 @@
 - prefer minimal diffs; avoid refactors unless required for the active task

 ## tech stack
- python; scrapy
+- python; scrapy, pytest
 - file storage: json or csv
 - assume local virtual env is available and accessible
 - do not add new dependencies unless explicitly approved; if unavoidable, document justification in the active task notes

 ## workflow
- prefer direct argv commands (no bash -lc / compound shell chains) unless necessary
- work on ONE task at a time unless explicitly instructed otherwise
+- prefer direct commands
+- work on ONE task at a time unless explicitly instructed otherwise:
  - at the start of work, state the task id you are executing
  - do not start work unless a task id is specified; if missing, choose the earliest unchecked task and say so
  - propose incremental steps
@@ -22,7 +22,7 @@
    - fill in evidence with commit hash + commands run
 	- never mark complete unless acceptance criteria are met
    - include date and time (HH:MM)
-
+	- follow this format:
 ```
 * [ ] t1.1 Task Title (1)
 Description and PM notes
--- a/docs/tb.py
+++ b/docs/tb.py
@@ -0,0 +1,105 @@
+import jsonlines
+import re
+from textblob import TextBlob
+from collections import Counter
+
+def tprint(obj):
+    print(f"{type(obj)} : {obj}")
+
+
+def sort_file(file):
+    '''return number of positive and negative comments based on TextBlob sentiment analysis'''
+    # with jsonlines.open("/vadoe/vadoe/vadoe/townhall_2021-01-14T02-05-51.json") as reader:
+    with jsonlines.open(file, mode='r') as reader:
+        # Confirm type
+        tprint(reader)
+
+        # Build iterator
+        _doc = iter(reader)
+        i = 0
+        pos = 0
+        neg = 0
+        posl = []
+        negl = []
+
+        while i<25:
+            _line = next(_doc)
+            tprint(_line)
+            if _line['sentiment'] == 'pos':
+                pos = pos + 1
+                posl.append(_line['comment'])
+            elif _line['sentiment'] == 'neg':
+                neg = neg + 1
+                negl.append(_line['comment'])
+            i=i+1
+
+        print(f'{pos} positive and {neg} negative comments')
+            # tst = TextBlob(obj['comment'])
+            # tst.sentiment
+
+def process_file(file):
+    '''Find Smythers posts'''
+    with jsonlines.open(file, mode='r') as reader:
+        _doc = iter(reader)
+        _list = []
+        for item in _doc:
+                try:
+                    if item['author'][0] == 'Smythers': 
+                        _list.append(item['content'][0])
+                except KeyError:
+                    continue
+    return(_list)
+
+def write_file(file, data:object):
+    '''Write data to file'''
+    with jsonlines.open(file, mode='w') as writer:
+        for each in data:
+            writer.write(each)
+    print('write successful')
+
+def clean_text(text:str):
+    s1 = remove_html(text)
+    s2 = remove_http(s1)
+    return s2
+
+def remove_html(text:str):
+    '''Remove html tags from string'''
+    clean = re.compile('<.*?>')
+    return re.sub(clean, '', text)
+
+def remove_http(text:str):
+    '''Remove URLs from string'''
+    return re.sub(r'http\S+','', text)
+
+def get_nouns(text:str):
+    blob = TextBlob(text)
+    # check nouns? or no
+    return blob.tags
+
+vadoe = '/vadoe/vadoe/vadoe/townhall_2021-01-14T02-05-51.json'
+vadoe_p = '/vadoe/vadoe/vadoe/townhall_2021-01-14T05-11-55.json'
+dlr = '/vadoe/vadoe/vadoe/dlr.json'
+
+smythers_pc = '/vadoe/vadoe/vadoe/smythers.json'
+write_to = '/vadoe/vadoe/vadoe/nouns.json'
+
+# processed_file(file)
+smythers_posts = process_file(dlr)
+# cleaned = []
+# for each in smythers:
+    # cleaned.append(clean_text(each))
+cleaned = [clean_text(each) for each in smythers_posts]
+nouns = []
+for x in cleaned:
+    _list = get_nouns(x)
+    for y in _list:
+        nouns.append(y)
+    # nouns.append(x for x in [get_nouns())
+sortedNouns = Counter(nouns)
+nouns = []
+for k, v in sortedNouns.items():
+    if v > 2: 
+        _d = (k, v)
+        nouns.append(_d)
+print(nouns)
+write_file(write_to, nouns)
--- a/docs/townhall.py
+++ b/docs/townhall.py
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+import scrapy
+from items import CommentItem
+import textblob
+from textblob import TextBlob
+from textblob.sentiments import NaiveBayesAnalyzer
+
+class TownhallSpider(scrapy.Spider):
+    name = 'townhall'
+    allowed_domains = ['townhall.virginia.gov']
+    start_urls = ['https://www.townhall.virginia.gov/L/comments.cfm?GDocForumID=452']
+    custom_settings = {
+        'FEED_EXPORTERS' : {
+            "jsonlines": "scrapy.exporters.JsonLinesItemExporter",
+        },
+        'FEED_URI' : '%(name)s_%(time)s.json',
+        'FEED_FORMAT': 'jsonlines'
+    }
+
+    def parse(self, response):
+        rows = response.css('#contentwide>table>tr')
+        # cut out the header row
+        for each in rows[1:]:
+        # for each in rows[1:6]:
+            cols = each.xpath('.//td')
+            linkfollow = cols[0].css('a::attr(href)').get()
+            comment_title = cols[0].xpath('a/text()').get()
+            # clean up
+            commenter = cols[1].xpath('text()').get()
+            # clean up
+            date = cols[2].xpath('a/text()').get()
+            print(f'{comment_title}  |  {commenter}')
+            yield response.follow(linkfollow, callback = self.parse_comment)
+
+    def parse_comment(self, response):
+        entry = CommentItem()
+        text = response.css('.divComment>p::text').get()
+        text = text.replace(u'\u00a0',' ')
+        entry['comment'] = text
+        blob = TextBlob(text, analyzer=NaiveBayesAnalyzer())
+        entry['sentiment'] = blob.sentiment.classification
+        entry['sentiment_pos'] = blob.sentiment.p_pos
+        entry['sentiment_neg'] = blob.sentiment.p_neg
+        # yield CommentItem(comment = response.css('.divComment>p::text').get())
+        yield entry
--- a/docs/townhall2.py
+++ b/docs/townhall2.py
@@ -0,0 +1,62 @@
+# -*- coding: utf-8 -*-
+import scrapy
+from items import CommentItem
+import textblob
+from textblob import TextBlob
+from textblob.sentiments import NaiveBayesAnalyzer
+
+class TownhallSpider(scrapy.Spider):
+    name = 'townhall'
+    allowed_domains = ['townhall.virginia.gov']
+    start_urls = ['https://www.townhall.virginia.gov/L/Forums.cfm']
+    custom_settings = {
+        'FEED_EXPORTERS' : {
+            "jsonlines": "scrapy.exporters.JsonLinesItemExporter",
+        },
+        'FEED_URI' : '%(name)s_%(time)s.json',
+        'FEED_FORMAT': 'jsonlines'
+    }
+
+    def parse(self, response):
+        rows = response.css('table>tr>td')
+        for each in rows:
+            linkfollow = each.css('a').attrib['href']
+            if 'comments' in linkfollow:
+                yield response.follow(linkfollow, callback = self.parse_forum)
+
+            cols = each.xpath('.//td')
+            linkfollow = cols[0].css('a::attr(href)').get()
+            comment_title = cols[0].xpath('a/text()').get()
+            # clean up
+            commenter = cols[1].xpath('text()').get()
+            # clean up
+            date = cols[2].xpath('a/text()').get()
+            print(f'{comment_title}  |  {commenter}')
+            yield response.follow(linkfollow, callback = self.parse_comment)
+
+    def parse_forum(self, response):
+        rows = response.css('#contentwide>table>tr')
+        # cut out the header row
+        for each in rows[1:]:
+        # for each in rows[1:6]:
+            cols = each.xpath('.//td')
+            linkfollow = cols[0].css('a::attr(href)').get()
+            comment_title = cols[0].xpath('a/text()').get()
+            # clean up
+            commenter = cols[1].xpath('text()').get()
+            # clean up
+            date = cols[2].xpath('a/text()').get()
+            print(f'{comment_title}  |  {commenter}')
+            yield response.follow(linkfollow, callback = self.parse_comment)
+
+    def parse_comment(self, response):
+        entry = CommentItem()
+        text = response.css('.divComment>p::text').get()
+        text = text.replace(u'\u00a0',' ')
+        entry['comment'] = text
+        blob = TextBlob(text, analyzer=NaiveBayesAnalyzer())
+        entry['sentiment'] = blob.sentiment.classification
+        entry['sentiment_pos'] = blob.sentiment.p_pos
+        entry['sentiment_neg'] = blob.sentiment.p_neg
+        # yield CommentItem(comment = response.css('.divComment>p::text').get())
+        yield entry