diff --git a/agents.md b/agents.md index 96531e9..0356fd7 100644 --- a/agents.md +++ b/agents.md @@ -5,24 +5,24 @@ - prefer minimal diffs; avoid refactors unless required for the active task ## tech stack -- python; scrapy +- python; scrapy, pytest - file storage: json or csv - assume local virtual env is available and accessible - do not add new dependencies unless explicitly approved; if unavoidable, document justification in the active task notes ## workflow -- prefer direct argv commands (no bash -lc / compound shell chains) unless necessary -- work on ONE task at a time unless explicitly instructed otherwise -- at the start of work, state the task id you are executing -- do not start work unless a task id is specified; if missing, choose the earliest unchecked task and say so -- propose incremental steps -- always include basic tests for core logic -- when you complete a task: - - mark it [X] in docs/tasks.md - - fill in evidence with commit hash + commands run - - never mark complete unless acceptance criteria are met - - include date and time (HH:MM) - +- prefer direct commands +- work on ONE task at a time unless explicitly instructed otherwise: + - at the start of work, state the task id you are executing + - do not start work unless a task id is specified; if missing, choose the earliest unchecked task and say so + - propose incremental steps + - always include basic tests for core logic + - when you complete a task: + - mark it [X] in docs/tasks.md + - fill in evidence with commit hash + commands run + - never mark complete unless acceptance criteria are met + - include date and time (HH:MM) + - follow this format: ``` * [ ] t1.1 Task Title (1) Description and PM notes diff --git a/docs/tb.py b/docs/tb.py new file mode 100644 index 0000000..58e35d3 --- /dev/null +++ b/docs/tb.py @@ -0,0 +1,105 @@ +import jsonlines +import re +from textblob import TextBlob +from collections import Counter + +def tprint(obj): + print(f"{type(obj)} : {obj}") + + +def sort_file(file): + '''return number of positive and negative comments based on TextBlob sentiment analysis''' + # with jsonlines.open("/vadoe/vadoe/vadoe/townhall_2021-01-14T02-05-51.json") as reader: + with jsonlines.open(file, mode='r') as reader: + # Confirm type + tprint(reader) + + # Build iterator + _doc = iter(reader) + i = 0 + pos = 0 + neg = 0 + posl = [] + negl = [] + + while i<25: + _line = next(_doc) + tprint(_line) + if _line['sentiment'] == 'pos': + pos = pos + 1 + posl.append(_line['comment']) + elif _line['sentiment'] == 'neg': + neg = neg + 1 + negl.append(_line['comment']) + i=i+1 + + print(f'{pos} positive and {neg} negative comments') + # tst = TextBlob(obj['comment']) + # tst.sentiment + +def process_file(file): + '''Find Smythers posts''' + with jsonlines.open(file, mode='r') as reader: + _doc = iter(reader) + _list = [] + for item in _doc: + try: + if item['author'][0] == 'Smythers': + _list.append(item['content'][0]) + except KeyError: + continue + return(_list) + +def write_file(file, data:object): + '''Write data to file''' + with jsonlines.open(file, mode='w') as writer: + for each in data: + writer.write(each) + print('write successful') + +def clean_text(text:str): + s1 = remove_html(text) + s2 = remove_http(s1) + return s2 + +def remove_html(text:str): + '''Remove html tags from string''' + clean = re.compile('<.*?>') + return re.sub(clean, '', text) + +def remove_http(text:str): + '''Remove URLs from string''' + return re.sub(r'http\S+','', text) + +def get_nouns(text:str): + blob = TextBlob(text) + # check nouns? or no + return blob.tags + +vadoe = '/vadoe/vadoe/vadoe/townhall_2021-01-14T02-05-51.json' +vadoe_p = '/vadoe/vadoe/vadoe/townhall_2021-01-14T05-11-55.json' +dlr = '/vadoe/vadoe/vadoe/dlr.json' + +smythers_pc = '/vadoe/vadoe/vadoe/smythers.json' +write_to = '/vadoe/vadoe/vadoe/nouns.json' + +# processed_file(file) +smythers_posts = process_file(dlr) +# cleaned = [] +# for each in smythers: + # cleaned.append(clean_text(each)) +cleaned = [clean_text(each) for each in smythers_posts] +nouns = [] +for x in cleaned: + _list = get_nouns(x) + for y in _list: + nouns.append(y) + # nouns.append(x for x in [get_nouns()) +sortedNouns = Counter(nouns) +nouns = [] +for k, v in sortedNouns.items(): + if v > 2: + _d = (k, v) + nouns.append(_d) +print(nouns) +write_file(write_to, nouns) \ No newline at end of file diff --git a/docs/townhall.py b/docs/townhall.py new file mode 100644 index 0000000..a7b92eb --- /dev/null +++ b/docs/townhall.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- +import scrapy +from items import CommentItem +import textblob +from textblob import TextBlob +from textblob.sentiments import NaiveBayesAnalyzer + +class TownhallSpider(scrapy.Spider): + name = 'townhall' + allowed_domains = ['townhall.virginia.gov'] + start_urls = ['https://www.townhall.virginia.gov/L/comments.cfm?GDocForumID=452'] + custom_settings = { + 'FEED_EXPORTERS' : { + "jsonlines": "scrapy.exporters.JsonLinesItemExporter", + }, + 'FEED_URI' : '%(name)s_%(time)s.json', + 'FEED_FORMAT': 'jsonlines' + } + + def parse(self, response): + rows = response.css('#contentwide>table>tr') + # cut out the header row + for each in rows[1:]: + # for each in rows[1:6]: + cols = each.xpath('.//td') + linkfollow = cols[0].css('a::attr(href)').get() + comment_title = cols[0].xpath('a/text()').get() + # clean up + commenter = cols[1].xpath('text()').get() + # clean up + date = cols[2].xpath('a/text()').get() + print(f'{comment_title} | {commenter}') + yield response.follow(linkfollow, callback = self.parse_comment) + + def parse_comment(self, response): + entry = CommentItem() + text = response.css('.divComment>p::text').get() + text = text.replace(u'\u00a0',' ') + entry['comment'] = text + blob = TextBlob(text, analyzer=NaiveBayesAnalyzer()) + entry['sentiment'] = blob.sentiment.classification + entry['sentiment_pos'] = blob.sentiment.p_pos + entry['sentiment_neg'] = blob.sentiment.p_neg + # yield CommentItem(comment = response.css('.divComment>p::text').get()) + yield entry diff --git a/docs/townhall2.py b/docs/townhall2.py new file mode 100644 index 0000000..638068c --- /dev/null +++ b/docs/townhall2.py @@ -0,0 +1,62 @@ +# -*- coding: utf-8 -*- +import scrapy +from items import CommentItem +import textblob +from textblob import TextBlob +from textblob.sentiments import NaiveBayesAnalyzer + +class TownhallSpider(scrapy.Spider): + name = 'townhall' + allowed_domains = ['townhall.virginia.gov'] + start_urls = ['https://www.townhall.virginia.gov/L/Forums.cfm'] + custom_settings = { + 'FEED_EXPORTERS' : { + "jsonlines": "scrapy.exporters.JsonLinesItemExporter", + }, + 'FEED_URI' : '%(name)s_%(time)s.json', + 'FEED_FORMAT': 'jsonlines' + } + + def parse(self, response): + rows = response.css('table>tr>td') + for each in rows: + linkfollow = each.css('a').attrib['href'] + if 'comments' in linkfollow: + yield response.follow(linkfollow, callback = self.parse_forum) + + cols = each.xpath('.//td') + linkfollow = cols[0].css('a::attr(href)').get() + comment_title = cols[0].xpath('a/text()').get() + # clean up + commenter = cols[1].xpath('text()').get() + # clean up + date = cols[2].xpath('a/text()').get() + print(f'{comment_title} | {commenter}') + yield response.follow(linkfollow, callback = self.parse_comment) + + def parse_forum(self, response): + rows = response.css('#contentwide>table>tr') + # cut out the header row + for each in rows[1:]: + # for each in rows[1:6]: + cols = each.xpath('.//td') + linkfollow = cols[0].css('a::attr(href)').get() + comment_title = cols[0].xpath('a/text()').get() + # clean up + commenter = cols[1].xpath('text()').get() + # clean up + date = cols[2].xpath('a/text()').get() + print(f'{comment_title} | {commenter}') + yield response.follow(linkfollow, callback = self.parse_comment) + + def parse_comment(self, response): + entry = CommentItem() + text = response.css('.divComment>p::text').get() + text = text.replace(u'\u00a0',' ') + entry['comment'] = text + blob = TextBlob(text, analyzer=NaiveBayesAnalyzer()) + entry['sentiment'] = blob.sentiment.classification + entry['sentiment_pos'] = blob.sentiment.p_pos + entry['sentiment_neg'] = blob.sentiment.p_neg + # yield CommentItem(comment = response.css('.divComment>p::text').get()) + yield entry