wikicredibility / talk_page_analysis.py
doantrang982@uni.minerva.edu
change the text split in talk page model
96f2e3d
raw
history blame
2.32 kB
from bs4 import BeautifulSoup
from bs4.element import Comment
import requests
from transformers import pipeline
def tag_visible(element):
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
return False
if isinstance(element, Comment):
return False
return True
def getTalkPage(wiki_page):
if "wikipedia.org" in wiki_page:
response = requests.get(wiki_page)
soup = BeautifulSoup(response.content, 'html.parser')
talk_url = soup.find_all("a", {"rel": "discussion"})
if len(talk_url) > 0:
talk_url = talk_url[0]["href"]
try:
talk_response = requests.get("https://en.wikipedia.org" + talk_url)
talk_soup = BeautifulSoup(talk_response.content, 'html.parser')
talk_texts = talk_soup.findAll(text=True)
visible_texts = filter(tag_visible, talk_texts)
return u" ".join(t.strip() for t in visible_texts)
except Exception as error:
print('Error occured: {}'.format(error))
classifier = pipeline(model="amitkayal/bert-finetuned-sem_eval-english", top_k=None)
def tone_talkpage(url):
"""This function goes through the content of the talk page, break it down into smaller parts.
It then takes those smaller parts through the fine-tuned BERT tone detection model, then average out the results for the whole page
Output: the 3 most likely tones of the page with its corresponding probability"""
talk_content = getTalkPage(url)
tone_labels = {'anger': 0, 'anticipation': 0, 'disgust': 0, 'fear': 0, 'joy': 0, 'love': 0, 'optimism': 0, 'pessimism': 0, 'sadness': 0, 'surprise': 0, 'trust': 0}
if talk_content:
breakdown = talk_content.split()
n = 150 #because the max amount of sequence length is 512
breakdown_lst = [' '.join(breakdown[i:i+n]) for i in range(0,len(talk_content),n)]
for ele in breakdown_lst:
res = classifier(ele)[0]
for tone_res in res:
tone_labels[tone_res["label"]] += tone_res["score"]
lst_len = len(breakdown_lst)
for key, val in tone_labels.items():
tone_labels[key] = val/lst_len
sorted_tones = sorted(tone_labels.items(), key=lambda x:x[1], reverse=True)
return sorted_tones[:3]