|
import streamlit as st
|
|
import torch
|
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import pandas as pd
|
|
import altair as alt
|
|
from collections import OrderedDict
|
|
import nltk
|
|
from nltk.tokenize import sent_tokenize
|
|
|
|
nltk.download('punkt')
|
|
|
|
|
|
model_name = 'C:/projects/sentiment/albert_sentiment_model/checkpoint-3000'
|
|
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
|
|
|
|
sentiment_labels = {
|
|
0: "very positive",
|
|
1: "positive",
|
|
2: "somewhat positive",
|
|
3: "neutral",
|
|
4: "somewhat negative",
|
|
5: "negative",
|
|
6: "very negative"
|
|
}
|
|
|
|
|
|
background_colors = {
|
|
"very positive": "rgba(0, 255, 0, 0.5)",
|
|
"positive": "rgba(0, 255, 0, 0.3)",
|
|
"somewhat positive": "rgba(0, 255, 0, 0.1)",
|
|
"neutral": "rgba(128, 128, 128, 0.1)",
|
|
"somewhat negative": "rgba(255, 0, 0, 0.1)",
|
|
"negative": "rgba(255, 0, 0, 0.3)",
|
|
"very negative": "rgba(255, 0, 0, 0.5)"
|
|
}
|
|
|
|
|
|
def get_text_from_url(url):
|
|
response = requests.get(url)
|
|
if response.status_code == 200:
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
paragraphs = soup.find_all('p')
|
|
return ' '.join(p.get_text() for p in paragraphs)
|
|
return ""
|
|
|
|
|
|
def classify_text(text, max_length):
|
|
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
|
|
with torch.no_grad():
|
|
outputs = model(**inputs)
|
|
scores = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].tolist()
|
|
return scores
|
|
|
|
|
|
def classify_long_text(text):
|
|
max_length = tokenizer.model_max_length
|
|
|
|
chunks = [text[i:i + max_length] for i in range(0, len(text), max_length)]
|
|
aggregate_scores = [0] * len(sentiment_labels)
|
|
chunk_scores_list = []
|
|
for chunk in chunks:
|
|
chunk_scores = classify_text(chunk, max_length)
|
|
chunk_scores_list.append(chunk_scores)
|
|
aggregate_scores = [x + y for x, y in zip(aggregate_scores, chunk_scores)]
|
|
|
|
aggregate_scores = [x / len(chunks) for x in aggregate_scores]
|
|
return aggregate_scores, chunk_scores_list, chunks
|
|
|
|
|
|
def classify_sentences(text):
|
|
sentences = sent_tokenize(text)
|
|
sentence_scores = []
|
|
for sentence in sentences:
|
|
scores = classify_text(sentence, tokenizer.model_max_length)
|
|
sentiment_idx = scores.index(max(scores))
|
|
sentiment = sentiment_labels[sentiment_idx]
|
|
sentence_scores.append((sentence, sentiment))
|
|
return sentence_scores
|
|
|
|
|
|
st.title("Sentiment Classification from URL")
|
|
|
|
url = st.text_input("Enter URL:")
|
|
if url:
|
|
text = get_text_from_url(url)
|
|
if text:
|
|
scores, chunk_scores_list, chunks = classify_long_text(text)
|
|
scores_dict = {sentiment_labels[i]: scores[i] for i in range(len(sentiment_labels))}
|
|
|
|
|
|
sentiment_order = [
|
|
"very positive", "positive", "somewhat positive",
|
|
"neutral",
|
|
"somewhat negative", "negative", "very negative"
|
|
]
|
|
ordered_scores_dict = OrderedDict((label, scores_dict[label]) for label in sentiment_order)
|
|
|
|
|
|
df = pd.DataFrame.from_dict(ordered_scores_dict, orient='index', columns=['Likelihood']).reindex(sentiment_order)
|
|
|
|
|
|
chart = alt.Chart(df.reset_index()).mark_bar().encode(
|
|
x=alt.X('index', sort=sentiment_order, title='Sentiment'),
|
|
y='Likelihood'
|
|
).properties(
|
|
width=600,
|
|
height=400
|
|
)
|
|
|
|
st.altair_chart(chart, use_container_width=True)
|
|
|
|
|
|
for i, (chunk_scores, chunk) in enumerate(zip(chunk_scores_list, chunks)):
|
|
chunk_scores_dict = {sentiment_labels[j]: chunk_scores[j] for j in range(len(sentiment_labels))}
|
|
ordered_chunk_scores_dict = OrderedDict((label, chunk_scores_dict[label]) for label in sentiment_order)
|
|
df_chunk = pd.DataFrame.from_dict(ordered_chunk_scores_dict, orient='index', columns=['Likelihood']).reindex(sentiment_order)
|
|
|
|
chunk_chart = alt.Chart(df_chunk.reset_index()).mark_bar().encode(
|
|
x=alt.X('index', sort=sentiment_order, title='Sentiment'),
|
|
y='Likelihood'
|
|
).properties(
|
|
width=600,
|
|
height=400
|
|
)
|
|
|
|
st.write(f"Chunk {i + 1}:")
|
|
st.write(chunk)
|
|
st.altair_chart(chunk_chart, use_container_width=True)
|
|
|
|
|
|
st.write("Extracted Text with Sentiment Highlights:")
|
|
sentence_scores = classify_sentences(text)
|
|
for sentence, sentiment in sentence_scores:
|
|
bg_color = background_colors[sentiment]
|
|
st.markdown(f'<span style="background-color: {bg_color}">{sentence}</span>', unsafe_allow_html=True)
|
|
|
|
else:
|
|
st.write("Could not extract text from the provided URL.")
|
|
|