import streamlit as st
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import requests
from bs4 import BeautifulSoup
import pandas as pd
import altair as alt
from collections import OrderedDict
from nltk.tokenize import sent_tokenize
import nltk
model_name = 'dejanseo/sentiment'
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
sentiment_labels = {
0: "very positive",
1: "positive",
2: "somewhat positive",
3: "neutral",
4: "somewhat negative",
5: "negative",
6: "very negative"
background_colors = {
"very positive": "rgba(0, 255, 0, 0.5)",
"positive": "rgba(0, 255, 0, 0.3)",
"somewhat positive": "rgba(0, 255, 0, 0.1)",
"neutral": "rgba(128, 128, 128, 0.1)",
"somewhat negative": "rgba(255, 0, 0, 0.1)",
"negative": "rgba(255, 0, 0, 0.3)",
"very negative": "rgba(255, 0, 0, 0.5)"
def get_text_from_url(url):
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
paragraphs = soup.find_all('p')
return ' '.join(p.get_text() for p in paragraphs)
return ""
def classify_text(text, max_length):
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
with torch.no_grad():
outputs = model(**inputs)
scores = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].tolist()
return scores
def classify_long_text(text):
max_length = tokenizer.model_max_length
chunks = [text[i:i + max_length] for i in range(0, len(text), max_length)]
aggregate_scores = [0] * len(sentiment_labels)
chunk_scores_list = []
for chunk in chunks:
chunk_scores = classify_text(chunk, max_length)
aggregate_scores = [x + y for x, y in zip(aggregate_scores, chunk_scores)]
aggregate_scores = [x / len(chunks) for x in aggregate_scores]
return aggregate_scores, chunk_scores_list, chunks
def classify_sentences(text):
sentences = sent_tokenize(text)
sentence_scores = []
for sentence in sentences:
scores = classify_text(sentence, tokenizer.model_max_length)
sentiment_idx = scores.index(max(scores))
sentiment = sentiment_labels[sentiment_idx]
sentence_scores.append((sentence, sentiment))
return sentence_scores
st.title("Sentiment Classification from URL")
url = st.text_input("Enter URL:")
Multi-label sentiment classification model developed by [Dejan Marketing](https://dejanmarketing.com/).
The model is designed to be deployed in an automated pipeline capable of classifying text sentiment for thousands (or even millions) of text chunks or as a part of a scraping pipeline.
This is a demo model which may occassionally misclasify some texts. In a typical commercial project, a larger model is deployed for the task, and in special cases, a domain-specific model is developed for the client.
# Engage Our Team
Interested in using this in an automated pipeline for bulk query processing?
Please [book an appointment](https://dejanmarketing.com/conference/) to discuss your needs.
if url:
text = get_text_from_url(url)
if text:
scores, chunk_scores_list, chunks = classify_long_text(text)
scores_dict = {sentiment_labels[i]: scores[i] for i in range(len(sentiment_labels))}
sentiment_order = [
"very positive", "positive", "somewhat positive",
"somewhat negative", "negative", "very negative"
ordered_scores_dict = OrderedDict((label, scores_dict[label]) for label in sentiment_order)
df = pd.DataFrame.from_dict(ordered_scores_dict, orient='index', columns=['Likelihood']).reindex(sentiment_order)
chart = alt.Chart(df.reset_index()).mark_bar().encode(
x=alt.X('index', sort=sentiment_order, title='Sentiment'),
st.altair_chart(chart, use_container_width=True)
for i, (chunk_scores, chunk) in enumerate(zip(chunk_scores_list, chunks)):
chunk_scores_dict = {sentiment_labels[j]: chunk_scores[j] for j in range(len(sentiment_labels))}
ordered_chunk_scores_dict = OrderedDict((label, chunk_scores_dict[label]) for label in sentiment_order)
df_chunk = pd.DataFrame.from_dict(ordered_chunk_scores_dict, orient='index', columns=['Likelihood']).reindex(sentiment_order)
chunk_chart = alt.Chart(df_chunk.reset_index()).mark_bar().encode(
x=alt.X('index', sort=sentiment_order, title='Sentiment'),
st.write(f"Chunk {i + 1}:")
st.altair_chart(chunk_chart, use_container_width=True)
st.write("Extracted Text with Sentiment Highlights:")
sentence_scores = classify_sentences(text)
for sentence, sentiment in sentence_scores:
bg_color = background_colors[sentiment]
st.markdown(f'<span style="background-color: {bg_color}">{sentence}</span>', unsafe_allow_html=True)
st.write("Could not extract text from the provided URL.")
