shrut27's picture
Update app.py
2083211 verified
import streamlit as st
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
import spacy
from tika import parser
import requests
import pandas as pd
# Loading spaCy model outside the streamlit cache
nlp = spacy.load("en_core_web_sm")
@st.cache_resource()
def load_environmental_model():
name_env = "ESGBERT/EnvironmentalBERT-environmental"
tokenizer_env = AutoTokenizer.from_pretrained(name_env)
model_env = AutoModelForSequenceClassification.from_pretrained(name_env)
return pipeline("text-classification", model=model_env, tokenizer=tokenizer_env)
@st.cache_resource()
def load_social_model():
name_soc = "ESGBERT/SocialBERT-social"
tokenizer_soc = AutoTokenizer.from_pretrained(name_soc)
model_soc = AutoModelForSequenceClassification.from_pretrained(name_soc)
return pipeline("text-classification", model=model_soc, tokenizer=tokenizer_soc)
@st.cache_resource()
def load_governance_model():
name_gov = "ESGBERT/GovernanceBERT-governance"
tokenizer_gov = AutoTokenizer.from_pretrained(name_gov)
model_gov = AutoModelForSequenceClassification.from_pretrained(name_gov)
return pipeline("text-classification", model=model_gov, tokenizer=tokenizer_gov)
@st.cache_resource()
def load_sentiment_model():
model_name = "climatebert/distilroberta-base-climate-sentiment"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, max_len=512)
return pipeline("text-classification", model=model, tokenizer=tokenizer)
# Streamlit App
st.title("ESG Report Classification using Natural Language Processing")
# Get report URL from user input
url = st.text_input("Enter the URL of the report (PDF):")
# Model selection dropdown
st.write("Environmental Model, Social Model, Governance Model would give the percentage denoting the parameter chosen.")
st.write("Sentiment Model shows if the company is a risk or opportunity based on all 3 parameters.")
selected_model = st.selectbox("Select Model", ["Environmental Model", "Social Model", "Governance Model", "Sentiment Model"])
if url:
# Download PDF content from the URL
response = requests.get(url, stream=True)
if response.status_code == 200:
# Parse PDF and extract text
raw_text = parser.from_buffer(response.content)['content']
# Extract sentences using spaCy
doc = nlp(raw_text)
sentences = [sent.text for sent in doc.sents]
# Filtering and preprocessing sentences
sequences = list(map(str, sentences))
sentences = [x.replace("\n", "") for x in sequences]
sentences = [x for x in sentences if x != ""]
sentences = [x for x in sentences if x[0].isupper()]
sub_sentences = sentences[:100]
# Classification using different models based on user selection
if selected_model == "Environmental Model":
pipe_model = load_environmental_model()
elif selected_model == "Social Model":
pipe_model = load_social_model()
elif selected_model == "Governance Model":
pipe_model = load_governance_model()
else:
pipe_model = load_sentiment_model()
# Get predictions for the selected model
model_results = pipe_model(sub_sentences, padding=True, truncation=True)
model_labels = [x["label"] for x in model_results]
# Display count of sentences labeled as the selected model
st.subheader(f"{selected_model} Sentences Count")
st.write(pd.DataFrame({"sentence": sub_sentences, selected_model: model_labels}).groupby(selected_model).count())
else:
st.error("Error fetching PDF content from the provided URL. Please check the URL and try again.")