shrut27's picture
Update app.py
2083211 verified
raw
history blame
No virus
3.79 kB
import streamlit as st
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
import spacy
from tika import parser
import requests
import pandas as pd
# Loading spaCy model outside the streamlit cache
nlp = spacy.load("en_core_web_sm")
@st.cache_resource()
def load_environmental_model():
name_env = "ESGBERT/EnvironmentalBERT-environmental"
tokenizer_env = AutoTokenizer.from_pretrained(name_env)
model_env = AutoModelForSequenceClassification.from_pretrained(name_env)
return pipeline("text-classification", model=model_env, tokenizer=tokenizer_env)
@st.cache_resource()
def load_social_model():
name_soc = "ESGBERT/SocialBERT-social"
tokenizer_soc = AutoTokenizer.from_pretrained(name_soc)
model_soc = AutoModelForSequenceClassification.from_pretrained(name_soc)
return pipeline("text-classification", model=model_soc, tokenizer=tokenizer_soc)
@st.cache_resource()
def load_governance_model():
name_gov = "ESGBERT/GovernanceBERT-governance"
tokenizer_gov = AutoTokenizer.from_pretrained(name_gov)
model_gov = AutoModelForSequenceClassification.from_pretrained(name_gov)
return pipeline("text-classification", model=model_gov, tokenizer=tokenizer_gov)
@st.cache_resource()
def load_sentiment_model():
model_name = "climatebert/distilroberta-base-climate-sentiment"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, max_len=512)
return pipeline("text-classification", model=model, tokenizer=tokenizer)
# Streamlit App
st.title("ESG Report Classification using Natural Language Processing")
# Get report URL from user input
url = st.text_input("Enter the URL of the report (PDF):")
# Model selection dropdown
st.write("Environmental Model, Social Model, Governance Model would give the percentage denoting the parameter chosen.")
st.write("Sentiment Model shows if the company is a risk or opportunity based on all 3 parameters.")
selected_model = st.selectbox("Select Model", ["Environmental Model", "Social Model", "Governance Model", "Sentiment Model"])
if url:
# Download PDF content from the URL
response = requests.get(url, stream=True)
if response.status_code == 200:
# Parse PDF and extract text
raw_text = parser.from_buffer(response.content)['content']
# Extract sentences using spaCy
doc = nlp(raw_text)
sentences = [sent.text for sent in doc.sents]
# Filtering and preprocessing sentences
sequences = list(map(str, sentences))
sentences = [x.replace("\n", "") for x in sequences]
sentences = [x for x in sentences if x != ""]
sentences = [x for x in sentences if x[0].isupper()]
sub_sentences = sentences[:100]
# Classification using different models based on user selection
if selected_model == "Environmental Model":
pipe_model = load_environmental_model()
elif selected_model == "Social Model":
pipe_model = load_social_model()
elif selected_model == "Governance Model":
pipe_model = load_governance_model()
else:
pipe_model = load_sentiment_model()
# Get predictions for the selected model
model_results = pipe_model(sub_sentences, padding=True, truncation=True)
model_labels = [x["label"] for x in model_results]
# Display count of sentences labeled as the selected model
st.subheader(f"{selected_model} Sentences Count")
st.write(pd.DataFrame({"sentence": sub_sentences, selected_model: model_labels}).groupby(selected_model).count())
else:
st.error("Error fetching PDF content from the provided URL. Please check the URL and try again.")