Spaces:

shrut27
/

ESG_Report_Analysis

Build error

App Files Files Community

ESG_Report_Analysis / app.py

shrut27

Update app.py

2083211 verified 7 months ago

raw

history blame

No virus

3.79 kB

	import streamlit as st
	from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
	import spacy
	from tika import parser
	import requests
	import pandas as pd

	# Loading spaCy model outside the streamlit cache
	nlp = spacy.load("en_core_web_sm")

	@st.cache_resource()
	def load_environmental_model():
	name_env = "ESGBERT/EnvironmentalBERT-environmental"
	tokenizer_env = AutoTokenizer.from_pretrained(name_env)
	model_env = AutoModelForSequenceClassification.from_pretrained(name_env)
	return pipeline("text-classification", model=model_env, tokenizer=tokenizer_env)

	@st.cache_resource()
	def load_social_model():
	name_soc = "ESGBERT/SocialBERT-social"
	tokenizer_soc = AutoTokenizer.from_pretrained(name_soc)
	model_soc = AutoModelForSequenceClassification.from_pretrained(name_soc)
	return pipeline("text-classification", model=model_soc, tokenizer=tokenizer_soc)

	@st.cache_resource()
	def load_governance_model():
	name_gov = "ESGBERT/GovernanceBERT-governance"
	tokenizer_gov = AutoTokenizer.from_pretrained(name_gov)
	model_gov = AutoModelForSequenceClassification.from_pretrained(name_gov)
	return pipeline("text-classification", model=model_gov, tokenizer=tokenizer_gov)

	@st.cache_resource()
	def load_sentiment_model():
	model_name = "climatebert/distilroberta-base-climate-sentiment"
	model = AutoModelForSequenceClassification.from_pretrained(model_name)
	tokenizer = AutoTokenizer.from_pretrained(model_name, max_len=512)
	return pipeline("text-classification", model=model, tokenizer=tokenizer)

	# Streamlit App
	st.title("ESG Report Classification using Natural Language Processing")

	# Get report URL from user input
	url = st.text_input("Enter the URL of the report (PDF):")

	# Model selection dropdown
	st.write("Environmental Model, Social Model, Governance Model would give the percentage denoting the parameter chosen.")
	st.write("Sentiment Model shows if the company is a risk or opportunity based on all 3 parameters.")
	selected_model = st.selectbox("Select Model", ["Environmental Model", "Social Model", "Governance Model", "Sentiment Model"])

	if url:
	# Download PDF content from the URL
	response = requests.get(url, stream=True)

	if response.status_code == 200:
	# Parse PDF and extract text
	raw_text = parser.from_buffer(response.content)['content']
	# Extract sentences using spaCy
	doc = nlp(raw_text)
	sentences = [sent.text for sent in doc.sents]
	# Filtering and preprocessing sentences
	sequences = list(map(str, sentences))
	sentences = [x.replace("\n", "") for x in sequences]
	sentences = [x for x in sentences if x != ""]
	sentences = [x for x in sentences if x[0].isupper()]
	sub_sentences = sentences[:100]
	# Classification using different models based on user selection
	if selected_model == "Environmental Model":
	pipe_model = load_environmental_model()
	elif selected_model == "Social Model":
	pipe_model = load_social_model()
	elif selected_model == "Governance Model":
	pipe_model = load_governance_model()
	else:
	pipe_model = load_sentiment_model()

	# Get predictions for the selected model
	model_results = pipe_model(sub_sentences, padding=True, truncation=True)
	model_labels = [x["label"] for x in model_results]

	# Display count of sentences labeled as the selected model
	st.subheader(f"{selected_model} Sentences Count")
	st.write(pd.DataFrame({"sentence": sub_sentences, selected_model: model_labels}).groupby(selected_model).count())

	else:
	st.error("Error fetching PDF content from the provided URL. Please check the URL and try again.")