"""Survey_Analysis_v_3_2_86.ipynb |
Automatically generated by Colaboratory. |
Original file is located at |
https://colab.research.google.com/drive/1VOlSQ6kva-BiGfJc7b3BwlKBegP13tdS |
""" |
!pip install streamlit |
import streamlit |
pip install pygal |
!pip install squarify |
import numpy as np |
import pandas as pd |
import seaborn as sns |
import matplotlib.pyplot as plt |
import plotly.express as px |
import plotly.graph_objects as go |
import pygal as py |
import squarify as sq |
import matplotlib |
plt.rcParams["figure.figsize"] = (20,15) |
matplotlib.rc('xtick', labelsize=7) |
matplotlib.rc('ytick', labelsize=7) |
font = {'family' : 'normal', |
'weight' : 'bold', |
'size' : 5} |
matplotlib.rc('font', **font) |
from sklearn.feature_extraction.text import CountVectorizer |
import warnings |
warnings.filterwarnings("ignore", category=FutureWarning) |
df=pd.read_csv("/content/gen-data.csv",engine="python",encoding="ISO-8859-1") |
df |
col1=df.keys()[0] |
col2=df.keys()[1] |
col2 |
df2=pd.DataFrame([[col1, col2]], columns=list([col1,col2]), index=[4845]) |
df=df.append(df2, ignore_index=True).set_axis(['sentiment', 'news'], axis=1, inplace=False) |
df |
df = df.replace("neutral","neutral") |
sns.countplot(y="sentiment",data=df) |
df.isnull().sum() |
from textblob import TextBlob |
def preprocess(ReviewText): |
ReviewText = ReviewText.str.replace("(<br/>)", "") |
ReviewText = ReviewText.str.replace('(<a).*(>).*(</a>)', '') |
ReviewText = ReviewText.str.replace('(&)', '') |
ReviewText = ReviewText.str.replace('(>)', '') |
ReviewText = ReviewText.str.replace('(<)', '') |
ReviewText = ReviewText.str.replace('(\xa0)', ' ') |
return ReviewText |
df['Review Text'] = preprocess(df['news']) |
df['polarity'] = df['news'].map(lambda text: TextBlob(text).sentiment.polarity) |
df['news_len'] = df['news'].astype(str).apply(len) |
df['word_count'] = df['news'].apply(lambda x: len(str(x).split())) |
df |
print('top 4 random reviews with the highest positive sentiment polarity: \n') |
df1=df.drop_duplicates(subset=['Review Text']) |
cl = df1.loc[df1.polarity == 1, ['Review Text']].sample(4).values |
for c in cl: |
print(c[0]) |
print('5 random reviews with the most neutral sentiment(zero) polarity: \n') |
cl1 = df.loc[df.polarity == 0, ['Review Text']].sample(5).values |
for c in cl1: |
print(c[0]) |
print('5 reviews with the most negative polarity having polarity lesser than -0.80: \n') |
cl3 = df.loc[df.polarity <= -0.80, ['Review Text']].sample(5).values |
for c in cl3: |
print(c[0]) |
sns.boxplot(df["polarity"],palette="rainbow",data=df) |
df['polarity'].plot( |
kind='hist', |
bins=50, |
color="peru", |
title='Sentiment Polarity Distribution');plt.show() |
p_s=df[df["polarity"]>0].count()["sentiment"] |
neu_s=df[df["polarity"]==0].count()["sentiment"] |
neg_s=df[df["polarity"]<0].count()["sentiment"] |
sentiment = ['positive_sentiment',"neutral_sentiment","negative_sentiment"] |
values = [p_s,neu_s,neg_s] |
colors = ['#FF0000', 'olive', '#FFFF00'] |
explode = (0.05, 0.05, 0.05) |
plt.pie(values, colors=colors, labels=sentiment, |
autopct='%1.1f%%', pctdistance=0.85, |
explode=explode) |
centre_circle = plt.Circle((0, 0), 0.70, fc='white') |
fig = plt.gcf() |
fig.gca().add_artist(centre_circle) |
plt.title('count of polarity as per sentiment') |
plt.show() |
df.plot.box(y=["word_count"],color="hotpink") |
df['word_count'].plot( |
kind='hist', |
bins=100, |
color="orange", |
title='Review Text Word Count Distribution');plt.show() |
sns.boxenplot(x="news_len",data=df) |
plt.show() |
df['news_len'].plot( |
kind='hist', |
bins=50, |
color="lightblue", |
title='Review Text Word Count Distribution');plt.show() |
fig = px.scatter(df, x="news_len", y="word_count", color="sentiment", |
marginal_x="box", marginal_y="violin", |
title="Click on the legend items!") |
fig.show() |
def get_top_n_words(corpus, n=None): |
vec = CountVectorizer().fit(corpus) |
bag_of_words = vec.transform(corpus) |
sum_words = bag_of_words.sum(axis=0) |
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()] |
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True) |
return words_freq[:n] |
common_words = get_top_n_words(df['Review Text'], 20) |
for word, freq in common_words: |
print(word, freq) |
df1 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count']) |
df1.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot( |
kind='bar',title='Top 20 words in review before removing stop words') |
df1 |
def get_top_n_words(corpus, n=None): |
vec = CountVectorizer(stop_words = 'english').fit(corpus) |
bag_of_words = vec.transform(corpus) |
sum_words = bag_of_words.sum(axis=0) |
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()] |
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True) |
return words_freq[:n] |
common_words = get_top_n_words(df['Review Text'], 20) |
for word, freq in common_words: |
print(word, freq) |
df2 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count']) |
df2.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(kind='bar', title='Top 20 words in review after removing stop words') |
def get_top_n_bigram(corpus, n=None): |
vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus) |
bag_of_words = vec.transform(corpus) |
sum_words = bag_of_words.sum(axis=0) |
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()] |
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True) |
return words_freq[:n] |
common_words = get_top_n_bigram(df['Review Text'], 20) |
for word, freq in common_words: |
print(word, freq) |
df3 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count']) |
df3.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot( |
kind='bar',title='Top 20 bigrams in review before removing stop words') |
def get_top_n_bigram(corpus, n=None): |
vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus) |
bag_of_words = vec.transform(corpus) |
sum_words = bag_of_words.sum(axis=0) |
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()] |
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True) |
return words_freq[:n] |
common_words = get_top_n_bigram(df['Review Text'], 20) |
for word, freq in common_words: |
print(word, freq) |
df4 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count']) |
df4.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot( |
kind='bar', title='Top 20 bigrams in review after removing stop words') |
def get_top_n_trigram(corpus, n=None): |
vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus) |
bag_of_words = vec.transform(corpus) |
sum_words = bag_of_words.sum(axis=0) |
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()] |
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True) |
return words_freq[:n] |
common_words = get_top_n_trigram(df['Review Text'], 20) |
for word, freq in common_words: |
print(word, freq) |
df5 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count']) |
df5.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot( |
kind='bar', title='Top 20 trigrams in review before removing stop words') |
def get_top_n_trigram(corpus, n=None): |
vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus) |
bag_of_words = vec.transform(corpus) |
sum_words = bag_of_words.sum(axis=0) |
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()] |
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True) |
return words_freq[:n] |
common_words = get_top_n_trigram(df['Review Text'], 20) |
for word, freq in common_words: |
print(word, freq) |
df6 = pd.DataFrame(common_words, columns = ['ReviewText' ,'count']) |
df6.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot( |
kind='bar', title='Top 20 trigrams in review after removing stop words') |
import nltk |
nltk.download('punkt') |
nltk.download('wordnet') |
nltk.download('omw-1.4') |
nltk.download('averaged_perceptron_tagger') |
blob = TextBlob(str(df['Review Text'])) |
pos_df = pd.DataFrame(blob.tags, columns = ['word' , 'pos']) |
pos_df = pos_df.pos.value_counts()[:20] |
pos_df.plot( |
kind='bar', |
title='Top 20 Part-of-speech tagging for review corpus') |
y0 = df.loc[df['sentiment'] == 'positive']['polarity'] |
y1 = df.loc[df['sentiment'] == 'negative']['polarity'] |
y2 = df.loc[df['sentiment'] == 'neutral']['polarity'] |
trace0 = go.Box( |
y=y0, |
name = 'positive', |
marker = dict( |
color = 'rgb(214, 12, 140)', |
) |
) |
trace1 = go.Box( |
y=y1, |
name = 'negative', |
marker = dict( |
color = 'rgb(0, 128, 128)', |
) |
) |
trace2 = go.Box( |
y=y2, |
name = 'neutral', |
marker = dict( |
color = 'rgb(10, 140, 208)', |
) |
) |
data = [trace0, trace1, trace2] |
layout = go.Layout( |
title = "Polarity Boxplot according to sentiment" |
) |
go.Figure(data=data,layout=layout) |
y0 = df.loc[df['sentiment'] == 'positive']['news_len'] |
y1 = df.loc[df['sentiment'] == 'negative']['news_len'] |
y2 = df.loc[df['sentiment'] == 'neutral']['news_len'] |
trace0 = go.Box( |
y=y0, |
name = 'positive', |
marker = dict( |
color = 'rgb(214, 12, 140)', |
) |
) |
trace1 = go.Box( |
y=y1, |
name = 'negative', |
marker = dict( |
color = 'rgb(0, 128, 128)', |
) |
) |
trace2 = go.Box( |
y=y2, |
name = 'neutral', |
marker = dict( |
color = 'rgb(10, 140, 208)', |
) |
) |
data = [trace0, trace1, trace2] |
layout = go.Layout( |
title = "news length Boxplot by sentiment" |
) |
go.Figure(data=data,layout=layout) |
xp = df.loc[df['sentiment'] == "positive", 'polarity'] |
xneu = df.loc[df['sentiment'] == "neutral", 'polarity'] |
xneg= df.loc[df['sentiment'] == "negative", 'polarity'] |
trace1 = go.Histogram( |
x=xp, name='positive', |
opacity=0.75 |
) |
trace2 = go.Histogram( |
x=xneu, name = 'neutral', |
opacity=0.75 |
) |
trace3 = go.Histogram( |
x=xneg, name = 'negative', |
opacity=0.75 |
) |
data = [trace1, trace2,trace3] |
layout = go.Layout(barmode='overlay', title='Distribution of Sentiment polarity') |
go.Figure(data=data, layout=layout) |
trace1 = go.Scatter( |
x=df['polarity'], y=df['news_len'], mode='markers', name='points', |
marker=dict(color='rgb(102,0,0)', size=2, opacity=0.4) |
) |
trace2 = go.Histogram2dContour( |
x=df['polarity'], y=df['news_len'], name='density', ncontours=50, |
colorscale='Hot', reversescale=True, showscale=False |
) |
trace3 = go.Histogram( |
x=df['polarity'], name='Sentiment polarity density', |
marker=dict(color='rgb(102,0,0)'), |
yaxis='y2' |
) |
trace4 = go.Histogram( |
y=df['news_len'], name='news length density', marker=dict(color='rgb(102,0,0)'), |
xaxis='x2' |
) |
data = [trace1, trace2, trace3, trace4] |
layout = go.Layout( |
showlegend=False, |
autosize=False, |
width=600, |
height=550, |
xaxis=dict( |
domain=[0, 0.85], |
showgrid=False, |
zeroline=False |
), |
yaxis=dict( |
domain=[0, 0.85], |
showgrid=False, |
zeroline=False |
), |
margin=dict( |
t=50 |
), |
hovermode='x unified', |
bargap=0, |
xaxis2=dict( |
domain=[0.85, 1], |
showgrid=False, |
zeroline=False |
), |
yaxis2=dict( |
domain=[0.85, 1], |
showgrid=False, |
zeroline=False |
) |
) |
go.Figure(data=data, layout=layout) |
trace1 = go.Scatter( |
x=df['polarity'], y=df['word_count'], mode='markers', name='points', |
marker=dict(color='rgb(102,0,0)', size=2, opacity=0.4) |
) |
trace2 = go.Histogram2dContour( |
x=df['polarity'], y=df['word_count'], name='density', ncontours=20, |
colorscale='Hot', reversescale=True, showscale=False |
) |
trace3 = go.Histogram( |
x=df['polarity'], name='Sentiment polarity density', |
marker=dict(color='rgb(102,0,0)'), |
yaxis='y2' |
) |
trace4 = go.Histogram( |
y=df['word_count'], name='word count density', marker=dict(color='rgb(112,0,0)'), |
xaxis='x2' |
) |
data = [trace1, trace2, trace3, trace4] |
layout = go.Layout( |
showlegend=False, |
autosize=False, |
width=600, |
height=550, |
xaxis=dict( |
domain=[0, 0.85], |
showgrid=False, |
zeroline=False |
), |
yaxis=dict( |
domain=[0, 0.85], |
showgrid=False, |
zeroline=False |
), |
margin=dict( |
t=50 |
), |
hovermode='closest', |
bargap=0, |
xaxis2=dict( |
domain=[0.85, 1], |
showgrid=False, |
zeroline=False |
), |
yaxis2=dict( |
domain=[0.85, 1], |
showgrid=False, |
zeroline=False |
) |
) |
go.Figure(data=data, layout=layout) |
pip install scattertext |
pip install spacy |
import scattertext as st |
import spacy |
nlp = spacy.blank("en") |
nlp.add_pipe('sentencizer') |
corpus = st.CorpusFromPandas(df, category_col='sentiment', text_col='Review Text', nlp=nlp).build() |
print(list(corpus.get_scaled_f_scores_vs_background().index[:20])) |
term_freq_df = corpus.get_term_freq_df() |
term_freq_df['positive_sentiment'] = corpus.get_scaled_f_scores('positive') |
list(term_freq_df.sort_values(by='positive_sentiment', ascending=False).index[:20]) |
term_freq_df['neutral_sentiment'] = corpus.get_scaled_f_scores('neutral') |
list(term_freq_df.sort_values(by='neutral_sentiment', ascending=False).index[:20]) |
term_freq_df['negative_sentiment'] = corpus.get_scaled_f_scores('negative') |
list(term_freq_df.sort_values(by='negative_sentiment', ascending=False).index[:20]) |
from sklearn.feature_extraction.text import TfidfVectorizer |
from sklearn.decomposition import TruncatedSVD |
from collections import Counter |
tfidf_vectorizer = TfidfVectorizer(stop_words='english', use_idf=True, smooth_idf=True) |
reindexed_data = df['Review Text'].values |
document_term_matrix = tfidf_vectorizer.fit_transform(reindexed_data) |
n_topics = 10 |
lsa_model = TruncatedSVD(n_components=n_topics) |
lsa_topic_matrix = lsa_model.fit_transform(document_term_matrix) |
def get_keys(topic_matrix): |
''' |
returns an integer list of predicted topic |
categories for a given topic matrix |
''' |
keys = topic_matrix.argmax(axis=1).tolist() |
return keys |
def keys_to_counts(keys): |
''' |
returns a tuple of topic categories and their |
accompanying magnitudes for a given list of keys |
''' |
count_pairs = Counter(keys).items() |
categories = [pair[0] for pair in count_pairs] |
counts = [pair[1] for pair in count_pairs] |
return (categories, counts) |
lsa_keys = get_keys(lsa_topic_matrix) |
lsa_categories, lsa_counts = keys_to_counts(lsa_keys) |
def get_top_n_words(n, keys, document_term_matrix, tfidf_vectorizer): |
''' |
returns a list of n_topic strings, where each string contains the n most common |
words in a predicted category, in order |
''' |
top_word_indices = [] |
for topic in range(n_topics): |
temp_vector_sum = 0 |
for i in range(len(keys)): |
if keys[i] == topic: |
temp_vector_sum += document_term_matrix[i] |
temp_vector_sum = temp_vector_sum.toarray() |
top_n_word_indices = np.flip(np.argsort(temp_vector_sum)[0][-n:],0) |
top_word_indices.append(top_n_word_indices) |
top_words = [] |
for topic in top_word_indices: |
topic_words = [] |
for index in topic: |
temp_word_vector = np.zeros((1,document_term_matrix.shape[1])) |
temp_word_vector[:,index] = 1 |
the_word = tfidf_vectorizer.inverse_transform(temp_word_vector)[0][0] |
topic_words.append(the_word.encode('ascii').decode('utf-8')) |
top_words.append(" ".join(topic_words)) |
return top_words |
top_lsa=get_top_n_words(3, lsa_keys, document_term_matrix, tfidf_vectorizer) |
for i in range(len(top_lsa)): |
print("Topic {}: ".format(i+1), top_lsa[i]) |
top_3_words = get_top_n_words(3, lsa_keys, document_term_matrix, tfidf_vectorizer) |
labels = ['Topic {}: \n'.format(i+1) + top_3_words[i] for i in lsa_categories] |
fig, ax = plt.subplots(figsize=(16,8)) |
ax.bar(lsa_categories, lsa_counts,color="skyblue"); |
ax.set_xticks(lsa_categories,); |
ax.set_xticklabels(labels, rotation=45, rotation_mode='default',color="olive"); |
ax.set_ylabel('Number of review text on topics'); |
ax.set_title('Count of LSA topics'); |
plt.show(); |
"""#---2----""" |
df['sentiment'].value_counts() |
from sklearn.model_selection import train_test_split |
train,eva = train_test_split(df,test_size = 0.2) |
!pip install simpletransformers |
from simpletransformers.classification import ClassificationModel |
model = ClassificationModel('bert', 'bert-base-cased', num_labels=3, args={'reprocess_input_data': True, 'overwrite_output_dir': True},use_cuda=False) |
def making_label(st): |
if(st=='positive'): |
return 0 |
elif(st=='neutral'): |
return 2 |
else: |
return 1 |
train['label'] = train['sentiment'].apply(making_label) |
eva['label'] = eva['sentiment'].apply(making_label) |
print(train.shape) |
train_df = pd.DataFrame({ |
'text': train['news'][:1500].replace(r'\n', ' ', regex=True), |
'label': train['label'][:1500] |
}) |
eval_df = pd.DataFrame({ |
'text': eva['news'][-400:].replace(r'\n', ' ', regex=True), |
'label': eva['label'][-400:] |
}) |
model.train_model(train_df) |
result, model_outputs, wrong_predictions = model.eval_model(eval_df) |
result |
model_outputs |
len(wrong_predictions) |
lst = [] |
for arr in model_outputs: |
lst.append(np.argmax(arr)) |
true = eval_df['label'].tolist() |
predicted = lst |
import sklearn |
mat = sklearn.metrics.confusion_matrix(true , predicted) |
mat |
df_cm = pd.DataFrame(mat, range(3), range(3)) |
sns.heatmap(df_cm, annot=True) |
plt.show() |
print(sklearn.metrics.classification_report(true,predicted,target_names=['positive','neutral','negative'])) |
sklearn.metrics.accuracy_score(true,predicted) |
def get_result(statement): |
result = model.predict([statement]) |
pos = np.where(result[1][0] == np.amax(result[1][0])) |
pos = int(pos[0]) |
sentiment_dict = {0:'positive',1:'negative',2:'neutral'} |
print(sentiment_dict[pos]) |
return |
get_result("According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .") |
get_result("According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net sales .") |
get_result('Sales in Finland decreased by 2.0 % , and international sales decreased by 9.3 % in terms of euros , and by 15.1 % in terms of local currencies .') |
get_result("This company is growing like anything with 23% profit every year") |
get_result("This company is not able to make any profit but make very less profit in last quarter") |
get_result("The doctor treated well and the patient was very healthy") |
get_result("the act of politicians is to serve and help needy and not to create ruck suck") |
get_result("American burger is too good. Can't resisit to go and have one") |
get_result("GDP per capita increased to double in India from 2013") |
get_result("Indian economy is doing very good and will become super power one day.") |
get_result("Indian economy is doing very good and will create millions of jobs in coming years") |
get_result("Indian economy is not doing very good and need urgent reforms but we are pretty sure it will be very good in coming years") |
get_result("Indian economy is doing very good.Indian economy is not doing very good ") |
get_result("Indian economy is not doing very good. Indian economy will bounce back to become leading economy") |
get_result("Indian economy is not doing very good. Urgent reforms is required to create new jobs and improve export") |
get_result("The stock market of Indian economy is dangling too much") |
"""#VADER""" |
!pip install vaderSentiment |
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer |
obj = SentimentIntensityAnalyzer() |
sentence = "Ram is really good " |
sentiment_dict = obj.polarity_scores(sentence) |
print(sentiment_dict) |
sentence = "Ram is better " |
sentiment_dict = obj.polarity_scores(sentence) |
print(sentiment_dict) |
sentence = "Rahul is really bad" |
sentiment_dict = obj.polarity_scores(sentence) |
print(sentiment_dict) |
print(obj.polarity_scores('Ram is good boy')) |
print(obj.polarity_scores('Ram is good boy!')) |
print(obj.polarity_scores('Ram is good boy!!')) |
print(obj.polarity_scores('Ram is good')) |
print(obj.polarity_scores('Ram is GOOD')) |
print(obj.polarity_scores('Ram is good')) |
print(obj.polarity_scores('Ram is better')) |
print(obj.polarity_scores('Ram is best')) |
print(obj.polarity_scores('Ram is bad')) |
print(obj.polarity_scores('Ram is worse')) |
print(obj.polarity_scores('Ram is worst')) |
print(obj.polarity_scores('Ram is good')) |
print(obj.polarity_scores('Ram is good, but he is also naughty sometimes')) |
print(obj.polarity_scores("That Hotel")) |
print(obj.polarity_scores("That Hotel SUX")) |
print(obj.polarity_scores("That Hotel SUCKS")) |
print(obj.polarity_scores("Your :) is the most beautiful thing I have ever seen")) |
print(obj.polarity_scores("Your smile is the most beautiful thing I have ever seen")) |
print(obj.polarity_scores("Your :( is the worst thing I have ever seen")) |
print(obj.polarity_scores("Your smile is the worst thing I have ever seen")) |
"""#3.a Using FINBERT Model""" |
from transformers import BertTokenizer, BertForSequenceClassification, pipeline |
import transformers |
transformers.__version__ |
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3) |
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone') |
nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer) |
results = nlp(['growth is strong and we have plenty of liquidity.', |
'there is a shortage of capital, and we need extra financing.', |
'formulation patents might protect Vasotec to a limited extent.']) |
results |
"""#FINBERT ESG""" |
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-esg',num_labels=4) |
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-esg') |
nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer) |
results = nlp(['Managing and working to mitigate the impact our operations have on the environment is a core element of our business.', |
'Rhonda has been volunteering for several years for a variety of charitable community programs.', |
'Cabot\'s annual statements are audited annually by an independent registered public accounting firm.', |
'As of December 31, 2012, the 2011 Term Loan had a principal balance of $492.5 million.']) |
results |
"""#FINBERT Classification""" |
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-fls',num_labels=3) |
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-fls') |
nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer) |
results = nlp(['we expect the age of our fleet to enhance availability and reliability due to reduced downtime for repairs.', |
'on an equivalent unit of production basis, general and administrative expenses declined 24 percent from 1994 to $.67 per boe.', |
'we will continue to assess the need for a valuation allowance against deferred tax assets considering all available evidence obtained in']) |
results |
X = df['Review Text'].to_list() |
y = df['sentiment'].to_list() |
from transformers import BertTokenizer, BertForSequenceClassification |
finbert_whole = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3) |
tokenizer_whole = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone') |
labels = {0:'neutral', 1:'positive',2:'negative'} |
sent_val = list() |
for x in X: |
inputs = tokenizer_whole(x, return_tensors="pt", padding=True) |
outputs = finbert_whole(**inputs)[0] |
val = labels[np.argmax(outputs.detach().numpy())] |
print(x, '---->', val) |
print('#######################################################') |
sent_val.append(val) |
from sklearn.metrics import accuracy_score |
print(accuracy_score(y, sent_val)) |
"""#Using DISTILBERT""" |
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification |
tokenizer_distilbert = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") |
model_distilbert = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased") |
labels = {0:'neutral', 1:'positive',2:'negative'} |
sent_val_bert = list() |
for x in X: |
inputs = tokenizer_distilbert(x, return_tensors="pt", padding=True) |
outputs = model_distilbert(**inputs)[0] |
val = labels[np.argmax(outputs.detach().numpy())] |
print(x, '---->', val) |
print('#######################################################') |
sent_val_bert.append(val) |
from sklearn.metrics import accuracy_score |
print(accuracy_score(y, sent_val)) |
"""#Bert""" |
tokenizer_bert = DistilBertTokenizer.from_pretrained("bert-base-uncased") |
model_bert = DistilBertForSequenceClassification.from_pretrained("bert-base-uncased") |
labels = {0:'neutral', 1:'positive',2:'negative'} |
sent_val_bert1 = list() |
for x in X: |
inputs = tokenizer_bert(x, return_tensors="pt", padding=True) |
outputs = model_bert(**inputs)[0] |
val = labels[np.argmax(outputs.detach().numpy())] |
print(x, '---->', val) |
print('#######################################################') |
sent_val_bert1.append(val) |
from sklearn.metrics import accuracy_score |
print(accuracy_score(y, sent_val)) |