Spaces:

Pushpa
/

APP

Runtime error

App Files Files Community

APP / app.py

Pushpa

Update app.py

9f86812 over 2 years ago

raw

history blame contribute delete

26.4 kB

	# -- coding: utf-8 --
	"""Survey_Analysis_v_3_2_86.ipynb

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/drive/1VOlSQ6kva-BiGfJc7b3BwlKBegP13tdS
	"""

	#1 - https://www.kaggle.com/code/ramjasmaurya/financial-sentiment-analysis
	#2 - https://www.kaggle.com/code/adarshbiradar/sentiment-analysis-using-bert

	import streamlit



	# Commented out IPython magic to ensure Python compatibility.
	import numpy as np
	import pandas as pd
	import seaborn as sns
	import matplotlib.pyplot as plt
	import plotly.express as px
	import plotly.graph_objects as go


	import pygal as py
	import squarify as sq

	plt.rcParams["figure.figsize"] = (20,15)
	plt.rc('xtick', labelsize=7)
	plt.rc('ytick', labelsize=7)

	font = {'family' : 'normal',
	'weight' : 'bold',
	'size' : 5}

	plt.rc('font', **font)
	from sklearn.feature_extraction.text import CountVectorizer
	import warnings
	warnings.filterwarnings("ignore", category=FutureWarning)
	# %matplotlib inline

	df=pd.read_csv("gen-data.csv",engine="python",encoding="ISO-8859-1")
	df

	col1=df.keys()[0]
	col2=df.keys()[1]
	col2

	df2=pd.DataFrame([[col1, col2]], columns=list([col1,col2]), index=[4845])

	df=df.append(df2, ignore_index=True).set_axis(['sentiment', 'news'], axis=1, inplace=False)

	df

	df = df.replace("ï»¿neutral","neutral")

	sns.countplot(y="sentiment",data=df)

	df.isnull().sum()

	from textblob import TextBlob

	def preprocess(ReviewText):
	ReviewText = ReviewText.str.replace("(<br/>)", "")
	ReviewText = ReviewText.str.replace('(<a).(>).(</a>)', '')
	ReviewText = ReviewText.str.replace('(&amp)', '')
	ReviewText = ReviewText.str.replace('(&gt)', '')
	ReviewText = ReviewText.str.replace('(&lt)', '')
	ReviewText = ReviewText.str.replace('(\xa0)', ' ')
	return ReviewText
	df['Review Text'] = preprocess(df['news'])

	df['polarity'] = df['news'].map(lambda text: TextBlob(text).sentiment.polarity)
	df['news_len'] = df['news'].astype(str).apply(len)
	df['word_count'] = df['news'].apply(lambda x: len(str(x).split()))

	df

	print('top 4 random reviews with the highest positive sentiment polarity: \n')

	df1=df.drop_duplicates(subset=['Review Text'])

	cl = df1.loc[df1.polarity == 1, ['Review Text']].sample(4).values
	for c in cl:
	print(c[0])

	print('5 random reviews with the most neutral sentiment(zero) polarity: \n')
	cl1 = df.loc[df.polarity == 0, ['Review Text']].sample(5).values
	for c in cl1:
	print(c[0])

	print('5 reviews with the most negative polarity having polarity lesser than -0.80: \n')
	cl3 = df.loc[df.polarity <= -0.80, ['Review Text']].sample(5).values
	for c in cl3:
	print(c[0])

	sns.boxplot(df["polarity"],palette="rainbow",data=df)

	df['polarity'].plot(
	kind='hist',
	bins=50,
	color="peru",
	title='Sentiment Polarity Distribution');plt.show()

	p_s=df[df["polarity"]>0].count()["sentiment"]
	neu_s=df[df["polarity"]==0].count()["sentiment"]
	neg_s=df[df["polarity"]<0].count()["sentiment"]

	# Setting labels for items in Chart
	sentiment = ['positive_sentiment',"neutral_sentiment","negative_sentiment"]

	# Setting size in Chart based on
	# given values
	values = [p_s,neu_s,neg_s]

	# colors
	colors = ['#FF0000', 'olive', '#FFFF00']
	# explosion
	explode = (0.05, 0.05, 0.05)

	# Pie Chart
	plt.pie(values, colors=colors, labels=sentiment,
	autopct='%1.1f%%', pctdistance=0.85,
	explode=explode)

	# draw circle
	centre_circle = plt.Circle((0, 0), 0.70, fc='white')
	fig = plt.gcf()

	# Adding Circle in Pie chart
	fig.gca().add_artist(centre_circle)

	# Adding Title of chart
	plt.title('count of polarity as per sentiment')

	# Displaing Chart
	plt.show()

	df.plot.box(y=["word_count"],color="hotpink")

	df['word_count'].plot(
	kind='hist',
	bins=100,
	color="orange",
	title='Review Text Word Count Distribution');plt.show()

	sns.boxenplot(x="news_len",data=df)
	plt.show()

	df['news_len'].plot(
	kind='hist',
	bins=50,
	color="lightblue",
	title='Review Text Word Count Distribution');plt.show()

	fig = px.scatter(df, x="news_len", y="word_count", color="sentiment",
	marginal_x="box", marginal_y="violin",
	title="Click on the legend items!")
	fig.show()

	def get_top_n_words(corpus, n=None):
	vec = CountVectorizer().fit(corpus)
	bag_of_words = vec.transform(corpus)
	sum_words = bag_of_words.sum(axis=0)
	words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
	words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
	return words_freq[:n]
	common_words = get_top_n_words(df['Review Text'], 20)
	for word, freq in common_words:
	print(word, freq)
	df1 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
	df1.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
	kind='bar',title='Top 20 words in review before removing stop words')
	df1

	def get_top_n_words(corpus, n=None):
	vec = CountVectorizer(stop_words = 'english').fit(corpus)
	bag_of_words = vec.transform(corpus)
	sum_words = bag_of_words.sum(axis=0)
	words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
	words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
	return words_freq[:n]
	common_words = get_top_n_words(df['Review Text'], 20)
	for word, freq in common_words:
	print(word, freq)
	df2 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
	df2.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(kind='bar', title='Top 20 words in review after removing stop words')

	def get_top_n_bigram(corpus, n=None):
	vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
	bag_of_words = vec.transform(corpus)
	sum_words = bag_of_words.sum(axis=0)
	words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
	words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
	return words_freq[:n]
	common_words = get_top_n_bigram(df['Review Text'], 20)
	for word, freq in common_words:
	print(word, freq)
	df3 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
	df3.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
	kind='bar',title='Top 20 bigrams in review before removing stop words')

	def get_top_n_bigram(corpus, n=None):
	vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
	bag_of_words = vec.transform(corpus)
	sum_words = bag_of_words.sum(axis=0)
	words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
	words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
	return words_freq[:n]
	common_words = get_top_n_bigram(df['Review Text'], 20)
	for word, freq in common_words:
	print(word, freq)
	df4 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
	df4.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
	kind='bar', title='Top 20 bigrams in review after removing stop words')

	def get_top_n_trigram(corpus, n=None):
	vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)
	bag_of_words = vec.transform(corpus)
	sum_words = bag_of_words.sum(axis=0)
	words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
	words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
	return words_freq[:n]
	common_words = get_top_n_trigram(df['Review Text'], 20)
	for word, freq in common_words:
	print(word, freq)
	df5 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
	df5.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
	kind='bar', title='Top 20 trigrams in review before removing stop words')

	def get_top_n_trigram(corpus, n=None):
	vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)
	bag_of_words = vec.transform(corpus)
	sum_words = bag_of_words.sum(axis=0)
	words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
	words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
	return words_freq[:n]
	common_words = get_top_n_trigram(df['Review Text'], 20)
	for word, freq in common_words:
	print(word, freq)
	df6 = pd.DataFrame(common_words, columns = ['ReviewText' ,'count'])
	df6.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
	kind='bar', title='Top 20 trigrams in review after removing stop words')

	import nltk
	nltk.download('punkt')
	nltk.download('wordnet')
	nltk.download('omw-1.4')
	nltk.download('averaged_perceptron_tagger')

	#import nltk
	blob = TextBlob(str(df['Review Text']))
	pos_df = pd.DataFrame(blob.tags, columns = ['word' , 'pos'])
	pos_df = pos_df.pos.value_counts()[:20]
	pos_df.plot(
	kind='bar',
	title='Top 20 Part-of-speech tagging for review corpus')

	y0 = df.loc[df['sentiment'] == 'positive']['polarity']
	y1 = df.loc[df['sentiment'] == 'negative']['polarity']
	y2 = df.loc[df['sentiment'] == 'neutral']['polarity']

	trace0 = go.Box(
	y=y0,
	name = 'positive',
	marker = dict(
	color = 'rgb(214, 12, 140)',
	)
	)
	trace1 = go.Box(
	y=y1,
	name = 'negative',
	marker = dict(
	color = 'rgb(0, 128, 128)',
	)
	)
	trace2 = go.Box(
	y=y2,
	name = 'neutral',
	marker = dict(
	color = 'rgb(10, 140, 208)',
	)
	)
	data = [trace0, trace1, trace2]
	layout = go.Layout(
	title = "Polarity Boxplot according to sentiment"
	)

	go.Figure(data=data,layout=layout)

	y0 = df.loc[df['sentiment'] == 'positive']['news_len']
	y1 = df.loc[df['sentiment'] == 'negative']['news_len']
	y2 = df.loc[df['sentiment'] == 'neutral']['news_len']


	trace0 = go.Box(
	y=y0,
	name = 'positive',
	marker = dict(
	color = 'rgb(214, 12, 140)',
	)
	)
	trace1 = go.Box(
	y=y1,
	name = 'negative',
	marker = dict(
	color = 'rgb(0, 128, 128)',
	)
	)
	trace2 = go.Box(
	y=y2,
	name = 'neutral',
	marker = dict(
	color = 'rgb(10, 140, 208)',
	)
	)
	data = [trace0, trace1, trace2]
	layout = go.Layout(
	title = "news length Boxplot by sentiment"
	)
	go.Figure(data=data,layout=layout)

	xp = df.loc[df['sentiment'] == "positive", 'polarity']
	xneu = df.loc[df['sentiment'] == "neutral", 'polarity']
	xneg= df.loc[df['sentiment'] == "negative", 'polarity']

	trace1 = go.Histogram(
	x=xp, name='positive',
	opacity=0.75
	)
	trace2 = go.Histogram(
	x=xneu, name = 'neutral',
	opacity=0.75
	)
	trace3 = go.Histogram(
	x=xneg, name = 'negative',
	opacity=0.75
	)
	data = [trace1, trace2,trace3]
	layout = go.Layout(barmode='overlay', title='Distribution of Sentiment polarity')
	go.Figure(data=data, layout=layout)

	trace1 = go.Scatter(
	x=df['polarity'], y=df['news_len'], mode='markers', name='points',
	marker=dict(color='rgb(102,0,0)', size=2, opacity=0.4)
	)
	trace2 = go.Histogram2dContour(
	x=df['polarity'], y=df['news_len'], name='density', ncontours=50,
	colorscale='Hot', reversescale=True, showscale=False
	)
	trace3 = go.Histogram(
	x=df['polarity'], name='Sentiment polarity density',
	marker=dict(color='rgb(102,0,0)'),
	yaxis='y2'
	)
	trace4 = go.Histogram(
	y=df['news_len'], name='news length density', marker=dict(color='rgb(102,0,0)'),
	xaxis='x2'
	)
	data = [trace1, trace2, trace3, trace4]

	layout = go.Layout(
	showlegend=False,
	autosize=False,
	width=600,
	height=550,
	xaxis=dict(
	domain=[0, 0.85],
	showgrid=False,
	zeroline=False
	),
	yaxis=dict(
	domain=[0, 0.85],
	showgrid=False,
	zeroline=False
	),
	margin=dict(
	t=50
	),
	hovermode='x unified',
	bargap=0,
	xaxis2=dict(
	domain=[0.85, 1],
	showgrid=False,
	zeroline=False
	),
	yaxis2=dict(
	domain=[0.85, 1],
	showgrid=False,
	zeroline=False
	)
	)

	go.Figure(data=data, layout=layout)

	trace1 = go.Scatter(
	x=df['polarity'], y=df['word_count'], mode='markers', name='points',
	marker=dict(color='rgb(102,0,0)', size=2, opacity=0.4)
	)
	trace2 = go.Histogram2dContour(
	x=df['polarity'], y=df['word_count'], name='density', ncontours=20,
	colorscale='Hot', reversescale=True, showscale=False
	)
	trace3 = go.Histogram(
	x=df['polarity'], name='Sentiment polarity density',
	marker=dict(color='rgb(102,0,0)'),
	yaxis='y2'
	)
	trace4 = go.Histogram(
	y=df['word_count'], name='word count density', marker=dict(color='rgb(112,0,0)'),
	xaxis='x2'
	)
	data = [trace1, trace2, trace3, trace4]

	layout = go.Layout(
	showlegend=False,
	autosize=False,
	width=600,
	height=550,
	xaxis=dict(
	domain=[0, 0.85],
	showgrid=False,
	zeroline=False
	),
	yaxis=dict(
	domain=[0, 0.85],
	showgrid=False,
	zeroline=False
	),
	margin=dict(
	t=50
	),
	hovermode='closest',
	bargap=0,
	xaxis2=dict(
	domain=[0.85, 1],
	showgrid=False,
	zeroline=False
	),
	yaxis2=dict(
	domain=[0.85, 1],
	showgrid=False,
	zeroline=False
	)
	)

	go.Figure(data=data, layout=layout)


	import scattertext as st
	import spacy
	nlp = spacy.blank("en")
	nlp.add_pipe('sentencizer')
	#nlp.add_pipe(nlp.create_pipe('sentencizer'))
	corpus = st.CorpusFromPandas(df, category_col='sentiment', text_col='Review Text', nlp=nlp).build()
	print(list(corpus.get_scaled_f_scores_vs_background().index[:20]))

	term_freq_df = corpus.get_term_freq_df()
	term_freq_df['positive_sentiment'] = corpus.get_scaled_f_scores('positive')
	list(term_freq_df.sort_values(by='positive_sentiment', ascending=False).index[:20])

	term_freq_df['neutral_sentiment'] = corpus.get_scaled_f_scores('neutral')
	list(term_freq_df.sort_values(by='neutral_sentiment', ascending=False).index[:20])

	term_freq_df['negative_sentiment'] = corpus.get_scaled_f_scores('negative')
	list(term_freq_df.sort_values(by='negative_sentiment', ascending=False).index[:20])

	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.decomposition import TruncatedSVD
	from collections import Counter

	tfidf_vectorizer = TfidfVectorizer(stop_words='english', use_idf=True, smooth_idf=True)
	reindexed_data = df['Review Text'].values
	document_term_matrix = tfidf_vectorizer.fit_transform(reindexed_data)
	n_topics = 10
	lsa_model = TruncatedSVD(n_components=n_topics)
	lsa_topic_matrix = lsa_model.fit_transform(document_term_matrix)

	def get_keys(topic_matrix):
	'''
	returns an integer list of predicted topic
	categories for a given topic matrix
	'''
	keys = topic_matrix.argmax(axis=1).tolist()
	return keys

	def keys_to_counts(keys):
	'''
	returns a tuple of topic categories and their
	accompanying magnitudes for a given list of keys
	'''
	count_pairs = Counter(keys).items()
	categories = [pair[0] for pair in count_pairs]
	counts = [pair[1] for pair in count_pairs]
	return (categories, counts)

	lsa_keys = get_keys(lsa_topic_matrix)
	lsa_categories, lsa_counts = keys_to_counts(lsa_keys)

	def get_top_n_words(n, keys, document_term_matrix, tfidf_vectorizer):
	'''
	returns a list of n_topic strings, where each string contains the n most common
	words in a predicted category, in order
	'''
	top_word_indices = []
	for topic in range(n_topics):
	temp_vector_sum = 0
	for i in range(len(keys)):
	if keys[i] == topic:
	temp_vector_sum += document_term_matrix[i]
	temp_vector_sum = temp_vector_sum.toarray()
	top_n_word_indices = np.flip(np.argsort(temp_vector_sum)[0][-n:],0)
	top_word_indices.append(top_n_word_indices)
	top_words = []
	for topic in top_word_indices:
	topic_words = []
	for index in topic:
	temp_word_vector = np.zeros((1,document_term_matrix.shape[1]))
	temp_word_vector[:,index] = 1
	the_word = tfidf_vectorizer.inverse_transform(temp_word_vector)[0][0]
	topic_words.append(the_word.encode('ascii').decode('utf-8'))
	top_words.append(" ".join(topic_words))
	return top_words

	top_lsa=get_top_n_words(3, lsa_keys, document_term_matrix, tfidf_vectorizer)

	for i in range(len(top_lsa)):
	print("Topic {}: ".format(i+1), top_lsa[i])

	top_3_words = get_top_n_words(3, lsa_keys, document_term_matrix, tfidf_vectorizer)
	labels = ['Topic {}: \n'.format(i+1) + top_3_words[i] for i in lsa_categories]
	fig, ax = plt.subplots(figsize=(16,8))
	ax.bar(lsa_categories, lsa_counts,color="skyblue");
	ax.set_xticks(lsa_categories,);
	ax.set_xticklabels(labels, rotation=45, rotation_mode='default',color="olive");
	ax.set_ylabel('Number of review text on topics');
	ax.set_title('Count of LSA topics');
	plt.show();

	"""#---2----"""

	df['sentiment'].value_counts()

	from sklearn.model_selection import train_test_split
	train,eva = train_test_split(df,test_size = 0.2)


	from simpletransformers.classification import ClassificationModel

	# Create a Transformer Model BERT
	model = ClassificationModel('bert', 'bert-base-cased', num_labels=3, args={'reprocess_input_data': True, 'overwrite_output_dir': True},use_cuda=False)

	# 0,1,2 : positive,negative
	def making_label(st):
	if(st=='positive'):
	return 0
	elif(st=='neutral'):
	return 2
	else:
	return 1

	train['label'] = train['sentiment'].apply(making_label)
	eva['label'] = eva['sentiment'].apply(making_label)
	print(train.shape)

	train_df = pd.DataFrame({
	'text': train['news'][:1500].replace(r'\n', ' ', regex=True),
	'label': train['label'][:1500]
	})

	eval_df = pd.DataFrame({
	'text': eva['news'][-400:].replace(r'\n', ' ', regex=True),
	'label': eva['label'][-400:]
	})

	model.train_model(train_df)

	result, model_outputs, wrong_predictions = model.eval_model(eval_df)

	result

	model_outputs

	len(wrong_predictions)

	lst = []
	for arr in model_outputs:
	lst.append(np.argmax(arr))

	true = eval_df['label'].tolist()
	predicted = lst

	import sklearn
	mat = sklearn.metrics.confusion_matrix(true , predicted)
	mat

	df_cm = pd.DataFrame(mat, range(3), range(3))

	sns.heatmap(df_cm, annot=True)
	plt.show()

	print(sklearn.metrics.classification_report(true,predicted,target_names=['positive','neutral','negative']))

	sklearn.metrics.accuracy_score(true,predicted)

	#Give your statement
	def get_result(statement):
	result = model.predict([statement])
	pos = np.where(result[1][0] == np.amax(result[1][0]))
	pos = int(pos[0])
	sentiment_dict = {0:'positive',1:'negative',2:'neutral'}
	print(sentiment_dict[pos])
	return

	## neutral statement
	get_result("According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .")

	## positive statement
	get_result("According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net sales .")

	## negative statement
	get_result('Sales in Finland decreased by 2.0 % , and international sales decreased by 9.3 % in terms of euros , and by 15.1 % in terms of local currencies .')

	get_result("This company is growing like anything with 23% profit every year")

	get_result("This company is not able to make any profit but make very less profit in last quarter")

	get_result("The doctor treated well and the patient was very healthy")

	get_result("the act of politicians is to serve and help needy and not to create ruck suck")

	get_result("American burger is too good. Can't resisit to go and have one")

	get_result("GDP per capita increased to double in India from 2013")

	get_result("Indian economy is doing very good and will become super power one day.")

	get_result("Indian economy is doing very good and will create millions of jobs in coming years")

	get_result("Indian economy is not doing very good and need urgent reforms but we are pretty sure it will be very good in coming years")

	get_result("Indian economy is doing very good.Indian economy is not doing very good ")

	get_result("Indian economy is not doing very good. Indian economy will bounce back to become leading economy")

	get_result("Indian economy is not doing very good. Urgent reforms is required to create new jobs and improve export")

	get_result("The stock market of Indian economy is dangling too much")

	"""#VADER"""

	from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

	obj = SentimentIntensityAnalyzer()

	sentence = "Ram is really good "
	sentiment_dict = obj.polarity_scores(sentence)
	print(sentiment_dict)

	#check this
	sentence = "Ram is better "
	sentiment_dict = obj.polarity_scores(sentence)
	print(sentiment_dict)

	sentence = "Rahul is really bad"
	sentiment_dict = obj.polarity_scores(sentence)
	print(sentiment_dict)

	#punctuation
	print(obj.polarity_scores('Ram is good boy'))
	print(obj.polarity_scores('Ram is good boy!'))
	print(obj.polarity_scores('Ram is good boy!!'))

	#capitalization
	print(obj.polarity_scores('Ram is good'))
	print(obj.polarity_scores('Ram is GOOD'))

	#degree
	print(obj.polarity_scores('Ram is good'))
	print(obj.polarity_scores('Ram is better'))
	print(obj.polarity_scores('Ram is best'))

	print(obj.polarity_scores('Ram is bad'))
	print(obj.polarity_scores('Ram is worse'))
	print(obj.polarity_scores('Ram is worst'))

	#conjuction
	print(obj.polarity_scores('Ram is good'))
	print(obj.polarity_scores('Ram is good, but he is also naughty sometimes'))

	#slang
	print(obj.polarity_scores("That Hotel"))
	print(obj.polarity_scores("That Hotel SUX"))
	print(obj.polarity_scores("That Hotel SUCKS"))

	#emoticons
	print(obj.polarity_scores("Your :) is the most beautiful thing I have ever seen"))
	print(obj.polarity_scores("Your smile is the most beautiful thing I have ever seen"))

	print(obj.polarity_scores("Your :( is the worst thing I have ever seen"))
	print(obj.polarity_scores("Your smile is the worst thing I have ever seen"))

	#https://360digitmg.com/blog/bert-variants-and-their-differences
	#https://simpletransformers.ai/docs/classification-specifics/#supported-model-types Official reference

	"""#3.a Using FINBERT Model"""

	#PPT
	#https://medium.com/@benjamin_joesy/finbert-financial-sentiment-analysis-with-bert-acf695b64ac6

	from transformers import BertTokenizer, BertForSequenceClassification, pipeline

	# tested in transformers==4.18.0
	import transformers
	transformers.__version__

	finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
	tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

	nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)
	results = nlp(['growth is strong and we have plenty of liquidity.',
	'there is a shortage of capital, and we need extra financing.',
	'formulation patents might protect Vasotec to a limited extent.'])

	results

	"""#FINBERT ESG"""

	finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-esg',num_labels=4)
	tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-esg')

	nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)
	results = nlp(['Managing and working to mitigate the impact our operations have on the environment is a core element of our business.',
	'Rhonda has been volunteering for several years for a variety of charitable community programs.',
	'Cabot\'s annual statements are audited annually by an independent registered public accounting firm.',
	'As of December 31, 2012, the 2011 Term Loan had a principal balance of $492.5 million.'])

	results

	"""#FINBERT Classification"""

	finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-fls',num_labels=3)
	tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-fls')

	nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)
	results = nlp(['we expect the age of our fleet to enhance availability and reliability due to reduced downtime for repairs.',
	'on an equivalent unit of production basis, general and administrative expenses declined 24 percent from 1994 to $.67 per boe.',
	'we will continue to assess the need for a valuation allowance against deferred tax assets considering all available evidence obtained in'])

	results

	X = df['Review Text'].to_list()
	y = df['sentiment'].to_list()

	from transformers import BertTokenizer, BertForSequenceClassification

	finbert_whole = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
	tokenizer_whole = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

	labels = {0:'neutral', 1:'positive',2:'negative'}

	sent_val = list()
	for x in X:
	inputs = tokenizer_whole(x, return_tensors="pt", padding=True)
	outputs = finbert_whole(**inputs)[0]

	val = labels[np.argmax(outputs.detach().numpy())]
	print(x, '---->', val)
	print('#######################################################')
	sent_val.append(val)

	from sklearn.metrics import accuracy_score
	print(accuracy_score(y, sent_val))

	"""#Using DISTILBERT"""

	from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

	tokenizer_distilbert = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
	model_distilbert = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

	labels = {0:'neutral', 1:'positive',2:'negative'}

	sent_val_bert = list()
	for x in X:
	inputs = tokenizer_distilbert(x, return_tensors="pt", padding=True)
	outputs = model_distilbert(**inputs)[0]

	val = labels[np.argmax(outputs.detach().numpy())]
	print(x, '---->', val)
	print('#######################################################')
	sent_val_bert.append(val)

	from sklearn.metrics import accuracy_score
	print(accuracy_score(y, sent_val))

	"""#Bert"""

	tokenizer_bert = DistilBertTokenizer.from_pretrained("bert-base-uncased")
	model_bert = DistilBertForSequenceClassification.from_pretrained("bert-base-uncased")

	labels = {0:'neutral', 1:'positive',2:'negative'}

	sent_val_bert1 = list()
	for x in X:
	inputs = tokenizer_bert(x, return_tensors="pt", padding=True)
	outputs = model_bert(**inputs)[0]

	val = labels[np.argmax(outputs.detach().numpy())]
	print(x, '---->', val)
	print('#######################################################')
	sent_val_bert1.append(val)

	from sklearn.metrics import accuracy_score
	print(accuracy_score(y, sent_val))