# -*- coding: utf-8 -*- """Survey_Analysis_v_3.2.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1UtAdINgLRkpdKGCzhEIPR8ZgK1u_dMtD """ #1 - https://www.kaggle.com/code/ramjasmaurya/financial-sentiment-analysis #2 - https://www.kaggle.com/code/adarshbiradar/sentiment-analysis-using-bert pip install streamlit import streamlit pip install pygal !pip install squarify # Commented out IPython magic to ensure Python compatibility. import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import plotly.express as px import plotly.graph_objects as go import pygal as py import squarify as sq import matplotlib plt.rcParams["figure.figsize"] = (20,15) matplotlib.rc('xtick', labelsize=7) matplotlib.rc('ytick', labelsize=7) font = {'family' : 'normal', 'weight' : 'bold', 'size' : 5} matplotlib.rc('font', **font) from sklearn.feature_extraction.text import CountVectorizer import warnings warnings.filterwarnings("ignore", category=FutureWarning) # %matplotlib inline df=pd.read_csv("/content/gen-data.csv",engine="python",encoding="ISO-8859-1") df col1=df.keys()[0] col2=df.keys()[1] col2 df2=pd.DataFrame([[col1, col2]], columns=list([col1,col2]), index=[4845]) df=df.append(df2, ignore_index=True).set_axis(['sentiment', 'news'], axis=1, inplace=False) df df = df.replace("ï»¿neutral","neutral") sns.countplot(y="sentiment",data=df) df.isnull().sum() from textblob import TextBlob def preprocess(ReviewText): ReviewText = ReviewText.str.replace("(
)", "") ReviewText = ReviewText.str.replace('().*()', '') ReviewText = ReviewText.str.replace('(&)', '') ReviewText = ReviewText.str.replace('(>)', '') ReviewText = ReviewText.str.replace('(<)', '') ReviewText = ReviewText.str.replace('(\xa0)', ' ') return ReviewText df['Review Text'] = preprocess(df['news']) df['polarity'] = df['news'].map(lambda text: TextBlob(text).sentiment.polarity) df['news_len'] = df['news'].astype(str).apply(len) df['word_count'] = df['news'].apply(lambda x: len(str(x).split())) df print('top 4 random reviews with the highest positive sentiment polarity: \n') df1=df.drop_duplicates(subset=['Review Text']) cl = df1.loc[df1.polarity == 1, ['Review Text']].sample(4).values for c in cl: print(c[0]) print('5 random reviews with the most neutral sentiment(zero) polarity: \n') cl1 = df.loc[df.polarity == 0, ['Review Text']].sample(5).values for c in cl1: print(c[0]) print('5 reviews with the most negative polarity having polarity lesser than -0.80: \n') cl3 = df.loc[df.polarity <= -0.80, ['Review Text']].sample(5).values for c in cl3: print(c[0]) sns.boxplot(df["polarity"],palette="rainbow",data=df) df['polarity'].plot( kind='hist', bins=50, color="peru", title='Sentiment Polarity Distribution');plt.show() p_s=df[df["polarity"]>0].count()["sentiment"] neu_s=df[df["polarity"]==0].count()["sentiment"] neg_s=df[df["polarity"]<0].count()["sentiment"] # Setting labels for items in Chart sentiment = ['positive_sentiment',"neutral_sentiment","negative_sentiment"] # Setting size in Chart based on # given values values = [p_s,neu_s,neg_s] # colors colors = ['#FF0000', 'olive', '#FFFF00'] # explosion explode = (0.05, 0.05, 0.05) # Pie Chart plt.pie(values, colors=colors, labels=sentiment, autopct='%1.1f%%', pctdistance=0.85, explode=explode) # draw circle centre_circle = plt.Circle((0, 0), 0.70, fc='white') fig = plt.gcf() # Adding Circle in Pie chart fig.gca().add_artist(centre_circle) # Adding Title of chart plt.title('count of polarity as per sentiment') # Displaing Chart plt.show() df.plot.box(y=["word_count"],color="hotpink") df['word_count'].plot( kind='hist', bins=100, color="orange", title='Review Text Word Count Distribution');plt.show() sns.boxenplot(x="news_len",data=df) plt.show() df['news_len'].plot( kind='hist', bins=50, color="lightblue", title='Review Text Word Count Distribution');plt.show() fig = px.scatter(df, x="news_len", y="word_count", color="sentiment", marginal_x="box", marginal_y="violin", title="Click on the legend items!") fig.show() def get_top_n_words(corpus, n=None): vec = CountVectorizer().fit(corpus) bag_of_words = vec.transform(corpus) sum_words = bag_of_words.sum(axis=0) words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()] words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True) return words_freq[:n] common_words = get_top_n_words(df['Review Text'], 20) for word, freq in common_words: print(word, freq) df1 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count']) df1.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot( kind='bar',title='Top 20 words in review before removing stop words') df1 def get_top_n_words(corpus, n=None): vec = CountVectorizer(stop_words = 'english').fit(corpus) bag_of_words = vec.transform(corpus) sum_words = bag_of_words.sum(axis=0) words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()] words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True) return words_freq[:n] common_words = get_top_n_words(df['Review Text'], 20) for word, freq in common_words: print(word, freq) df2 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count']) df2.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(kind='bar', title='Top 20 words in review after removing stop words') def get_top_n_bigram(corpus, n=None): vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus) bag_of_words = vec.transform(corpus) sum_words = bag_of_words.sum(axis=0) words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()] words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True) return words_freq[:n] common_words = get_top_n_bigram(df['Review Text'], 20) for word, freq in common_words: print(word, freq) df3 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count']) df3.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot( kind='bar',title='Top 20 bigrams in review before removing stop words') def get_top_n_bigram(corpus, n=None): vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus) bag_of_words = vec.transform(corpus) sum_words = bag_of_words.sum(axis=0) words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()] words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True) return words_freq[:n] common_words = get_top_n_bigram(df['Review Text'], 20) for word, freq in common_words: print(word, freq) df4 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count']) df4.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot( kind='bar', title='Top 20 bigrams in review after removing stop words') def get_top_n_trigram(corpus, n=None): vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus) bag_of_words = vec.transform(corpus) sum_words = bag_of_words.sum(axis=0) words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()] words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True) return words_freq[:n] common_words = get_top_n_trigram(df['Review Text'], 20) for word, freq in common_words: print(word, freq) df5 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count']) df5.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot( kind='bar', title='Top 20 trigrams in review before removing stop words') def get_top_n_trigram(corpus, n=None): vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus) bag_of_words = vec.transform(corpus) sum_words = bag_of_words.sum(axis=0) words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()] words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True) return words_freq[:n] common_words = get_top_n_trigram(df['Review Text'], 20) for word, freq in common_words: print(word, freq) df6 = pd.DataFrame(common_words, columns = ['ReviewText' ,'count']) df6.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot( kind='bar', title='Top 20 trigrams in review after removing stop words') import nltk nltk.download('punkt') nltk.download('wordnet') nltk.download('omw-1.4') nltk.download('averaged_perceptron_tagger') #import nltk blob = TextBlob(str(df['Review Text'])) pos_df = pd.DataFrame(blob.tags, columns = ['word' , 'pos']) pos_df = pos_df.pos.value_counts()[:20] pos_df.plot( kind='bar', title='Top 20 Part-of-speech tagging for review corpus') y0 = df.loc[df['sentiment'] == 'positive']['polarity'] y1 = df.loc[df['sentiment'] == 'negative']['polarity'] y2 = df.loc[df['sentiment'] == 'neutral']['polarity'] trace0 = go.Box( y=y0, name = 'positive', marker = dict( color = 'rgb(214, 12, 140)', ) ) trace1 = go.Box( y=y1, name = 'negative', marker = dict( color = 'rgb(0, 128, 128)', ) ) trace2 = go.Box( y=y2, name = 'neutral', marker = dict( color = 'rgb(10, 140, 208)', ) ) data = [trace0, trace1, trace2] layout = go.Layout( title = "Polarity Boxplot according to sentiment" ) go.Figure(data=data,layout=layout) y0 = df.loc[df['sentiment'] == 'positive']['news_len'] y1 = df.loc[df['sentiment'] == 'negative']['news_len'] y2 = df.loc[df['sentiment'] == 'neutral']['news_len'] trace0 = go.Box( y=y0, name = 'positive', marker = dict( color = 'rgb(214, 12, 140)', ) ) trace1 = go.Box( y=y1, name = 'negative', marker = dict( color = 'rgb(0, 128, 128)', ) ) trace2 = go.Box( y=y2, name = 'neutral', marker = dict( color = 'rgb(10, 140, 208)', ) ) data = [trace0, trace1, trace2] layout = go.Layout( title = "news length Boxplot by sentiment" ) go.Figure(data=data,layout=layout) xp = df.loc[df['sentiment'] == "positive", 'polarity'] xneu = df.loc[df['sentiment'] == "neutral", 'polarity'] xneg= df.loc[df['sentiment'] == "negative", 'polarity'] trace1 = go.Histogram( x=xp, name='positive', opacity=0.75 ) trace2 = go.Histogram( x=xneu, name = 'neutral', opacity=0.75 ) trace3 = go.Histogram( x=xneg, name = 'negative', opacity=0.75 ) data = [trace1, trace2,trace3] layout = go.Layout(barmode='overlay', title='Distribution of Sentiment polarity') go.Figure(data=data, layout=layout) trace1 = go.Scatter( x=df['polarity'], y=df['news_len'], mode='markers', name='points', marker=dict(color='rgb(102,0,0)', size=2, opacity=0.4) ) trace2 = go.Histogram2dContour( x=df['polarity'], y=df['news_len'], name='density', ncontours=50, colorscale='Hot', reversescale=True, showscale=False ) trace3 = go.Histogram( x=df['polarity'], name='Sentiment polarity density', marker=dict(color='rgb(102,0,0)'), yaxis='y2' ) trace4 = go.Histogram( y=df['news_len'], name='news length density', marker=dict(color='rgb(102,0,0)'), xaxis='x2' ) data = [trace1, trace2, trace3, trace4] layout = go.Layout( showlegend=False, autosize=False, width=600, height=550, xaxis=dict( domain=[0, 0.85], showgrid=False, zeroline=False ), yaxis=dict( domain=[0, 0.85], showgrid=False, zeroline=False ), margin=dict( t=50 ), hovermode='x unified', bargap=0, xaxis2=dict( domain=[0.85, 1], showgrid=False, zeroline=False ), yaxis2=dict( domain=[0.85, 1], showgrid=False, zeroline=False ) ) go.Figure(data=data, layout=layout) trace1 = go.Scatter( x=df['polarity'], y=df['word_count'], mode='markers', name='points', marker=dict(color='rgb(102,0,0)', size=2, opacity=0.4) ) trace2 = go.Histogram2dContour( x=df['polarity'], y=df['word_count'], name='density', ncontours=20, colorscale='Hot', reversescale=True, showscale=False ) trace3 = go.Histogram( x=df['polarity'], name='Sentiment polarity density', marker=dict(color='rgb(102,0,0)'), yaxis='y2' ) trace4 = go.Histogram( y=df['word_count'], name='word count density', marker=dict(color='rgb(112,0,0)'), xaxis='x2' ) data = [trace1, trace2, trace3, trace4] layout = go.Layout( showlegend=False, autosize=False, width=600, height=550, xaxis=dict( domain=[0, 0.85], showgrid=False, zeroline=False ), yaxis=dict( domain=[0, 0.85], showgrid=False, zeroline=False ), margin=dict( t=50 ), hovermode='closest', bargap=0, xaxis2=dict( domain=[0.85, 1], showgrid=False, zeroline=False ), yaxis2=dict( domain=[0.85, 1], showgrid=False, zeroline=False ) ) go.Figure(data=data, layout=layout) pip install scattertext pip install spacy import scattertext as st import spacy nlp = spacy.blank("en") nlp.add_pipe('sentencizer') #nlp.add_pipe(nlp.create_pipe('sentencizer')) corpus = st.CorpusFromPandas(df, category_col='sentiment', text_col='Review Text', nlp=nlp).build() print(list(corpus.get_scaled_f_scores_vs_background().index[:20])) term_freq_df = corpus.get_term_freq_df() term_freq_df['positive_sentiment'] = corpus.get_scaled_f_scores('positive') list(term_freq_df.sort_values(by='positive_sentiment', ascending=False).index[:20]) term_freq_df['neutral_sentiment'] = corpus.get_scaled_f_scores('neutral') list(term_freq_df.sort_values(by='neutral_sentiment', ascending=False).index[:20]) term_freq_df['negative_sentiment'] = corpus.get_scaled_f_scores('negative') list(term_freq_df.sort_values(by='negative_sentiment', ascending=False).index[:20]) from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import TruncatedSVD from collections import Counter tfidf_vectorizer = TfidfVectorizer(stop_words='english', use_idf=True, smooth_idf=True) reindexed_data = df['Review Text'].values document_term_matrix = tfidf_vectorizer.fit_transform(reindexed_data) n_topics = 10 lsa_model = TruncatedSVD(n_components=n_topics) lsa_topic_matrix = lsa_model.fit_transform(document_term_matrix) def get_keys(topic_matrix): ''' returns an integer list of predicted topic categories for a given topic matrix ''' keys = topic_matrix.argmax(axis=1).tolist() return keys def keys_to_counts(keys): ''' returns a tuple of topic categories and their accompanying magnitudes for a given list of keys ''' count_pairs = Counter(keys).items() categories = [pair[0] for pair in count_pairs] counts = [pair[1] for pair in count_pairs] return (categories, counts) lsa_keys = get_keys(lsa_topic_matrix) lsa_categories, lsa_counts = keys_to_counts(lsa_keys) def get_top_n_words(n, keys, document_term_matrix, tfidf_vectorizer): ''' returns a list of n_topic strings, where each string contains the n most common words in a predicted category, in order ''' top_word_indices = [] for topic in range(n_topics): temp_vector_sum = 0 for i in range(len(keys)): if keys[i] == topic: temp_vector_sum += document_term_matrix[i] temp_vector_sum = temp_vector_sum.toarray() top_n_word_indices = np.flip(np.argsort(temp_vector_sum)[0][-n:],0) top_word_indices.append(top_n_word_indices) top_words = [] for topic in top_word_indices: topic_words = [] for index in topic: temp_word_vector = np.zeros((1,document_term_matrix.shape[1])) temp_word_vector[:,index] = 1 the_word = tfidf_vectorizer.inverse_transform(temp_word_vector)[0][0] topic_words.append(the_word.encode('ascii').decode('utf-8')) top_words.append(" ".join(topic_words)) return top_words top_lsa=get_top_n_words(3, lsa_keys, document_term_matrix, tfidf_vectorizer) for i in range(len(top_lsa)): print("Topic {}: ".format(i+1), top_lsa[i]) top_3_words = get_top_n_words(3, lsa_keys, document_term_matrix, tfidf_vectorizer) labels = ['Topic {}: \n'.format(i+1) + top_3_words[i] for i in lsa_categories] fig, ax = plt.subplots(figsize=(16,8)) ax.bar(lsa_categories, lsa_counts,color="skyblue"); ax.set_xticks(lsa_categories,); ax.set_xticklabels(labels, rotation=45, rotation_mode='default',color="olive"); ax.set_ylabel('Number of review text on topics'); ax.set_title('Count of LSA topics'); plt.show(); """#---2----""" df['sentiment'].value_counts() from sklearn.model_selection import train_test_split train,eva = train_test_split(df,test_size = 0.2) !pip install simpletransformers from simpletransformers.classification import ClassificationModel # Create a Transformer Model BERT model = ClassificationModel('bert', 'bert-base-cased', num_labels=3, args={'reprocess_input_data': True, 'overwrite_output_dir': True},use_cuda=False) # 0,1,2 : positive,negative def making_label(st): if(st=='positive'): return 0 elif(st=='neutral'): return 2 else: return 1 train['label'] = train['sentiment'].apply(making_label) eva['label'] = eva['sentiment'].apply(making_label) print(train.shape) train_df = pd.DataFrame({ 'text': train['news'][:1500].replace(r'\n', ' ', regex=True), 'label': train['label'][:1500] }) eval_df = pd.DataFrame({ 'text': eva['news'][-400:].replace(r'\n', ' ', regex=True), 'label': eva['label'][-400:] }) model.train_model(train_df) result, model_outputs, wrong_predictions = model.eval_model(eval_df) result model_outputs len(wrong_predictions) lst = [] for arr in model_outputs: lst.append(np.argmax(arr)) true = eval_df['label'].tolist() predicted = lst import sklearn mat = sklearn.metrics.confusion_matrix(true , predicted) mat df_cm = pd.DataFrame(mat, range(3), range(3)) sns.heatmap(df_cm, annot=True) plt.show() print(sklearn.metrics.classification_report(true,predicted,target_names=['positive','neutral','negative'])) sklearn.metrics.accuracy_score(true,predicted) #Give your statement def get_result(statement): result = model.predict([statement]) pos = np.where(result[1][0] == np.amax(result[1][0])) pos = int(pos[0]) sentiment_dict = {0:'positive',1:'negative',2:'neutral'} print(sentiment_dict[pos]) return ## neutral statement get_result("According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .") ## positive statement get_result("According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net sales .") ## negative statement get_result('Sales in Finland decreased by 2.0 % , and international sales decreased by 9.3 % in terms of euros , and by 15.1 % in terms of local currencies .') get_result("This company is growing like anything with 23% profit every year") get_result("This company is not able to make any profit but make very less profit in last quarter") get_result("The doctor treated well and the patient was very healthy") get_result("the act of politicians is to serve and help needy and not to create ruck suck") get_result("American burger is too good. Can't resisit to go and have one") get_result("GDP per capita increased to double in India from 2013") get_result("Indian economy is doing very good and will become super power one day.") get_result("Indian economy is doing very good and will create millions of jobs in coming years") get_result("Indian economy is not doing very good and need urgent reforms but we are pretty sure it will be very good in coming years") get_result("Indian economy is doing very good.Indian economy is not doing very good ") get_result("Indian economy is not doing very good. Indian economy will bounce back to become leading economy") get_result("Indian economy is not doing very good. Urgent reforms is required to create new jobs and improve export") get_result("The stock market of Indian economy is dangling too much") """#VADER""" !pip install vaderSentiment from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer obj = SentimentIntensityAnalyzer() sentence = "Ram is really good " sentiment_dict = obj.polarity_scores(sentence) print(sentiment_dict) #check this sentence = "Ram is better " sentiment_dict = obj.polarity_scores(sentence) print(sentiment_dict) sentence = "Rahul is really bad" sentiment_dict = obj.polarity_scores(sentence) print(sentiment_dict) #punctuation print(obj.polarity_scores('Ram is good boy')) print(obj.polarity_scores('Ram is good boy!')) print(obj.polarity_scores('Ram is good boy!!')) #capitalization print(obj.polarity_scores('Ram is good')) print(obj.polarity_scores('Ram is GOOD')) #degree print(obj.polarity_scores('Ram is good')) print(obj.polarity_scores('Ram is better')) print(obj.polarity_scores('Ram is best')) print(obj.polarity_scores('Ram is bad')) print(obj.polarity_scores('Ram is worse')) print(obj.polarity_scores('Ram is worst')) #conjuction print(obj.polarity_scores('Ram is good')) print(obj.polarity_scores('Ram is good, but he is also naughty sometimes')) #slang print(obj.polarity_scores("That Hotel")) print(obj.polarity_scores("That Hotel SUX")) print(obj.polarity_scores("That Hotel SUCKS")) #emoticons print(obj.polarity_scores("Your :) is the most beautiful thing I have ever seen")) print(obj.polarity_scores("Your smile is the most beautiful thing I have ever seen")) print(obj.polarity_scores("Your :( is the worst thing I have ever seen")) print(obj.polarity_scores("Your smile is the worst thing I have ever seen")) #https://360digitmg.com/blog/bert-variants-and-their-differences #https://simpletransformers.ai/docs/classification-specifics/#supported-model-types Official reference """#3.a Using FINBERT Model""" #PPT #https://medium.com/@benjamin_joesy/finbert-financial-sentiment-analysis-with-bert-acf695b64ac6 from transformers import BertTokenizer, BertForSequenceClassification, pipeline # tested in transformers==4.18.0 import transformers transformers.__version__ finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3) tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone') nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer) results = nlp(['growth is strong and we have plenty of liquidity.', 'there is a shortage of capital, and we need extra financing.', 'formulation patents might protect Vasotec to a limited extent.']) results """#FINBERT ESG""" finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-esg',num_labels=4) tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-esg') nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer) results = nlp(['Managing and working to mitigate the impact our operations have on the environment is a core element of our business.', 'Rhonda has been volunteering for several years for a variety of charitable community programs.', 'Cabot\'s annual statements are audited annually by an independent registered public accounting firm.', 'As of December 31, 2012, the 2011 Term Loan had a principal balance of $492.5 million.']) results """#FINBERT Classification""" finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-fls',num_labels=3) tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-fls') nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer) results = nlp(['we expect the age of our fleet to enhance availability and reliability due to reduced downtime for repairs.', 'on an equivalent unit of production basis, general and administrative expenses declined 24 percent from 1994 to $.67 per boe.', 'we will continue to assess the need for a valuation allowance against deferred tax assets considering all available evidence obtained in']) results X = df['Review Text'].to_list() y = df['sentiment'].to_list() from transformers import BertTokenizer, BertForSequenceClassification finbert_whole = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3) tokenizer_whole = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone') labels = {0:'neutral', 1:'positive',2:'negative'} sent_val = list() for x in X: inputs = tokenizer_whole(x, return_tensors="pt", padding=True) outputs = finbert_whole(**inputs)[0] val = labels[np.argmax(outputs.detach().numpy())] print(x, '---->', val) print('#######################################################') sent_val.append(val) from sklearn.metrics import accuracy_score print(accuracy_score(y, sent_val)) """#Using DISTILBERT""" from transformers import DistilBertTokenizer, DistilBertForSequenceClassification tokenizer_distilbert = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") model_distilbert = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased") labels = {0:'neutral', 1:'positive',2:'negative'} sent_val_bert = list() for x in X: inputs = tokenizer_distilbert(x, return_tensors="pt", padding=True) outputs = model_distilbert(**inputs)[0] val = labels[np.argmax(outputs.detach().numpy())] print(x, '---->', val) print('#######################################################') sent_val_bert.append(val) from sklearn.metrics import accuracy_score print(accuracy_score(y, sent_val)) """#Bert""" tokenizer_bert = DistilBertTokenizer.from_pretrained("bert-base-uncased") model_bert = DistilBertForSequenceClassification.from_pretrained("bert-base-uncased") labels = {0:'neutral', 1:'positive',2:'negative'} sent_val_bert1 = list() for x in X: inputs = tokenizer_bert(x, return_tensors="pt", padding=True) outputs = model_bert(**inputs)[0] val = labels[np.argmax(outputs.detach().numpy())] print(x, '---->', val) print('#######################################################') sent_val_bert1.append(val) from sklearn.metrics import accuracy_score print(accuracy_score(y, sent_val))