Spaces:

vibha-mah
/

Twitter_Emoticon_Analysis_NLP

Runtime error

App Files Files Community

Twitter_Emoticon_Analysis_NLP / app.py

vibha-mah

Update app.py

64f1623 almost 2 years ago

raw

history blame contribute delete

5.14 kB

	# utilities
	import re
	import pickle
	import numpy as np
	import pandas as pd
	# plotting
	import seaborn as sns
	from wordcloud import WordCloud
	import matplotlib.pyplot as plt
	# nltk
	from nltk.stem import WordNetLemmatizer
	# sklearn
	from sklearn.svm import LinearSVC
	from sklearn.naive_bayes import BernoulliNB
	from sklearn.linear_model import LogisticRegression

	from sklearn.model_selection import train_test_split
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics import confusion_matrix, classification_report

	from datasets import load_dataset

	dataset = load_dataset('Twitter_Emoticon_Analysis_NLP/training.1600000.processed.noemoticon.csv')

	DATASET_COLUMNS = ["sentiment", "ids", "date", "flag", "user", "text"]
	DATASET_ENCODING = "ISO-8859-1"
	dataset = pd.read_csv('training.1600000.processed.noemoticon.csv',
	encoding=DATASET_ENCODING , names=DATASET_COLUMNS)

	# Removing the unnecessary columns.
	dataset = dataset[['sentiment','text']]
	# Replacing the values to ease understanding.
	dataset['sentiment'] = dataset['sentiment'].replace(4,1)

	# Storing data in lists.
	text, sentiment = list(dataset['text']), list(dataset['sentiment'])

	def preprocess(textdata):
	processedText = []

	# Create Lemmatizer and Stemmer.
	wordLemm = WordNetLemmatizer()

	# Defining regex patterns.
	urlPattern = r"((http://)[^ ]\|(https://)[^ ]\|( www\.)[^ ]*)"
	userPattern = '@[^\s]+'
	alphaPattern = "[^a-zA-Z0-9]"
	sequencePattern = r"(.)\1\1+"
	seqReplacePattern = r"\1\1"

	for tweet in textdata:
	tweet = tweet.lower()

	# Replace all URls with 'URL'
	tweet = re.sub(urlPattern,' URL',tweet)
	# Replace all emojis.
	for emoji in emojis.keys():
	tweet = tweet.replace(emoji, "EMOJI" + emojis[emoji])
	# Replace @USERNAME to 'USER'.
	tweet = re.sub(userPattern,' USER', tweet)
	# Replace all non alphabets.
	tweet = re.sub(alphaPattern, " ", tweet)
	# Replace 3 or more consecutive letters by 2 letter.
	tweet = re.sub(sequencePattern, seqReplacePattern, tweet)

	tweetwords = ''
	for word in tweet.split():
	# Checking if the word is a stopword.
	#if word not in stopwordlist:
	if len(word)>1:
	# Lemmatizing the word.
	word = wordLemm.lemmatize(word)
	tweetwords += (word+' ')

	processedText.append(tweetwords)

	return processedText

	def preprocess(textdata):
	processedText = []

	# Create Lemmatizer and Stemmer.
	wordLemm = WordNetLemmatizer()

	# Defining regex patterns.
	urlPattern = r"((http://)[^ ]\|(https://)[^ ]\|( www\.)[^ ]*)"
	userPattern = '@[^\s]+'
	alphaPattern = "[^a-zA-Z0-9]"
	sequencePattern = r"(.)\1\1+"
	seqReplacePattern = r"\1\1"

	for tweet in textdata:
	tweet = tweet.lower()

	# Replace all URls with 'URL'
	tweet = re.sub(urlPattern,' URL',tweet)
	# Replace all emojis.
	for emoji in emojis.keys():
	tweet = tweet.replace(emoji, "EMOJI" + emojis[emoji])
	# Replace @USERNAME to 'USER'.
	tweet = re.sub(userPattern,' USER', tweet)
	# Replace all non alphabets.
	tweet = re.sub(alphaPattern, " ", tweet)
	# Replace 3 or more consecutive letters by 2 letter.
	tweet = re.sub(sequencePattern, seqReplacePattern, tweet)

	tweetwords = ''
	for word in tweet.split():
	# Checking if the word is a stopword.
	#if word not in stopwordlist:
	if len(word)>1:
	# Lemmatizing the word.
	word = wordLemm.lemmatize(word)
	tweetwords += (word+' ')

	processedText.append(tweetwords)

	return processedText

	import gradio as gr
	import nltk
	from nltk.sentiment.vader import SentimentIntensityAnalyzer

	# Download required NLTK resources
	nltk.download('vader_lexicon')

	# Load the pre-trained sentiment intensity analyzer
	sia = SentimentIntensityAnalyzer()

	def get_sentiment(tweet):
	# Preprocess the tweet
	processed_tweet = preprocess([tweet])

	# Get the sentiment score using VADER sentiment analyzer
	sentiment_score = sia.polarity_scores(processed_tweet[0])

	# Determine the sentiment label based on the compound score
	compound_score = sentiment_score['compound']
	if compound_score >= 0.05:
	sentiment = 'Positive'
	elif compound_score <= -0.05:
	sentiment = 'Negative'
	else:
	sentiment = 'Neutral'

	return sentiment

	# Create a Gradio interface
	iface = gr.Interface(
	fn=get_sentiment,
	inputs='text',
	outputs='text',
	title='Tweet Sentiment Analyzer',
	description='Enter a tweet with text or emoticon or both, and get the sentiment prediction.',
	examples=[['I love this movie!', 'This weather is terrible.']],
	theme='Soft'

	)

	# Launch the interface
	iface.launch(share = True)