Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Importing the required packages
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import seaborn as sns
|
6 |
+
import nltk
|
7 |
+
# Set the style sheet for plots
|
8 |
+
plt.style.use('ggplot')
|
9 |
+
|
10 |
+
# Read the data
|
11 |
+
df = pd.read_csv("hf://datasets/patrickbdevaney/tripadvisor_hotel_reviews/data/tripadvisor_hotel_reviews.csv")
|
12 |
+
|
13 |
+
df = df.reset_index().rename(columns={'index': 'Id'})
|
14 |
+
|
15 |
+
df.head()
|
16 |
+
|
17 |
+
df.head()
|
18 |
+
|
19 |
+
# Check the shape of the DataFrame
|
20 |
+
print(df.shape)
|
21 |
+
|
22 |
+
# Count the number of reviews for each rating and plot a bar chart
|
23 |
+
ax = df['Rating'].value_counts().sort_index() \
|
24 |
+
.plot(kind='bar',
|
25 |
+
title='Count of Reviews by Stars',
|
26 |
+
figsize=(10, 5))
|
27 |
+
ax.set_xlabel('Review Stars')
|
28 |
+
ax.set_ylabel('No. of Stars')
|
29 |
+
plt.show()
|
30 |
+
|
31 |
+
# Select a review for sentiment analysis
|
32 |
+
rev250 = df['Review'][200]
|
33 |
+
print(rev250)
|
34 |
+
|
35 |
+
# Preprocess the review text
|
36 |
+
tokens = nltk.word_tokenize(rev250) # Tokenization
|
37 |
+
tagged = nltk.pos_tag(tokens) # Part-of-speech tagging
|
38 |
+
entities = nltk.chunk.ne_chunk(tagged) # Entity recognition
|
39 |
+
|
40 |
+
entities.pprint()
|
41 |
+
|
42 |
+
# Perform sentiment analysis using VADER
|
43 |
+
from nltk.sentiment import SentimentIntensityAnalyzer
|
44 |
+
sia = SentimentIntensityAnalyzer()
|
45 |
+
|
46 |
+
# Analyze sentiment for a positive sentence
|
47 |
+
print(sia.polarity_scores('I am so happy!'))
|
48 |
+
#>> {'neg': 0.0, 'neu': 0.318, 'pos': 0.682, 'compound': 0.6468}
|
49 |
+
|
50 |
+
# Analyze sentiment for a negative sentence
|
51 |
+
print(sia.polarity_scores('I hate sweet aroma!'))
|
52 |
+
#>> {'neg': 0.499, 'neu': 0.125, 'pos': 0.375, 'compound': -0.2481}
|
53 |
+
|
54 |
+
# Analyze sentiment for the selected review
|
55 |
+
print(sia.polarity_scores(rev250))
|
56 |
+
#>> {'neg': 0.1, 'neu': 0.612, 'pos': 0.288, 'compound': 0.9556}
|
57 |
+
|
58 |
+
# Perform sentiment analysis on the entire dataset
|
59 |
+
from tqdm import tqdm
|
60 |
+
|
61 |
+
res = {} # Store the sentiment scores
|
62 |
+
|
63 |
+
for i, row in tqdm(df.iterrows(), total=len(df)):
|
64 |
+
text = row['Review']
|
65 |
+
myid = row['Id']
|
66 |
+
res[myid] = sia.polarity_scores(text)
|
67 |
+
|
68 |
+
# Create a DataFrame from the sentiment scores and merge it with the original DataFrame
|
69 |
+
vaders = pd.DataFrame(res).T
|
70 |
+
vaders = vaders.reset_index().rename(columns={'index': 'Id'})
|
71 |
+
vaders = vaders.merge(df, how='left')
|
72 |
+
|
73 |
+
vaders.head()
|
74 |
+
|
75 |
+
# Visualize the sentiment scores
|
76 |
+
fig, axs = plt.subplots(1, 3, figsize=(12, 3))
|
77 |
+
sns.barplot(data=vaders, x='Rating', y='pos', ax=axs[0])
|
78 |
+
sns.barplot(data=vaders, x='Rating', y='neu', ax=axs[1])
|
79 |
+
sns.barplot(data=vaders, x='Rating', y='neg', ax=axs[2])
|
80 |
+
|
81 |
+
# Set titles for the subplots
|
82 |
+
axs[0].set_title('Positive')
|
83 |
+
axs[1].set_title('Neutral')
|
84 |
+
axs[2].set_title('Negative')
|
85 |
+
|
86 |
+
# Add spacing between the subplots
|
87 |
+
plt.tight_layout()
|
88 |
+
plt.show()
|