irfantea's picture
Update app.py
35651d8
raw
history blame
5.03 kB
import pandas as pd
import streamlit as st
import re
import matplotlib.pyplot as plt
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from collections import Counter
from wordcloud import WordCloud
nltk.download('stopwords')
#Huggingface
from datasets import load_dataset
from huggingface_hub import login
import os
st.set_page_config(
page_title="Smart Farming Sentiment Analysis",
page_icon="🌱",
layout="wide"
)
login(token = os.environ['hf_token'])
dataset = load_dataset("irfantea/collections", data_files='smartfarmingsentences.csv', split='train')
df = dataset.to_pandas()
st.dataframe(df)
def set_cleantext(dataframe):
#Sentence less than 10 words
dataframe = dataframe[dataframe['sentences'].apply(lambda x: len(x.split()) >= 10)]
#Delete web address
dataframe.loc[:, 'sentences'] = dataframe['sentences'].str.replace(url_pattern, '', regex=True)
#Removing empty spaces
dataframe.loc[:, 'sentences'] = dataframe['sentences'].str.replace('\n', ' ')
dataframe.loc[:, 'sentences'] = dataframe['sentences'].str.strip()
#Delete duplicate sentences
dataframe = dataframe.drop_duplicates(subset=['sentences'])
dataframe.reset_index(drop=True, inplace=True)
return dataframe
def set_textblob(dataframe):
# apply TextBlob to the value in the column
text_blob = TextBlob(dataframe['sentences'])
# add new columns for polarity and subjectivity
dataframe['polarity'] = text_blob.sentiment.polarity
dataframe['subjectivity'] = text_blob.sentiment.subjectivity
return dataframe
def delete_stopwords(dataframe):
#Delete stopwords
stop = stopwords.words('english')
dataframe['sentences'] = dataframe['sentences'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
return dataframe
#df = pd.read_csv("smartfarmingsentences.csv")
num_ori = df.shape[0]
st.title("Smart Farming Sentiment Analysis")
st.subheader("Sentiment Analysis of Smart Farming Knowledge Graph")
url_pattern = re.compile(r'https?://\S+')
df = set_cleantext(df)
num_clean = df.shape[0]
kolom_num1, kolom_num2, kolom_num3 = st.columns(3)
with kolom_num1:
st.text("Original Sentences: " + str(num_ori))
with kolom_num2:
st.text("Sentences Count: " + str(num_clean))
with kolom_num3:
st.text("Deleted Sentences: " + str(num_ori - num_clean))
#Sentiment Analysis
df = df.apply(set_textblob, axis=1)
st.dataframe(df, use_container_width=True)
#Separate polarity by Positive, Neutral, Negative
df_pos = df[df['polarity'] > 0]
df_neu = df[df['polarity'] == 0]
df_neg = df[df['polarity'] < 0]
#Separate subjectivity by Objective, Subjective
df_obj = df[df['subjectivity'] <= 0.3]
df_sub = df[df['subjectivity'] > 0.3]
figp, ax = plt.subplots()
bars = ax.bar(['Positive', 'Neutral', 'Negative'], [len(df_pos), len(df_neu), len(df_neg)], color=['green', 'gray', 'red'])
ax.set_xlabel('Sentiment')
ax.set_ylabel('Count')
ax.set_title('Sentiment Analysis')
figs, ax = plt.subplots()
bars = ax.bar(['Objective', 'Subjective'], [len(df_obj), len(df_sub)], color=['green', 'red'])
ax.set_xlabel('Subjectivity')
ax.set_ylabel('Count')
ax.set_title('Subjectivity Analysis')
kolom_polar, kolom_subject = st.columns(2)
with kolom_polar:
#Show Sentiment Analysis
st.subheader("Sentiment Analysis")
st.text("Positive: " + str(df_pos.shape[0]))
st.text("Neutral: " + str(df_neu.shape[0]))
st.text("Negative: " + str(df_neg.shape[0]))
# Create a bar chart
st.pyplot(figp)
with kolom_subject:
#Show Subjectivity Analysis
st.subheader("Subjectivity Analysis")
st.text("Objective: " + str(df_obj.shape[0]))
st.text("Subjective: " + str(df_sub.shape[0]))
st.text("---")
# Create a bar chart
st.pyplot(figs)
#Make Lowercase
df['sentences'] = df['sentences'].str.lower()
#remove punctuation . , ! ? : ; " ' ( ) [ ] { } < > / \ | ` ~ @ # $ % ^ & * - _ = +
df['sentences'] = df['sentences'].str.replace('[.,!?;:"\'()\[\]{}<>\\/|`~@#$%^&*\-_+=]', '')
#Stopwords
df = delete_stopwords(df)
#Delete one or two words
df['sentences'] = df['sentences'].apply(lambda x: ' '.join([word for word in x.split() if len(word) > 2]))
#Remove custom words
custom_words = ["s", "al", 'view', 'article', 'google', 'scholar', "scopus", "crossref"]
df['sentences'] = df['sentences'].apply(lambda x: ' '.join([word for word in x.split() if word not in (custom_words)]))
#Make Wordcloud
all_words = ' '.join([text for text in df['sentences']])
wordcloud = WordCloud(width=1024, height=1024, random_state=21, max_font_size=110).generate(all_words)
st.subheader("Wordcloud")
st.text("Total Words: " + str(len(all_words)))
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
st.pyplot(plt)
#Show Top 10 Words
word_freq = Counter(all_words.split()).most_common(100)
df_word_freq = pd.DataFrame(word_freq, columns=['Word', 'Frequency'])
st.subheader("Top 100 Words")
st.dataframe(df_word_freq.head(100), use_container_width=True)