Spaces:
Sleeping
Sleeping
import pandas as pd | |
import streamlit as st | |
import re | |
import matplotlib.pyplot as plt | |
from textblob import TextBlob | |
import nltk | |
from nltk.corpus import stopwords | |
from collections import Counter | |
from wordcloud import WordCloud | |
nltk.download('stopwords') | |
#Huggingface | |
from datasets import load_dataset | |
from huggingface_hub import login | |
import os | |
st.set_page_config( | |
page_title="Smart Farming Sentiment Analysis", | |
page_icon="🌱", | |
layout="wide" | |
) | |
login(token = os.environ['hf_token']) | |
dataset = load_dataset("irfantea/collections", data_files='smartfarmingsentences.csv', split='train') | |
df = dataset.to_pandas() | |
st.dataframe(df) | |
def set_cleantext(dataframe): | |
#Sentence less than 10 words | |
dataframe = dataframe[dataframe['sentences'].apply(lambda x: len(x.split()) >= 10)] | |
#Delete web address | |
dataframe.loc[:, 'sentences'] = dataframe['sentences'].str.replace(url_pattern, '', regex=True) | |
#Removing empty spaces | |
dataframe.loc[:, 'sentences'] = dataframe['sentences'].str.replace('\n', ' ') | |
dataframe.loc[:, 'sentences'] = dataframe['sentences'].str.strip() | |
#Delete duplicate sentences | |
dataframe = dataframe.drop_duplicates(subset=['sentences']) | |
dataframe.reset_index(drop=True, inplace=True) | |
return dataframe | |
def set_textblob(dataframe): | |
# apply TextBlob to the value in the column | |
text_blob = TextBlob(dataframe['sentences']) | |
# add new columns for polarity and subjectivity | |
dataframe['polarity'] = text_blob.sentiment.polarity | |
dataframe['subjectivity'] = text_blob.sentiment.subjectivity | |
return dataframe | |
def delete_stopwords(dataframe): | |
#Delete stopwords | |
stop = stopwords.words('english') | |
dataframe['sentences'] = dataframe['sentences'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])) | |
return dataframe | |
#df = pd.read_csv("smartfarmingsentences.csv") | |
num_ori = df.shape[0] | |
st.title("Smart Farming Sentiment Analysis") | |
st.subheader("Sentiment Analysis of Smart Farming Knowledge Graph") | |
url_pattern = re.compile(r'https?://\S+') | |
df = set_cleantext(df) | |
num_clean = df.shape[0] | |
kolom_num1, kolom_num2, kolom_num3 = st.columns(3) | |
with kolom_num1: | |
st.text("Original Sentences: " + str(num_ori)) | |
with kolom_num2: | |
st.text("Sentences Count: " + str(num_clean)) | |
with kolom_num3: | |
st.text("Deleted Sentences: " + str(num_ori - num_clean)) | |
#Sentiment Analysis | |
df = df.apply(set_textblob, axis=1) | |
st.dataframe(df, use_container_width=True) | |
#Separate polarity by Positive, Neutral, Negative | |
df_pos = df[df['polarity'] > 0] | |
df_neu = df[df['polarity'] == 0] | |
df_neg = df[df['polarity'] < 0] | |
#Separate subjectivity by Objective, Subjective | |
df_obj = df[df['subjectivity'] <= 0.3] | |
df_sub = df[df['subjectivity'] > 0.3] | |
figp, ax = plt.subplots() | |
bars = ax.bar(['Positive', 'Neutral', 'Negative'], [len(df_pos), len(df_neu), len(df_neg)], color=['green', 'gray', 'red']) | |
ax.set_xlabel('Sentiment') | |
ax.set_ylabel('Count') | |
ax.set_title('Sentiment Analysis') | |
figs, ax = plt.subplots() | |
bars = ax.bar(['Objective', 'Subjective'], [len(df_obj), len(df_sub)], color=['green', 'red']) | |
ax.set_xlabel('Subjectivity') | |
ax.set_ylabel('Count') | |
ax.set_title('Subjectivity Analysis') | |
kolom_polar, kolom_subject = st.columns(2) | |
with kolom_polar: | |
#Show Sentiment Analysis | |
st.subheader("Sentiment Analysis") | |
st.text("Positive: " + str(df_pos.shape[0])) | |
st.text("Neutral: " + str(df_neu.shape[0])) | |
st.text("Negative: " + str(df_neg.shape[0])) | |
# Create a bar chart | |
st.pyplot(figp) | |
with kolom_subject: | |
#Show Subjectivity Analysis | |
st.subheader("Subjectivity Analysis") | |
st.text("Objective: " + str(df_obj.shape[0])) | |
st.text("Subjective: " + str(df_sub.shape[0])) | |
st.text("---") | |
# Create a bar chart | |
st.pyplot(figs) | |
#Make Lowercase | |
df['sentences'] = df['sentences'].str.lower() | |
#remove punctuation . , ! ? : ; " ' ( ) [ ] { } < > / \ | ` ~ @ # $ % ^ & * - _ = + | |
df['sentences'] = df['sentences'].str.replace('[.,!?;:"\'()\[\]{}<>\\/|`~@#$%^&*\-_+=]', '') | |
#Stopwords | |
df = delete_stopwords(df) | |
#Delete one or two words | |
df['sentences'] = df['sentences'].apply(lambda x: ' '.join([word for word in x.split() if len(word) > 2])) | |
#Remove custom words | |
custom_words = ["s", "al", 'view', 'article', 'google', 'scholar', "scopus", "crossref"] | |
df['sentences'] = df['sentences'].apply(lambda x: ' '.join([word for word in x.split() if word not in (custom_words)])) | |
#Make Wordcloud | |
all_words = ' '.join([text for text in df['sentences']]) | |
wordcloud = WordCloud(width=1024, height=1024, random_state=21, max_font_size=110).generate(all_words) | |
st.subheader("Wordcloud") | |
st.text("Total Words: " + str(len(all_words))) | |
plt.figure(figsize=(10, 7)) | |
plt.imshow(wordcloud, interpolation="bilinear") | |
plt.axis('off') | |
st.pyplot(plt) | |
#Show Top 10 Words | |
word_freq = Counter(all_words.split()).most_common(100) | |
df_word_freq = pd.DataFrame(word_freq, columns=['Word', 'Frequency']) | |
st.subheader("Top 100 Words") | |
st.dataframe(df_word_freq.head(100), use_container_width=True) |