nlp-dataset / app.py
Zaman, Shaheer Shaheer
first commit
414b5fe
import streamlit as st
import pandas as pd
import re
import nltk
from PIL import Image
import os
import numpy as np
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords
import datasets
from datasets import load_dataset
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import LabelEncoder
sns.set_palette('RdBu')
#load dataset
dataset = load_dataset('merve/poetry', streaming=True)
df = pd.DataFrame.from_dict(dataset['train'])
d = os.path.dirname(__file__) if '__file__' in locals() else os.getcwd()
nltk.download('stopwords')
stop = stopwords.words('english')
def standardize(text, remove_digits=True):
text = re.sub('[^a-zA-Z\d\s]', '', text)
text = text.lower()
return text
st.set_option('deprecation.showPyplotGlobalUse', False)
st.write('Poetry dataset, content character cleaned from special characters and lower cased')
df.content = df.content.apply(lambda x: ' '.join(word for word in x.split() if word not in stop))
df.content = df.content.apply(standardize)
st.dataframe(df)
st.subheader('Visualization on dataset statistics')
st.write('Number of peoms written in each type')
sns.catplot(x='type', data=df, kind='count')
plt.xticks(rotation=0)
st.pyplot()
st.write('Number of poems for each age')
sns.catplot(x='age', data=df, kind='count')
plt.xticks(rotation=0)
st.pyplot()
st.write("Number of poems for each author")
sns.catplot(x="author", data=df, kind="count", aspect = 4)
plt.xticks(rotation=90)
st.pyplot()
st.write('Distributions of poem types according to ages and authors, seems that folks in renaissance loved the love themed poems and nature themed poems became popular later')
le = LabelEncoder()
df.author = le.fit_transform(df.author)
sns.catplot(x='age', y='author', hue='type', data=df)
st.pyplot()
words = df.content.str.split(expand=True).unstack().value_counts()
renaissance = df.content.loc[df.age == 'Renaissance'].str.split(expand=True).unstack().value_counts()
modern = df.content.loc[df.age == 'modern'].str.split(expand=True).unstack().value_counts()
st.subheader('Visualizing content')
mask = np.array(Image.open(os.path.join(d, 'poet.png')))
import matplotlib.pyplot as plt
def word_cloud(content, title):
wc = WordCloud(background_color='white',
max_words=200,
contour_width=3,
stopwords=STOPWORDS,
max_font_size=50)
wc.generate(' '.join(content.index.values))
fig = plt.figure(figsize=(10, 10))
plt.title(title, fontsize=20)
plt.imshow(wc.recolor(colormap='magma', random_state=42), cmap=plt.cm.gray, interpolation = "bilinear", alpha=0.98)
plt.axis('off')
st.pyplot()
st.subheader('Most appearing words excluding stopwords n poems according to ages')
word_cloud(modern, 'word cloud Renaissance poems')
st.write('Most appearing words including stopwords')
st.bar_chart(words[:50])