import streamlit as st import pandas as pd import re import nltk from PIL import Image import os import numpy as np import seaborn as sns from wordcloud import WordCloud, STOPWORDS from nltk.corpus import stopwords import datasets from datasets import load_dataset import matplotlib.pyplot as plt import sklearn from sklearn.preprocessing import LabelEncoder sns.set_palette("RdBu") # loading dataset dataset = load_dataset("merve/poetry", streaming=True) df = pd.DataFrame.from_dict(dataset["train"]) d = os.path.dirname(__file__) if "__file__" in locals() else os.getcwd() nltk.download("stopwords") stop = stopwords.words('english') # standardizing dataset by removing special characters and lowercasing def standardize(text, remove_digits=True): text=re.sub('[^a-zA-Z\d\s]', '',text) text = text.lower() return text st.set_option('deprecation.showPyplotGlobalUse', False) st.write("Poetry dataset, content column cleaned from special characters and lowercased") df.content = df.content.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])) df.content=df.content.apply(standardize) st.dataframe(df) st.subheader("Visualization on dataset statistics") st.write("Number of poems written in each type") sns.catplot(x="type", data=df, kind="count") plt.xticks(rotation=0) st.pyplot() st.write("Number of poems for each age") sns.catplot(x="age", data=df, kind="count") plt.xticks(rotation=0) st.pyplot() st.write("Number of poems for each author") sns.catplot(x="author", data=df, kind="count", aspect = 4) plt.xticks(rotation=90) st.pyplot() # distributions of poem types according to ages and authors st.write("Distributions of poem types according to ages and authors, seems that folks in renaissance loved the love themed poems and nature themed poems became popular later") le = LabelEncoder() df.author = le.fit_transform(df.author) sns.catplot(x="age", y="author",hue="type", data=df) st.pyplot() #words = df.content.str.split(expand=True).unstack().value_counts() # most appearing words other than stop words words = df.content.str.split(expand=True).unstack().value_counts() renaissance = df.content.loc[df.age == "Renaissance"].str.split(expand=True).unstack().value_counts() modern = df.content.loc[df.age == "Modern"].str.split(expand=True).unstack().value_counts() st.subheader("Visualizing content") mask = np.array(Image.open(os.path.join(d, "poet.png"))) import matplotlib.pyplot as plt def word_cloud(content, title): wc = WordCloud(background_color="white", max_words=200,contour_width=3, stopwords=STOPWORDS, max_font_size=50) wc.generate(" ".join(content.index.values)) fig = plt.figure(figsize=(10, 10)) plt.title(title, fontsize=20) plt.imshow(wc.recolor(colormap='magma', random_state=42), cmap=plt.cm.gray, interpolation = "bilinear", alpha=0.98) plt.axis('off') st.pyplot() st.subheader("Most appearing words excluding stopwords in poems according to ages") word_cloud(modern, "Word Cloud of Modern Poems") word_cloud(renaissance, "Word Cloud Renaissance Poems") # most appearing words including stopwords st.write("Most appearing words including stopwords") st.bar_chart(words[0:50])