import streamlit as st import pandas as pd import re import nltk from PIL import Image import os import numpy as np import seaborn as sns from wordcloud import WordCloud, STOPWORDS from nltk.corpus import stopwords import datasets from datasets import load_dataset import sklearn from sklearn.preprocessing import LabelEncoder # loading dataset dataset = load_dataset("merve/poetry", streaming=True) df = pd.DataFrame.from_dict(dataset["train"]) d = os.path.dirname(__file__) if "__file__" in locals() else os.getcwd() nltk.download("stopwords") stop = stopwords.words('english') # standardizing dataset by removing special characters and lowercasing def standardize(text, remove_digits=True): text=re.sub('[^a-zA-Z\d\s]', '',text) text = text.lower() return text st.write("Poetry dataset, content column cleaned from special characters and lowercased") df.content = df.content.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])) df.content=df.content.apply(standardize) st.dataframe(df) #most appearing words including stopwords st.write("Most appearing words including stopwords") words = df.content.str.split(expand=True).unstack().value_counts() st.bar_chart(words[0:50]) st.set_option('deprecation.showPyplotGlobalUse', False) mask = np.array(Image.open(os.path.join(d, "poet.png"))) # distributions of poem types according to ages and authors st.write("Distributions of poem types according to ages and authors") le = LabelEncoder() df.author = le.fit_transform(df.author) sns.catplot(x="age", y="author",hue="type", data=df) st.pyplot() # most appearing words other than stop words import matplotlib.pyplot as plt def word_cloud(content, title): wc = WordCloud(background_color="white", max_words=200,contour_width=3, stopwords=STOPWORDS, mask = mask, max_font_size=50) wc.generate(" ".join(content.index.values)) fig = plt.figure(figsize=(10, 10)) plt.title(title, fontsize=20) plt.imshow(wc.recolor(colormap='magma', random_state=42), cmap=plt.cm.gray, interpolation = "bilinear", alpha=0.98) plt.axis('off') st.pyplot() st.write("Most appearing words excluding stopwords") word_cloud(words, "Word Cloud")