File size: 3,211 Bytes
cdf6823
79ffe5b
 
 
143d008
 
 
 
79ffe5b
 
143d008
 
5683823
143d008
 
5683823
143d008
 
79ffe5b
 
143d008
 
 
 
 
 
a4b69f2
79ffe5b
 
 
7dc792f
79ffe5b
05b5df8
143d008
 
 
9774795
79ffe5b
5683823
79ffe5b
5683823
 
 
 
143d008
5683823
 
 
 
143d008
5683823
 
 
 
143d008
 
dd02d3b
143d008
 
 
 
 
 
5683823
 
143d008
5683823
 
 
 
 
 
79ffe5b
 
143d008
5683823
79ffe5b
143d008
79ffe5b
143d008
79ffe5b
 
 
5683823
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import streamlit as st
import pandas as pd
import re
import nltk
from PIL import Image
import os
import numpy as np
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords
import datasets 
from datasets import load_dataset
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import LabelEncoder
sns.set_palette("RdBu")
# loading dataset
dataset = load_dataset("merve/poetry", streaming=True)
df = pd.DataFrame.from_dict(dataset["train"])


d = os.path.dirname(__file__) if "__file__" in locals() else os.getcwd()
nltk.download("stopwords")
stop = stopwords.words('english')

# standardizing dataset by removing special characters and lowercasing

def standardize(text, remove_digits=True):
    text=re.sub('[^a-zA-Z\d\s]', '',text)
    text = text.lower()

    return text
st.set_option('deprecation.showPyplotGlobalUse', False)
st.write("Poetry dataset, content column cleaned from special characters and lowercased")
df.content = df.content.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df.content=df.content.apply(standardize)
st.dataframe(df)

st.subheader("Visualization on dataset statistics")

st.write("Number of poems written in each type")
sns.catplot(x="type", data=df, kind="count")
plt.xticks(rotation=0)
st.pyplot()

st.write("Number of poems for each age")
sns.catplot(x="age", data=df, kind="count")
plt.xticks(rotation=0)
st.pyplot()

st.write("Number of poems for each author")
sns.catplot(x="author", data=df, kind="count", aspect = 4)
plt.xticks(rotation=90)
st.pyplot()

# distributions of poem types according to ages and authors
st.write("Distributions of poem types according to ages and authors, seems that folks in renaissance loved the love themed poems  and nature themed poems became popular later")
le = LabelEncoder()

df.author = le.fit_transform(df.author)
sns.catplot(x="age", y="author",hue="type", data=df)
st.pyplot()


#words = df.content.str.split(expand=True).unstack().value_counts()
# most appearing words other than stop words
words = df.content.str.split(expand=True).unstack().value_counts()
renaissance = df.content.loc[df.age == "Renaissance"].str.split(expand=True).unstack().value_counts()
modern = df.content.loc[df.age == "Modern"].str.split(expand=True).unstack().value_counts()
st.subheader("Visualizing content")
mask = np.array(Image.open(os.path.join(d, "poet.png")))

import matplotlib.pyplot as plt
def word_cloud(content, title):
    wc = WordCloud(background_color="white", max_words=200,contour_width=3, 
                  stopwords=STOPWORDS, max_font_size=50)
    wc.generate(" ".join(content.index.values))
    fig = plt.figure(figsize=(10, 10))
    plt.title(title, fontsize=20)
    plt.imshow(wc.recolor(colormap='magma', random_state=42), cmap=plt.cm.gray, interpolation = "bilinear", alpha=0.98)
    plt.axis('off')
    st.pyplot()

st.subheader("Most appearing words excluding stopwords in poems according to ages")
word_cloud(modern, "Word Cloud of Modern Poems")

word_cloud(renaissance, "Word Cloud Renaissance Poems")

# most appearing words including stopwords
st.write("Most appearing words including stopwords")
st.bar_chart(words[0:50])