HamidBekam commited on
Commit
c27aa03
1 Parent(s): 8ff3479

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -0
app.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import re
4
+ import nltk
5
+ from PIL import Image
6
+ import os
7
+ import numpy as np
8
+ import seaborn as sns
9
+ from wordcloud import WordCloud, STOPWORDS
10
+ from nltk.corpus import stopwords
11
+ import datasets
12
+ from datasets import load_dataset
13
+ import matplotlib.pyplot as plt
14
+ import sklearn
15
+ from sklearn.preprocessing import LabelEncoder
16
+ sns.set_palette("RdBu")
17
+ # loading dataset
18
+ dataset = load_dataset("merve/poetry", streaming=True)
19
+ df = pd.DataFrame.from_dict(dataset["train"])
20
+
21
+
22
+ d = os.path.dirname(__file__) if "__file__" in locals() else os.getcwd()
23
+ nltk.download("stopwords")
24
+ stop = stopwords.words('english')
25
+
26
+ # standardizing dataset by removing special characters and lowercasing
27
+
28
+ def standardize(text, remove_digits=True):
29
+ text=re.sub('[^a-zA-Z\d\s]', '',text)
30
+ text = text.lower()
31
+
32
+ return text
33
+ st.set_option('deprecation.showPyplotGlobalUse', False)
34
+ st.write("Poetry dataset, content column cleaned from special characters and lowercased")
35
+ df.content = df.content.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
36
+ df.content=df.content.apply(standardize)
37
+ st.dataframe(df)
38
+
39
+ st.subheader("Visualization on dataset statistics")
40
+
41
+ st.write("Number of poems written in each type")
42
+ sns.catplot(x="type", data=df, kind="count")
43
+ plt.xticks(rotation=0)
44
+ st.pyplot()
45
+
46
+ st.write("Number of poems for each age")
47
+ sns.catplot(x="age", data=df, kind="count")
48
+ plt.xticks(rotation=0)
49
+ st.pyplot()
50
+
51
+ st.write("Number of poems for each author")
52
+ sns.catplot(x="author", data=df, kind="count", aspect = 4)
53
+ plt.xticks(rotation=90)
54
+ st.pyplot()
55
+
56
+ # distributions of poem types according to ages and authors
57
+ st.write("Distributions of poem types according to ages and authors, seems that folks in renaissance loved the love themed poems and nature themed poems became popular later")
58
+ le = LabelEncoder()
59
+
60
+ df.author = le.fit_transform(df.author)
61
+ sns.catplot(x="age", y="author",hue="type", data=df)
62
+ st.pyplot()
63
+
64
+
65
+ #words = df.content.str.split(expand=True).unstack().value_counts()
66
+ # most appearing words other than stop words
67
+ words = df.content.str.split(expand=True).unstack().value_counts()
68
+ renaissance = df.content.loc[df.age == "Renaissance"].str.split(expand=True).unstack().value_counts()
69
+ modern = df.content.loc[df.age == "Modern"].str.split(expand=True).unstack().value_counts()
70
+ st.subheader("Visualizing content")
71
+ mask = np.array(Image.open(os.path.join(d, "poet.png")))
72
+
73
+ import matplotlib.pyplot as plt
74
+ def word_cloud(content, title):
75
+ wc = WordCloud(background_color="white", max_words=200,contour_width=3,
76
+ stopwords=STOPWORDS, max_font_size=50)
77
+ wc.generate(" ".join(content.index.values))
78
+ fig = plt.figure(figsize=(10, 10))
79
+ plt.title(title, fontsize=20)
80
+ plt.imshow(wc.recolor(colormap='magma', random_state=42), cmap=plt.cm.gray, interpolation = "bilinear", alpha=0.98)
81
+ plt.axis('off')
82
+ st.pyplot()
83
+
84
+ st.subheader("Most appearing words excluding stopwords in poems according to ages")
85
+ word_cloud(modern, "Word Cloud of Modern Poems")
86
+
87
+ word_cloud(renaissance, "Word Cloud Renaissance Poems")
88
+
89
+ # most appearing words including stopwords
90
+ st.write("Most appearing words including stopwords")
91
+ st.bar_chart(words[0:50])