dongho204 commited on
Commit
a8d2c6d
1 Parent(s): 4c687be

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +133 -0
app.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Hugging Face's logo
2
+ Hugging Face
3
+ Search models, datasets, users...
4
+ Models
5
+ Datasets
6
+ Spaces
7
+ Docs
8
+ Solutions
9
+ Pricing
10
+
11
+
12
+
13
+ Spaces:
14
+
15
+ merve
16
+ /
17
+ streamlit-dataset-demo
18
+
19
+
20
+ like
21
+ 6
22
+ App
23
+ Files
24
+ Community
25
+ 2
26
+ streamlit-dataset-demo
27
+ /
28
+ app.py
29
+ merve's picture
30
+ merve
31
+ HF STAFF
32
+ Update app.py
33
+ dd02d3b
34
+ about 2 years ago
35
+ raw
36
+ history
37
+ blame
38
+ contribute
39
+ delete
40
+ No virus
41
+ 3.21 kB
42
+ import streamlit as st
43
+ import pandas as pd
44
+ import re
45
+ import nltk
46
+ from PIL import Image
47
+ import os
48
+ import numpy as np
49
+ import seaborn as sns
50
+ from wordcloud import WordCloud, STOPWORDS
51
+ from nltk.corpus import stopwords
52
+ import datasets
53
+ from datasets import load_dataset
54
+ import matplotlib.pyplot as plt
55
+ import sklearn
56
+ from sklearn.preprocessing import LabelEncoder
57
+ sns.set_palette("RdBu")
58
+ # loading dataset
59
+ dataset = load_dataset("merve/poetry", streaming=True)
60
+ df = pd.DataFrame.from_dict(dataset["train"])
61
+
62
+
63
+ d = os.path.dirname(__file__) if "__file__" in locals() else os.getcwd()
64
+ nltk.download("stopwords")
65
+ stop = stopwords.words('english')
66
+
67
+ # standardizing dataset by removing special characters and lowercasing
68
+
69
+ def standardize(text, remove_digits=True):
70
+ text=re.sub('[^a-zA-Z\d\s]', '',text)
71
+ text = text.lower()
72
+
73
+ return text
74
+ st.set_option('deprecation.showPyplotGlobalUse', False)
75
+ st.write("Poetry dataset, content column cleaned from special characters and lowercased")
76
+ df.content = df.content.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
77
+ df.content=df.content.apply(standardize)
78
+ st.dataframe(df)
79
+
80
+ st.subheader("Visualization on dataset statistics")
81
+
82
+ st.write("Number of poems written in each type")
83
+ sns.catplot(x="type", data=df, kind="count")
84
+ plt.xticks(rotation=0)
85
+ st.pyplot()
86
+
87
+ st.write("Number of poems for each age")
88
+ sns.catplot(x="age", data=df, kind="count")
89
+ plt.xticks(rotation=0)
90
+ st.pyplot()
91
+
92
+ st.write("Number of poems for each author")
93
+ sns.catplot(x="author", data=df, kind="count", aspect = 4)
94
+ plt.xticks(rotation=90)
95
+ st.pyplot()
96
+
97
+ # distributions of poem types according to ages and authors
98
+ st.write("Distributions of poem types according to ages and authors, seems that folks in renaissance loved the love themed poems and nature themed poems became popular later")
99
+ le = LabelEncoder()
100
+
101
+ df.author = le.fit_transform(df.author)
102
+ sns.catplot(x="age", y="author",hue="type", data=df)
103
+ st.pyplot()
104
+
105
+
106
+ #words = df.content.str.split(expand=True).unstack().value_counts()
107
+ # most appearing words other than stop words
108
+ words = df.content.str.split(expand=True).unstack().value_counts()
109
+ renaissance = df.content.loc[df.age == "Renaissance"].str.split(expand=True).unstack().value_counts()
110
+ modern = df.content.loc[df.age == "Modern"].str.split(expand=True).unstack().value_counts()
111
+ st.subheader("Visualizing content")
112
+ mask = np.array(Image.open(os.path.join(d, "poet.png")))
113
+
114
+ import matplotlib.pyplot as plt
115
+ def word_cloud(content, title):
116
+ wc = WordCloud(background_color="white", max_words=200,contour_width=3,
117
+ stopwords=STOPWORDS, max_font_size=50)
118
+ wc.generate(" ".join(content.index.values))
119
+ fig = plt.figure(figsize=(10, 10))
120
+ plt.title(title, fontsize=20)
121
+ plt.imshow(wc.recolor(colormap='magma', random_state=42), cmap=plt.cm.gray, interpolation = "bilinear", alpha=0.98)
122
+ plt.axis('off')
123
+ st.pyplot()
124
+
125
+ st.subheader("Most appearing words excluding stopwords in poems according to ages")
126
+ word_cloud(modern, "Word Cloud of Modern Poems")
127
+
128
+ word_cloud(renaissance, "Word Cloud Renaissance Poems")
129
+
130
+ # most appearing words including stopwords
131
+ st.write("Most appearing words including stopwords")
132
+ st.bar_chart(words[0:50])
133
+