merve HF staff commited on
Commit
5683823
β€’
1 Parent(s): 47b32ef

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -12
app.py CHANGED
@@ -11,9 +11,10 @@ from wordcloud import WordCloud, STOPWORDS
11
  from nltk.corpus import stopwords
12
  import datasets
13
  from datasets import load_dataset
 
14
  import sklearn
15
  from sklearn.preprocessing import LabelEncoder
16
-
17
  # loading dataset
18
  dataset = load_dataset("merve/poetry", streaming=True)
19
  df = pd.DataFrame.from_dict(dataset["train"])
@@ -36,30 +37,46 @@ df.content = df.content.apply(lambda x: ' '.join([word for word in x.split() if
36
  df.content=df.content.apply(standardize)
37
  st.dataframe(df)
38
 
39
- #most appearing words including stopwords
40
- st.write("Most appearing words including stopwords")
41
- words = df.content.str.split(expand=True).unstack().value_counts()
42
- st.bar_chart(words[0:50])
43
- st.set_option('deprecation.showPyplotGlobalUse', False)
44
 
 
 
 
 
45
 
 
 
 
 
46
 
47
- mask = np.array(Image.open(os.path.join(d, "poet.png")))
 
 
 
48
 
49
  # distributions of poem types according to ages and authors
50
- st.write("Distributions of poem types according to ages and authors")
 
 
51
  le = LabelEncoder()
52
 
53
  df.author = le.fit_transform(df.author)
54
  sns.catplot(x="age", y="author",hue="type", data=df)
55
  st.pyplot()
56
 
 
 
57
  # most appearing words other than stop words
58
-
 
 
 
 
 
59
  import matplotlib.pyplot as plt
60
  def word_cloud(content, title):
61
  wc = WordCloud(background_color="white", max_words=200,contour_width=3,
62
- stopwords=STOPWORDS, mask = mask, max_font_size=50)
63
  wc.generate(" ".join(content.index.values))
64
  fig = plt.figure(figsize=(10, 10))
65
  plt.title(title, fontsize=20)
@@ -67,5 +84,12 @@ def word_cloud(content, title):
67
  plt.axis('off')
68
  st.pyplot()
69
 
70
- st.write("Most appearing words excluding stopwords")
71
- word_cloud(words, "Word Cloud")
 
 
 
 
 
 
 
 
11
  from nltk.corpus import stopwords
12
  import datasets
13
  from datasets import load_dataset
14
+ import matplotlib.pyplot as plt
15
  import sklearn
16
  from sklearn.preprocessing import LabelEncoder
17
+ sns.set_palette("RdBu")
18
  # loading dataset
19
  dataset = load_dataset("merve/poetry", streaming=True)
20
  df = pd.DataFrame.from_dict(dataset["train"])
 
37
  df.content=df.content.apply(standardize)
38
  st.dataframe(df)
39
 
40
+ st.subheader("Visualization on dataset statistics")
 
 
 
 
41
 
42
+ st.write("Number of poems written in each type")
43
+ sns.catplot(x="type", data=df, kind="count")
44
+ plt.xticks(rotation=0)
45
+ st.pyplot()
46
 
47
+ st.write("Number of poems for each age")
48
+ sns.catplot(x="age", data=df, kind="count")
49
+ plt.xticks(rotation=0)
50
+ st.pyplot()
51
 
52
+ st.write("Number of poems for each author")
53
+ sns.catplot(x="author", data=df, kind="count", aspect = 4)
54
+ plt.xticks(rotation=90)
55
+ st.pyplot()
56
 
57
  # distributions of poem types according to ages and authors
58
+ st.write("Distributions of poem types according to ages and authors, \
59
+ seems that folks in renaissance loved the love themed poems \
60
+ and nature themed poems became popular later")
61
  le = LabelEncoder()
62
 
63
  df.author = le.fit_transform(df.author)
64
  sns.catplot(x="age", y="author",hue="type", data=df)
65
  st.pyplot()
66
 
67
+
68
+ #words = df.content.str.split(expand=True).unstack().value_counts()
69
  # most appearing words other than stop words
70
+ words = df.content.str.split(expand=True).unstack().value_counts()
71
+ renaissance = df.content.loc[df.age == "Renaissance"].str.split(expand=True).unstack().value_counts()
72
+ modern = df.content.loc[df.age == "Modern"].str.split(expand=True).unstack().value_counts()
73
+ st.subheader("Visualizing content")
74
+ mask = np.array(Image.open(os.path.join(d, "poet.png")))
75
+
76
  import matplotlib.pyplot as plt
77
  def word_cloud(content, title):
78
  wc = WordCloud(background_color="white", max_words=200,contour_width=3,
79
+ stopwords=STOPWORDS, max_font_size=50)
80
  wc.generate(" ".join(content.index.values))
81
  fig = plt.figure(figsize=(10, 10))
82
  plt.title(title, fontsize=20)
 
84
  plt.axis('off')
85
  st.pyplot()
86
 
87
+ st.subheader("Most appearing words excluding stopwords in poems according to ages")
88
+ word_cloud(modern, "Word Cloud of Modern Poems")
89
+
90
+ word_cloud(renaissance, "Word Cloud Renaissance Poems")
91
+
92
+ # most appearing words including stopwords
93
+ st.write("Most appearing words including stopwords")
94
+ st.bar_chart(words[0:50])
95
+ st.set_option('deprecation.showPyplotGlobalUse', False)