Spaces:

merve
/

streamlit-dataset-demo

Build error

App Files Files Community

merve HF staff commited on Sep 27, 2021

Commit

5683823

•

1 Parent(s): 47b32ef

Upload app.py

Browse files

Files changed (1) hide show

app.py +36 -12

app.py CHANGED Viewed

@@ -11,9 +11,10 @@ from wordcloud import WordCloud, STOPWORDS
 from nltk.corpus import stopwords
 import datasets
 from datasets import load_dataset
 import sklearn
 from sklearn.preprocessing import LabelEncoder
 # loading dataset
 dataset = load_dataset("merve/poetry", streaming=True)
 df = pd.DataFrame.from_dict(dataset["train"])
@@ -36,30 +37,46 @@ df.content = df.content.apply(lambda x: ' '.join([word for word in x.split() if
 df.content=df.content.apply(standardize)
 st.dataframe(df)
-#most appearing words including stopwords
-st.write("Most appearing words including stopwords")
-words = df.content.str.split(expand=True).unstack().value_counts()
-st.bar_chart(words[0:50])
-st.set_option('deprecation.showPyplotGlobalUse', False)
-mask = np.array(Image.open(os.path.join(d, "poet.png")))
 # distributions of poem types according to ages and authors
-st.write("Distributions of poem types according to ages and authors")
 le = LabelEncoder()
 df.author = le.fit_transform(df.author)
 sns.catplot(x="age", y="author",hue="type", data=df)
 st.pyplot()
 # most appearing words other than stop words
 import matplotlib.pyplot as plt
 def word_cloud(content, title):
     wc = WordCloud(background_color="white", max_words=200,contour_width=3,
-                  stopwords=STOPWORDS, mask = mask, max_font_size=50)
     wc.generate(" ".join(content.index.values))
     fig = plt.figure(figsize=(10, 10))
     plt.title(title, fontsize=20)
@@ -67,5 +84,12 @@ def word_cloud(content, title):
     plt.axis('off')
     st.pyplot()
-st.write("Most appearing words excluding stopwords")
-word_cloud(words, "Word Cloud")

 from nltk.corpus import stopwords
 import datasets
 from datasets import load_dataset
+import matplotlib.pyplot as plt
 import sklearn
 from sklearn.preprocessing import LabelEncoder
+sns.set_palette("RdBu")
 # loading dataset
 dataset = load_dataset("merve/poetry", streaming=True)
 df = pd.DataFrame.from_dict(dataset["train"])
 df.content=df.content.apply(standardize)
 st.dataframe(df)
+st.subheader("Visualization on dataset statistics")
+st.write("Number of poems written in each type")
+sns.catplot(x="type", data=df, kind="count")
+plt.xticks(rotation=0)
+st.pyplot()
+st.write("Number of poems for each age")
+sns.catplot(x="age", data=df, kind="count")
+plt.xticks(rotation=0)
+st.pyplot()
+st.write("Number of poems for each author")
+sns.catplot(x="author", data=df, kind="count", aspect = 4)
+plt.xticks(rotation=90)
+st.pyplot()
 # distributions of poem types according to ages and authors
+st.write("Distributions of poem types according to ages and authors, \
+seems that folks in renaissance loved the love themed poems \
+and nature themed poems became popular later")
 le = LabelEncoder()
 df.author = le.fit_transform(df.author)
 sns.catplot(x="age", y="author",hue="type", data=df)
 st.pyplot()
+#words = df.content.str.split(expand=True).unstack().value_counts()
 # most appearing words other than stop words
+words = df.content.str.split(expand=True).unstack().value_counts()
+renaissance = df.content.loc[df.age == "Renaissance"].str.split(expand=True).unstack().value_counts()
+modern = df.content.loc[df.age == "Modern"].str.split(expand=True).unstack().value_counts()
+st.subheader("Visualizing content")
+mask = np.array(Image.open(os.path.join(d, "poet.png")))
 import matplotlib.pyplot as plt
 def word_cloud(content, title):
     wc = WordCloud(background_color="white", max_words=200,contour_width=3,
+                  stopwords=STOPWORDS, max_font_size=50)
     wc.generate(" ".join(content.index.values))
     fig = plt.figure(figsize=(10, 10))
     plt.title(title, fontsize=20)
     plt.axis('off')
     st.pyplot()
+st.subheader("Most appearing words excluding stopwords in poems according to ages")
+word_cloud(modern, "Word Cloud of Modern Poems")
+word_cloud(renaissance, "Word Cloud Renaissance Poems")
+# most appearing words including stopwords
+st.write("Most appearing words including stopwords")
+st.bar_chart(words[0:50])
+st.set_option('deprecation.showPyplotGlobalUse', False)