Spaces:

dongho204
/

space-demo

Runtime error

App Files Files Community

space-demo / app.py

dongho204

Create app.py

a8d2c6d about 1 year ago

raw

history blame

No virus

3.56 kB

	Hugging Face's logo
	Hugging Face
	Search models, datasets, users...
	Models
	Datasets
	Spaces
	Docs
	Solutions
	Pricing



	Spaces:

	merve
	/
	streamlit-dataset-demo


	like
	6
	App
	Files
	Community
	2
	streamlit-dataset-demo
	/
	app.py
	merve's picture
	merve
	HF STAFF
	Update app.py
	dd02d3b
	about 2 years ago
	raw
	history
	blame
	contribute
	delete
	No virus
	3.21 kB
	import streamlit as st
	import pandas as pd
	import re
	import nltk
	from PIL import Image
	import os
	import numpy as np
	import seaborn as sns
	from wordcloud import WordCloud, STOPWORDS
	from nltk.corpus import stopwords
	import datasets
	from datasets import load_dataset
	import matplotlib.pyplot as plt
	import sklearn
	from sklearn.preprocessing import LabelEncoder
	sns.set_palette("RdBu")
	# loading dataset
	dataset = load_dataset("merve/poetry", streaming=True)
	df = pd.DataFrame.from_dict(dataset["train"])


	d = os.path.dirname(__file__) if "__file__" in locals() else os.getcwd()
	nltk.download("stopwords")
	stop = stopwords.words('english')

	# standardizing dataset by removing special characters and lowercasing

	def standardize(text, remove_digits=True):
	text=re.sub('[^a-zA-Z\d\s]', '',text)
	text = text.lower()

	return text
	st.set_option('deprecation.showPyplotGlobalUse', False)
	st.write("Poetry dataset, content column cleaned from special characters and lowercased")
	df.content = df.content.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
	df.content=df.content.apply(standardize)
	st.dataframe(df)

	st.subheader("Visualization on dataset statistics")

	st.write("Number of poems written in each type")
	sns.catplot(x="type", data=df, kind="count")
	plt.xticks(rotation=0)
	st.pyplot()

	st.write("Number of poems for each age")
	sns.catplot(x="age", data=df, kind="count")
	plt.xticks(rotation=0)
	st.pyplot()

	st.write("Number of poems for each author")
	sns.catplot(x="author", data=df, kind="count", aspect = 4)
	plt.xticks(rotation=90)
	st.pyplot()

	# distributions of poem types according to ages and authors
	st.write("Distributions of poem types according to ages and authors, seems that folks in renaissance loved the love themed poems and nature themed poems became popular later")
	le = LabelEncoder()

	df.author = le.fit_transform(df.author)
	sns.catplot(x="age", y="author",hue="type", data=df)
	st.pyplot()


	#words = df.content.str.split(expand=True).unstack().value_counts()
	# most appearing words other than stop words
	words = df.content.str.split(expand=True).unstack().value_counts()
	renaissance = df.content.loc[df.age == "Renaissance"].str.split(expand=True).unstack().value_counts()
	modern = df.content.loc[df.age == "Modern"].str.split(expand=True).unstack().value_counts()
	st.subheader("Visualizing content")
	mask = np.array(Image.open(os.path.join(d, "poet.png")))

	import matplotlib.pyplot as plt
	def word_cloud(content, title):
	wc = WordCloud(background_color="white", max_words=200,contour_width=3,
	stopwords=STOPWORDS, max_font_size=50)
	wc.generate(" ".join(content.index.values))
	fig = plt.figure(figsize=(10, 10))
	plt.title(title, fontsize=20)
	plt.imshow(wc.recolor(colormap='magma', random_state=42), cmap=plt.cm.gray, interpolation = "bilinear", alpha=0.98)
	plt.axis('off')
	st.pyplot()

	st.subheader("Most appearing words excluding stopwords in poems according to ages")
	word_cloud(modern, "Word Cloud of Modern Poems")

	word_cloud(renaissance, "Word Cloud Renaissance Poems")

	# most appearing words including stopwords
	st.write("Most appearing words including stopwords")
	st.bar_chart(words[0:50])