Spaces:

HamidBekam
/

Test_LLM

Runtime error

App Files Files Community

HamidBekam commited on Mar 6, 2023

Commit

c27aa03

•

1 Parent(s): 8ff3479

Create app.py

Browse files

Files changed (1) hide show

app.py +91 -0

app.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import streamlit as st
+import pandas as pd
+import re
+import nltk
+from PIL import Image
+import os
+import numpy as np
+import seaborn as sns
+from wordcloud import WordCloud, STOPWORDS
+from nltk.corpus import stopwords
+import datasets
+from datasets import load_dataset
+import matplotlib.pyplot as plt
+import sklearn
+from sklearn.preprocessing import LabelEncoder
+sns.set_palette("RdBu")
+# loading dataset
+dataset = load_dataset("merve/poetry", streaming=True)
+df = pd.DataFrame.from_dict(dataset["train"])
+d = os.path.dirname(__file__) if "__file__" in locals() else os.getcwd()
+nltk.download("stopwords")
+stop = stopwords.words('english')
+# standardizing dataset by removing special characters and lowercasing
+def standardize(text, remove_digits=True):
+    text=re.sub('[^a-zA-Z\d\s]', '',text)
+    text = text.lower()
+    return text
+st.set_option('deprecation.showPyplotGlobalUse', False)
+st.write("Poetry dataset, content column cleaned from special characters and lowercased")
+df.content = df.content.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
+df.content=df.content.apply(standardize)
+st.dataframe(df)
+st.subheader("Visualization on dataset statistics")
+st.write("Number of poems written in each type")
+sns.catplot(x="type", data=df, kind="count")
+plt.xticks(rotation=0)
+st.pyplot()
+st.write("Number of poems for each age")
+sns.catplot(x="age", data=df, kind="count")
+plt.xticks(rotation=0)
+st.pyplot()
+st.write("Number of poems for each author")
+sns.catplot(x="author", data=df, kind="count", aspect = 4)
+plt.xticks(rotation=90)
+st.pyplot()
+# distributions of poem types according to ages and authors
+st.write("Distributions of poem types according to ages and authors, seems that folks in renaissance loved the love themed poems  and nature themed poems became popular later")
+le = LabelEncoder()
+df.author = le.fit_transform(df.author)
+sns.catplot(x="age", y="author",hue="type", data=df)
+st.pyplot()
+#words = df.content.str.split(expand=True).unstack().value_counts()
+# most appearing words other than stop words
+words = df.content.str.split(expand=True).unstack().value_counts()
+renaissance = df.content.loc[df.age == "Renaissance"].str.split(expand=True).unstack().value_counts()
+modern = df.content.loc[df.age == "Modern"].str.split(expand=True).unstack().value_counts()
+st.subheader("Visualizing content")
+mask = np.array(Image.open(os.path.join(d, "poet.png")))
+import matplotlib.pyplot as plt
+def word_cloud(content, title):
+    wc = WordCloud(background_color="white", max_words=200,contour_width=3,
+                  stopwords=STOPWORDS, max_font_size=50)
+    wc.generate(" ".join(content.index.values))
+    fig = plt.figure(figsize=(10, 10))
+    plt.title(title, fontsize=20)
+    plt.imshow(wc.recolor(colormap='magma', random_state=42), cmap=plt.cm.gray, interpolation = "bilinear", alpha=0.98)
+    plt.axis('off')
+    st.pyplot()
+st.subheader("Most appearing words excluding stopwords in poems according to ages")
+word_cloud(modern, "Word Cloud of Modern Poems")
+word_cloud(renaissance, "Word Cloud Renaissance Poems")
+# most appearing words including stopwords
+st.write("Most appearing words including stopwords")
+st.bar_chart(words[0:50])