Zaman, Shaheer Shaheer commited on
Commit
414b5fe
1 Parent(s): 90fc317

first commit

Browse files
Files changed (3) hide show
  1. .gitignore +1 -0
  2. app.py +87 -0
  3. requirements.txt +10 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ venv/
app.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import re
4
+ import nltk
5
+ from PIL import Image
6
+ import os
7
+ import numpy as np
8
+ import seaborn as sns
9
+ from wordcloud import WordCloud, STOPWORDS
10
+ from nltk.corpus import stopwords
11
+ import datasets
12
+ from datasets import load_dataset
13
+ import matplotlib.pyplot as plt
14
+ import sklearn
15
+ from sklearn.preprocessing import LabelEncoder
16
+ sns.set_palette('RdBu')
17
+
18
+ #load dataset
19
+ dataset = load_dataset('merve/poetry', streaming=True)
20
+ df = pd.DataFrame.from_dict(dataset['train'])
21
+
22
+ d = os.path.dirname(__file__) if '__file__' in locals() else os.getcwd()
23
+ nltk.download('stopwords')
24
+ stop = stopwords.words('english')
25
+
26
+ def standardize(text, remove_digits=True):
27
+ text = re.sub('[^a-zA-Z\d\s]', '', text)
28
+ text = text.lower()
29
+
30
+ return text
31
+
32
+
33
+ st.set_option('deprecation.showPyplotGlobalUse', False)
34
+ st.write('Poetry dataset, content character cleaned from special characters and lower cased')
35
+ df.content = df.content.apply(lambda x: ' '.join(word for word in x.split() if word not in stop))
36
+ df.content = df.content.apply(standardize)
37
+ st.dataframe(df)
38
+
39
+ st.subheader('Visualization on dataset statistics')
40
+
41
+ st.write('Number of peoms written in each type')
42
+ sns.catplot(x='type', data=df, kind='count')
43
+ plt.xticks(rotation=0)
44
+ st.pyplot()
45
+
46
+ st.write('Number of poems for each age')
47
+ sns.catplot(x='age', data=df, kind='count')
48
+ plt.xticks(rotation=0)
49
+ st.pyplot()
50
+
51
+ st.write("Number of poems for each author")
52
+ sns.catplot(x="author", data=df, kind="count", aspect = 4)
53
+ plt.xticks(rotation=90)
54
+ st.pyplot()
55
+
56
+ st.write('Distributions of poem types according to ages and authors, seems that folks in renaissance loved the love themed poems and nature themed poems became popular later')
57
+ le = LabelEncoder()
58
+
59
+ df.author = le.fit_transform(df.author)
60
+ sns.catplot(x='age', y='author', hue='type', data=df)
61
+ st.pyplot()
62
+
63
+ words = df.content.str.split(expand=True).unstack().value_counts()
64
+ renaissance = df.content.loc[df.age == 'Renaissance'].str.split(expand=True).unstack().value_counts()
65
+ modern = df.content.loc[df.age == 'modern'].str.split(expand=True).unstack().value_counts()
66
+ st.subheader('Visualizing content')
67
+ mask = np.array(Image.open(os.path.join(d, 'poet.png')))
68
+
69
+ import matplotlib.pyplot as plt
70
+ def word_cloud(content, title):
71
+ wc = WordCloud(background_color='white',
72
+ max_words=200,
73
+ contour_width=3,
74
+ stopwords=STOPWORDS,
75
+ max_font_size=50)
76
+ wc.generate(' '.join(content.index.values))
77
+ fig = plt.figure(figsize=(10, 10))
78
+ plt.title(title, fontsize=20)
79
+ plt.imshow(wc.recolor(colormap='magma', random_state=42), cmap=plt.cm.gray, interpolation = "bilinear", alpha=0.98)
80
+ plt.axis('off')
81
+ st.pyplot()
82
+
83
+ st.subheader('Most appearing words excluding stopwords n poems according to ages')
84
+ word_cloud(modern, 'word cloud Renaissance poems')
85
+
86
+ st.write('Most appearing words including stopwords')
87
+ st.bar_chart(words[:50])
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ nltk
2
+ spacy
3
+ datasets==1.12.1
4
+ wordcloud
5
+ streamlit==0.84.2
6
+ numpy
7
+ pandas
8
+ sklearn
9
+ pillow
10
+ seaborn