File size: 3,089 Bytes
e6dc112
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import praw
import pandas as pd
import datetime as dt
from wordcloud import WordCloud, STOPWORDS

reddit = praw.Reddit(client_id='w0cDom4nIf5druip4y9zSw', \
                     client_secret='mtCul8hEucwNky7hLwgkewlLPzH0sg', \
                     user_agent='Profile extractor', \
                     username='CarelessSwordfish541', \
                     password='Testing@2022')

st.title('Just Reddit as it is 👀')

st.write('This is a simple web app to extract data from Reddit and analyze it.')

DATA_URL = 'subreddit_data_v1.csv'



@st.cache
def load_data():
    data = pd.read_csv(DATA_URL)
    lowercase = lambda x: str(x).lower()
    data.rename(lowercase, axis='columns', inplace=True)
    return data

data_load_state = st.text('Loading data...')
data = load_data()
data_load_state.text("Done! (using st.cache)")


if st.checkbox('Show raw data'):
    st.subheader('Raw data')
    st.write(data)

subreddit = st.selectbox('Select a subreddit', data['subreddit'].unique())

st.subheader('Wordcloud of the most common words in the subreddit')


comment_words = ''
stopwords = set(STOPWORDS)

# iterate through the csv file
for val in data[data['subreddit'] == subreddit]['title']:
    # typecaste each val to string
    val = str(val)

    # split the value
    tokens = val.split()

    # Converts each token into lowercase
    for i in range(len(tokens)):
        tokens[i] = tokens[i].lower()

    comment_words += " ".join(tokens)+" "

wordcloud = WordCloud(width = 800, height = 800,
                background_color ='white',
                stopwords = stopwords,
                min_font_size = 10).generate(comment_words)

# plot the WordCloud image
plt.figure(figsize = (8, 8), facecolor = None)

plt.imshow(wordcloud)

plt.axis("off")

plt.tight_layout(pad = 0)
st.set_option('deprecation.showPyplotGlobalUse', False)
st.pyplot()


#Based on the subreddit selected , show the statistics of the subreddit
st.subheader('Statistics of the subreddit')
st.write(data[data['subreddit'] == subreddit].describe())

#Based on the subreddit selected display the number of posts per day
st.subheader('Number of posts per day')
st.write(data[data['subreddit'] == subreddit].groupby('created')['title'].count())

#Based on the subreddit selected display the number of comments per day
st.subheader('Number of comments per day')
st.write(data[data['subreddit'] == subreddit].groupby('created')['num_comments'].sum())

#display a bar chart of the score of the posts
st.subheader('Score of the posts')
st.bar_chart(data[data['subreddit'] == subreddit]['score'])





# st.subheader('Number of pickups by hour')
# hist_values = np.histogram(data[DATE_COLUMN].dt.hour, bins=24, range=(0,24))[0]
# st.bar_chart(hist_values)

# # Some number in the range 0-23
# hour_to_filter = st.slider('hour', 0, 23, 17)
# filtered_data = data[data[DATE_COLUMN].dt.hour == hour_to_filter]

# st.subheader('Map of all pickups at %s:00' % hour_to_filter)
# st.map(filtered_data)