MarMont commited on
Commit
e2bd7bd
1 Parent(s): 997565c

Transfer code

Browse files
Files changed (4) hide show
  1. app.py +339 -1
  2. katip-december.csv +0 -0
  3. requirements.txt +13 -0
  4. stopwords-tl.json +1 -0
app.py CHANGED
@@ -1,7 +1,345 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  def greet(name):
4
  return "Hello " + name + "!!"
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
  iface.launch()
 
1
+ # Required Libraries
2
+
3
+ #Base and Cleaning
4
+ import json
5
+ import requests
6
+ import pandas as pd
7
+ import numpy as np
8
+ import emoji
9
+ import regex
10
+ import re
11
+ import string
12
+ from collections import Counter
13
+ import tqdm
14
+ from operator import itemgetter
15
+
16
+ #Visualizations
17
+ import plotly.express as px
18
+ import seaborn as sns
19
+ import matplotlib.pyplot as plt
20
+ import pyLDAvis.gensim
21
+ import chart_studio
22
+ import chart_studio.plotly as py
23
+ import chart_studio.tools as tls
24
+
25
+ #Natural Language Processing (NLP)
26
+ import spacy
27
+ import gensim
28
+ import json
29
+ from spacy.tokenizer import Tokenizer
30
+ from gensim.corpora import Dictionary
31
+ from gensim.models.ldamulticore import LdaMulticore
32
+ from gensim.models.coherencemodel import CoherenceModel
33
+ from gensim.parsing.preprocessing import STOPWORDS as SW
34
+ from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
35
+ from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
36
+ from sklearn.model_selection import GridSearchCV
37
+ from pprint import pprint
38
+ from wordcloud import STOPWORDS
39
+ from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_numeric
40
+
41
  import gradio as gr
42
 
43
+ def give_emoji_free_text(text):
44
+ """
45
+ Removes emoji's from tweets
46
+ Accepts:
47
+ Text (tweets)
48
+ Returns:
49
+ Text (emoji free tweets)
50
+ """
51
+ emoji_list = [c for c in text if c in emoji.EMOJI_DATA]
52
+ clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])
53
+ return clean_text
54
+
55
+ def url_free_text(text):
56
+ '''
57
+ Cleans text from urls
58
+ '''
59
+ text = re.sub(r'http\S+', '', text)
60
+ return text
61
+
62
+ # Tokenizer function
63
+ def tokenize(text):
64
+ """
65
+ Parses a string into a list of semantic units (words)
66
+ Args:
67
+ text (str): The string that the function will tokenize.
68
+ Returns:
69
+ list: tokens parsed out
70
+ """
71
+ # Removing url's
72
+ pattern = r"http\S+"
73
+
74
+ tokens = re.sub(pattern, "", text) # https://www.youtube.com/watch?v=O2onA4r5UaY
75
+ tokens = re.sub('[^a-zA-Z 0-9]', '', text)
76
+ tokens = re.sub('[%s]' % re.escape(string.punctuation), '', text) # Remove punctuation
77
+ tokens = re.sub('\w*\d\w*', '', text) # Remove words containing numbers
78
+ # tokens = re.sub('@*!*$*', '', text) # Remove @ ! $
79
+ tokens = tokens.strip(',') # TESTING THIS LINE
80
+ tokens = tokens.strip('?') # TESTING THIS LINE
81
+ tokens = tokens.strip('!') # TESTING THIS LINE
82
+ tokens = tokens.strip("'") # TESTING THIS LINE
83
+ tokens = tokens.strip(".") # TESTING THIS LINE
84
+
85
+ tokens = tokens.lower().split() # Make text lowercase and split it
86
+
87
+ return tokens
88
+
89
+ def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=1):
90
+ coherence_values = []
91
+ model_list = []
92
+ for num_topics in range(start, limit, step):
93
+ model = gensim.models.ldamodel.LdaModel(corpus=corpus,
94
+ num_topics=num_topics,
95
+ random_state=100,
96
+ chunksize=200,
97
+ passes=10,
98
+ per_word_topics=True,
99
+ id2word=id2word)
100
+ model_list.append(model)
101
+ coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
102
+ coherence_values.append(coherencemodel.get_coherence())
103
+
104
+ return model_list, coherence_values
105
+
106
+ def compute_coherence_values2(corpus, dictionary, k, a, b):
107
+ lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
108
+ id2word=id2word,
109
+ num_topics=num_topics,
110
+ random_state=100,
111
+ chunksize=200,
112
+ passes=10,
113
+ alpha=a,
114
+ eta=b,
115
+ per_word_topics=True)
116
+ coherence_model_lda = CoherenceModel(model=lda_model, texts=df['lemma_tokens'], dictionary=id2word, coherence='c_v')
117
+
118
+ return coherence_model_lda.get_coherence()
119
+
120
+ def assignTopic(l):
121
+ maxTopic = max(l,key=itemgetter(1))[0]
122
+ return maxTopic
123
+
124
+ def get_topic_value(row, i):
125
+ if len(row) == 1:
126
+ return row[0][1]
127
+ else:
128
+ return row[i][1]
129
+
130
+
131
+ df = pd.DataFrame()
132
+
133
+ def dataframeProcessing(dataset):
134
+ # Opening JSON file
135
+ f = open('stopwords-tl.json')
136
+ tlStopwords = json.loads(f.read())
137
+ stopwords = set(STOPWORDS)
138
+ stopwords.update(tlStopwords)
139
+ stopwords.update(['na', 'sa', 'ko', 'ako', 'ng', 'mga', 'ba', 'ka', 'yung', 'lang', 'di', 'mo', 'kasi'])
140
+
141
+ df = pd.read_csv('katip-december.csv')
142
+ df.rename(columns = {'tweet':'original_tweets'}, inplace = True)
143
+ df = df.apply(lambda row: row[df['language'].isin(['en'])])
144
+ df.reset_index(inplace=True)
145
+
146
+ # Apply the function above and get tweets free of emoji's
147
+ call_emoji_free = lambda x: give_emoji_free_text(x)
148
+
149
+ # Apply `call_emoji_free` which calls the function to remove all emoji's
150
+ df['emoji_free_tweets'] = df['original_tweets'].apply(call_emoji_free)
151
+
152
+ #Create a new column with url free tweets
153
+ df['url_free_tweets'] = df['emoji_free_tweets'].apply(url_free_text)
154
+
155
+ # Load spacy
156
+ # Make sure to restart the runtime after running installations and libraries tab
157
+ nlp = spacy.load('en_core_web_lg')
158
+
159
+ # Tokenizer
160
+ tokenizer = Tokenizer(nlp.vocab)
161
+
162
+
163
+ # Custom stopwords
164
+ custom_stopwords = ['hi','\n','\n\n', '&', ' ', '.', '-', 'got', "it's", 'it’s', "i'm", 'i’m', 'im', 'want', 'like', '$', '@']
165
+
166
+
167
+ # Customize stop words by adding to the default list
168
+ STOP_WORDS = nlp.Defaults.stop_words.union(custom_stopwords)
169
+
170
+ # ALL_STOP_WORDS = spacy + gensim + wordcloud
171
+ ALL_STOP_WORDS = STOP_WORDS.union(SW).union(stopwords)
172
+
173
+
174
+ tokens = []
175
+ STOP_WORDS.update(stopwords)
176
+
177
+ for doc in tokenizer.pipe(df['url_free_tweets'], batch_size=500):
178
+ doc_tokens = []
179
+ for token in doc:
180
+ if token.text.lower() not in STOP_WORDS:
181
+ doc_tokens.append(token.text.lower())
182
+ tokens.append(doc_tokens)
183
+
184
+ # Makes tokens column
185
+ df['tokens'] = tokens
186
+
187
+ # Make tokens a string again
188
+ df['tokens_back_to_text'] = [' '.join(map(str, l)) for l in df['tokens']]
189
+
190
+ def get_lemmas(text):
191
+ '''Used to lemmatize the processed tweets'''
192
+ lemmas = []
193
+
194
+ doc = nlp(text)
195
+
196
+ # Something goes here :P
197
+ for token in doc:
198
+ if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'):
199
+ lemmas.append(token.lemma_)
200
+
201
+ return lemmas
202
+
203
+ df['lemmas'] = df['tokens_back_to_text'].apply(get_lemmas)
204
+
205
+ # Make lemmas a string again
206
+ df['lemmas_back_to_text'] = [' '.join(map(str, l)) for l in df['lemmas']]
207
+
208
+ # Apply tokenizer
209
+ df['lemma_tokens'] = df['lemmas_back_to_text'].apply(tokenize)
210
+
211
+ # Create a id2word dictionary
212
+ id2word = Dictionary(df['lemma_tokens'])
213
+
214
+ # Filtering Extremes
215
+ id2word.filter_extremes(no_below=2, no_above=.99)
216
+ print(len(id2word))
217
+
218
+ # Creating a corpus object
219
+ corpus = [id2word.doc2bow(d) for d in df['lemma_tokens']]
220
+
221
+ lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
222
+ id2word=id2word,
223
+ num_topics=5,
224
+ random_state=100,
225
+ chunksize=200,
226
+ passes=10,
227
+ per_word_topics=True)
228
+
229
+ pprint(lda_model.print_topics())
230
+ doc_lda = lda_model[corpus]
231
+
232
+ coherence_model_lda = CoherenceModel(model=lda_model, texts=df['lemma_tokens'], dictionary=id2word, coherence='c_v')
233
+ coherence_lda = coherence_model_lda.get_coherence()
234
+
235
+ model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus,
236
+ texts=df['lemma_tokens'],
237
+ start=2,
238
+ limit=10,
239
+ step=1)
240
+
241
+ k_max = max(coherence_values)
242
+ num_topics = coherence_values.index(k_max) + 2
243
+
244
+ lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
245
+ id2word=id2word,
246
+ num_topics=num_topics,
247
+ random_state=100,
248
+ chunksize=200,
249
+ passes=10,
250
+ per_word_topics=True)
251
+
252
+ grid = {}
253
+ grid['Validation_Set'] = {}
254
+
255
+ alpha = [0.05, 0.1, 0.5, 1, 5, 10]
256
+
257
+ beta = [0.05, 0.1, 0.5, 1, 5, 10]
258
+
259
+ num_of_docs = len(corpus)
260
+ corpus_sets = [gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)),
261
+ corpus]
262
+ corpus_title = ['75% Corpus', '100% Corpus']
263
+ model_results = {'Validation_Set': [],
264
+ 'Alpha': [],
265
+ 'Beta': [],
266
+ 'Coherence': []
267
+ }
268
+ if 1 == 1:
269
+ pbar = tqdm.tqdm(total=540)
270
+
271
+ for i in range(len(corpus_sets)):
272
+ for a in alpha:
273
+ for b in beta:
274
+ cv = compute_coherence_values2(corpus=corpus_sets[i], dictionary=id2word, k=num_topics, a=a, b=b)
275
+ model_results['Validation_Set'].append(corpus_title[i])
276
+ model_results['Alpha'].append(a)
277
+ model_results['Beta'].append(b)
278
+ model_results['Coherence'].append(cv)
279
+
280
+ pbar.update(1)
281
+ pd.DataFrame(model_results).to_csv('lda_tuning_results_new.csv', index=False)
282
+ pbar.close()
283
+
284
+ params_df = pd.read_csv('lda_tuning_results_new.csv')
285
+ params_df = params_df[params_df.Validation_Set == '100% Corpus']
286
+ params_df.reset_index(inplace=True)
287
+
288
+ max_params = params_df.loc[params_df['Coherence'].idxmax()]
289
+ max_coherence = max_params['Coherence']
290
+ max_alpha = max_params['Alpha']
291
+ max_beta = max_params['Beta']
292
+
293
+ lda_model_final = gensim.models.ldamodel.LdaModel(corpus=corpus,
294
+ id2word=id2word,
295
+ num_topics=7,
296
+ random_state=100,
297
+ chunksize=200,
298
+ passes=10,
299
+ alpha=max_alpha,
300
+ eta=max_beta,
301
+ per_word_topics=True)
302
+
303
+ coherence_model_lda = CoherenceModel(model=lda_model_final, texts=df['lemma_tokens'], dictionary=id2word,
304
+ coherence='c_v')
305
+ coherence_lda = coherence_model_lda.get_coherence()
306
+
307
+ lda_topics = lda_model_final.show_topics(num_words=10)
308
+
309
+ topics = []
310
+ filters = [lambda x: x.lower(), strip_punctuation, strip_numeric]
311
+
312
+ for topic in lda_topics:
313
+ print(topic)
314
+ topics.append(preprocess_string(topic[1], filters))
315
+
316
+ df['topic'] = [sorted(lda_model_final[corpus][text][0]) for text in range(len(df['original_tweets']))]
317
+
318
+ df = df[df['topic'].map(lambda d: len(d)) > 0]
319
+ df['topic'][0]
320
+
321
+ df['max_topic'] = df['topic'].map(lambda row: assignTopic(row))
322
+
323
+ topic_clusters = []
324
+ for i in range(num_topics):
325
+ topic_clusters.append(df[df['max_topic'].isin(([i]))])
326
+ topic_clusters[i] = topic_clusters[i]['original_tweets'].tolist()
327
+
328
+ for i in range(len(topic_clusters)):
329
+ tweets = df.loc[df['max_topic'] == i]
330
+ tweets['topic'] = tweets['topic'].apply(lambda x: get_topic_value(x, i))
331
+ # tweets['topic'] = [row[i][1] for row in tweets['topic']]
332
+ tweets_sorted = tweets.sort_values('topic', ascending=False)
333
+ tweets_sorted.drop_duplicates(subset=['original_tweets'])
334
+ rep_tweets = tweets_sorted['original_tweets']
335
+ rep_tweets = [*set(rep_tweets)]
336
+ print('Topic ', i)
337
+ print(rep_tweets[:5])
338
+
339
+ return df
340
+
341
  def greet(name):
342
  return "Hello " + name + "!!"
343
 
344
+ iface = gr.Interface(fn=dataframeProcessing, outputs=gr.Dataframe(headers=['original_tweets', 'max_topic']))
345
  iface.launch()
katip-december.csv ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ emoji==1.7.0
2
+ pandas-profiling==2.*
3
+ plotly==4.*
4
+ spacy>=3.0.0,<4.0.0
5
+ en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.5.0/en_core_web_lg-3.5.0-py3-none-any.whl
6
+ pyldavis
7
+ gensim
8
+ chart_studio
9
+ autopep8
10
+ transformers
11
+ sentencepiece
12
+ bert-extractive-summarizer
13
+ tqdm
stopwords-tl.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ["akin","aking","ako","alin","am","amin","aming","ang","ano","anumang","apat","at","atin","ating","ay","bababa","bago","bakit","bawat","bilang","dahil","dalawa","dapat","din","dito","doon","gagawin","gayunman","ginagawa","ginawa","ginawang","gumawa","gusto","habang","hanggang","hindi","huwag","iba","ibaba","ibabaw","ibig","ikaw","ilagay","ilalim","ilan","inyong","isa","isang","itaas","ito","iyo","iyon","iyong","ka","kahit","kailangan","kailanman","kami","kanila","kanilang","kanino","kanya","kanyang","kapag","kapwa","karamihan","katiyakan","katulad","kaya","kaysa","ko","kong","kulang","kumuha","kung","laban","lahat","lamang","likod","lima","maaari","maaaring","maging","mahusay","makita","marami","marapat","masyado","may","mayroon","mga","minsan","mismo","mula","muli","na","nabanggit","naging","nagkaroon","nais","nakita","namin","napaka","narito","nasaan","ng","ngayon","ni","nila","nilang","nito","niya","niyang","noon","o","pa","paano","pababa","paggawa","pagitan","pagkakaroon","pagkatapos","palabas","pamamagitan","panahon","pangalawa","para","paraan","pareho","pataas","pero","pumunta","pumupunta","sa","saan","sabi","sabihin","sarili","sila","sino","siya","tatlo","tayo","tulad","tungkol","una","walang"]