MarMont commited on
Commit
6dba7a5
1 Parent(s): 397900b

compile all lda

Browse files
Files changed (2) hide show
  1. app.py +62 -71
  2. appv1.py +559 -0
app.py CHANGED
@@ -125,7 +125,64 @@ def tokenize(text):
125
 
126
  return tokens
127
 
128
- def cleaning(df):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  df.rename(columns = {'tweet':'original_tweets'}, inplace = True)
130
 
131
  # Apply the function above and get tweets free of emoji's
@@ -184,29 +241,6 @@ def cleaning(df):
184
  # Apply tokenizer
185
  df['lemma_tokens'] = df['lemmas_back_to_text'].apply(tokenize)
186
 
187
- def split_corpus(corpus, n):
188
- for i in range(0, len(corpus), n):
189
- corpus_split = corpus
190
- yield corpus_split[i:i + n]
191
-
192
- def compute_coherence_values_base_lda(dictionary, corpus, texts, limit, coherence, start=2, step=1):
193
- coherence_values = []
194
- model_list = []
195
- for num_topics in range(start, limit, step):
196
- model = gensim.models.ldamodel.LdaModel(corpus=corpus,
197
- num_topics=num_topics,
198
- random_state=100,
199
- chunksize=200,
200
- passes=10,
201
- per_word_topics=True,
202
- id2word=id2word)
203
- model_list.append(model)
204
- coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence=coherence)
205
- coherence_values.append(coherencemodel.get_coherence())
206
-
207
- return model_list, coherence_values
208
-
209
- def base_lda():
210
  # Create a id2word dictionary
211
  global id2word
212
  id2word = Dictionary(df['lemma_tokens'])
@@ -253,24 +287,6 @@ def base_lda():
253
  global num_topics
254
  num_topics = coherence_averages.index(k_max) + 2
255
 
256
- def compute_coherence_values2(corpus, dictionary, k, a, b):
257
- lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
258
- id2word=id2word,
259
- num_topics=num_topics,
260
- random_state=100,
261
- chunksize=200,
262
- passes=10,
263
- alpha=a,
264
- eta=b,
265
- per_word_topics=True)
266
- coherence_model_lda = CoherenceModel(model=lda_model,
267
- texts=df['lemma_tokens'],
268
- dictionary=id2word,
269
- coherence='c_v')
270
-
271
- return coherence_model_lda.get_coherence()
272
-
273
- def hyperparameter_optimization():
274
  grid = {}
275
  grid['Validation_Set'] = {}
276
 
@@ -337,21 +353,9 @@ def hyperparameter_optimization():
337
  per_word_topics=True)
338
 
339
  coherence_model_lda = CoherenceModel(model=lda_model_final, texts=df['lemma_tokens'], dictionary=id2word,
340
- coherence='c_v')
341
  coherence_lda = coherence_model_lda.get_coherence()
342
-
343
- return coherence_lda
344
-
345
- def assignMaxTopic(l):
346
- maxTopic = max(l,key=itemgetter(1))[0]
347
- return maxTopic
348
-
349
- def assignTopic(l):
350
- topics = []
351
- for x in l:
352
- topics.append(x[0])
353
-
354
- def topic_assignment(df):
355
  lda_topics = lda_model_final.show_topics(num_words=10)
356
 
357
  topics = []
@@ -371,16 +375,6 @@ def topic_assignment(df):
371
  topic_clusters.append(df[df['max_topic'].isin(([i]))])
372
  topic_clusters[i] = topic_clusters[i]['original_tweets'].tolist()
373
 
374
- def get_topic_value(row, i):
375
- if len(row) == 1:
376
- return row[0][1]
377
- else:
378
- try:
379
- return row[i][1]
380
- except Exception as e:
381
- print(e)
382
-
383
- def reprsentative_tweets():
384
  global top_tweets
385
  top_tweets = []
386
  for i in range(len(topic_clusters)):
@@ -394,6 +388,7 @@ def reprsentative_tweets():
394
  top_tweets.append(rep_tweets[:5])
395
  # print('Topic ', i)
396
  # print(rep_tweets[:5])
 
397
  return top_tweets
398
 
399
  def topic_summarization(topic_groups):
@@ -521,14 +516,10 @@ def main(dataset, model):
521
  print(dataset)
522
  place_data = str(scrape(keyword_list))
523
  print(df)
524
- cleaning(df)
525
 
526
  print(df)
527
  if model == 'LDA':
528
- base_lda()
529
- coherence = hyperparameter_optimization()
530
- topic_assignment(df)
531
- top_tweets = reprsentative_tweets()
532
  else:
533
  base_bertopic()
534
  optimized_bertopic()
 
125
 
126
  return tokens
127
 
128
+ def split_corpus(corpus, n):
129
+ for i in range(0, len(corpus), n):
130
+ corpus_split = corpus
131
+ yield corpus_split[i:i + n]
132
+
133
+ def compute_coherence_values_base_lda(dictionary, corpus, texts, limit, coherence, start=2, step=1):
134
+ coherence_values = []
135
+ model_list = []
136
+ for num_topics in range(start, limit, step):
137
+ model = gensim.models.ldamodel.LdaModel(corpus=corpus,
138
+ num_topics=num_topics,
139
+ random_state=100,
140
+ chunksize=200,
141
+ passes=10,
142
+ per_word_topics=True,
143
+ id2word=id2word)
144
+ model_list.append(model)
145
+ coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence=coherence)
146
+ coherence_values.append(coherencemodel.get_coherence())
147
+
148
+ return model_list, coherence_values
149
+
150
+ def compute_coherence_values2(corpus, dictionary, k, a, b):
151
+ lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
152
+ id2word=id2word,
153
+ num_topics=num_topics,
154
+ random_state=100,
155
+ chunksize=200,
156
+ passes=10,
157
+ alpha=a,
158
+ eta=b,
159
+ per_word_topics=True)
160
+ coherence_model_lda = CoherenceModel(model=lda_model,
161
+ texts=df['lemma_tokens'],
162
+ dictionary=id2word,
163
+ coherence='c_v')
164
+
165
+ return coherence_model_lda.get_coherence()
166
+
167
+ def assignMaxTopic(l):
168
+ maxTopic = max(l,key=itemgetter(1))[0]
169
+ return maxTopic
170
+
171
+ def assignTopic(l):
172
+ topics = []
173
+ for x in l:
174
+ topics.append(x[0])
175
+
176
+ def get_topic_value(row, i):
177
+ if len(row) == 1:
178
+ return row[0][1]
179
+ else:
180
+ try:
181
+ return row[i][1]
182
+ except Exception as e:
183
+ print(e)
184
+
185
+ def full_lda():
186
  df.rename(columns = {'tweet':'original_tweets'}, inplace = True)
187
 
188
  # Apply the function above and get tweets free of emoji's
 
241
  # Apply tokenizer
242
  df['lemma_tokens'] = df['lemmas_back_to_text'].apply(tokenize)
243
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  # Create a id2word dictionary
245
  global id2word
246
  id2word = Dictionary(df['lemma_tokens'])
 
287
  global num_topics
288
  num_topics = coherence_averages.index(k_max) + 2
289
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
  grid = {}
291
  grid['Validation_Set'] = {}
292
 
 
353
  per_word_topics=True)
354
 
355
  coherence_model_lda = CoherenceModel(model=lda_model_final, texts=df['lemma_tokens'], dictionary=id2word,
356
+ coherence='c_v')
357
  coherence_lda = coherence_model_lda.get_coherence()
358
+
 
 
 
 
 
 
 
 
 
 
 
 
359
  lda_topics = lda_model_final.show_topics(num_words=10)
360
 
361
  topics = []
 
375
  topic_clusters.append(df[df['max_topic'].isin(([i]))])
376
  topic_clusters[i] = topic_clusters[i]['original_tweets'].tolist()
377
 
 
 
 
 
 
 
 
 
 
 
378
  global top_tweets
379
  top_tweets = []
380
  for i in range(len(topic_clusters)):
 
388
  top_tweets.append(rep_tweets[:5])
389
  # print('Topic ', i)
390
  # print(rep_tweets[:5])
391
+
392
  return top_tweets
393
 
394
  def topic_summarization(topic_groups):
 
516
  print(dataset)
517
  place_data = str(scrape(keyword_list))
518
  print(df)
 
519
 
520
  print(df)
521
  if model == 'LDA':
522
+ top_tweets = full_lda()
 
 
 
523
  else:
524
  base_bertopic()
525
  optimized_bertopic()
appv1.py ADDED
@@ -0,0 +1,559 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import tweepy
3
+ import re
4
+ import emoji
5
+ import spacy
6
+ import gensim
7
+ import json
8
+ import string
9
+
10
+ from spacy.tokenizer import Tokenizer
11
+ from gensim.parsing.preprocessing import STOPWORDS as SW
12
+ from wordcloud import STOPWORDS
13
+
14
+ from gensim.corpora import Dictionary
15
+ from gensim.models.coherencemodel import CoherenceModel
16
+ from pprint import pprint
17
+
18
+ import numpy as np
19
+ import tqdm
20
+
21
+ from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_numeric
22
+
23
+ import torch
24
+ from transformers import T5ForConditionalGeneration,T5Tokenizer
25
+ from googletrans import Translator
26
+
27
+ from bertopic import BERTopic
28
+ from umap import UMAP
29
+ from sklearn.feature_extraction.text import CountVectorizer
30
+
31
+ from operator import itemgetter
32
+
33
+ import gradio as gr
34
+
35
+ global df
36
+ bearer_token = 'AAAAAAAAAAAAAAAAAAAAACEigwEAAAAACoP8KHJYLOKCL4OyB9LEPV00VB0%3DmyeDROUvw4uipHwvbPPfnTuY0M9ORrLuXrMvcByqZhwo3SUc4F'
37
+ client = tweepy.Client(bearer_token=bearer_token)
38
+ nlp = spacy.load('en_core_web_lg')
39
+ print('hi')
40
+
41
+ def scrape(keywords):
42
+ query = keywords + ' (lang:en OR lang:tl) -is:retweet'
43
+ max_results = 100
44
+ tweet_fields=['geo', 'id', 'lang', 'created_at']
45
+ expansions=['geo.place_id']
46
+ place_fields = ['contained_within', 'country', 'country_code', 'full_name', 'geo', 'id', 'name', 'place_type']
47
+
48
+ response = client.search_recent_tweets(
49
+ query=query,
50
+ max_results=max_results,
51
+ tweet_fields=tweet_fields,
52
+ expansions=expansions,
53
+ place_fields=place_fields
54
+ )
55
+
56
+ tweets = []
57
+ for x in response[0]:
58
+ tweets.append(str(x))
59
+
60
+ place_data = response[1]
61
+
62
+ df = pd.DataFrame(tweets, columns=['tweet'])
63
+
64
+ return place_data
65
+
66
+ def get_example(dataset):
67
+ df = pd.read_csv(dataset + '.csv')
68
+ return df
69
+
70
+ def give_emoji_free_text(text):
71
+ """
72
+ Removes emoji's from tweets
73
+ Accepts:
74
+ Text (tweets)
75
+ Returns:
76
+ Text (emoji free tweets)
77
+ """
78
+ emoji_list = [c for c in text if c in emoji.EMOJI_DATA]
79
+ clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])
80
+ return clean_text
81
+
82
+ def url_free_text(text):
83
+ '''
84
+ Cleans text from urls
85
+ '''
86
+ text = re.sub(r'http\S+', '', text)
87
+ return text
88
+
89
+ def get_lemmas(text):
90
+ '''Used to lemmatize the processed tweets'''
91
+ lemmas = []
92
+
93
+ doc = nlp(text)
94
+
95
+ for token in doc:
96
+ if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'):
97
+ lemmas.append(token.lemma_)
98
+
99
+ return lemmas
100
+
101
+ # Tokenizer function
102
+ def tokenize(text):
103
+ """
104
+ Parses a string into a list of semantic units (words)
105
+ Args:
106
+ text (str): The string that the function will tokenize.
107
+ Returns:
108
+ list: tokens parsed out
109
+ """
110
+ # Removing url's
111
+ pattern = r"http\S+"
112
+
113
+ tokens = re.sub(pattern, "", text) # https://www.youtube.com/watch?v=O2onA4r5UaY
114
+ tokens = re.sub('[^a-zA-Z 0-9]', '', text)
115
+ tokens = re.sub('[%s]' % re.escape(string.punctuation), '', text) # Remove punctuation
116
+ tokens = re.sub('\w*\d\w*', '', text) # Remove words containing numbers
117
+ # tokens = re.sub('@*!*$*', '', text) # Remove @ ! $
118
+ tokens = tokens.strip(',') # TESTING THIS LINE
119
+ tokens = tokens.strip('?') # TESTING THIS LINE
120
+ tokens = tokens.strip('!') # TESTING THIS LINE
121
+ tokens = tokens.strip("'") # TESTING THIS LINE
122
+ tokens = tokens.strip(".") # TESTING THIS LINE
123
+
124
+ tokens = tokens.lower().split() # Make text lowercase and split it
125
+
126
+ return tokens
127
+
128
+
129
+ def cleaning(df):
130
+ df.rename(columns = {'tweet':'original_tweets'}, inplace = True)
131
+
132
+ # Apply the function above and get tweets free of emoji's
133
+ call_emoji_free = lambda x: give_emoji_free_text(x)
134
+
135
+ # Apply `call_emoji_free` which calls the function to remove all emoji's
136
+ df['emoji_free_tweets'] = df['original_tweets'].apply(call_emoji_free)
137
+
138
+ #Create a new column with url free tweets
139
+ df['url_free_tweets'] = df['emoji_free_tweets'].apply(url_free_text)
140
+
141
+
142
+
143
+ f = open('stopwords-tl.json')
144
+ tlStopwords = json.loads(f.read())
145
+ stopwords = set(STOPWORDS)
146
+ stopwords.update(tlStopwords)
147
+ stopwords.update(['na', 'sa', 'ko', 'ako', 'ng', 'mga', 'ba', 'ka', 'yung', 'lang', 'di', 'mo', 'kasi'])
148
+
149
+ # Tokenizer
150
+ tokenizer = Tokenizer(nlp.vocab)
151
+
152
+
153
+ # Custom stopwords
154
+ custom_stopwords = ['hi','\n','\n\n', '&', ' ', '.', '-', 'got', "it's", 'it’s', "i'm", 'i’m', 'im', 'want', 'like', '$', '@']
155
+
156
+
157
+ # Customize stop words by adding to the default list
158
+ STOP_WORDS = nlp.Defaults.stop_words.union(custom_stopwords)
159
+
160
+ # ALL_STOP_WORDS = spacy + gensim + wordcloud
161
+ ALL_STOP_WORDS = STOP_WORDS.union(SW).union(stopwords)
162
+
163
+
164
+ tokens = []
165
+ STOP_WORDS.update(stopwords)
166
+
167
+ for doc in tokenizer.pipe(df['url_free_tweets'], batch_size=500):
168
+ doc_tokens = []
169
+ for token in doc:
170
+ if token.text.lower() not in STOP_WORDS:
171
+ doc_tokens.append(token.text.lower())
172
+ tokens.append(doc_tokens)
173
+
174
+ # Makes tokens column
175
+ df['tokens'] = tokens
176
+
177
+ # Make tokens a string again
178
+ df['tokens_back_to_text'] = [' '.join(map(str, l)) for l in df['tokens']]
179
+
180
+ df['lemmas'] = df['tokens_back_to_text'].apply(get_lemmas)
181
+
182
+ # Make lemmas a string again
183
+ df['lemmas_back_to_text'] = [' '.join(map(str, l)) for l in df['lemmas']]
184
+
185
+ # Apply tokenizer
186
+ df['lemma_tokens'] = df['lemmas_back_to_text'].apply(tokenize)
187
+
188
+ def split_corpus(corpus, n):
189
+ for i in range(0, len(corpus), n):
190
+ corpus_split = corpus
191
+ yield corpus_split[i:i + n]
192
+
193
+ def compute_coherence_values_base_lda(dictionary, corpus, texts, limit, coherence, start=2, step=1):
194
+ coherence_values = []
195
+ model_list = []
196
+ for num_topics in range(start, limit, step):
197
+ model = gensim.models.ldamodel.LdaModel(corpus=corpus,
198
+ num_topics=num_topics,
199
+ random_state=100,
200
+ chunksize=200,
201
+ passes=10,
202
+ per_word_topics=True,
203
+ id2word=id2word)
204
+ model_list.append(model)
205
+ coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence=coherence)
206
+ coherence_values.append(coherencemodel.get_coherence())
207
+
208
+ return model_list, coherence_values
209
+
210
+ def base_lda():
211
+ # Create a id2word dictionary
212
+ global id2word
213
+ id2word = Dictionary(df['lemma_tokens'])
214
+
215
+ # Filtering Extremes
216
+ id2word.filter_extremes(no_below=2, no_above=.99)
217
+
218
+ # Creating a corpus object
219
+ global corpus
220
+ corpus = [id2word.doc2bow(d) for d in df['lemma_tokens']]
221
+ global corpus_og
222
+ corpus_og = [id2word.doc2bow(d) for d in df['lemma_tokens']]
223
+
224
+ corpus_split = corpus
225
+ split_corpus(corpus_split, 5)
226
+
227
+ global coherence
228
+ coherence = 'c_v'
229
+
230
+ coherence_averages = [0] * 8
231
+ for i in range(5):
232
+ training_corpus = corpus_split
233
+ training_corpus.remove(training_corpus[i])
234
+ print(training_corpus[i])
235
+ model_list, coherence_values = compute_coherence_values_base_lda(dictionary=id2word, corpus=training_corpus,
236
+ texts=df['lemma_tokens'],
237
+ start=2,
238
+ limit=10,
239
+ step=1,
240
+ coherence=coherence)
241
+ for j in range(len(coherence_values)):
242
+ coherence_averages[j] += coherence_values[j]
243
+
244
+ limit = 10; start = 2; step = 1;
245
+ x = range(start, limit, step)
246
+
247
+ coherence_averages = [x / 5 for x in coherence_averages]
248
+
249
+ if coherence == 'c_v':
250
+ k_max = max(coherence_averages)
251
+ else:
252
+ k_max = min(coherence_averages, key=abs)
253
+
254
+ global num_topics
255
+ num_topics = coherence_averages.index(k_max) + 2
256
+
257
+ def compute_coherence_values2(corpus, dictionary, k, a, b):
258
+ lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
259
+ id2word=id2word,
260
+ num_topics=num_topics,
261
+ random_state=100,
262
+ chunksize=200,
263
+ passes=10,
264
+ alpha=a,
265
+ eta=b,
266
+ per_word_topics=True)
267
+ coherence_model_lda = CoherenceModel(model=lda_model,
268
+ texts=df['lemma_tokens'],
269
+ dictionary=id2word,
270
+ coherence='c_v')
271
+
272
+ return coherence_model_lda.get_coherence()
273
+
274
+ def hyperparameter_optimization():
275
+ grid = {}
276
+ grid['Validation_Set'] = {}
277
+
278
+ min_topics = 1
279
+ max_topics = 10
280
+ step_size = 1
281
+ topics_range = range(min_topics, max_topics, step_size)
282
+
283
+ alpha = [0.05, 0.1, 0.5, 1, 5, 10]
284
+ # alpha.append('symmetric')
285
+ # alpha.append('asymmetric')
286
+
287
+ beta = [0.05, 0.1, 0.5, 1, 5, 10]
288
+ # beta.append('symmetric')
289
+
290
+ num_of_docs = len(corpus_og)
291
+ corpus_sets = [gensim.utils.ClippedCorpus(corpus_og, int(num_of_docs*0.75)),
292
+ corpus_og]
293
+ corpus_title = ['75% Corpus', '100% Corpus']
294
+ model_results = {'Validation_Set': [],
295
+ 'Alpha': [],
296
+ 'Beta': [],
297
+ 'Coherence': []
298
+ }
299
+ if 1 == 1:
300
+ pbar = tqdm.tqdm(total=540)
301
+
302
+ for i in range(len(corpus_sets)):
303
+ for a in alpha:
304
+ for b in beta:
305
+ cv = compute_coherence_values2(corpus=corpus_sets[i],
306
+ dictionary=id2word,
307
+ k=num_topics,
308
+ a=a,
309
+ b=b)
310
+ model_results['Validation_Set'].append(corpus_title[i])
311
+ model_results['Alpha'].append(a)
312
+ model_results['Beta'].append(b)
313
+ model_results['Coherence'].append(cv)
314
+
315
+ pbar.update(1)
316
+ pd.DataFrame(model_results).to_csv('lda_tuning_results_new.csv', index=False)
317
+ pbar.close()
318
+
319
+ params_df = pd.read_csv('lda_tuning_results_new.csv')
320
+ params_df = params_df[params_df.Validation_Set == '75% Corpus']
321
+ params_df.reset_index(inplace=True)
322
+ params_df = params_df.replace(np.inf, -np.inf)
323
+ max_params = params_df.loc[params_df['Coherence'].idxmax()]
324
+ max_coherence = max_params['Coherence']
325
+ max_alpha = max_params['Alpha']
326
+ max_beta = max_params['Beta']
327
+ max_validation_set = max_params['Validation_Set']
328
+
329
+ global lda_model_final
330
+ lda_model_final = gensim.models.ldamodel.LdaModel(corpus=corpus_og,
331
+ id2word=id2word,
332
+ num_topics=num_topics,
333
+ random_state=100,
334
+ chunksize=200,
335
+ passes=10,
336
+ alpha=max_alpha,
337
+ eta=max_beta,
338
+ per_word_topics=True)
339
+
340
+ coherence_model_lda = CoherenceModel(model=lda_model_final, texts=df['lemma_tokens'], dictionary=id2word,
341
+ coherence='c_v')
342
+ coherence_lda = coherence_model_lda.get_coherence()
343
+
344
+ return coherence_lda
345
+
346
+ def assignMaxTopic(l):
347
+ maxTopic = max(l,key=itemgetter(1))[0]
348
+ return maxTopic
349
+
350
+ def assignTopic(l):
351
+ topics = []
352
+ for x in l:
353
+ topics.append(x[0])
354
+
355
+ def topic_assignment(df):
356
+ lda_topics = lda_model_final.show_topics(num_words=10)
357
+
358
+ topics = []
359
+ filters = [lambda x: x.lower(), strip_punctuation, strip_numeric]
360
+
361
+ for topic in lda_topics:
362
+ topics.append(preprocess_string(topic[1], filters))
363
+
364
+ df['topic'] = [sorted(lda_model_final[corpus_og][text][0]) for text in range(len(df['original_tweets']))]
365
+
366
+ df = df[df['topic'].map(lambda d: len(d)) > 0]
367
+ df['max_topic'] = df['topic'].map(lambda row: assignMaxTopic(row))
368
+
369
+ global topic_clusters
370
+ topic_clusters = []
371
+ for i in range(num_topics):
372
+ topic_clusters.append(df[df['max_topic'].isin(([i]))])
373
+ topic_clusters[i] = topic_clusters[i]['original_tweets'].tolist()
374
+
375
+ def get_topic_value(row, i):
376
+ if len(row) == 1:
377
+ return row[0][1]
378
+ else:
379
+ try:
380
+ return row[i][1]
381
+ except Exception as e:
382
+ print(e)
383
+
384
+ def reprsentative_tweets():
385
+ global top_tweets
386
+ top_tweets = []
387
+ for i in range(len(topic_clusters)):
388
+ tweets = df.loc[df['max_topic'] == i]
389
+ tweets['topic'] = tweets['topic'].apply(lambda x: get_topic_value(x, i))
390
+ # tweets['topic'] = [row[i][1] for row in tweets['topic']]
391
+ tweets_sorted = tweets.sort_values('topic', ascending=False)
392
+ tweets_sorted.drop_duplicates(subset=['original_tweets'])
393
+ rep_tweets = tweets_sorted['original_tweets']
394
+ rep_tweets = [*set(rep_tweets)]
395
+ top_tweets.append(rep_tweets[:5])
396
+ # print('Topic ', i)
397
+ # print(rep_tweets[:5])
398
+ return top_tweets
399
+
400
+ def topic_summarization(topic_groups):
401
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
402
+
403
+ model = T5ForConditionalGeneration.from_pretrained("Michau/t5-base-en-generate-headline")
404
+ tokenizer = T5Tokenizer.from_pretrained("Michau/t5-base-en-generate-headline")
405
+ model = model.to(device)
406
+ translator = Translator()
407
+
408
+ headlines = []
409
+ for i in range(len(topic_groups)):
410
+ tweets = " ".join(topic_groups[i])
411
+ # print(tweets)
412
+ out = translator.translate(tweets, dest='en')
413
+ text = out.text
414
+ # print(tweets)
415
+
416
+ max_len = 256
417
+
418
+ encoding = tokenizer.encode_plus(text, return_tensors = "pt")
419
+ input_ids = encoding["input_ids"].to(device)
420
+ attention_masks = encoding["attention_mask"].to(device)
421
+
422
+ beam_outputs = model.generate(
423
+ input_ids = input_ids,
424
+ attention_mask = attention_masks,
425
+ max_length = 64,
426
+ num_beams = 3,
427
+ early_stopping = True,
428
+ )
429
+
430
+ result = tokenizer.decode(beam_outputs[0])
431
+ headlines += "Topic " + str(i) + " " + result
432
+
433
+ return headlines
434
+
435
+ def compute_coherence_value_bertopic(topic_model):
436
+ topic_words = [[words for words, _ in topic_model.get_topic(topic)] for topic in range(len(set(topics))-1)]
437
+ coherence_model = CoherenceModel(topics=topic_words,
438
+ texts=df['lemma_tokens'],
439
+ corpus=corpus,
440
+ dictionary=id2word,
441
+ coherence=coherence)
442
+ coherence_score = coherence_model.get_coherence()
443
+
444
+ return coherence_score
445
+
446
+ def base_bertopic():
447
+ df['lemma_tokens_string'] = df['lemma_tokens'].apply(lambda x: ' '.join(x))
448
+ global id2word
449
+ id2word = Dictionary(df['lemma_tokens'])
450
+ global corpus
451
+ corpus = [id2word.doc2bow(d) for d in df['lemma_tokens']]
452
+
453
+ global umap_model
454
+ umap_model = UMAP(n_neighbors=15,
455
+ n_components=5,
456
+ min_dist=0.0,
457
+ metric='cosine',
458
+ random_state=100)
459
+
460
+ base_topic_model = BERTopic(umap_model=umap_model, language="english", calculate_probabilities=True)
461
+
462
+ topics, probabilities = base_topic_model.fit_transform(df['lemma_tokens_string'])
463
+
464
+ try:
465
+ print(compute_coherence_value_bertopic(base_topic_model))
466
+ except:
467
+ print('Unable to generate meaningful topics (Base BERTopic model)')
468
+
469
+ def optimized_bertopic():
470
+ vectorizer_model = CountVectorizer(max_features=1_000, stop_words="english")
471
+ optimized_topic_model = BERTopic(umap_model=umap_model,
472
+ language="multilingual",
473
+ n_gram_range=(1, 3),
474
+ vectorizer_model=vectorizer_model,
475
+ calculate_probabilities=True)
476
+
477
+ topics, probabilities = optimized_topic_model.fit_transform(df['lemma_tokens_string'])
478
+
479
+ try:
480
+ print(compute_coherence_value_bertopic(optimized_topic_model))
481
+ except:
482
+ print('Unable to generate meaningful topics, base BERTopic model if possible')
483
+
484
+ rep_docs = optimized_topic_model.representative_docs_
485
+
486
+ global top_tweets
487
+ top_tweets = []
488
+
489
+ for topic in rep_docs:
490
+ if topic == -1:
491
+ print('test')
492
+ continue
493
+ topic_docs = rep_docs.get(topic)
494
+
495
+ tweets = []
496
+ for doc in topic_docs:
497
+ index = df.isin([doc]).any(axis=1).idxmax()
498
+ # print(index)
499
+ tweets.append(df.loc[index, 'original_tweets'])
500
+ print(tweets)
501
+ top_tweets.append(tweets)
502
+
503
+ global examples
504
+
505
+ def main(dataset, model):
506
+ global df
507
+ examples = [ "katip,katipunan",
508
+ "bgc,bonifacio global city",
509
+ "pobla,poblacion",
510
+ "cubao",
511
+ "taft"
512
+ ]
513
+ keyword_list = dataset.split(',')
514
+ if len(keyword_list) > 1:
515
+ keywords = '(' + ' OR '.join(keyword_list) + ')'
516
+ else:
517
+ keywords = keyword_list[0]
518
+ if dataset in examples:
519
+ df = get_example(keywords)
520
+ place_data = 'test'
521
+ else:
522
+ print(dataset)
523
+ place_data = str(scrape(keyword_list))
524
+ print(df)
525
+ cleaning(df)
526
+
527
+ print(df)
528
+ if model == 'LDA':
529
+ base_lda()
530
+ coherence = hyperparameter_optimization()
531
+ topic_assignment(df)
532
+ top_tweets = reprsentative_tweets()
533
+ else:
534
+ base_bertopic()
535
+ optimized_bertopic()
536
+
537
+ headlines = topic_summarization(top_tweets)
538
+ headlines = '\n'.join(str(h) for h in headlines)
539
+
540
+
541
+
542
+ return place_data, headlines
543
+
544
+
545
+ iface = gr.Interface(fn=main,
546
+ inputs=[gr.Dropdown(["katip,katipunan",
547
+ "bgc,bonifacio global city",
548
+ "cubao",
549
+ "taft",
550
+ "pobla,poblacion"],
551
+ label="Dataset"),
552
+ gr.Dropdown(["LDA",
553
+ "BERTopic"],
554
+ label="Model")
555
+ ],
556
+ # examples=examples,
557
+ outputs=["text",
558
+ "text"])
559
+ iface.launch()