andufkova commited on
Commit
48032f9
1 Parent(s): 4ce4650

topic discovery added

Browse files
app.py CHANGED
@@ -1,21 +1,33 @@
1
  import gradio as gr
2
  import numpy as np
 
3
  import pickle
 
 
4
  from sentence_transformers import SentenceTransformer
 
 
5
 
6
 
7
- #css_code='body {background-image:url("https://picsum.photos/seed/picsum/200/300");} div.gradio-container {background: white;}'
 
8
 
 
 
9
 
10
- categories = ["Censorship","Development","Digital Activism","Disaster","Economics & Business","Education","Environment","Governance","Health","History","Humanitarian Response","International Relations","Law","Media & Journalism","Migration & Immigration","Politics","Protest","Religion","Sport","Travel","War & Conflict","Technology_Science","Women&Gender_LGBTQ+_Youth","Freedom_of_Speech_Human_Rights","Literature_Arts&Culture"]
11
- model = SentenceTransformer('sentence-transformers/LaBSE')
 
12
  with open('models/MLP_classifier_average_en.pkl', 'rb') as f:
13
  classifier = pickle.load(f)
 
 
 
14
 
15
  def get_embedding(text):
16
  if text is None:
17
  text = ""
18
- return model.encode(text)
19
 
20
  def get_categories(y_pred):
21
  indices = []
@@ -25,6 +37,53 @@ def get_categories(y_pred):
25
  cats = [categories[i] for i in indices]
26
  return cats
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  def generate_output(article):
29
  paragraphs = article.split("\n")
30
  embdds = []
@@ -33,32 +92,74 @@ def generate_output(article):
33
  embedding = np.average(embdds, axis=0)
34
 
35
  #y_pred = classifier.predict_proba(embedding.reshape(1, 768))
36
- y_pred = classifier.predict(embedding.reshape(1, 768))
37
- y_pred = y_pred.flatten()
 
 
 
 
 
 
 
38
  classes = get_categories(y_pred)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
- return (classes, "clustering tbd")
41
-
42
- # with gr.Blocks() as demo:
43
- # with gr.Row():
44
- # # column for input
45
- # with gr.Column():
46
- # input_text = gr.Textbox(lines=6, placeholder="Insert text of the article here...", label="Article"),
47
- # submit_button = gr.Button("Submit")
48
- # clear_button = gr.Button("Clear")
49
-
50
- # # column for output
51
- # with gr.Column():
52
- # output_classification = gr.Textbox(lines=1, label="Article category")
53
- # output_topic_discovery = gr.Textbox(lines=5, label="Topic discovery")
54
-
55
- #submit_button.click(generate_output, inputs=input_text, outputs=[output_classification, output_topic_discovery])
56
- demo = gr.Interface(fn=generate_output,
57
- inputs=gr.Textbox(lines=6, placeholder="Insert text of the article here...", label="Article"),
58
- outputs=[gr.Textbox(lines=1, label="Category"), gr.Textbox(lines=5, label="Topic discovery")],
59
- title="Article classification & topic discovery demo",
60
- flagging_options=["Incorrect"],
61
- theme=gr.themes.Base())
62
  #css=css_code)
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  demo.launch()
 
1
  import gradio as gr
2
  import numpy as np
3
+ import pandas as pd
4
  import pickle
5
+ import sklearn
6
+ import plotly.express as px
7
  from sentence_transformers import SentenceTransformer
8
+ from sklearn.cluster import MiniBatchKMeans
9
+ from learn_multi_doc_model import Model
10
 
11
 
12
+ #css_code='body {background-image:url("https://picsum.photos/seed/picsum/200/300");} div.gradio-container {background: white;}, button#component-8{background-color: rgb(158,202,225);}'
13
+ css_code='button#component-8{background-color: rgb(158,202,225);}'
14
 
15
+ import __main__
16
+ setattr(__main__, "Model", Model)
17
 
18
+ categories = ["Censorship","Development","Digital Activism","Disaster","Economics & Business","Education","Environment","Governance","Health","History","Humanitarian Response","International Relations","Law","Media & Journalism","Migration & Immigration","Politics","Protest","Religion","Sport","Travel","War & Conflict","Technology + Science","Women & Gender + LGBTQ + Youth","Freedom of Speech + Human Rights","Literature + Arts & Culture"]
19
+ input_cvect_key_file = 'topic_discovery/cvects.key'
20
+ model_labse = SentenceTransformer('sentence-transformers/LaBSE')
21
  with open('models/MLP_classifier_average_en.pkl', 'rb') as f:
22
  classifier = pickle.load(f)
23
+ mul_model = None
24
+ with open('models/model_0.0001_100.pkl', 'rb') as f:
25
+ mul_model = pickle.load(f)
26
 
27
  def get_embedding(text):
28
  if text is None:
29
  text = ""
30
+ return model_labse.encode(text)
31
 
32
  def get_categories(y_pred):
33
  indices = []
 
37
  cats = [categories[i] for i in indices]
38
  return cats
39
 
40
+ def get_words(doc_emb):
41
+ # load countvectorizers
42
+ cvects = {}
43
+ vocab = {} # load vocabulary of words for each lang
44
+ with open(input_cvect_key_file, "r") as fpr:
45
+ for line in fpr:
46
+ #print(line)
47
+ lang, fpath = line.strip().split()
48
+ with open(fpath, "rb") as fpr:
49
+ #print(f"loading {fpath}")
50
+ cvects[lang] = pickle.load(fpr)
51
+ vocab[lang] = cvects[lang].get_feature_names()
52
+
53
+ #print(
54
+ # "Loaded CountVectorizer for lang",
55
+ # lang,
56
+ # "with vocab size:",
57
+ # len(vocab[lang]),
58
+ #)
59
+
60
+ topn = 10 # top N words per cluster
61
+
62
+ #print(vocab["en"])
63
+ #print("MODEL KEYS")
64
+ #print(mul_model.E.keys())
65
+
66
+ doc_emb = doc_emb.flatten()
67
+
68
+ words_dict = {}
69
+
70
+ for lang in mul_model.E.keys():
71
+
72
+ #print(lang, end=": ")
73
+
74
+ scores = mul_model.E[lang] @ (doc_emb).T
75
+ k_ixs = np.argsort(scores)[::-1][:topn].squeeze() # sort them in descending order and pick topn
76
+ tmp = []
77
+ for i in k_ixs:
78
+ #print(vocab[lang][i], end=", ")
79
+ tmp.append(vocab[lang][i])
80
+
81
+ words_dict[lang] = tmp
82
+ #print()
83
+
84
+ return words_dict
85
+
86
+
87
  def generate_output(article):
88
  paragraphs = article.split("\n")
89
  embdds = []
 
92
  embedding = np.average(embdds, axis=0)
93
 
94
  #y_pred = classifier.predict_proba(embedding.reshape(1, 768))
95
+ reshaped = embedding.reshape(1, 768)
96
+ #y_pred = classifier.predict(reshaped)
97
+ #y_pred = y_pred.flatten()
98
+
99
+ y_prob = classifier.predict_proba(reshaped)
100
+ y_prob = y_prob.reshape(len(categories),1)
101
+
102
+ y_pred = [1 if x >= 0.5 else 0 for x in y_prob]
103
+
104
  classes = get_categories(y_pred)
105
+ if len(classes) > 1:
106
+ classes_string = ', '.join(classes)
107
+ elif len(classes) == 1:
108
+ classes_string = classes[0]
109
+ else:
110
+ classes_string = 'No category was found.'
111
+
112
+
113
+
114
+ data = pd.DataFrame()
115
+ data['Category'] = categories
116
+ data['Probability'] = y_prob
117
+ fig = px.bar(data, x='Probability', y='Category', orientation='h', height=600)#, title="Category probability")
118
+ fig.update_xaxes(range=[0, 1])
119
+ fig.update_layout(margin=dict(l=5, r=5, t=20, b=5)) #paper_bgcolor="LightSteelBlue")
120
+ fig.update_traces(marker_color='rgb(158,202,225)')
121
+
122
+ #print(f"LEN Y_PROB {len(y_prob)}")
123
+ #print(f"LEN CAT {len(categories)}")
124
 
125
+ words_dict = get_words(reshaped)
126
+ words_string = ""
127
+
128
+ for lang, w in words_dict.items():
129
+ words_string += f"{lang}: "
130
+ words_string += ', '.join(w)
131
+ words_string += "\n"
132
+
133
+ return (classes_string, fig, words_string)
134
+
135
+ # demo = gr.Interface(fn=generate_output,
136
+ # inputs=gr.Textbox(lines=6, placeholder="Insert text of the article here...", label="Article"),
137
+ # outputs=[gr.Textbox(lines=1, label="Category"), gr.Plot(label="Category probability"), gr.Textbox(lines=5, label="Topic discovery")],
138
+ # title="Article classification & topic discovery demo",
139
+ # flagging_options=["Incorrect"],
140
+ # theme=gr.themes.Base())
 
 
 
 
 
 
141
  #css=css_code)
142
 
143
+ demo = gr.Blocks(css=css_code, theme=gr.themes.Base(), title="Article classification & topic discovery demo")
144
+
145
+ with demo:
146
+ with gr.Row():
147
+ my_title = gr.HTML("<h1 align='center'>Article classification & topic discovery demo</h1>")
148
+ with gr.Row():
149
+ with gr.Column():
150
+ input_text = gr.Textbox(lines=22, placeholder="Insert text of the article here...", label="Article")
151
+ with gr.Row():
152
+ clear_button = gr.Button("Clear")
153
+ submit_button = gr.Button("Submit")
154
+ with gr.Column():
155
+ with gr.Tabs():
156
+ with gr.TabItem("Classification"):
157
+ category_text = gr.Textbox(lines=1, label="Category")
158
+ category_plot = gr.Plot()
159
+ with gr.TabItem("Topic discovery"):
160
+ topic_text = gr.Textbox(lines=22, label="The most representative words")
161
+
162
+ submit_button.click(generate_output, inputs=input_text, outputs=[category_text, category_plot, topic_text])
163
+ clear_button.click(lambda: None, None, input_text, queue=False)
164
+
165
  demo.launch()
learn_multi_doc_model.py ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /usr/bin/env python3
2
+
3
+ import argparse
4
+ import os
5
+ import numpy as np
6
+ import scipy
7
+ import pickle
8
+ from scipy.special import log_softmax
9
+ from time import time
10
+ from packaging import version
11
+
12
+ assert version.parse(scipy.__version__) >= version.parse(
13
+ "1.7.0"
14
+ ), f"Requries scipy > 1.7.0. Found {scipy.__version__}"
15
+
16
+
17
+ class Model:
18
+ """Model defintion, parameters and helper fucntions to compute log-likelihood"""
19
+
20
+ def __init__(self, vocab: dict, emb_dim: int):
21
+ """Initialize our model
22
+
23
+ Args:
24
+ vocab: vocab size for each language {'en': 25000, 'de': 25000}
25
+ emb_dim: embedding dimension, will be same across languages
26
+ """
27
+
28
+ self.L = len(vocab)
29
+ self.vocab = vocab
30
+ self.emb_dim = emb_dim
31
+
32
+ # word embeddings matrix / subspace for each language
33
+ self.E = {}
34
+
35
+ # bias vector for each language
36
+ self.b = {}
37
+
38
+ n1 = 1.0 / np.sqrt(emb_dim)
39
+
40
+ # initialize word embeddings and bias vectors randomly
41
+ for lang, vocab_size in vocab.items():
42
+ n2 = 1.0 / np.sqrt(vocab_size)
43
+ self.E[lang] = np.random.uniform(-n2, n1, size=(vocab_size, emb_dim))
44
+ self.b[lang] = np.random.randn(vocab_size, 1) * 0.0001
45
+
46
+ def init_bias_with_log_unigram_dist(self, X, lang):
47
+ """We will initialize the bias vector with log of unigram distribution over vocabulary.
48
+ This should help us with better initialization.
49
+
50
+ b = \log (\sum_d x_d) / (\sum_d \sum_i x_{di})
51
+ """
52
+
53
+ # if X is sparse matrix, X.A gives the dense version of it in numpy array format
54
+ if isinstance(X, np.ndarray):
55
+ X = X + 1e-08 # to avoid zeros
56
+ else:
57
+ X = X.A + 1e-08 # to avoid any zeros
58
+
59
+ self.b[lang][:, 0] = np.log(
60
+ X.sum(axis=0) / X.sum()
61
+ ) # we would like b to of size (W, 1)
62
+
63
+ def compute_log_thetas(self, lang: str, DE_lang: np.ndarray, sanity_check=False):
64
+ """Compute log of thetas, where theta_d is the unigram distribution over document `d`
65
+ estiamted from the current params (word-embedding matrix, bias vector) and document embedding a_d.
66
+
67
+ Args:
68
+ ----
69
+ lang (str): Language ID (eg: en, de, es ...)
70
+ DE_lang (np.ndarray): Document embeddings of language
71
+ """
72
+
73
+ mat = self.b[lang] + (self.E[lang] @ DE_lang) # shape is vocab_size x n_docs
74
+ mat = mat.T # shape is D x W
75
+
76
+ # log_norm = logsumexp(mat, axis=1)
77
+ # log_thetas = mat - log_norm
78
+
79
+ # the following single step is same the two above steps combined
80
+ log_thetas = log_softmax(mat, axis=1) # shape is n_docs x vocab_size
81
+
82
+ if sanity_check:
83
+ n_docs = DE_lang.shape[0]
84
+ # sanity-check
85
+ # since each document is a proper distribution, it should sum upto 1
86
+ # sum of the matrix should be equal to number of documents
87
+ print(
88
+ "Sanity check for log-thetas:",
89
+ np.allclose(np.exp(log_thetas).sum(), n_docs),
90
+ )
91
+
92
+ return log_thetas
93
+
94
+ def compute_log_likelihood(self, lang, DE_lang, X):
95
+ """Compute log-likelihood of the data, given the current parameters / embeddings
96
+
97
+ Each summation could be implemented using a for-loop but that would very slow,
98
+ since we have every thing stored in matrices and a sparse matrix, we will do it via
99
+ matrix muliplications and additions.
100
+
101
+ Args:
102
+ lang: language ID (eg: en, es, fr)
103
+ DE_lang: document embeddings for the given language
104
+ X: doc-by-word counts in scipy.sparse format for a specific language
105
+
106
+ Returns:
107
+ float: log-likelihood of the data
108
+ """
109
+
110
+ log_thetas = self.compute_log_thetas(lang, DE_lang)
111
+
112
+ # log-likelihood is product of counts to the respective log-probability values.
113
+ if isinstance(X, np.ndarray):
114
+ llh = (X * log_thetas).sum()
115
+ else:
116
+ # X is a scipy sparse matrix
117
+ llh = (X.multiply(log_thetas)).sum()
118
+
119
+ return llh
120
+
121
+
122
+ def gradients_WE(model, lang, DE_lang, X, alpha):
123
+ """Gradient of the log-likelihood with-respect-to language-specific word embedding matrix `E`
124
+
125
+ Args:
126
+ model (Model): The object of the model
127
+ lang (str): Language ID
128
+ DE_lang: document embeddings for the given language
129
+ X (scipy.sparse_matrix): The doc-by-word counts
130
+ alpha (float): L2 reg. weight
131
+
132
+ Returns:
133
+ np.ndarray: Gradient of log-likelihood w.r.t word embeddings, i.e, grad of llh w.r.t to model.E
134
+ """
135
+
136
+ # grads = np.zeros_like(model.E) # initialize empty gradients to be the same shape as word embeddings (W, K)
137
+
138
+ # compute log_thetas as they are needed in gradient
139
+ log_thetas = model.compute_log_thetas(lang, DE_lang)
140
+
141
+ # the gradient computation can be done using for-loops to reflect the equation
142
+ # or it can be done efficiently using matrix multiplications
143
+
144
+ # 1. simple way using for-loop
145
+ # iterate over all documents
146
+ # for d in range(model.D):
147
+
148
+ # iterate over every word,
149
+ # for k in range(model.W):
150
+ # x_dk = X[d, k] # count of word k in doc d
151
+ # rel_x_dk = X[d, :].sum() * np.exp(log_thetas)[d, k] # relative /estimated count of word k in doc d
152
+ # grads[k, :] += ((x_dk - rel_x_dk) * model.A[:, d]) # doc embeddings are column wise in model.A
153
+
154
+ # 2. Efficient way of obtaining gradients using matrix operations
155
+
156
+ ef_grads = np.zeros_like(model.E)
157
+
158
+ tmp = (
159
+ X - np.multiply(X.sum(axis=1).reshape(-1, 1), np.exp(log_thetas))
160
+ ).A # .A will convert matrix to np ndarray
161
+ ef_grads = (DE_lang @ tmp).T - (alpha * 0.5 * model.E[lang]).sum()
162
+
163
+ # Sanity check to see if gradients computed in both ways are numerically identical
164
+ # print('- All close grad_E:', np.allclose(ef_grads, grads))
165
+
166
+ return ef_grads
167
+
168
+
169
+ def update_parameters(params, gradient, learning_rate):
170
+ """Update the parameters
171
+
172
+ Args:
173
+ params (np.ndarray): Word embedding matrix of the document embedding matrix
174
+ gradient (np.ndarray): Gradients of all word embeddings or document embeddings. Should be same as size as params
175
+ learning_rate (float): The learning_rate can also be seen as step size, i.e, the size of the step to be taken
176
+ along the direction of gradient. Too big steps can overshoot our estimate, whereas too small steps
177
+ can take longer for the model to reach optimum.
178
+
179
+ Returns:
180
+ np.ndarray: the updated params
181
+ """
182
+
183
+ assert (
184
+ params.shape == gradient.shape
185
+ ), "The params and gradient must have same shape, \
186
+ ({:d}, {:d}) != ({:d} {:d})".format(
187
+ *params.shape, *gradient.shape
188
+ )
189
+
190
+ new_params = params + (
191
+ learning_rate * gradient
192
+ ) # since we are doing gradient ascent
193
+ return new_params
194
+
195
+
196
+ def train(model, bow, DE, args):
197
+ """Training scheme for the model"""
198
+
199
+ print("\nTraining started ..")
200
+ learning_rate = args.lr
201
+ llh_0 = 0.0
202
+ for lang, X in bow.items():
203
+ llh_0 += model.compute_log_likelihood(lang, DE[lang].T, X)
204
+ print(" Initial log-likelihood: {:16.2f}".format(llh_0))
205
+
206
+ llhs = [llh_0]
207
+
208
+ for i in range(1, args.epochs + 1):
209
+
210
+ llh_ei = 0.0
211
+ for lang, X in bow.items():
212
+
213
+ # update word embeddings E for lang, by keeping doc-embeddings A fixed
214
+ grad_E = gradients_WE(model, lang, DE[lang].T, X, args.alpha)
215
+
216
+ model.E[lang] = update_parameters(model.E[lang], grad_E, learning_rate)
217
+
218
+ llh_ei += model.compute_log_likelihood(lang, DE[lang].T, X)
219
+
220
+ print(
221
+ "Epoch {:4d} / {:4d} | Log-likelihood: {:16.2f} | Learning rate: {:f}".format(
222
+ i, args.epochs, llh_ei, learning_rate
223
+ )
224
+ )
225
+
226
+ if llh_ei < llhs[-1]:
227
+ print(
228
+ "The log-likelihood should improve after every epoch.",
229
+ "Instead it decreased, which means the updates have overshooted.",
230
+ "Halving the learning_rate.",
231
+ )
232
+ learning_rate = learning_rate * 0.5
233
+
234
+ llhs.append(llh_ei)
235
+
236
+ # learning_rate scheduler
237
+ # we reduce the learning_rate by 10 % after every 10 epochs
238
+ # if i % 10 == 0:
239
+ # print("Reducing the learning by a factor of 0.1 every 10 epcohs")
240
+ # learning_rate -= learning_rate * 0.1
241
+ if i % 100 == 0:
242
+ with open(
243
+ os.path.join(args.out_dir, f"model_{args.alpha}_{i}.pkl"), "wb"
244
+ ) as fpw:
245
+ pickle.dump(model, fpw)
246
+ np.savetxt(
247
+ os.path.join(args.out_dir, f"llh_{args.alpha}_{args.epochs}.txt"),
248
+ np.asarray(llhs),
249
+ )
250
+
251
+ return model, llhs
252
+
253
+
254
+ def main():
255
+ """main"""
256
+
257
+ args = parse_arguments()
258
+
259
+ os.makedirs(args.out_dir, exist_ok=True)
260
+
261
+ emb_dim = 0
262
+ # load doc embeddings for each language
263
+ doc_embs = {} # {lang_1: np.ndarray, lang_2: np.ndarray, ...}
264
+ with open(args.input_embedding_key_file, "r") as fpr:
265
+ for line in fpr:
266
+ lang, fpath = line.strip().split()
267
+ doc_embs[lang] = np.load(fpath)
268
+ print("Loaded embeddings:", lang, doc_embs[lang].shape)
269
+
270
+ if emb_dim == 0:
271
+ emb_dim = doc_embs[lang].shape[1]
272
+
273
+ # load bag of words for each language
274
+ bows = {} # {lang_1: scipy.sparse, lang_2: scipy.sparse, ...}
275
+ vocab = {} # {lang_1: vocab_size}
276
+ with open(args.input_bag_of_words_key_file, "r") as fpr:
277
+ for line in fpr:
278
+ lang, fpath = line.strip().split()
279
+ bows[lang] = scipy.sparse.load_npz(fpath)
280
+ print("Loaded bag-of-words:", lang, bows[lang].shape)
281
+
282
+ vocab[lang] = bows[lang].shape[1]
283
+
284
+ # assert the number of docs per language are same in embeddings and bag-of-words
285
+ assert (
286
+ bows[lang].shape[0] == doc_embs[lang].shape[0]
287
+ ), "Number of docs in BoW ({:d}) != number of docs in embeddigs ({:d}) for language: {:s}".format(
288
+ bows[lang].shape[0], doc_embs[lang].shape[0], lang
289
+ )
290
+
291
+ model = Model(vocab, emb_dim)
292
+ for lang, bow in bows.items():
293
+ model.init_bias_with_log_unigram_dist(bow, lang)
294
+
295
+ print("Model params:")
296
+ for lang in model.vocab:
297
+ print(" ", lang, model.E[lang].shape, model.b[lang].shape)
298
+
299
+ if args.resume:
300
+ with open(args.resume, "rb") as fpr:
301
+ model = pickle.load(fpr)
302
+
303
+ # start the training
304
+ model, llhs = train(model, bows, doc_embs, args)
305
+
306
+ with open(
307
+ os.path.join(args.out_dir, f"model_{args.alpha}_{args.epochs}.pkl"), "wb"
308
+ ) as fpw:
309
+ pickle.dump(model, fpw)
310
+
311
+ np.savetxt(
312
+ os.path.join(args.out_dir, f"llh_{args.alpha}_{args.epochs}.txt"),
313
+ np.asarray(llhs),
314
+ )
315
+
316
+ print("Saved in", args.out_dir)
317
+
318
+
319
+ def parse_arguments():
320
+
321
+ parser = argparse.ArgumentParser(
322
+ description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
323
+ )
324
+
325
+ parser.add_argument(
326
+ "input_embedding_key_file",
327
+ help="path to file that has paths to embeddings for each language",
328
+ )
329
+
330
+ parser.add_argument(
331
+ "input_bag_of_words_key_file", help="path to input bag of words dictionary file"
332
+ )
333
+
334
+ parser.add_argument("out_dir", help="out dir to save the model/word embeddings")
335
+
336
+ parser.add_argument("--epochs", type=int, default=100, help="number of epochs")
337
+ parser.add_argument("--lr", type=float, default=0.0001, help="learning rate")
338
+ parser.add_argument(
339
+ "--alpha", type=float, default=1e-4, help="L2 reg. weight / weight decay"
340
+ )
341
+
342
+ parser.add_argument(
343
+ "--resume", default="", help="path to trained model to resume training"
344
+ )
345
+
346
+ args = parser.parse_args()
347
+
348
+ return args
349
+
350
+
351
+ if __name__ == "__main__":
352
+ main()
models/model_0.0001_100.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d48ed6671bf0990a14476301a7845362092852c8e6bb624271f3943252e954c1
3
+ size 2166342600
requirements.txt CHANGED
@@ -1,2 +1,5 @@
1
  numpy==1.24.2
2
  sentence-transformers==2.2.2
 
 
 
 
1
  numpy==1.24.2
2
  sentence-transformers==2.2.2
3
+ pandas==1.5.2
4
+ plotly
5
+ sklearn==0.24.2
topic_discovery/.DS_Store ADDED
Binary file (8.2 kB). View file
 
topic_discovery/cvect_25000_ar.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b37e9e016646662718993e2368f9e88c4c21141f8944f23449f27c6d59e03221
3
+ size 3047285
topic_discovery/cvect_25000_bn.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05b3adf720d522a38762fda2bb6da2c948389a437b2138004698d326181d971d
3
+ size 157149
topic_discovery/cvect_25000_de.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e551d8934e6a8e23c841437805bbed1b0e17eb2f3ab3e260b9104c1e30f452ad
3
+ size 2037400
topic_discovery/cvect_25000_el.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5419f509f5666ae55a7f5cdfb1cf7ea41f3fa102ec639c19c4aeea8b2dffe32
3
+ size 3681045
topic_discovery/cvect_25000_en.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb0ee36e4ef6738d408e30132c5d970be2e05728c305fccce06dc67b3941bea2
3
+ size 4143980
topic_discovery/cvect_25000_es.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d28eb842e6f4717a791de9c8c61014131dbea8d26f84f90c62cd54b05595a1c9
3
+ size 4235561
topic_discovery/cvect_25000_fr.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74ff26b2269c2033f78ecb1e5870c449423d42d668975e5e98e899b6d2489f64
3
+ size 2967490
topic_discovery/cvect_25000_it.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e8892d88fd88e0d9e121e57e1b77810e47d34909944b2e65e2094d426f17daa
3
+ size 2477565
topic_discovery/cvect_25000_jp.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c075e83209a4a23afe290aef6a301717f4eadfd118a278114ea142fdf882c20
3
+ size 3082086
topic_discovery/cvect_25000_mg.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:958dd98498097b8463b1fbc6f068b512650d40397b9e53659dc2238032126181
3
+ size 3643714
topic_discovery/cvect_25000_mk.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6758e48f3626b7c91b7359097d27aedb6beaeb36c6a6632901c3fae3f6da5ea3
3
+ size 2152452
topic_discovery/cvect_25000_nl.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f81d4942757d07cde33715cd00fe150c377b19070f57cc992230b8c6eeacb06
3
+ size 1466263
topic_discovery/cvect_25000_pl.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad1d1d8853aa424ba47c81d52ab6fdd708d1a440901652d680482d092a88a44a
3
+ size 2063425
topic_discovery/cvect_25000_pt.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:baef6e3fe017ed4feb3ac2e08701b77b4425ade9f39d700ab3d1b4a2d89059d6
3
+ size 2001188
topic_discovery/cvect_25000_ru.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89bfa381364b0df772b0a181df8740bf597733e328410c464e6690d58e8e212f
3
+ size 5482015
topic_discovery/cvect_25000_zhs.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1369c082d071340da56006eef8ffc380625c39fef4a7034b7d1e2927b1f54717
3
+ size 9390903
topic_discovery/cvect_25000_zht.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:030a1c4b66cfecf4645de14f77d90d56886e8927225581c94e45a93006c0c633
3
+ size 9965443
topic_discovery/cvects.key ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ en topic_discovery/cvect_25000_en.pkl
2
+ es topic_discovery/cvect_25000_es.pkl
3
+ fr topic_discovery/cvect_25000_fr.pkl
4
+ mg topic_discovery/cvect_25000_mg.pkl
5
+ it topic_discovery/cvect_25000_it.pkl
6
+ el topic_discovery/cvect_25000_el.pkl
7
+ zhs topic_discovery/cvect_25000_zhs.pkl
8
+ zht topic_discovery/cvect_25000_zht.pkl
9
+ bn topic_discovery/cvect_25000_bn.pkl
10
+ ru topic_discovery/cvect_25000_ru.pkl
11
+ pt topic_discovery/cvect_25000_pt.pkl
12
+ ar topic_discovery/cvect_25000_ar.pkl
13
+ de topic_discovery/cvect_25000_de.pkl
14
+ jp topic_discovery/cvect_25000_jp.pkl
15
+ mk topic_discovery/cvect_25000_mk.pkl
16
+ pl topic_discovery/cvect_25000_pl.pkl
17
+ nl topic_discovery/cvect_25000_nl.pkl