Spaces:

andufkova
/

articles

Runtime error

App Files Files Community

andufkova commited on Apr 15, 2023

Commit

48032f9

•

1 Parent(s): 4ce4650

topic discovery added

Browse files

Files changed (23) hide show

app.py +129 -28
learn_multi_doc_model.py +352 -0
models/model_0.0001_100.pkl +3 -0
requirements.txt +3 -0
topic_discovery/.DS_Store +0 -0
topic_discovery/cvect_25000_ar.pkl +3 -0
topic_discovery/cvect_25000_bn.pkl +3 -0
topic_discovery/cvect_25000_de.pkl +3 -0
topic_discovery/cvect_25000_el.pkl +3 -0
topic_discovery/cvect_25000_en.pkl +3 -0
topic_discovery/cvect_25000_es.pkl +3 -0
topic_discovery/cvect_25000_fr.pkl +3 -0
topic_discovery/cvect_25000_it.pkl +3 -0
topic_discovery/cvect_25000_jp.pkl +3 -0
topic_discovery/cvect_25000_mg.pkl +3 -0
topic_discovery/cvect_25000_mk.pkl +3 -0
topic_discovery/cvect_25000_nl.pkl +3 -0
topic_discovery/cvect_25000_pl.pkl +3 -0
topic_discovery/cvect_25000_pt.pkl +3 -0
topic_discovery/cvect_25000_ru.pkl +3 -0
topic_discovery/cvect_25000_zhs.pkl +3 -0
topic_discovery/cvect_25000_zht.pkl +3 -0
topic_discovery/cvects.key +17 -0

app.py CHANGED Viewed

@@ -1,21 +1,33 @@
 import gradio as gr
 import numpy as np
 import pickle
 from sentence_transformers import SentenceTransformer
-#css_code='body {background-image:url("https://picsum.photos/seed/picsum/200/300");} div.gradio-container {background: white;}'
-categories = ["Censorship","Development","Digital Activism","Disaster","Economics & Business","Education","Environment","Governance","Health","History","Humanitarian Response","International Relations","Law","Media & Journalism","Migration & Immigration","Politics","Protest","Religion","Sport","Travel","War & Conflict","Technology_Science","Women&Gender_LGBTQ+_Youth","Freedom_of_Speech_Human_Rights","Literature_Arts&Culture"]
-model = SentenceTransformer('sentence-transformers/LaBSE')
 with open('models/MLP_classifier_average_en.pkl', 'rb') as f:
     classifier = pickle.load(f)
 def get_embedding(text):
     if text is None:
         text = ""
-    return model.encode(text)
 def get_categories(y_pred):
     indices = []
@@ -25,6 +37,53 @@ def get_categories(y_pred):
     cats = [categories[i] for i in indices]
     return cats
 def generate_output(article):
     paragraphs = article.split("\n")
     embdds = []
@@ -33,32 +92,74 @@ def generate_output(article):
     embedding = np.average(embdds, axis=0)
     #y_pred = classifier.predict_proba(embedding.reshape(1, 768))
-    y_pred = classifier.predict(embedding.reshape(1, 768))
-    y_pred = y_pred.flatten()
     classes = get_categories(y_pred)
-    return (classes, "clustering tbd")
-# with gr.Blocks() as demo:
-#     with gr.Row():
-#         # column for input
-#         with gr.Column():
-#             input_text = gr.Textbox(lines=6, placeholder="Insert text of the article here...", label="Article"),
-#             submit_button = gr.Button("Submit")
-#             clear_button = gr.Button("Clear")
-#         # column for output
-#         with gr.Column():
-#             output_classification = gr.Textbox(lines=1, label="Article category")
-#             output_topic_discovery = gr.Textbox(lines=5, label="Topic discovery")
-    #submit_button.click(generate_output, inputs=input_text, outputs=[output_classification, output_topic_discovery])
-demo = gr.Interface(fn=generate_output,
-    inputs=gr.Textbox(lines=6, placeholder="Insert text of the article here...", label="Article"),
-    outputs=[gr.Textbox(lines=1, label="Category"), gr.Textbox(lines=5, label="Topic discovery")],
-    title="Article classification & topic discovery demo",
-    flagging_options=["Incorrect"],
-    theme=gr.themes.Base())
     #css=css_code)
 demo.launch()

 import gradio as gr
 import numpy as np
+import pandas as pd
 import pickle
+import sklearn
+import plotly.express as px
 from sentence_transformers import SentenceTransformer
+from sklearn.cluster import MiniBatchKMeans
+from learn_multi_doc_model import Model
+#css_code='body {background-image:url("https://picsum.photos/seed/picsum/200/300");} div.gradio-container {background: white;}, button#component-8{background-color: rgb(158,202,225);}'
+css_code='button#component-8{background-color: rgb(158,202,225);}'
+import __main__
+setattr(__main__, "Model", Model)
+categories = ["Censorship","Development","Digital Activism","Disaster","Economics & Business","Education","Environment","Governance","Health","History","Humanitarian Response","International Relations","Law","Media & Journalism","Migration & Immigration","Politics","Protest","Religion","Sport","Travel","War & Conflict","Technology + Science","Women & Gender + LGBTQ + Youth","Freedom of Speech + Human Rights","Literature + Arts & Culture"]
+input_cvect_key_file = 'topic_discovery/cvects.key'
+model_labse = SentenceTransformer('sentence-transformers/LaBSE')
 with open('models/MLP_classifier_average_en.pkl', 'rb') as f:
     classifier = pickle.load(f)
+mul_model = None
+with open('models/model_0.0001_100.pkl', 'rb') as f:
+    mul_model = pickle.load(f)
 def get_embedding(text):
     if text is None:
         text = ""
+    return model_labse.encode(text)
 def get_categories(y_pred):
     indices = []
     cats = [categories[i] for i in indices]
     return cats
+def get_words(doc_emb):
+    # load countvectorizers
+    cvects = {}
+    vocab = {}  # load vocabulary of words for each lang
+    with open(input_cvect_key_file, "r") as fpr:
+        for line in fpr:
+            #print(line)
+            lang, fpath = line.strip().split()
+            with open(fpath, "rb") as fpr:
+                #print(f"loading {fpath}")
+                cvects[lang] = pickle.load(fpr)
+            vocab[lang] = cvects[lang].get_feature_names()
+            #print(
+            #    "Loaded CountVectorizer for lang",
+            #    lang,
+            #    "with vocab size:",
+            #    len(vocab[lang]),
+            #)
+    topn = 10 # top N words per cluster
+    #print(vocab["en"])
+    #print("MODEL KEYS")
+    #print(mul_model.E.keys())
+    doc_emb = doc_emb.flatten()
+    words_dict = {}
+    for lang in mul_model.E.keys():
+        #print(lang, end=": ")
+        scores = mul_model.E[lang] @ (doc_emb).T
+        k_ixs = np.argsort(scores)[::-1][:topn].squeeze()  # sort them in descending order and pick topn
+        tmp = []
+        for i in k_ixs:
+            #print(vocab[lang][i], end=", ")
+            tmp.append(vocab[lang][i])
+        words_dict[lang] = tmp
+        #print()
+    return words_dict
 def generate_output(article):
     paragraphs = article.split("\n")
     embdds = []
     embedding = np.average(embdds, axis=0)
     #y_pred = classifier.predict_proba(embedding.reshape(1, 768))
+    reshaped = embedding.reshape(1, 768)
+    #y_pred = classifier.predict(reshaped)
+    #y_pred = y_pred.flatten()
+    y_prob = classifier.predict_proba(reshaped)
+    y_prob = y_prob.reshape(len(categories),1)
+    y_pred = [1 if x >= 0.5 else 0 for x in y_prob]
     classes = get_categories(y_pred)
+    if len(classes) > 1:
+        classes_string = ', '.join(classes)
+    elif len(classes) == 1:
+        classes_string = classes[0]
+    else:
+        classes_string = 'No category was found.'
+    data = pd.DataFrame()
+    data['Category'] = categories
+    data['Probability'] = y_prob
+    fig = px.bar(data, x='Probability', y='Category', orientation='h', height=600)#, title="Category probability")
+    fig.update_xaxes(range=[0, 1])
+    fig.update_layout(margin=dict(l=5, r=5, t=20, b=5)) #paper_bgcolor="LightSteelBlue")
+    fig.update_traces(marker_color='rgb(158,202,225)')
+    #print(f"LEN Y_PROB {len(y_prob)}")
+    #print(f"LEN CAT {len(categories)}")
+    words_dict = get_words(reshaped)
+    words_string = ""
+    for lang, w in words_dict.items():
+        words_string += f"{lang}: "
+        words_string += ', '.join(w)
+        words_string += "\n"
+    return (classes_string, fig, words_string)
+# demo = gr.Interface(fn=generate_output,
+#     inputs=gr.Textbox(lines=6, placeholder="Insert text of the article here...", label="Article"),
+#     outputs=[gr.Textbox(lines=1, label="Category"), gr.Plot(label="Category probability"), gr.Textbox(lines=5, label="Topic discovery")],
+#     title="Article classification & topic discovery demo",
+#     flagging_options=["Incorrect"],
+#     theme=gr.themes.Base())
     #css=css_code)
+demo = gr.Blocks(css=css_code, theme=gr.themes.Base(), title="Article classification & topic discovery demo")
+with demo:
+    with gr.Row():
+        my_title = gr.HTML("<h1 align='center'>Article classification & topic discovery demo</h1>")
+    with gr.Row():
+        with gr.Column():
+            input_text = gr.Textbox(lines=22, placeholder="Insert text of the article here...", label="Article")
+            with gr.Row():
+                clear_button = gr.Button("Clear")
+                submit_button = gr.Button("Submit")
+        with gr.Column():
+            with gr.Tabs():
+                with gr.TabItem("Classification"):
+                    category_text = gr.Textbox(lines=1, label="Category")
+                    category_plot = gr.Plot()
+                with gr.TabItem("Topic discovery"):
+                    topic_text = gr.Textbox(lines=22, label="The most representative words")
+    submit_button.click(generate_output, inputs=input_text, outputs=[category_text, category_plot, topic_text])
+    clear_button.click(lambda: None, None, input_text, queue=False)
 demo.launch()

learn_multi_doc_model.py ADDED Viewed

	@@ -0,0 +1,352 @@

+# /usr/bin/env python3
+import argparse
+import os
+import numpy as np
+import scipy
+import pickle
+from scipy.special import log_softmax
+from time import time
+from packaging import version
+assert version.parse(scipy.__version__) >= version.parse(
+    "1.7.0"
+), f"Requries scipy > 1.7.0. Found {scipy.__version__}"
+class Model:
+    """Model defintion, parameters and helper fucntions to compute log-likelihood"""
+    def __init__(self, vocab: dict, emb_dim: int):
+        """Initialize our model
+        Args:
+            vocab: vocab size for each language {'en': 25000, 'de': 25000}
+            emb_dim: embedding dimension, will be same across languages
+        """
+        self.L = len(vocab)
+        self.vocab = vocab
+        self.emb_dim = emb_dim
+        # word embeddings matrix / subspace for each language
+        self.E = {}
+        # bias vector for each language
+        self.b = {}
+        n1 = 1.0 / np.sqrt(emb_dim)
+        # initialize word embeddings and bias vectors randomly
+        for lang, vocab_size in vocab.items():
+            n2 = 1.0 / np.sqrt(vocab_size)
+            self.E[lang] = np.random.uniform(-n2, n1, size=(vocab_size, emb_dim))
+            self.b[lang] = np.random.randn(vocab_size, 1) * 0.0001
+    def init_bias_with_log_unigram_dist(self, X, lang):
+        """We will initialize the bias vector with log of unigram distribution over vocabulary.
+        This should help us with better initialization.
+        b = \log (\sum_d x_d) / (\sum_d \sum_i x_{di})
+        """
+        # if X is sparse matrix, X.A gives the dense version of it in numpy array format
+        if isinstance(X, np.ndarray):
+            X = X + 1e-08  # to avoid zeros
+        else:
+            X = X.A + 1e-08  # to avoid any zeros
+        self.b[lang][:, 0] = np.log(
+            X.sum(axis=0) / X.sum()
+        )  # we would like b to of size (W, 1)
+    def compute_log_thetas(self, lang: str, DE_lang: np.ndarray, sanity_check=False):
+        """Compute log of thetas, where theta_d is the unigram distribution over document `d`
+        estiamted from the current params (word-embedding matrix, bias vector) and document embedding a_d.
+        Args:
+        ----
+            lang (str): Language ID (eg: en, de, es ...)
+            DE_lang (np.ndarray): Document embeddings of language
+        """
+        mat = self.b[lang] + (self.E[lang] @ DE_lang)  # shape is vocab_size x n_docs
+        mat = mat.T  # shape is D x W
+        # log_norm = logsumexp(mat, axis=1)
+        # log_thetas = mat - log_norm
+        # the following single step is same the two above steps combined
+        log_thetas = log_softmax(mat, axis=1)  # shape is n_docs x vocab_size
+        if sanity_check:
+            n_docs = DE_lang.shape[0]
+            # sanity-check
+            # since each document is a proper distribution, it should sum upto 1
+            # sum of the matrix should be equal to number of documents
+            print(
+                "Sanity check for log-thetas:",
+                np.allclose(np.exp(log_thetas).sum(), n_docs),
+            )
+        return log_thetas
+    def compute_log_likelihood(self, lang, DE_lang, X):
+        """Compute log-likelihood of the data, given the current parameters / embeddings
+        Each summation could be implemented using a for-loop but that would very slow,
+        since we have every thing stored in matrices and a sparse matrix, we will do it via
+        matrix muliplications and additions.
+        Args:
+            lang: language ID (eg: en, es, fr)
+            DE_lang: document embeddings for the given language
+            X: doc-by-word counts in scipy.sparse format for a specific language
+        Returns:
+            float: log-likelihood of the data
+        """
+        log_thetas = self.compute_log_thetas(lang, DE_lang)
+        # log-likelihood is product of counts to the respective log-probability values.
+        if isinstance(X, np.ndarray):
+            llh = (X * log_thetas).sum()
+        else:
+            # X is a scipy sparse matrix
+            llh = (X.multiply(log_thetas)).sum()
+        return llh
+def gradients_WE(model, lang, DE_lang, X, alpha):
+    """Gradient of the log-likelihood with-respect-to language-specific word embedding matrix `E`
+    Args:
+        model (Model): The object of the model
+        lang (str): Language ID
+        DE_lang: document embeddings for the given language
+        X (scipy.sparse_matrix): The doc-by-word counts
+        alpha (float): L2 reg. weight
+    Returns:
+        np.ndarray: Gradient of log-likelihood w.r.t word embeddings, i.e, grad of llh w.r.t to model.E
+    """
+    # grads = np.zeros_like(model.E)  # initialize empty gradients to be the same shape as word embeddings (W, K)
+    # compute log_thetas as they are needed in gradient
+    log_thetas = model.compute_log_thetas(lang, DE_lang)
+    # the gradient computation can be done using for-loops to reflect the equation
+    # or it can be done efficiently using matrix multiplications
+    # 1. simple way using for-loop
+    # iterate over all documents
+    # for d in range(model.D):
+    # iterate over every word,
+    #     for k in range(model.W):
+    #         x_dk = X[d, k]  # count of word k in doc d
+    #         rel_x_dk = X[d, :].sum() * np.exp(log_thetas)[d, k]  # relative /estimated count of word k in doc d
+    #         grads[k, :] += ((x_dk - rel_x_dk) * model.A[:, d])  # doc embeddings are column wise in model.A
+    # 2. Efficient way of obtaining gradients using matrix operations
+    ef_grads = np.zeros_like(model.E)
+    tmp = (
+        X - np.multiply(X.sum(axis=1).reshape(-1, 1), np.exp(log_thetas))
+    ).A  # .A will convert matrix to np ndarray
+    ef_grads = (DE_lang @ tmp).T - (alpha * 0.5 * model.E[lang]).sum()
+    # Sanity check to see if gradients computed in both ways are numerically identical
+    # print('- All close grad_E:', np.allclose(ef_grads, grads))
+    return ef_grads
+def update_parameters(params, gradient, learning_rate):
+    """Update the parameters
+    Args:
+        params (np.ndarray): Word embedding matrix of the document embedding matrix
+        gradient (np.ndarray): Gradients of all word embeddings or document embeddings. Should be same as size as params
+        learning_rate (float): The learning_rate can also be seen as step size, i.e, the size of the step to be taken
+               along the direction of gradient. Too big steps can overshoot our estimate, whereas too small steps
+               can take longer for the model to reach optimum.
+    Returns:
+        np.ndarray: the updated params
+    """
+    assert (
+        params.shape == gradient.shape
+    ), "The params and gradient must have same shape, \
+    ({:d}, {:d}) != ({:d} {:d})".format(
+        *params.shape, *gradient.shape
+    )
+    new_params = params + (
+        learning_rate * gradient
+    )  # since we are doing gradient ascent
+    return new_params
+def train(model, bow, DE, args):
+    """Training scheme for the model"""
+    print("\nTraining started ..")
+    learning_rate = args.lr
+    llh_0 = 0.0
+    for lang, X in bow.items():
+        llh_0 += model.compute_log_likelihood(lang, DE[lang].T, X)
+    print("  Initial log-likelihood: {:16.2f}".format(llh_0))
+    llhs = [llh_0]
+    for i in range(1, args.epochs + 1):
+        llh_ei = 0.0
+        for lang, X in bow.items():
+            # update word embeddings E for lang, by keeping doc-embeddings A fixed
+            grad_E = gradients_WE(model, lang, DE[lang].T, X, args.alpha)
+            model.E[lang] = update_parameters(model.E[lang], grad_E, learning_rate)
+            llh_ei += model.compute_log_likelihood(lang, DE[lang].T, X)
+        print(
+            "Epoch {:4d} / {:4d} | Log-likelihood: {:16.2f} | Learning rate: {:f}".format(
+                i, args.epochs, llh_ei, learning_rate
+            )
+        )
+        if llh_ei < llhs[-1]:
+            print(
+                "The log-likelihood should improve after every epoch.",
+                "Instead it decreased, which means the updates have overshooted.",
+                "Halving the learning_rate.",
+            )
+            learning_rate = learning_rate * 0.5
+        llhs.append(llh_ei)
+        # learning_rate scheduler
+        # we reduce the learning_rate by 10 % after every 10 epochs
+        # if i % 10 == 0:
+        #    print("Reducing the learning by a factor of 0.1 every 10 epcohs")
+        #    learning_rate -= learning_rate * 0.1
+        if i % 100 == 0:
+            with open(
+                os.path.join(args.out_dir, f"model_{args.alpha}_{i}.pkl"), "wb"
+            ) as fpw:
+                pickle.dump(model, fpw)
+            np.savetxt(
+                os.path.join(args.out_dir, f"llh_{args.alpha}_{args.epochs}.txt"),
+                np.asarray(llhs),
+            )
+    return model, llhs
+def main():
+    """main"""
+    args = parse_arguments()
+    os.makedirs(args.out_dir, exist_ok=True)
+    emb_dim = 0
+    # load doc embeddings for each language
+    doc_embs = {}  # {lang_1: np.ndarray, lang_2: np.ndarray, ...}
+    with open(args.input_embedding_key_file, "r") as fpr:
+        for line in fpr:
+            lang, fpath = line.strip().split()
+            doc_embs[lang] = np.load(fpath)
+            print("Loaded embeddings:", lang, doc_embs[lang].shape)
+            if emb_dim == 0:
+                emb_dim = doc_embs[lang].shape[1]
+    # load bag of words for each language
+    bows = {}  # {lang_1: scipy.sparse, lang_2: scipy.sparse, ...}
+    vocab = {}  # {lang_1: vocab_size}
+    with open(args.input_bag_of_words_key_file, "r") as fpr:
+        for line in fpr:
+            lang, fpath = line.strip().split()
+            bows[lang] = scipy.sparse.load_npz(fpath)
+            print("Loaded bag-of-words:", lang, bows[lang].shape)
+            vocab[lang] = bows[lang].shape[1]
+            # assert the number of docs per language are same in embeddings and bag-of-words
+            assert (
+                bows[lang].shape[0] == doc_embs[lang].shape[0]
+            ), "Number of docs in BoW ({:d}) != number of docs in embeddigs ({:d}) for language: {:s}".format(
+                bows[lang].shape[0], doc_embs[lang].shape[0], lang
+            )
+    model = Model(vocab, emb_dim)
+    for lang, bow in bows.items():
+        model.init_bias_with_log_unigram_dist(bow, lang)
+    print("Model params:")
+    for lang in model.vocab:
+        print("  ", lang, model.E[lang].shape, model.b[lang].shape)
+    if args.resume:
+        with open(args.resume, "rb") as fpr:
+            model = pickle.load(fpr)
+    # start the training
+    model, llhs = train(model, bows, doc_embs, args)
+    with open(
+        os.path.join(args.out_dir, f"model_{args.alpha}_{args.epochs}.pkl"), "wb"
+    ) as fpw:
+        pickle.dump(model, fpw)
+    np.savetxt(
+        os.path.join(args.out_dir, f"llh_{args.alpha}_{args.epochs}.txt"),
+        np.asarray(llhs),
+    )
+    print("Saved in", args.out_dir)
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        "input_embedding_key_file",
+        help="path to file that has paths to embeddings for each language",
+    )
+    parser.add_argument(
+        "input_bag_of_words_key_file", help="path to input bag of words dictionary file"
+    )
+    parser.add_argument("out_dir", help="out dir to save the model/word embeddings")
+    parser.add_argument("--epochs", type=int, default=100, help="number of epochs")
+    parser.add_argument("--lr", type=float, default=0.0001, help="learning rate")
+    parser.add_argument(
+        "--alpha", type=float, default=1e-4, help="L2 reg. weight / weight decay"
+    )
+    parser.add_argument(
+        "--resume", default="", help="path to trained model to resume training"
+    )
+    args = parser.parse_args()
+    return args
+if __name__ == "__main__":
+    main()

models/model_0.0001_100.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d48ed6671bf0990a14476301a7845362092852c8e6bb624271f3943252e954c1
+size 2166342600

requirements.txt CHANGED Viewed

@@ -1,2 +1,5 @@
 numpy==1.24.2
 sentence-transformers==2.2.2

 numpy==1.24.2
 sentence-transformers==2.2.2
+pandas==1.5.2
+plotly
+sklearn==0.24.2

topic_discovery/.DS_Store ADDED Viewed

Binary file (8.2 kB). View file

topic_discovery/cvect_25000_ar.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b37e9e016646662718993e2368f9e88c4c21141f8944f23449f27c6d59e03221
+size 3047285

topic_discovery/cvect_25000_bn.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:05b3adf720d522a38762fda2bb6da2c948389a437b2138004698d326181d971d
+size 157149

topic_discovery/cvect_25000_de.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e551d8934e6a8e23c841437805bbed1b0e17eb2f3ab3e260b9104c1e30f452ad
+size 2037400

topic_discovery/cvect_25000_el.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d5419f509f5666ae55a7f5cdfb1cf7ea41f3fa102ec639c19c4aeea8b2dffe32
+size 3681045

topic_discovery/cvect_25000_en.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cb0ee36e4ef6738d408e30132c5d970be2e05728c305fccce06dc67b3941bea2
+size 4143980

topic_discovery/cvect_25000_es.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d28eb842e6f4717a791de9c8c61014131dbea8d26f84f90c62cd54b05595a1c9
+size 4235561

topic_discovery/cvect_25000_fr.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:74ff26b2269c2033f78ecb1e5870c449423d42d668975e5e98e899b6d2489f64
+size 2967490

topic_discovery/cvect_25000_it.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e8892d88fd88e0d9e121e57e1b77810e47d34909944b2e65e2094d426f17daa
+size 2477565

topic_discovery/cvect_25000_jp.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2c075e83209a4a23afe290aef6a301717f4eadfd118a278114ea142fdf882c20
+size 3082086

topic_discovery/cvect_25000_mg.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:958dd98498097b8463b1fbc6f068b512650d40397b9e53659dc2238032126181
+size 3643714

topic_discovery/cvect_25000_mk.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6758e48f3626b7c91b7359097d27aedb6beaeb36c6a6632901c3fae3f6da5ea3
+size 2152452

topic_discovery/cvect_25000_nl.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5f81d4942757d07cde33715cd00fe150c377b19070f57cc992230b8c6eeacb06
+size 1466263

topic_discovery/cvect_25000_pl.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ad1d1d8853aa424ba47c81d52ab6fdd708d1a440901652d680482d092a88a44a
+size 2063425

topic_discovery/cvect_25000_pt.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:baef6e3fe017ed4feb3ac2e08701b77b4425ade9f39d700ab3d1b4a2d89059d6
+size 2001188

topic_discovery/cvect_25000_ru.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:89bfa381364b0df772b0a181df8740bf597733e328410c464e6690d58e8e212f
+size 5482015

topic_discovery/cvect_25000_zhs.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1369c082d071340da56006eef8ffc380625c39fef4a7034b7d1e2927b1f54717
+size 9390903

topic_discovery/cvect_25000_zht.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:030a1c4b66cfecf4645de14f77d90d56886e8927225581c94e45a93006c0c633
+size 9965443

topic_discovery/cvects.key ADDED Viewed

	@@ -0,0 +1,17 @@

+en topic_discovery/cvect_25000_en.pkl
+es topic_discovery/cvect_25000_es.pkl
+fr topic_discovery/cvect_25000_fr.pkl
+mg topic_discovery/cvect_25000_mg.pkl
+it topic_discovery/cvect_25000_it.pkl
+el topic_discovery/cvect_25000_el.pkl
+zhs topic_discovery/cvect_25000_zhs.pkl
+zht topic_discovery/cvect_25000_zht.pkl
+bn topic_discovery/cvect_25000_bn.pkl
+ru topic_discovery/cvect_25000_ru.pkl
+pt topic_discovery/cvect_25000_pt.pkl
+ar topic_discovery/cvect_25000_ar.pkl
+de topic_discovery/cvect_25000_de.pkl
+jp topic_discovery/cvect_25000_jp.pkl
+mk topic_discovery/cvect_25000_mk.pkl
+pl topic_discovery/cvect_25000_pl.pkl
+nl topic_discovery/cvect_25000_nl.pkl