Spaces:

somosnlp-hackathon-2022
/

BioMedIA

Build error

App Files Files Community

Alejandro Vaca commited on Mar 28, 2022

Commit

6bf4ad7

•

1 Parent(s): 92d518a

initial commit

Browse files

Files changed (14) hide show

.gitattributes +2 -0
app.py +254 -0
article_app.py +173 -0
audio_troll.flac +3 -0
dpr_index_bio.faiss +3 -0
dpr_index_bio_newdpr.faiss +3 -0
dpr_index_bio_prueba.faiss +3 -0
dpr_index_bio_splitted.faiss +3 -0
general_utils.py +138 -0
packages.txt +2 -0
requirements.txt +10 -0
save_faiss_index.py +65 -0
tmptdsnrh_8.flac +3 -0
vacio.flac +3 -0

.gitattributes CHANGED Viewed

@@ -25,3 +25,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.flac filter=lfs diff=lfs merge=lfs -text
+*.faiss filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,254 @@

+from datasets import load_dataset
+from transformers import (
+    DPRQuestionEncoder,
+    DPRQuestionEncoderTokenizer,
+    MT5ForConditionalGeneration,
+    AutoTokenizer,
+    AutoModelForCTC,
+    Wav2Vec2Tokenizer,
+)
+from general_utils import (
+    embed_questions,
+    transcript,
+    remove_chars_to_tts,
+    parse_final_answer,
+)
+from typing import List
+import gradio as gr
+from article_app import article, description, examples
+from haystack.nodes import DensePassageRetriever
+from haystack.document_stores import InMemoryDocumentStore
+import numpy as np
+from sentence_transformers import SentenceTransformer, util, CrossEncoder
+topk = 21
+minchars = 200
+min_snippet_length = 20
+device = "cpu"
+covidterms = ["covid19", "covid", "coronavirus", "covid-19", "sars-cov-2"]
+models = {
+    "wav2vec2-iic": {
+        "processor": Wav2Vec2Tokenizer.from_pretrained(
+            "IIC/wav2vec2-spanish-multilibrispeech"
+        ),
+        "model": AutoModelForCTC.from_pretrained(
+            "IIC/wav2vec2-spanish-multilibrispeech"
+        ),
+    },
+    # "wav2vec2-jonatangrosman": {
+    #     "processor": Wav2Vec2Tokenizer.from_pretrained(
+    #         "jonatasgrosman/wav2vec2-large-xlsr-53-spanish"
+    #     ),
+    #     "model": AutoModelForCTC.from_pretrained(
+    #         "jonatasgrosman/wav2vec2-large-xlsr-53-spanish"
+    #     ),
+    # },
+}
+tts_es = gr.Interface.load("huggingface/facebook/tts_transformer-es-css10")
+params_generate = {
+    "min_length": 50,
+    "max_length": 250,
+    "do_sample": False,
+    "early_stopping": True,
+    "num_beams": 8,
+    "temperature": 1.0,
+    "top_k": None,
+    "top_p": None,
+    "no_repeat_ngram_size": 3,
+    "num_return_sequences": 1,
+}
+dpr = DensePassageRetriever(
+    document_store=InMemoryDocumentStore(),
+    query_embedding_model="IIC/dpr-spanish-question_encoder-allqa-base",
+    passage_embedding_model="IIC/dpr-spanish-passage_encoder-allqa-base",
+    max_seq_len_query=64,
+    max_seq_len_passage=256,
+    batch_size=512,
+    use_gpu=False,
+)
+mt5_tokenizer = AutoTokenizer.from_pretrained("IIC/mt5-base-lfqa-es")
+mt5_lfqa = MT5ForConditionalGeneration.from_pretrained("IIC/mt5-base-lfqa-es")
+similarity_model = SentenceTransformer(
+    "distiluse-base-multilingual-cased", device="cpu"
+)
+crossencoder = CrossEncoder("avacaondata/roberta-base-bne-ranker", device="cpu")
+dataset = load_dataset("IIC/spanish_biomedical_crawled_corpus", split="train")
+dataset = dataset.filter(lambda example: len(example["text"]) > minchars)
+dataset.load_faiss_index(
+    "embeddings",
+    "dpr_index_bio_newdpr.faiss",
+)
+def query_index(question: str):
+    question_embedding = dpr.embed_queries([question])[0]
+    scores, closest_passages = dataset.get_nearest_examples(
+        "embeddings", question_embedding, k=topk
+    )
+    contexts = [
+        closest_passages["text"][i] for i in range(len(closest_passages["text"]))
+    ]
+    # [:int(topk / 3)]
+    return [
+        context for context in contexts if len(context.split()) > min_snippet_length
+    ]
+def sort_on_similarity(question, contexts, include_rank: int = 5):
+    # TODO: METER AQUÍ EL CROSSENCODER nuestro
+    question_encoded = similarity_model.encode([question])[0]
+    ctxs_encoded = similarity_model.encode(contexts)
+    similarity_scores = [
+        util.cos_sim(question_encoded, ctx_encoded) for ctx_encoded in ctxs_encoded
+    ]
+    similarity_ranking_idx = np.flip(np.argsort(similarity_scores))
+    return [contexts[idx] for idx in similarity_ranking_idx][:include_rank]
+def create_context(contexts: List):
+    return "<p>" + "<p>".join(contexts)
+def create_model_input(question: str, context: str):
+    return f"question: {question} context: {context}"
+def generate_answer(model_input, update_params):
+    model_input = mt5_tokenizer(
+        model_input, truncation=True, padding=True, return_tensors="pt", max_length=1024
+    )
+    params_generate.update(update_params)
+    answers_encoded = mt5_lfqa.generate(
+        input_ids=model_input["input_ids"].to(device),
+        attention_mask=model_input["attention_mask"].to(device),
+        **params_generate,
+    )
+    answers = mt5_tokenizer.batch_decode(
+        answers_encoded, skip_special_tokens=True, clean_up_tokenization_spaces=True
+    )
+    results = [{"generated_text": answer} for answer in answers]
+    return results
+def search_and_answer(
+    question,
+    audio_file,
+    audio_array,
+    min_length_answer,
+    num_beams,
+    no_repeat_ngram_size,
+    temperature,
+    max_answer_length,
+    wav2vec2_name,
+    do_tts,
+):
+    update_params = {
+        "min_length": min_length_answer,
+        "max_length": max_answer_length,
+        "num_beams": int(num_beams),
+        "temperature": temperature,
+        "no_repeat_ngram_size": no_repeat_ngram_size,
+    }
+    if not question:
+        s2t_model = models[wav2vec2_name]["model"]
+        s2t_processor = models[wav2vec2_name]["processor"]
+        question = transcript(
+            audio_file, audio_array, processor=s2t_processor, model=s2t_model
+        )
+        print(f"Transcripted question: *** {question} ****")
+    if any([any([term in word.lower() for term in covidterms]) for word in question.split(" ")]):
+        return "Del COVID no queremos saber ya más nada, lo sentimos, pregúntame sobre otra cosa :P ", "tmptdsnrh_8.flac"
+    contexts = query_index(question)
+    contexts = sort_on_similarity(question, contexts)
+    context = create_context(contexts)
+    model_input = create_model_input(question, context)
+    answers = generate_answer(model_input, update_params)
+    final_answer = answers[0]["generated_text"]
+    if do_tts:
+        audio_answer = tts_es(remove_chars_to_tts(final_answer))
+    final_answer = parse_final_answer(final_answer, contexts)
+    return final_answer, audio_answer if do_tts else "tmptdsnrh_8.flac"
+if __name__ == "__main__":
+    gr.Interface(
+        search_and_answer,
+        inputs=[
+            gr.inputs.Textbox(
+                lines=2,
+                label="Question",
+                placeholder="Type your question (in spanish) to the system.",
+                optional=True,
+            ),
+            gr.inputs.Audio(
+                source="upload",
+                type="filepath",
+                label="Upload your audio asking a question here.",
+                optional=True,
+            ),
+            gr.inputs.Audio(
+                source="microphone",
+                type="numpy",
+                label="Record your audio asking a question.",
+                optional=True,
+            ),
+            gr.inputs.Slider(
+                minimum=10,
+                maximum=200,
+                default=50,
+                label="Minimum size for the answer",
+                step=1,
+            ),
+            gr.inputs.Slider(
+                minimum=4, maximum=12, default=8, label="number of beams", step=1
+            ),
+            gr.inputs.Slider(
+                minimum=2, maximum=5, default=3, label="no repeat n-gram size", step=1
+            ),
+            gr.inputs.Slider(
+                minimum=0.8, maximum=2.0, default=1.0, label="temperature", step=0.1
+            ),
+            gr.inputs.Slider(
+                minimum=220,
+                maximum=360,
+                default=250,
+                label="maximum answer length",
+                step=1,
+            ),
+            gr.inputs.Dropdown(
+                ["wav2vec2-iic", "wav2vec2-jonatangrosman"],
+                type="value",
+                default=None,
+                label="Select the speech recognition model.",
+                optional=False,
+            ),
+            gr.inputs.Checkbox(
+                default=False, label="Text to Speech", optional=True),
+        ],
+        outputs=[
+            gr.outputs.HTML(
+                # type="str",
+                label="Answer from the system."
+            ),
+            gr.outputs.Audio(label="Answer in audio"),
+        ],
+        # title="Abstractive QA of BioMedical Domain in Spanish",
+        description=description,
+        examples=examples,
+        theme="grass",
+        article=article,
+        thumbnail="IIC_logoP.png",
+        css="https://cdn.jsdelivr.net/npm/bootstrap@3.3.7/dist/css/bootstrap.min.css",
+    ).launch()

article_app.py ADDED Viewed

	@@ -0,0 +1,173 @@

+article = """
+<img src="https://www.iic.uam.es/wp-content/uploads/2017/12/IIC_logoP.png">
+<img src="https://drive.google.com/uc?export=view&id=1S8v94q39QRCfmVTMvjLCACmhMe9lJQdc">
+<p style="text-align: justify;"> This app is developed by <a href="https://www.iic.uam.es/">IIC - Instituto de Ingeniería del Conocimiento</a> as part of the <a href="https://www.eventbrite.com/e/registro-hackathon-de-pln-en-espanol-273014111557">Somos PLN Hackaton 2022.</a>
+The objective of this app is to expand the existing tools regarding long form question answering in Spanish. In fact, multiple novel methods (in Spanish)
+have been introduced to build this app.
+The reason for including audio as a possible input and always as an output is because we wanted to make the App much more accessible to people that cannot read or write.
+Below you can find all the pieces that form the system.
+1. <a href="https://huggingface.co/IIC/wav2vec2-spanish-multilibrispeech">Speech2Text</a>: For this we finedtuned a multilingual Wav2Vec2, as explained in the attached link. We use this model to process audio questions.
+2. <a href="https://huggingface.co/IIC/dpr-spanish-passage_encoder-allqa-base">Dense Passage Retrieval for Context</a>: Dense Passage Retrieval is a methodology <a href="https://arxiv.org/abs/2004.04906">developed by Facebook</a> which is currently the SoTA for Passage Retrieval,
+that is, the task of getting the most relevant passages to answer a given question with. You can find details about how it was trained on the link attached to the name.
+3. <a href="https://huggingface.co/IIC/dpr-spanish-question_encoder-allqa-base">Dense Passage Retrieval for Question</a>: It is actually part of the same thing as the above. For more details, go to the attached link.
+4. <a href="https://huggingface.co/sentence-transformers/distiluse-base-multilingual-cased-v1">Sentence Encoder Ranker</a>: To rerank the candidate contexts retrieved by dpr for the generative model to see. This also selects the top 5 passages for the model to read, it is the final filter before the generative model.
+5. <a href="https://huggingface.co/IIC/mt5-base-lfqa-es">Generative Long-Form Question Answering Model</a>: For this we used either mT5 (the one attached) or <a href="https://huggingface.co/IIC/mbart-large-lfqa-es">mBART</a>. This generative model receives the most relevant
+passages and uses them to generate an answer to the question. In the attached link there are more details about how we trained it etc.
+On the other hand, we uploaded, and in some cases created, datasets in Spanish to be able to build such a system.
+1. <a href="https://huggingface.co/datasets/IIC/spanish_biomedical_crawled_corpus">Spanish Biomedical Crawled Corpus</a>. Used for finding answers to questions about biomedicine. (More info in the link.)
+2. <a href="https://huggingface.co/datasets/IIC/lfqa_spanish">LFQA_Spanish</a>. Used for training the generative model. (More info in the link.)
+3. <a href="https://huggingface.co/datasets/squad_es">SQUADES</a>. Used to train the DPR models. (More info in the link.)
+4. <a href="https://huggingface.co/datasets/IIC/bioasq22_es">BioAsq22-Spanish</a>. Used to train the DPR models. (More info in the link.)
+5. <a href="https://huggingface.co/datasets/PlanTL-GOB-ES/SQAC">SQAC (Spanish Question Answering Corpus)</a>. Used to train the DPR models. (More info in the link.)
+</p>
+"""
+# height="100", width="1000"
+description = """
+<a href="https://www.iic.uam.es/">
+    <img src="https://drive.google.com/uc?export=view&id=1xNz4EuafyzvMKSMTEfwzELln155uN6_H"  style="max-width: 100%; max-height: 10%; height: 250px; object-fit: fill">,
+</a>
+<h1> BioMedIA: Abstractive Question Answering of BioMedical Domain in Spanish </h1>
+Esta aplicación consiste en sistemas de búsqueda del Estado del Arte en Español junto con un modelo generativo entrenado para componer una respuesta a preguntas a partir de una serie de contextos.
+"""
+examples = [
+    [
+        "¿Cuáles son los efectos secundarios más ampliamente reportados en el tratamiento de la enfermedad de Crohn?",
+        "vacio.flac",
+        "vacio.flac",
+        60,
+        8,
+        3,
+        1.0,
+        250,
+        "wav2vec2-iic",
+        False,
+    ],
+    [
+        "¿Qué alternativas al Paracetamol existen para el dolor de cabeza?",
+        "vacio.flac",
+        "vacio.flac",
+        80,
+        8,
+        3,
+        1.0,
+        250,
+        "wav2vec2-iic",
+        False
+    ],
+    [
+        "¿Cuáles son los principales tipos de disartria del trastorno del habla motor?",
+        "vacio.flac",
+        "vacio.flac",
+        50,
+        8,
+        3,
+        1.0,
+        250,
+        "wav2vec2-iic",
+        False
+    ],
+    [
+        "¿Es la esclerosis tuberosa una enfermedad genética?",
+        "vacio.flac",
+        "vacio.flac",
+        50,
+        8,
+        3,
+        1.0,
+        250,
+        "wav2vec2-iic",
+        False
+    ],
+    [
+        "¿Cuál es la función de la proteína Mis18?",
+        "vacio.flac",
+        "vacio.flac",
+        50,
+        8,
+        3,
+        1.0,
+        250,
+        "wav2vec2-iic",
+        False
+    ],
+    [
+        "¿Qué deficiencia es la causa del síndrome de piernas inquietas??",
+        "vacio.flac",
+        "vacio.flac",
+        50,
+        8,
+        3,
+        1.0,
+        250,
+        "wav2vec2-iic",
+        False
+    ],
+    [
+        "¿Cuál es la función del 6SRNA en las bacterias?",
+        "vacio.flac",
+        "vacio.flac",
+        60,
+        8,
+        3,
+        1.0,
+        250,
+        "wav2vec2-iic",
+        False,
+    ],
+    [
+        "¿Por qué los humanos desarrollamos diabetes?",
+        "vacio.flac",
+        "vacio.flac",
+        50,
+        10,
+        3,
+        1.0,
+        250,
+        "wav2vec2-iic",
+        False,
+    ],
+    [
+        "¿Qué factores de riesgo aumentan la probabilidad de sufrir un ataque al corazón?",
+        "vacio.flac",
+        "vacio.flac",
+        80,
+        8,
+        3,
+        1.0,
+        250,
+        "wav2vec2-iic",
+        False
+    ],
+    [
+        "¿Cómo funcionan las vacunas?",
+        "vacio.flac",
+        "vacio.flac",
+        90,
+        8,
+        3,
+        1.0,
+        250,
+        "wav2vec2-iic",
+        False
+    ],
+    [
+        "¿Tienen conciencia los animales?",
+        "vacio.flac",
+        "vacio.flac",
+        70,
+        8,
+        3,
+        1.0,
+        250,
+        "wav2vec2-iic",
+        False
+    ],
+]

audio_troll.flac ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fd58f522978eb9ed242c9c9ff6b3e4dd0054e55f74ab125f4d8f1d821bacfb01
+size 585520

dpr_index_bio.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:098cc186374dde469b419bb91cd71ca1e0ac2fab02adae13977689f6e249e0be
+size 68619327

dpr_index_bio_newdpr.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fcedce3fa1c9049abe6f5325ee16f937147d8a5b22b526969dbf77182ebc4c5b
+size 59494679

dpr_index_bio_prueba.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:00aa65abf514ebe54aec3b25486589c6302a223ea07b5fc7c9f644ed081c9c6d
+size 301101

dpr_index_bio_splitted.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bcc178f8b1ec7795834dd209875847f3f6fc6e26bcebce1b08be38d6bdd27211
+size 162144239

general_utils.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import torch
+import nltk
+from scipy.io.wavfile import write
+import librosa
+import hashlib
+from typing import List
+def embed_questions(
+    question_model, question_tokenizer, questions, max_length=128, device="cpu"
+):
+    query = question_tokenizer(
+        questions,
+        max_length=max_length,
+        padding="max_length",
+        truncation=True,
+        return_tensors="pt",
+    )
+    with torch.no_grad():
+        q_reps = question_model(
+            query["input_ids"].to(device), query["attention_mask"].to(device)
+        ).pooler_output
+    return q_reps.cpu().numpy()
+def embed_passages(ctx_model, ctx_tokenizer, passages, max_length=128, device="cpu"):
+    p = ctx_tokenizer(
+        passages["text"],
+        max_length=max_length,
+        padding="max_length",
+        truncation=True,
+        return_tensors="pt",
+    )
+    with torch.no_grad():
+        a_reps = ctx_model(
+            p["input_ids"].to(device), p["attention_mask"].to(device)
+        ).pooler_output
+    return {"embeddings": a_reps.cpu().numpy()}
+class Document:
+    def __init__(self, meta={}, content: str = "", id_: str = ""):
+        self.meta = meta
+        self.content = content
+        self.id = id_
+def _alter_docs_for_haystack(passages):
+    return [Document(content=passage, id_=str(i)) for i, passage in enumerate(passages)]
+def embed_passages_haystack(
+    dpr_model,
+    passages,
+):
+    passages = _alter_docs_for_haystack(passages["text"])
+    embeddings = dpr_model.embed_documents(passages)
+    return {"embeddings": embeddings}
+def correct_casing(input_sentence):
+    """This function is for correcting the casing of the generated transcribed text"""
+    sentences = nltk.sent_tokenize(input_sentence)
+    return " ".join([s.replace(s[0], s[0].capitalize(), 1) for s in sentences])
+def clean_transcript(text):
+    text = text.replace("[pad]".upper(), "")
+    return text
+def add_question_symbols(text):
+    if text[0] != "¿":
+        text = "¿" + text
+    if text[-1] != "?":
+        text = text + "?"
+    return text
+def remove_chars_to_tts(text):
+    text = text.replace(",", " ")
+    return text
+def transcript(input_file, audio_array, processor, model):
+    if audio_array:
+        rate, sample = audio_array
+        write("temp.wav", rate, sample)
+        input_file = "temp.wav"
+    transcript = ""
+    # Ensure that the sample rate is 16k
+    sample_rate = librosa.get_samplerate(input_file)
+    # Stream over 10 seconds chunks rather than load the full file
+    stream = librosa.stream(
+        input_file,
+        block_length=20,  # number of seconds to split the batch
+        frame_length=sample_rate,  # 16000,
+        hop_length=sample_rate,  # 16000
+    )
+    for speech in stream:
+        if len(speech.shape) > 1:
+            speech = speech[:, 0] + speech[:, 1]
+        if sample_rate != 16000:
+            speech = librosa.resample(speech, orig_sr=sample_rate, target_sr=16000)
+        input_values = processor(speech, return_tensors="pt").input_values
+        logits = model(input_values).logits
+        predicted_ids = torch.argmax(logits, dim=-1)
+        transcription = processor.decode(
+            predicted_ids[0],
+            clean_up_tokenization_spaces=True,
+            skip_special_tokens=True,
+        )
+        transcription = clean_transcript(transcription)
+        # transcript += transcription.lower()
+        transcript += correct_casing(transcription.lower()) + ". "
+        # transcript += " "
+    whole_text = transcript[:3800]
+    whole_text = add_question_symbols(whole_text)
+    return whole_text
+def parse_final_answer(answer_text: str, contexts: List):
+    """Parse the final answer into correct format"""
+    s = (
+        f"<b><em>Final Answer:</em> {answer_text}</b> \n\n\n"
+        + "<p> Contexts Used: \n <p>"
+        + "\n".join(
+            [
+                ("""<p style="text-align: justify;">""" + context)[:300]
+                + "[...]</p>"
+                for context in contexts[:5]
+            ]
+        )
+    )
+    return s

packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ libsndfile1
2	+ ffmpeg

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+nltk==3.7
+transformers==4.13.0
+torch==1.10.2
+librosa==0.9.1
+numpy==1.21
+gradio==2.8.13
+jinja2==3.0.3
+datasets==1.18.4
+faiss-gpu==1.7.2
+farm-haystack==1.3.0

save_faiss_index.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from datasets import load_dataset
+from transformers import DPRContextEncoderTokenizer, DPRContextEncoder
+from general_utils import embed_passages, embed_passages_haystack
+import faiss
+import argparse
+import os
+from haystack.nodes import DensePassageRetriever
+from haystack.document_stores import InMemoryDocumentStore
+os.environ["OMP_NUM_THREADS"] = "8"
+def create_faiss_index(args):
+    minchars = 200
+    dims = 128
+    dpr = DensePassageRetriever(
+        document_store=InMemoryDocumentStore(),
+        query_embedding_model="IIC/dpr-spanish-question_encoder-allqa-base",
+        passage_embedding_model="IIC/dpr-spanish-question_encoder-allqa-base",
+        max_seq_len_query=64,
+        max_seq_len_passage=256,
+        batch_size=512,
+    )
+    dataset = load_dataset(
+        "IIC/spanish_biomedical_crawled_corpus", split="train"
+    )
+    dataset = dataset.filter(lambda example: len(example["text"]) > minchars)
+    def embed_passages_retrieval(examples):
+        return embed_passages_haystack(dpr, examples)
+    dataset = dataset.map(embed_passages_retrieval, batched=True, batch_size=8192)
+    dataset.add_faiss_index(
+        column="embeddings",
+        string_factory="OPQ64_128,IVF4898,PQ64x4fsr",
+        train_size=len(dataset),
+    )
+    dataset.save_faiss_index("embeddings", args.index_file_name)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Creates Faiss Wikipedia index file")
+    parser.add_argument(
+        "--ctx_encoder_name",
+        default="IIC/dpr-spanish-passage_encoder-squades-base",
+        help="Encoding model to use for passage encoding",
+    )
+    parser.add_argument(
+        "--index_file_name",
+        default="dpr_index_bio_splitted.faiss",
+        help="Faiss index file with passage embeddings",
+    )
+    parser.add_argument(
+        "--device", default="cuda:0", help="The device to index data on."
+    )
+    main_args, _ = parser.parse_known_args()
+    create_faiss_index(main_args)

tmptdsnrh_8.flac ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:04f8d015f3597c6858e74d40c72fb70fe1caab7bf6b015ccca0eda5f53a49c71
+size 389592

vacio.flac ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:04a9780650bebeb4e93ccdc6f9298e7338a53cc30b7c3a281cd1a01ff2bbb5c8
+size 103880