avacaondata commited on
Commit
56a498b
1 Parent(s): 13beaa4

añadidos cambios article

Browse files
Files changed (2) hide show
  1. app.py +2 -12
  2. article_app.py +37 -13
app.py CHANGED
@@ -36,14 +36,6 @@ models = {
36
  "IIC/wav2vec2-spanish-multilibrispeech"
37
  ),
38
  },
39
- # "wav2vec2-jonatangrosman": {
40
- # "processor": Wav2Vec2Tokenizer.from_pretrained(
41
- # "jonatasgrosman/wav2vec2-large-xlsr-53-spanish"
42
- # ),
43
- # "model": AutoModelForCTC.from_pretrained(
44
- # "jonatasgrosman/wav2vec2-large-xlsr-53-spanish"
45
- # ),
46
- # },
47
  }
48
 
49
 
@@ -80,7 +72,7 @@ similarity_model = SentenceTransformer(
80
  "distiluse-base-multilingual-cased", device="cpu"
81
  )
82
 
83
- crossencoder = CrossEncoder("avacaondata/roberta-base-bne-ranker", device="cpu")
84
 
85
  dataset = load_dataset("IIC/spanish_biomedical_crawled_corpus", split="train")
86
 
@@ -228,7 +220,7 @@ if __name__ == "__main__":
228
  step=1,
229
  ),
230
  gr.inputs.Dropdown(
231
- ["wav2vec2-iic", "wav2vec2-jonatangrosman"],
232
  type="value",
233
  default=None,
234
  label="Select the speech recognition model.",
@@ -239,12 +231,10 @@ if __name__ == "__main__":
239
  ],
240
  outputs=[
241
  gr.outputs.HTML(
242
- # type="str",
243
  label="Answer from the system."
244
  ),
245
  gr.outputs.Audio(label="Answer in audio"),
246
  ],
247
- # title="Abstractive QA of BioMedical Domain in Spanish",
248
  description=description,
249
  examples=examples,
250
  theme="grass",
36
  "IIC/wav2vec2-spanish-multilibrispeech"
37
  ),
38
  },
 
 
 
 
 
 
 
 
39
  }
40
 
41
 
72
  "distiluse-base-multilingual-cased", device="cpu"
73
  )
74
 
75
+ crossencoder = CrossEncoder("IIC/roberta-base-bne-ranker", device="cpu")
76
 
77
  dataset = load_dataset("IIC/spanish_biomedical_crawled_corpus", split="train")
78
 
220
  step=1,
221
  ),
222
  gr.inputs.Dropdown(
223
+ ["wav2vec2-iic"],
224
  type="value",
225
  default=None,
226
  label="Select the speech recognition model.",
231
  ],
232
  outputs=[
233
  gr.outputs.HTML(
 
234
  label="Answer from the system."
235
  ),
236
  gr.outputs.Audio(label="Answer in audio"),
237
  ],
 
238
  description=description,
239
  examples=examples,
240
  theme="grass",
article_app.py CHANGED
@@ -9,27 +9,27 @@ have been introduced to build this app.
9
  The reason for including audio as a possible input and always as an output is because we wanted to make the App much more accessible to people that cannot read or write.
10
  Below you can find all the pieces that form the system.
11
 
12
- 1. <a href="https://huggingface.co/IIC/wav2vec2-spanish-multilibrispeech">Speech2Text</a>: For this we finedtuned a multilingual Wav2Vec2, as explained in the attached link. We use this model to process audio questions.
13
- 2. <a href="https://huggingface.co/IIC/dpr-spanish-passage_encoder-allqa-base">Dense Passage Retrieval for Context</a>: Dense Passage Retrieval is a methodology <a href="https://arxiv.org/abs/2004.04906">developed by Facebook</a> which is currently the SoTA for Passage Retrieval,
14
  that is, the task of getting the most relevant passages to answer a given question with. You can find details about how it was trained on the link attached to the name.
15
- 3. <a href="https://huggingface.co/IIC/dpr-spanish-question_encoder-allqa-base">Dense Passage Retrieval for Question</a>: It is actually part of the same thing as the above. For more details, go to the attached link.
16
- 4. <a href="https://huggingface.co/sentence-transformers/distiluse-base-multilingual-cased-v1">Sentence Encoder Ranker</a>: To rerank the candidate contexts retrieved by dpr for the generative model to see. This also selects the top 5 passages for the model to read, it is the final filter before the generative model.
17
- 5. <a href="https://huggingface.co/IIC/mt5-base-lfqa-es">Generative Long-Form Question Answering Model</a>: For this we used either mT5 (the one attached) or <a href="https://huggingface.co/IIC/mbart-large-lfqa-es">mBART</a>. This generative model receives the most relevant
18
  passages and uses them to generate an answer to the question. In the attached link there are more details about how we trained it etc.
19
 
20
  On the other hand, we uploaded, and in some cases created, datasets in Spanish to be able to build such a system.
21
 
22
- 1. <a href="https://huggingface.co/datasets/IIC/spanish_biomedical_crawled_corpus">Spanish Biomedical Crawled Corpus</a>. Used for finding answers to questions about biomedicine. (More info in the link.)
23
- 2. <a href="https://huggingface.co/datasets/IIC/lfqa_spanish">LFQA_Spanish</a>. Used for training the generative model. (More info in the link.)
24
- 3. <a href="https://huggingface.co/datasets/squad_es">SQUADES</a>. Used to train the DPR models. (More info in the link.)
25
- 4. <a href="https://huggingface.co/datasets/IIC/bioasq22_es">BioAsq22-Spanish</a>. Used to train the DPR models. (More info in the link.)
26
- 5. <a href="https://huggingface.co/datasets/PlanTL-GOB-ES/SQAC">SQAC (Spanish Question Answering Corpus)</a>. Used to train the DPR models. (More info in the link.)
27
  </p>
28
  """
29
- # height="100", width="1000"
30
  description = """
31
  <a href="https://www.iic.uam.es/">
32
- <img src="https://drive.google.com/uc?export=view&id=1xNz4EuafyzvMKSMTEfwzELln155uN6_H" style="max-width: 100%; max-height: 10%; height: 250px; object-fit: fill">,
33
  </a>
34
  <h1> BioMedIA: Abstractive Question Answering of BioMedical Domain in Spanish </h1>
35
  Esta aplicación consiste en sistemas de búsqueda del Estado del Arte en Español junto con un modelo generativo entrenado para componer una respuesta a preguntas a partir de una serie de contextos.
@@ -49,6 +49,30 @@ examples = [
49
  "wav2vec2-iic",
50
  False,
51
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  [
53
  "¿Qué alternativas al Paracetamol existen para el dolor de cabeza?",
54
  "vacio.flac",
@@ -98,7 +122,7 @@ examples = [
98
  False
99
  ],
100
  [
101
- "¿Qué deficiencia es la causa del síndrome de piernas inquietas??",
102
  "vacio.flac",
103
  "vacio.flac",
104
  50,
9
  The reason for including audio as a possible input and always as an output is because we wanted to make the App much more accessible to people that cannot read or write.
10
  Below you can find all the pieces that form the system.
11
 
12
+ 1. <a href="https://hf.co/IIC/wav2vec2-spanish-multilibrispeech">Speech2Text</a>: For this we finedtuned a multilingual Wav2Vec2, as explained in the attached link. We use this model to process audio questions.
13
+ 2. <a href="https://hf.co/IIC/dpr-spanish-passage_encoder-allqa-base">Dense Passage Retrieval for Context</a>: Dense Passage Retrieval is a methodology <a href="https://arxiv.org/abs/2004.04906">developed by Facebook</a> which is currently the SoTA for Passage Retrieval,
14
  that is, the task of getting the most relevant passages to answer a given question with. You can find details about how it was trained on the link attached to the name.
15
+ 3. <a href="https://hf.co/IIC/dpr-spanish-question_encoder-allqa-base">Dense Passage Retrieval for Question</a>: It is actually part of the same thing as the above. For more details, go to the attached link.
16
+ 4. <a href="https://hf.co/sentence-transformers/distiluse-base-multilingual-cased-v1">Sentence Encoder Ranker</a>: To rerank the candidate contexts retrieved by dpr for the generative model to see. This also selects the top 5 passages for the model to read, it is the final filter before the generative model.
17
+ 5. <a href="https://hf.co/IIC/mt5-base-lfqa-es">Generative Long-Form Question Answering Model</a>: For this we used either mT5 (the one attached) or <a href="https://hf.co/IIC/mbart-large-lfqa-es">mBART</a>. This generative model receives the most relevant
18
  passages and uses them to generate an answer to the question. In the attached link there are more details about how we trained it etc.
19
 
20
  On the other hand, we uploaded, and in some cases created, datasets in Spanish to be able to build such a system.
21
 
22
+ 1. <a href="https://hf.co/datasets/IIC/spanish_biomedical_crawled_corpus">Spanish Biomedical Crawled Corpus</a>. Used for finding answers to questions about biomedicine. (More info in the link.)
23
+ 2. <a href="https://hf.co/datasets/IIC/lfqa_spanish">LFQA_Spanish</a>. Used for training the generative model. (More info in the link.)
24
+ 3. <a href="https://hf.co/datasets/squad_es">SQUADES</a>. Used to train the DPR models. (More info in the link.)
25
+ 4. <a href="https://hf.co/datasets/IIC/bioasq22_es">BioAsq22-Spanish</a>. Used to train the DPR models. (More info in the link.)
26
+ 5. <a href="https://hf.co/datasets/PlanTL-GOB-ES/SQAC">SQAC (Spanish Question Answering Corpus)</a>. Used to train the DPR models. (More info in the link.)
27
  </p>
28
  """
29
+
30
  description = """
31
  <a href="https://www.iic.uam.es/">
32
+ <img src="https://drive.google.com/uc?export=view&id=1kvHDFUPPnf1kM5EKlv5Ife2KcZZvva_1" style="max-width: 100%; max-height: 10%; height: 250px; object-fit: fill">,
33
  </a>
34
  <h1> BioMedIA: Abstractive Question Answering of BioMedical Domain in Spanish </h1>
35
  Esta aplicación consiste en sistemas de búsqueda del Estado del Arte en Español junto con un modelo generativo entrenado para componer una respuesta a preguntas a partir de una serie de contextos.
49
  "wav2vec2-iic",
50
  False,
51
  ],
52
+ [
53
+ "¿Por qué sentimos ansiedad?",
54
+ "vacio.flac",
55
+ "vacio.flac",
56
+ 50,
57
+ 8,
58
+ 3,
59
+ 1.0,
60
+ 250,
61
+ "wav2vec2-iic",
62
+ False,
63
+ ],
64
+ [
65
+ "¿Qué es la mesoterapia?",
66
+ "vacio.flac",
67
+ "vacio.flac",
68
+ 50,
69
+ 8,
70
+ 3,
71
+ 1.0,
72
+ 250,
73
+ "wav2vec2-iic",
74
+ False,
75
+ ],
76
  [
77
  "¿Qué alternativas al Paracetamol existen para el dolor de cabeza?",
78
  "vacio.flac",
122
  False
123
  ],
124
  [
125
+ "¿Qué deficiencia es la causa del síndrome de piernas inquietas?",
126
  "vacio.flac",
127
  "vacio.flac",
128
  50,