Pablo commited on
Commit
605d4d7
1 Parent(s): 8cb1dfe

Reduced number fo models available and extended the project description

Browse files
Files changed (2) hide show
  1. app.py +16 -17
  2. requirements.txt +1 -1
app.py CHANGED
@@ -7,24 +7,15 @@ from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline
7
  LOGO = "https://raw.githubusercontent.com/nlp-en-es/assets/main/logo.png"
8
 
9
  MODELS = {
10
- "RoBERTa Base": {
11
- "url": "bertin-project/bertin-roberta-base-spanish"
12
  },
13
- "RoBERTa Base Gaussian": {
14
  "url": "bertin-project/bertin-base-gaussian"
15
  },
16
- "RoBERTa Base Random": {
17
  "url": "bertin-project/bertin-base-random"
18
  },
19
- "RoBERTa Base Stepwise": {
20
- "url": "bertin-project/bertin-base-stepwise"
21
- },
22
- "RoBERTa Base Gaussian Experiment": {
23
- "url": "bertin-project/bertin-base-gaussian-exp-512seqlen"
24
- },
25
- "RoBERTa Base Random Experiment": {
26
- "url": "bertin-project/bertin-base-random-exp-512seqlen"
27
- }
28
  }
29
 
30
  PROMPT_LIST = [
@@ -36,7 +27,9 @@ PROMPT_LIST = [
36
  "Mañana vienen mis amigos de <mask>.",
37
  "¿Te apetece venir a <mask> conmigo?",
38
  "En verano hace mucho <mask>.",
39
- "En el bosque había <mask>."
 
 
40
  ]
41
 
42
 
@@ -63,6 +56,12 @@ st.markdown(
63
  The models are trained with Flax and using TPUs sponsored by Google since this is part of the
64
  [Flax/Jax Community Week](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/7104)
65
  organised by HuggingFace.
 
 
 
 
 
 
66
  """
67
  )
68
 
@@ -88,12 +87,12 @@ if st.button("Fill the mask"):
88
  st.markdown(
89
  """
90
  ### Team members
91
- - Javier de la Rosa ([versae](https://huggingface.co/versae))
92
  - Eduardo González ([edugp](https://huggingface.co/edugp))
93
- - Paulo Villegas ([paulo](https://huggingface.co/paulo))
94
- - Pablo González de Prado ([Pablogps](https://huggingface.co/Pablogps))
95
  - Manu Romero ([mrm8488](https://huggingface.co/mrm8488))
96
  - María Grandury ([mariagrandury](https://huggingface.co/mariagrandury))
 
 
97
 
98
  ### More information
99
  You can find more information about these models
7
  LOGO = "https://raw.githubusercontent.com/nlp-en-es/assets/main/logo.png"
8
 
9
  MODELS = {
10
+ "RoBERTa Base Gaussian Seq Len 512": {
11
+ "url": "bertin-project/bertin-base-gaussian-exp-512seqlen"
12
  },
13
+ "RoBERTa Base Gaussian Seq Len 128": {
14
  "url": "bertin-project/bertin-base-gaussian"
15
  },
16
+ "RoBERTa Base Random Seq Len 128": {
17
  "url": "bertin-project/bertin-base-random"
18
  },
 
 
 
 
 
 
 
 
 
19
  }
20
 
21
  PROMPT_LIST = [
27
  "Mañana vienen mis amigos de <mask>.",
28
  "¿Te apetece venir a <mask> conmigo?",
29
  "En verano hace mucho <mask>.",
30
+ "En el bosque había <mask>.",
31
+ "El ministro dijo que <mask> los impuestos.",
32
+ "Si no estuviera afónica, <mask> esa canción.",
33
  ]
34
 
35
 
56
  The models are trained with Flax and using TPUs sponsored by Google since this is part of the
57
  [Flax/Jax Community Week](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/7104)
58
  organised by HuggingFace.
59
+
60
+ All models are variations of RoBERTa-base trained from scratch in Spanish.
61
+ We used the mc4 dataset. We reduced the dataset size to 50 million documents to keep training times shorter, and also to be able to bias training examples based on their perplexity.
62
+ The idea is to favour examples with perplexities that are neither too small (short, repetitive texts) or too long (potentially poor quality).
63
+ "Random" sampling simply takes documents at random to reduce the dataset size. "Gaussian" rejects documents with a higher probability for lower and larger perplexities, based on a Gaussian function.
64
+ The first models have been trained (250.000 steps) on sequence length 128, and training for Gaussian changed to sequence length 512 for the last 25.000 training steps.
65
  """
66
  )
67
 
87
  st.markdown(
88
  """
89
  ### Team members
 
90
  - Eduardo González ([edugp](https://huggingface.co/edugp))
91
+ - Javier de la Rosa ([versae](https://huggingface.co/versae))
 
92
  - Manu Romero ([mrm8488](https://huggingface.co/mrm8488))
93
  - María Grandury ([mariagrandury](https://huggingface.co/mariagrandury))
94
+ - Pablo González de Prado ([Pablogps](https://huggingface.co/Pablogps))
95
+ - Paulo Villegas ([paulo](https://huggingface.co/paulo))
96
 
97
  ### More information
98
  You can find more information about these models
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
  streamlit
2
  mtranslate
3
  transformers
4
- torch
1
  streamlit
2
  mtranslate
3
  transformers
4
+ torch