versae commited on
Commit
888933e
1 Parent(s): b57da4e
Files changed (5) hide show
  1. README.md +4 -4
  2. app.py +108 -0
  3. gitattributes +16 -0
  4. gitignore +1 -0
  5. requirements.txt +4 -0
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: Bertin
3
- emoji: 🐢
4
- colorFrom: gray
5
- colorTo: purple
6
  sdk: streamlit
7
  app_file: app.py
8
  pinned: false
 
1
  ---
2
+ title: BERTIN
3
+ emoji: 🔥
4
+ colorFrom: yellow
5
+ colorTo: red
6
  sdk: streamlit
7
  app_file: app.py
8
  pinned: false
app.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ from mtranslate import translate
3
+ import streamlit as st
4
+ from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline
5
+
6
+
7
+ LOGO = "https://raw.githubusercontent.com/nlp-en-es/assets/main/logo.png"
8
+
9
+ MODELS = {
10
+ "RoBERTa Base Gaussian Seq Len 512": {
11
+ "url": "bertin-project/bertin-base-gaussian-exp-512seqlen"
12
+ },
13
+ "RoBERTa Base Gaussian Seq Len 128": {
14
+ "url": "bertin-project/bertin-base-gaussian"
15
+ },
16
+ "RoBERTa Base Random Seq Len 128": {
17
+ "url": "bertin-project/bertin-base-random"
18
+ },
19
+ "RoBERTa Base Stepwise Seq Len 128": {
20
+ "url": "bertin-project/bertin-base-stepwise"
21
+ },
22
+ }
23
+
24
+ PROMPT_LIST = [
25
+ "Fui a la librería a comprar un <mask>.",
26
+ "¡Qué buen <mask> hace hoy!",
27
+ "Hoy empiezan las vacaciones así que vamos a la <mask>.",
28
+ "Mi color favorito es el <mask>.",
29
+ "Voy a <mask> porque estoy muy cansada.",
30
+ "Mañana vienen mis amigos de <mask>.",
31
+ "¿Te apetece venir a <mask> conmigo?",
32
+ "En verano hace mucho <mask>.",
33
+ "En el bosque había <mask>.",
34
+ "El ministro dijo que <mask> los impuestos.",
35
+ "Si no estuviera afónica, <mask> esa canción.",
36
+ ]
37
+
38
+
39
+ @st.cache(show_spinner=False, persist=True)
40
+ def load_model(masked_text, model_url):
41
+ model = AutoModelForMaskedLM.from_pretrained(model_url)
42
+ tokenizer = AutoTokenizer.from_pretrained(model_url)
43
+ nlp = pipeline("fill-mask", model=model, tokenizer=tokenizer)
44
+ result = nlp(masked_text)
45
+ return result
46
+
47
+
48
+ # Page
49
+ st.set_page_config(page_title="BERTIN Demo", page_icon=LOGO)
50
+ st.title("BERTIN")
51
+
52
+ #Sidebar
53
+ st.sidebar.image(LOGO)
54
+
55
+ # Body
56
+ st.markdown(
57
+ """
58
+ BERTIN is a series of BERT-based models for Spanish.
59
+
60
+ The models are trained with Flax and using TPUs sponsored by Google since this is part of the
61
+ [Flax/Jax Community Week](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/7104)
62
+ organised by HuggingFace.
63
+
64
+ All models are variations of **RoBERTa-base** trained from scratch in **Spanish** using the **mc4 dataset**.
65
+ We reduced the dataset size to 50 million documents to keep training times shorter, and also to be able to bias training examples based on their perplexity.
66
+
67
+ The idea is to favour examples with perplexities that are neither too small (short, repetitive texts) or too long (potentially poor quality).
68
+ * **Random** sampling simply takes documents at random to reduce the dataset size.
69
+ * **Gaussian** rejects documents with a higher probability for lower and larger perplexities, based on a Gaussian function.
70
+
71
+ The first models have been trained (250.000 steps) on sequence length 128, and training for Gaussian changed to sequence length 512 for the last 25.000 training steps.
72
+ """
73
+ )
74
+
75
+ model_name = st.selectbox("Model", list(MODELS.keys()))
76
+ model_url = MODELS[model_name]["url"]
77
+
78
+ prompt = st.selectbox("Prompt", ["Random", "Custom"])
79
+ if prompt == "Custom":
80
+ prompt_box = "Enter your masked text here..."
81
+ else:
82
+ prompt_box = random.choice(PROMPT_LIST)
83
+ text = st.text_area("Enter text", prompt_box)
84
+
85
+ if st.button("Fill the mask"):
86
+ with st.spinner(text="Filling the mask..."):
87
+ st.subheader("Result")
88
+ result = load_model(text, model_url)
89
+ result_sequence = result[0]["sequence"]
90
+ st.write(result_sequence)
91
+ st.write("_English_ _translation:_", translate(result_sequence, "en", "es"))
92
+ st.write(result)
93
+
94
+ st.markdown(
95
+ """
96
+ ### Team members
97
+ - Eduardo González ([edugp](https://huggingface.co/edugp))
98
+ - Javier de la Rosa ([versae](https://huggingface.co/versae))
99
+ - Manu Romero ([mrm8488](https://huggingface.co/mrm8488))
100
+ - María Grandury ([mariagrandury](https://huggingface.co/mariagrandury))
101
+ - Pablo González de Prado ([Pablogps](https://huggingface.co/Pablogps))
102
+ - Paulo Villegas ([paulo](https://huggingface.co/paulo))
103
+
104
+ ### More information
105
+ You can find more information about these models
106
+ [here](https://huggingface.co/bertin-project/bertin-roberta-base-spanish).
107
+ """
108
+ )
gitattributes ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
2
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.h5 filter=lfs diff=lfs merge=lfs -text
5
+ *.tflite filter=lfs diff=lfs merge=lfs -text
6
+ *.tar.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.ot filter=lfs diff=lfs merge=lfs -text
8
+ *.onnx filter=lfs diff=lfs merge=lfs -text
9
+ *.arrow filter=lfs diff=lfs merge=lfs -text
10
+ *.ftz filter=lfs diff=lfs merge=lfs -text
11
+ *.joblib filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.pb filter=lfs diff=lfs merge=lfs -text
15
+ *.pt filter=lfs diff=lfs merge=lfs -text
16
+ *.pth filter=lfs diff=lfs merge=lfs -text
gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ venv/
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ streamlit
2
+ mtranslate
3
+ transformers
4
+ torch