andersab commited on
Commit
165560d
1 Parent(s): 489d4bc
QuijoBERT/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "./QuijoBERT/backup",
3
+ "architectures": [
4
+ "RobertaForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_norm_eps": 1e-12,
16
+ "max_position_embeddings": 514,
17
+ "model_type": "roberta",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 6,
20
+ "pad_token_id": 1,
21
+ "position_embedding_type": "absolute",
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.19.0.dev0",
24
+ "type_vocab_size": 1,
25
+ "use_cache": true,
26
+ "vocab_size": 50000
27
+ }
QuijoBERT/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
QuijoBERT/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65f7722ec4294cf9c4a092995573924d30d0a0a268b7e4db0c41a4ff2564b1c7
3
+ size 327904939
QuijoBERT/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c49120b72a0cb08b7a726d09864a5baba0e888193c56ff53895d356cc6cc501a
3
+ size 3119
QuijoBERT/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
app.py CHANGED
@@ -1,7 +1,43 @@
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
  iface.launch()
 
1
+ # import gradio as gr
2
+
3
+ # def greet(name):
4
+ # return "Hello Mr." + name + "!!"
5
+
6
+ # iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
+ # iface.launch()
8
+
9
+
10
  import gradio as gr
11
+ from numpy import kaiser
12
+
13
+ from transformers import pipeline
14
+
15
+ fill_mask = pipeline("fill-mask", model="./QuijoBERT", tokenizer = './QuijoBERT')
16
+
17
+ def predict(text):
18
+
19
+ res_dict = {}
20
+ x = fill_mask(text)
21
+ print('x')
22
+ for i in range(len(x)):
23
+ k = x[i]['sequence']
24
+ e = x[i]['score']
25
+ print(k, e)
26
+ if e >= 0.05:
27
+ res_dict[k] = e
28
+ print (res_dict)
29
+ return res_dict
30
+ #return {x[0]["sequence"], x[0]["score"]}
31
+
32
+ # texto = 'en un lugar de la <mask>'
33
+ # print(predict(texto))
34
+
35
+ iface = gr.Interface(
36
+ fn=predict,
37
+ inputs='text',
38
+ outputs ='label',
39
+ examples=['En un lugar de la <mask>', 'En verdad, <mask> Sancho', 'Cómo has estado, bien mío, <mask> de mis ojos, compañero mío']
40
+ )
41
 
 
 
42
 
 
43
  iface.launch()
el_quijote.txt ADDED
The diff for this file is too large to render. See raw diff
 
quijoBERT.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ from transformers import AutoTokenizer, AutoModelForMaskedLM, RobertaConfig , RobertaTokenizer,RobertaForMaskedLM, DataCollatorForLanguageModeling, LineByLineTextDataset, Trainer, TrainingArguments
4
+
5
+
6
+ from pathlib import Path
7
+ from tokenizers import ByteLevelBPETokenizer
8
+ from tokenizers.implementations import ByteLevelBPETokenizer
9
+ from tokenizers.processors import BertProcessing
10
+ import torch
11
+ from torchinfo import summary
12
+
13
+
14
+ import os
15
+
16
+ paths = [str(x) for x in Path(".").glob("**/el_*.txt")]
17
+ print(paths)
18
+ # Initialize a tokenizer
19
+ tokenizer = ByteLevelBPETokenizer()
20
+ # Customize training
21
+ tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2,
22
+ special_tokens=[
23
+ "<s>",
24
+ "<pad>",
25
+ "</s>",
26
+ "<unk>",
27
+ "<mask>",
28
+ ])
29
+
30
+
31
+ dir_path = os.getcwd()
32
+ token_dir = os.path.join(dir_path, 'QuijoBERT')
33
+
34
+ if not os.path.exists(token_dir):
35
+ os.makedirs(token_dir)
36
+ tokenizer.save_model('QuijoBERT')
37
+
38
+ tokenizer = ByteLevelBPETokenizer(
39
+ "./QuijoBERT/vocab.json",
40
+ "./QuijoBERT/merges.txt",
41
+ )
42
+
43
+ tokenizer._tokenizer.post_processor = BertProcessing(
44
+ ("</s>", tokenizer.token_to_id("</s>")),
45
+ ("<s>", tokenizer.token_to_id("<s>")),
46
+ )
47
+ tokenizer.enable_truncation(max_length=512)
48
+
49
+
50
+
51
+ config = RobertaConfig(
52
+ vocab_size=52_000,
53
+ max_position_embeddings=514,
54
+ num_attention_heads=12,
55
+ num_hidden_layers=6,
56
+ type_vocab_size=1,
57
+ )
58
+
59
+ """# Step 8: Re-creating the Tokenizer in Transformers"""
60
+
61
+ tokenizer = RobertaTokenizer.from_pretrained("./QuijoBERT", max_length=512)
62
+
63
+ #Initializing a Model
64
+
65
+ model = RobertaForMaskedLM(config=config)
66
+ #In case we want to recover the after a crash
67
+ #model = RobertaForMaskedLM.from_pretrained("./QuijoBERT/Checkpoint-xxxxx")
68
+
69
+
70
+ #Tensorflow
71
+ print(model)
72
+ #Pytorch
73
+ summary(model)
74
+
75
+
76
+ dataset = LineByLineTextDataset(
77
+ tokenizer=tokenizer,
78
+ file_path="./el_quijote.txt",
79
+ block_size=128,
80
+ )
81
+
82
+
83
+ #Defining a Data Collator
84
+
85
+ data_collator = DataCollatorForLanguageModeling(
86
+ tokenizer=tokenizer, mlm=True, mlm_probability=0.15
87
+ )
88
+
89
+ # Initializing the Trainer Object
90
+ training_args = TrainingArguments(
91
+ output_dir="./QuijoBERT",
92
+ overwrite_output_dir=True,
93
+ num_train_epochs=1,
94
+ per_device_train_batch_size=64,
95
+ save_steps=1000,
96
+ save_total_limit=2,
97
+ )
98
+ trainer = Trainer(
99
+ model=model,
100
+ args=training_args,
101
+ data_collator=data_collator,
102
+ train_dataset=dataset,
103
+ )
104
+
105
+
106
+ #Training the Model
107
+ print('aqui')
108
+ trainer.train()
109
+ trainer.save_model("./QuijoBERT")
110
+
111
+ #Saving the Final Model(+tokenizer + config) to disk
112
+ trainer.save_model("./QuijoBERT")
113
+