Roman Castagné commited on
Commit
4714d24
0 Parent(s):

Initial commit

Browse files
Files changed (3) hide show
  1. .gitignore +2 -0
  2. app.py +72 -0
  3. requirements.txt +3 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
1
+ __pycache__/
2
+ .vscode/
app.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datasets
2
+ import gradio as gr
3
+ from transformers import AutoModelForMaskedLM, AutoTokenizer, DataCollatorForLanguageModeling
4
+
5
+
6
+ ds = datasets.load_dataset(
7
+ "oscar-corpus/OSCAR-2109", "deduplicated_en", streaming=True, use_auth_token=True, split="train"
8
+ )
9
+ ds = ds.shuffle(buffer_size=1000)
10
+ ds = iter(ds)
11
+
12
+ model_name = "../../checkpoints/artificial_pretraining/mlm_en_100k"
13
+
14
+ model = AutoModelForMaskedLM.from_pretrained(model_name)
15
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
16
+
17
+ collate_fn = DataCollatorForLanguageModeling(tokenizer)
18
+
19
+
20
+ with gr.Blocks() as demo:
21
+ inputs_oscar = gr.TextArea(
22
+ placeholder="Type a sentence or click the button below to get a random sentence from the English OSCAR corpus",
23
+ label="Input",
24
+ num_lines=6,
25
+ interactive=True,
26
+ )
27
+ next_button = gr.Button("Random OSCAR sentence")
28
+ next_button.click(fn=lambda: next(ds)["text"], outputs=inputs_oscar)
29
+
30
+ masked_text = gr.Textbox(label="Masked sentence")
31
+
32
+ labels_and_outputs = []
33
+ with gr.Row():
34
+ for _ in range(4):
35
+ with gr.Column():
36
+ labels_and_outputs.append(gr.Textbox(label="Label"))
37
+ labels_and_outputs.append(gr.Label(num_top_classes=5, show_label=False))
38
+ with gr.Row():
39
+ for _ in range(4):
40
+ with gr.Column():
41
+ labels_and_outputs.append(gr.Textbox(label="Label"))
42
+ labels_and_outputs.append(gr.Label(num_top_classes=5, show_label=False))
43
+
44
+ def model_inputs_and_outputs(example):
45
+ token_ids = tokenizer(example, return_tensors="pt", truncation=True, max_length=128)
46
+ model_inputs = collate_fn((token_ids,))
47
+ model_inputs = {k: v[0] for k, v in model_inputs.items()}
48
+ masked_tokens = tokenizer.batch_decode(model_inputs["input_ids"])[0]
49
+
50
+ original_labels = [tokenizer.convert_ids_to_tokens([id])[0] for id in model_inputs["labels"][0] if id != -100]
51
+
52
+ out = model(**model_inputs)
53
+ all_logits = out.logits[model_inputs["labels"] != -100].softmax(-1)
54
+ all_outputs = [
55
+ {tokenizer.convert_ids_to_tokens([id])[0]: val.item() for id, val in enumerate(logits)}
56
+ for logits in all_logits
57
+ ]
58
+ out_dict = {masked_text: masked_tokens}
59
+ for i in range(len(labels_and_outputs) // 2):
60
+ try:
61
+ out_dict[labels_and_outputs[2 * i]] = original_labels[i]
62
+ out_dict[labels_and_outputs[2 * i + 1]] = all_outputs[i]
63
+ except:
64
+ out_dict[labels_and_outputs[2 * i]] = ""
65
+ out_dict[labels_and_outputs[2 * i + 1]] = {}
66
+ return out_dict
67
+
68
+ button = gr.Button("Predict tokens")
69
+ button.click(fn=model_inputs_and_outputs, inputs=inputs_oscar, outputs=[masked_text] + labels_and_outputs)
70
+
71
+
72
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ datasets==2.4.0
2
+ gradio==3.19.1
3
+ transformers==4.22.0.dev0