Create code.py
Browse files
code.py
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
! pip install transformers datasets
|
2 |
+
from datasets import load_dataset
|
3 |
+
|
4 |
+
squad = load_dataset("squad", split="train[:500]")
|
5 |
+
squad = squad.train_test_split(test_size=0.2)
|
6 |
+
squad["train"][0]
|
7 |
+
from transformers import AutoTokenizer
|
8 |
+
|
9 |
+
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
|
10 |
+
def preprocess_function(examples):
|
11 |
+
questions = [q.strip() for q in examples["question"]]
|
12 |
+
inputs = tokenizer(
|
13 |
+
questions,
|
14 |
+
examples["context"],
|
15 |
+
max_length=384,
|
16 |
+
truncation="only_second",
|
17 |
+
return_offsets_mapping=True,
|
18 |
+
padding="max_length",
|
19 |
+
)
|
20 |
+
|
21 |
+
offset_mapping = inputs.pop("offset_mapping")
|
22 |
+
answers = examples["answers"]
|
23 |
+
start_positions = []
|
24 |
+
end_positions = []
|
25 |
+
|
26 |
+
for i, offset in enumerate(offset_mapping):
|
27 |
+
answer = answers[i]
|
28 |
+
start_char = answer["answer_start"][0]
|
29 |
+
end_char = answer["answer_start"][0] + len(answer["text"][0])
|
30 |
+
sequence_ids = inputs.sequence_ids(i)
|
31 |
+
|
32 |
+
# Find the start and end of the context
|
33 |
+
idx = 0
|
34 |
+
while sequence_ids[idx] != 1:
|
35 |
+
idx += 1
|
36 |
+
context_start = idx
|
37 |
+
while sequence_ids[idx] == 1:
|
38 |
+
idx += 1
|
39 |
+
context_end = idx - 1
|
40 |
+
|
41 |
+
# If the answer is not fully inside the context, label it (0, 0)
|
42 |
+
if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
|
43 |
+
start_positions.append(0)
|
44 |
+
end_positions.append(0)
|
45 |
+
else:
|
46 |
+
# Otherwise it's the start and end token positions
|
47 |
+
idx = context_start
|
48 |
+
while idx <= context_end and offset[idx][0] <= start_char:
|
49 |
+
idx += 1
|
50 |
+
start_positions.append(idx - 1)
|
51 |
+
|
52 |
+
idx = context_end
|
53 |
+
while idx >= context_start and offset[idx][1] >= end_char:
|
54 |
+
idx -= 1
|
55 |
+
end_positions.append(idx + 1)
|
56 |
+
|
57 |
+
inputs["start_positions"] = start_positions
|
58 |
+
inputs["end_positions"] = end_positions
|
59 |
+
return inputs
|
60 |
+
|
61 |
+
from transformers import DefaultDataCollator
|
62 |
+
|
63 |
+
data_collator = DefaultDataCollator(return_tensors="tf")
|
64 |
+
|
65 |
+
|
66 |
+
from transformers import create_optimizer
|
67 |
+
|
68 |
+
batch_size = 16
|
69 |
+
num_epochs = 2
|
70 |
+
total_train_steps = (len(tokenized_squad["train"]) // batch_size) * num_epochs
|
71 |
+
optimizer, schedule = create_optimizer(
|
72 |
+
init_lr=2e-5,
|
73 |
+
num_warmup_steps=0,
|
74 |
+
num_train_steps=total_train_steps,
|
75 |
+
)
|
76 |
+
|
77 |
+
from transformers import TFAutoModelForQuestionAnswering
|
78 |
+
|
79 |
+
model = TFAutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
|
80 |
+
|
81 |
+
tf_train_set = model.prepare_tf_dataset(
|
82 |
+
tokenized_squad["train"],
|
83 |
+
shuffle=True,
|
84 |
+
batch_size=16,
|
85 |
+
collate_fn=data_collator,
|
86 |
+
)
|
87 |
+
|
88 |
+
tf_validation_set = model.prepare_tf_dataset(
|
89 |
+
tokenized_squad["test"],
|
90 |
+
shuffle=False,
|
91 |
+
batch_size=16,
|
92 |
+
collate_fn=data_collator,
|
93 |
+
)
|
94 |
+
|
95 |
+
import tensorflow as tf
|
96 |
+
|
97 |
+
model.compile(optimizer=optimizer)
|
98 |
+
|
99 |
+
from transformers.keras_callbacks import PushToHubCallback
|
100 |
+
|
101 |
+
callback = PushToHubCallback(
|
102 |
+
output_dir="my_awesome_qa_model",
|
103 |
+
tokenizer=tokenizer,
|
104 |
+
)
|
105 |
+
|
106 |
+
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=1, callbacks=[callback])
|
107 |
+
|
108 |
+
|
109 |
+
question = "How many programming languages does BLOOM support?"
|
110 |
+
context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages."
|
111 |
+
|
112 |
+
from transformers import pipeline
|
113 |
+
|
114 |
+
question_answerer = pipeline("question-answering", model="my_awesome_qa_model")
|
115 |
+
question_answerer(question=question, context=context)
|
116 |
+
|
117 |
+
from transformers import AutoTokenizer
|
118 |
+
|
119 |
+
tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model")
|
120 |
+
inputs = tokenizer(question, context, return_tensors="tf")
|
121 |
+
|
122 |
+
from transformers import TFAutoModelForQuestionAnswering
|
123 |
+
|
124 |
+
model = TFAutoModelForQuestionAnswering.from_pretrained("my_awesome_qa_model")
|
125 |
+
outputs = model(**inputs)
|
126 |
+
|
127 |
+
answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
|
128 |
+
answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])
|
129 |
+
|
130 |
+
predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
|
131 |
+
tokenizer.decode(predict_answer_tokens)
|