ajlao
/

my_awesome_qa_model

+! pip install transformers datasets
+from datasets import load_dataset
+squad = load_dataset("squad", split="train[:500]")
+squad = squad.train_test_split(test_size=0.2)
+squad["train"][0]
+from transformers import AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
+def preprocess_function(examples):
+    questions = [q.strip() for q in examples["question"]]
+    inputs = tokenizer(
+        questions,
+        examples["context"],
+        max_length=384,
+        truncation="only_second",
+        return_offsets_mapping=True,
+        padding="max_length",
+    )
+    offset_mapping = inputs.pop("offset_mapping")
+    answers = examples["answers"]
+    start_positions = []
+    end_positions = []
+    for i, offset in enumerate(offset_mapping):
+        answer = answers[i]
+        start_char = answer["answer_start"][0]
+        end_char = answer["answer_start"][0] + len(answer["text"][0])
+        sequence_ids = inputs.sequence_ids(i)
+        # Find the start and end of the context
+        idx = 0
+        while sequence_ids[idx] != 1:
+            idx += 1
+        context_start = idx
+        while sequence_ids[idx] == 1:
+            idx += 1
+        context_end = idx - 1
+        # If the answer is not fully inside the context, label it (0, 0)
+        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
+            start_positions.append(0)
+            end_positions.append(0)
+        else:
+            # Otherwise it's the start and end token positions
+            idx = context_start
+            while idx <= context_end and offset[idx][0] <= start_char:
+                idx += 1
+            start_positions.append(idx - 1)
+            idx = context_end
+            while idx >= context_start and offset[idx][1] >= end_char:
+                idx -= 1
+            end_positions.append(idx + 1)
+    inputs["start_positions"] = start_positions
+    inputs["end_positions"] = end_positions
+    return inputs
+from transformers import DefaultDataCollator
+data_collator = DefaultDataCollator(return_tensors="tf")
+from transformers import create_optimizer
+batch_size = 16
+num_epochs = 2
+total_train_steps = (len(tokenized_squad["train"]) // batch_size) * num_epochs
+optimizer, schedule = create_optimizer(
+    init_lr=2e-5,
+    num_warmup_steps=0,
+    num_train_steps=total_train_steps,
+)
+from transformers import TFAutoModelForQuestionAnswering
+model = TFAutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+tf_train_set = model.prepare_tf_dataset(
+    tokenized_squad["train"],
+    shuffle=True,
+    batch_size=16,
+    collate_fn=data_collator,
+)
+tf_validation_set = model.prepare_tf_dataset(
+    tokenized_squad["test"],
+    shuffle=False,
+    batch_size=16,
+    collate_fn=data_collator,
+)
+import tensorflow as tf
+model.compile(optimizer=optimizer)
+from transformers.keras_callbacks import PushToHubCallback
+callback = PushToHubCallback(
+    output_dir="my_awesome_qa_model",
+    tokenizer=tokenizer,
+)
+model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=1, callbacks=[callback])
+question = "How many programming languages does BLOOM support?"
+context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages."
+from transformers import pipeline
+question_answerer = pipeline("question-answering", model="my_awesome_qa_model")
+question_answerer(question=question, context=context)
+from transformers import AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model")
+inputs = tokenizer(question, context, return_tensors="tf")
+from transformers import TFAutoModelForQuestionAnswering
+model = TFAutoModelForQuestionAnswering.from_pretrained("my_awesome_qa_model")
+outputs = model(**inputs)
+answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
+answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])
+predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
+tokenizer.decode(predict_answer_tokens)