ajlao commited on
Commit
7c36740
1 Parent(s): d628540

Create code.py

Browse files
Files changed (1) hide show
  1. code.py +131 -0
code.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ! pip install transformers datasets
2
+ from datasets import load_dataset
3
+
4
+ squad = load_dataset("squad", split="train[:500]")
5
+ squad = squad.train_test_split(test_size=0.2)
6
+ squad["train"][0]
7
+ from transformers import AutoTokenizer
8
+
9
+ tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
10
+ def preprocess_function(examples):
11
+ questions = [q.strip() for q in examples["question"]]
12
+ inputs = tokenizer(
13
+ questions,
14
+ examples["context"],
15
+ max_length=384,
16
+ truncation="only_second",
17
+ return_offsets_mapping=True,
18
+ padding="max_length",
19
+ )
20
+
21
+ offset_mapping = inputs.pop("offset_mapping")
22
+ answers = examples["answers"]
23
+ start_positions = []
24
+ end_positions = []
25
+
26
+ for i, offset in enumerate(offset_mapping):
27
+ answer = answers[i]
28
+ start_char = answer["answer_start"][0]
29
+ end_char = answer["answer_start"][0] + len(answer["text"][0])
30
+ sequence_ids = inputs.sequence_ids(i)
31
+
32
+ # Find the start and end of the context
33
+ idx = 0
34
+ while sequence_ids[idx] != 1:
35
+ idx += 1
36
+ context_start = idx
37
+ while sequence_ids[idx] == 1:
38
+ idx += 1
39
+ context_end = idx - 1
40
+
41
+ # If the answer is not fully inside the context, label it (0, 0)
42
+ if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
43
+ start_positions.append(0)
44
+ end_positions.append(0)
45
+ else:
46
+ # Otherwise it's the start and end token positions
47
+ idx = context_start
48
+ while idx <= context_end and offset[idx][0] <= start_char:
49
+ idx += 1
50
+ start_positions.append(idx - 1)
51
+
52
+ idx = context_end
53
+ while idx >= context_start and offset[idx][1] >= end_char:
54
+ idx -= 1
55
+ end_positions.append(idx + 1)
56
+
57
+ inputs["start_positions"] = start_positions
58
+ inputs["end_positions"] = end_positions
59
+ return inputs
60
+
61
+ from transformers import DefaultDataCollator
62
+
63
+ data_collator = DefaultDataCollator(return_tensors="tf")
64
+
65
+
66
+ from transformers import create_optimizer
67
+
68
+ batch_size = 16
69
+ num_epochs = 2
70
+ total_train_steps = (len(tokenized_squad["train"]) // batch_size) * num_epochs
71
+ optimizer, schedule = create_optimizer(
72
+ init_lr=2e-5,
73
+ num_warmup_steps=0,
74
+ num_train_steps=total_train_steps,
75
+ )
76
+
77
+ from transformers import TFAutoModelForQuestionAnswering
78
+
79
+ model = TFAutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
80
+
81
+ tf_train_set = model.prepare_tf_dataset(
82
+ tokenized_squad["train"],
83
+ shuffle=True,
84
+ batch_size=16,
85
+ collate_fn=data_collator,
86
+ )
87
+
88
+ tf_validation_set = model.prepare_tf_dataset(
89
+ tokenized_squad["test"],
90
+ shuffle=False,
91
+ batch_size=16,
92
+ collate_fn=data_collator,
93
+ )
94
+
95
+ import tensorflow as tf
96
+
97
+ model.compile(optimizer=optimizer)
98
+
99
+ from transformers.keras_callbacks import PushToHubCallback
100
+
101
+ callback = PushToHubCallback(
102
+ output_dir="my_awesome_qa_model",
103
+ tokenizer=tokenizer,
104
+ )
105
+
106
+ model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=1, callbacks=[callback])
107
+
108
+
109
+ question = "How many programming languages does BLOOM support?"
110
+ context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages."
111
+
112
+ from transformers import pipeline
113
+
114
+ question_answerer = pipeline("question-answering", model="my_awesome_qa_model")
115
+ question_answerer(question=question, context=context)
116
+
117
+ from transformers import AutoTokenizer
118
+
119
+ tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model")
120
+ inputs = tokenizer(question, context, return_tensors="tf")
121
+
122
+ from transformers import TFAutoModelForQuestionAnswering
123
+
124
+ model = TFAutoModelForQuestionAnswering.from_pretrained("my_awesome_qa_model")
125
+ outputs = model(**inputs)
126
+
127
+ answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
128
+ answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])
129
+
130
+ predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
131
+ tokenizer.decode(predict_answer_tokens)