Spaces:
Runtime error
Runtime error
import torch | |
import numpy as np | |
# # from transformers import AutoTokenizer, AutoModelForQuestionAnswering | |
class QuestionAnswer: | |
def __init__(self, data, model, tokenizer, torch_device): | |
self.max_length = 384 | |
self.doc_stride = 128 | |
self.tokenizer = tokenizer | |
self.model = model | |
self.data = data | |
self.torch_device = torch_device | |
self.output = None | |
self.features = None | |
self.results = None | |
def get_output_from_model(self): | |
# data = {'question': question, 'context': context} | |
with torch.no_grad(): | |
tokenized_data = self.tokenizer( | |
self.data['question'], | |
self.data['context'], | |
truncation='only_second', | |
max_length=self.max_length, | |
stride=self.doc_stride, | |
return_overflowing_tokens=True, | |
return_offsets_mapping=True, | |
padding='max_length', | |
return_tensors='pt' | |
).to(self.torch_device) | |
output = self.model(tokenized_data['input_ids'], tokenized_data['attention_mask']) | |
return output | |
# print(output.keys()) | |
# print(output['start_logits'].shape) | |
# print(output['end_logits'].shape) | |
# print(tokenized_data.keys()) | |
def prepare_features(self, example): | |
tokenized_example = self.tokenizer( | |
example['question'], | |
example['context'], | |
truncation='only_second', | |
max_length=self.max_length, | |
stride=self.doc_stride, | |
return_overflowing_tokens=True, | |
return_offsets_mapping=True, | |
padding='max_length', | |
) | |
# sample_mapping = tokenized_example.pop("overflow_to_sample_mapping") | |
for i in range(len(tokenized_example['input_ids'])): | |
sequence_ids = tokenized_example.sequence_ids(i) | |
# print(sequence_ids) | |
context_index = 1 | |
# sample_index = sample_mapping[i] | |
tokenized_example["offset_mapping"][i] = [ | |
(o if sequence_ids[k] == context_index else None) | |
for k, o in enumerate(tokenized_example["offset_mapping"][i]) | |
] | |
return tokenized_example | |
def postprocess_qa_predictions(self, data, features, raw_predictions, top_n_answers=5, max_answer_length=30): | |
all_start_logits, all_end_logits = raw_predictions.start_logits, raw_predictions.end_logits | |
# print(all_start_logits) | |
results = [] | |
context = data['context'] | |
# print(len(features['input_ids'])) | |
for i in range(len(features['input_ids'])): | |
start_logits = all_start_logits[i].cpu().numpy() | |
end_logits = all_end_logits[i].cpu().numpy() | |
# print(start_logits) | |
offset_mapping = features['offset_mapping'][i] | |
start_indices = np.argsort(start_logits)[-1: -top_n_answers - 1: -1].tolist() | |
end_indices = np.argsort(end_logits)[-1: -top_n_answers - 1: -1].tolist() | |
for start_index in start_indices: | |
for end_index in end_indices: | |
if ( | |
start_index >= len(offset_mapping) | |
or end_index >= len(offset_mapping) | |
or offset_mapping[start_index] is None | |
or offset_mapping[end_index] is None | |
or end_index < start_index | |
or end_index - start_index + 1 > max_answer_length | |
): | |
continue | |
start_char = offset_mapping[start_index][0] | |
end_char = offset_mapping[end_index][1] | |
# print(start_logits[start_index]) | |
# print(end_logits[end_index]) | |
score = start_logits[start_index] + end_logits[end_index] | |
results.append( | |
{ | |
'score': float('%.*g' % (3, score)), | |
'text': context[start_char: end_char] | |
} | |
) | |
results = sorted(results, key=lambda x: x["score"], reverse=True)[:top_n_answers] | |
return results | |
def get_results(self): | |
self.output = self.get_output_from_model() | |
self.features = self.prepare_features(self.data) | |
self.results = self.postprocess_qa_predictions(self.data, self.features, self.output) | |
return self.results |