Spaces:
Runtime error
Runtime error
Upload 8 files
Browse files- lib/.DS_Store +0 -0
- lib/.ipynb_checkpoints/utils-checkpoint.py +188 -0
- lib/__init__.py +0 -0
- lib/__pycache__/__init__.cpython-310.pyc +0 -0
- lib/__pycache__/utils.cpython-310.pyc +0 -0
- lib/utils.py +188 -0
- requirements.txt +71 -0
- roberta_app.py +97 -0
lib/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
lib/.ipynb_checkpoints/utils-checkpoint.py
ADDED
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from scipy.special import softmax
|
3 |
+
import collections
|
4 |
+
import torch
|
5 |
+
from torch.utils.data import DataLoader
|
6 |
+
from transformers import default_data_collator
|
7 |
+
|
8 |
+
def preprocess_examples(examples, tokenizer , max_length = 384, stride = 128):
|
9 |
+
"""
|
10 |
+
Preprocesses and tokenizes examples in preparation for inference
|
11 |
+
|
12 |
+
Parameters:
|
13 |
+
-----------
|
14 |
+
examples : datasets.Dataset
|
15 |
+
The dataset of examples. Must have columns:
|
16 |
+
'id', 'question', 'context'
|
17 |
+
tokenizer : transformers.AutoTokenizer
|
18 |
+
The tokenizer for the model
|
19 |
+
max_length : int
|
20 |
+
The max length for context truncation
|
21 |
+
stride : int
|
22 |
+
The stride for context truncation
|
23 |
+
|
24 |
+
Returns:
|
25 |
+
--------
|
26 |
+
inputs : dict
|
27 |
+
The tokenized and processed data dictionary with
|
28 |
+
keys 'input_ids', 'attention_mask', 'offset_ids', 'example_id'
|
29 |
+
All values are lists of length = # of inputs output by tokenizer
|
30 |
+
inputs['input_ids'][k] : list
|
31 |
+
token ids corresponding to tokens in feature k
|
32 |
+
inputs['attention_mask'][k] : list
|
33 |
+
attention mask for feature k
|
34 |
+
inputs['offset_ids'][k] : list
|
35 |
+
offset ids for feature k
|
36 |
+
inputs['example_id'][k] : int
|
37 |
+
id of example from which feature k originated
|
38 |
+
"""
|
39 |
+
questions = [q.strip() for q in examples["question"]]
|
40 |
+
inputs = tokenizer(
|
41 |
+
questions,
|
42 |
+
examples['context'],
|
43 |
+
max_length=max_length,
|
44 |
+
truncation="only_second",
|
45 |
+
stride=stride,
|
46 |
+
return_overflowing_tokens=True,
|
47 |
+
return_offsets_mapping=True,
|
48 |
+
padding="max_length",
|
49 |
+
)
|
50 |
+
|
51 |
+
sample_map = inputs.pop("overflow_to_sample_mapping")
|
52 |
+
example_ids = []
|
53 |
+
|
54 |
+
for i in range(len(inputs["input_ids"])):
|
55 |
+
sample_idx = sample_map[i]
|
56 |
+
example_ids.append(examples["id"][sample_idx])
|
57 |
+
|
58 |
+
sequence_ids = inputs.sequence_ids(i)
|
59 |
+
offset = inputs["offset_mapping"][i]
|
60 |
+
inputs["offset_mapping"][i] = [
|
61 |
+
o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
|
62 |
+
]
|
63 |
+
|
64 |
+
inputs["example_id"] = example_ids
|
65 |
+
return inputs
|
66 |
+
|
67 |
+
|
68 |
+
def make_predictions(model,tokenizer,inputs,examples,
|
69 |
+
n_best = 20,max_answer_length=30):
|
70 |
+
"""
|
71 |
+
Generates a list of prediction data based on logits
|
72 |
+
|
73 |
+
Parameters:
|
74 |
+
-----------
|
75 |
+
model : transformers.AutoModelForQuestionAnswering
|
76 |
+
The trained model
|
77 |
+
tokenizer : transformers.AutoTokenizer
|
78 |
+
The model's tokenizer
|
79 |
+
inputs : dict
|
80 |
+
The tokenized and processed data dictionary with
|
81 |
+
keys 'input_ids', 'attention_mask', 'offset_ids', 'example_id'
|
82 |
+
All values are lists of length = # of inputs output by tokenizer
|
83 |
+
inputs['input_ids'][k] : list
|
84 |
+
token ids corresponding to tokens in feature k
|
85 |
+
inputs['attention_mask'][k] : list
|
86 |
+
attention mask for feature k
|
87 |
+
inputs['offset_ids'][k] : list
|
88 |
+
offset ids for feature k
|
89 |
+
inputs['example_id'][k] : int
|
90 |
+
id of example from which feature k originated
|
91 |
+
examples : datasets.Dataset
|
92 |
+
The dataset of examples. Must have columns:
|
93 |
+
'id', 'question', 'context'
|
94 |
+
n_best : int
|
95 |
+
The number of top start/end (by logit) indices to consider
|
96 |
+
max_answer_length : int
|
97 |
+
The maximum length (in characters) allowed for a candidate answer
|
98 |
+
|
99 |
+
Returns:
|
100 |
+
--------
|
101 |
+
predicted_answers : list(dict)
|
102 |
+
predicted_answers[k] has keys 'id','prediction_text','confidence'
|
103 |
+
predicted_answers[k]['id'] : int
|
104 |
+
The unique id of the example
|
105 |
+
predicted_answers[k]['prediction_text'] : str
|
106 |
+
The predicted answer as a string
|
107 |
+
predicted_answers[k]['confidence'] : float
|
108 |
+
The predicted probability corresponding to the answer, i.e. the
|
109 |
+
corresponding output of a softmax function on logits
|
110 |
+
"""
|
111 |
+
assert n_best <= len(inputs['input_ids'][0]), 'n_best cannot be larger than max_length'
|
112 |
+
|
113 |
+
if torch.backends.mps.is_available():
|
114 |
+
device = "mps"
|
115 |
+
elif torch.cuda.us_available():
|
116 |
+
device = "cuda"
|
117 |
+
else:
|
118 |
+
device = "cpu"
|
119 |
+
data_for_model = inputs.remove_columns(["example_id", "offset_mapping"])
|
120 |
+
data_for_model.set_format("torch",device=device)
|
121 |
+
dl = DataLoader(
|
122 |
+
data_for_model,
|
123 |
+
collate_fn=default_data_collator,
|
124 |
+
batch_size=len(inputs)
|
125 |
+
)
|
126 |
+
model = model.to(device)
|
127 |
+
for batch in dl:
|
128 |
+
outputs = model(**batch)
|
129 |
+
|
130 |
+
start_logits = outputs.start_logits.cpu().detach().numpy()
|
131 |
+
end_logits = outputs.end_logits.cpu().detach().numpy()
|
132 |
+
example_to_inputs = collections.defaultdict(list)
|
133 |
+
for idx, feature in enumerate(inputs):
|
134 |
+
example_to_inputs[feature["example_id"]].append(idx)
|
135 |
+
|
136 |
+
predicted_answers = []
|
137 |
+
for example in examples:
|
138 |
+
example_id = example["id"]
|
139 |
+
context = example["context"]
|
140 |
+
answers = []
|
141 |
+
|
142 |
+
for feature_index in example_to_inputs[example_id]:
|
143 |
+
start_logit = start_logits[feature_index]
|
144 |
+
end_logit = end_logits[feature_index]
|
145 |
+
offsets = inputs[feature_index]['offset_mapping']
|
146 |
+
|
147 |
+
start_indices = np.argsort(start_logit)[-1:-n_best-1:-1].tolist()
|
148 |
+
end_indices = np.argsort(end_logit)[-1 :-n_best-1: -1].tolist()
|
149 |
+
|
150 |
+
for start_index in start_indices:
|
151 |
+
for end_index in end_indices:
|
152 |
+
# Skip answers with a length that is either < 0 or > max_answer_length.
|
153 |
+
if(
|
154 |
+
end_index < start_index
|
155 |
+
or end_index - start_index + 1 > max_answer_length
|
156 |
+
):
|
157 |
+
continue
|
158 |
+
|
159 |
+
if (offsets[start_index] is None)^(offsets[end_index] is None):
|
160 |
+
continue
|
161 |
+
if (offsets[start_index] is None)&(offsets[end_index] is None):
|
162 |
+
answers.append(
|
163 |
+
{
|
164 |
+
"text": '',
|
165 |
+
"logit_score": start_logit[start_index] + end_logit[end_index],
|
166 |
+
}
|
167 |
+
)
|
168 |
+
else:
|
169 |
+
answers.append(
|
170 |
+
{
|
171 |
+
"text": context[offsets[start_index][0] : offsets[end_index][1]],
|
172 |
+
"logit_score": start_logit[start_index] + end_logit[end_index],
|
173 |
+
}
|
174 |
+
)
|
175 |
+
answer_logits = [a['logit_score'] for a in answers]
|
176 |
+
answer_probs = softmax(answer_logits)
|
177 |
+
|
178 |
+
if len(answers)>0:
|
179 |
+
best_answer = max(answers, key=lambda x:x['logit_score'])
|
180 |
+
predicted_answers.append(
|
181 |
+
{'id':example_id, 'prediction_text':best_answer['text'], 'confidence':answer_probs[0]}
|
182 |
+
)
|
183 |
+
else:
|
184 |
+
predicted_answers.append({'id':example_id, 'prediction_text':'','confidence':answer_probs[0]})
|
185 |
+
for pred in predicted_answers:
|
186 |
+
if pred['prediction_text'] == '':
|
187 |
+
pred['prediction_text'] = "I don't have an answer based on the context provided."
|
188 |
+
return predicted_answers
|
lib/__init__.py
ADDED
File without changes
|
lib/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (166 Bytes). View file
|
|
lib/__pycache__/utils.cpython-310.pyc
ADDED
Binary file (6.13 kB). View file
|
|
lib/utils.py
ADDED
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from scipy.special import softmax
|
3 |
+
import collections
|
4 |
+
import torch
|
5 |
+
from torch.utils.data import DataLoader
|
6 |
+
from transformers import default_data_collator
|
7 |
+
|
8 |
+
def preprocess_examples(examples, tokenizer , max_length = 384, stride = 128):
|
9 |
+
"""
|
10 |
+
Preprocesses and tokenizes examples in preparation for inference
|
11 |
+
|
12 |
+
Parameters:
|
13 |
+
-----------
|
14 |
+
examples : datasets.Dataset
|
15 |
+
The dataset of examples. Must have columns:
|
16 |
+
'id', 'question', 'context'
|
17 |
+
tokenizer : transformers.AutoTokenizer
|
18 |
+
The tokenizer for the model
|
19 |
+
max_length : int
|
20 |
+
The max length for context truncation
|
21 |
+
stride : int
|
22 |
+
The stride for context truncation
|
23 |
+
|
24 |
+
Returns:
|
25 |
+
--------
|
26 |
+
inputs : dict
|
27 |
+
The tokenized and processed data dictionary with
|
28 |
+
keys 'input_ids', 'attention_mask', 'offset_ids', 'example_id'
|
29 |
+
All values are lists of length = # of inputs output by tokenizer
|
30 |
+
inputs['input_ids'][k] : list
|
31 |
+
token ids corresponding to tokens in feature k
|
32 |
+
inputs['attention_mask'][k] : list
|
33 |
+
attention mask for feature k
|
34 |
+
inputs['offset_ids'][k] : list
|
35 |
+
offset ids for feature k
|
36 |
+
inputs['example_id'][k] : int
|
37 |
+
id of example from which feature k originated
|
38 |
+
"""
|
39 |
+
questions = [q.strip() for q in examples["question"]]
|
40 |
+
inputs = tokenizer(
|
41 |
+
questions,
|
42 |
+
examples['context'],
|
43 |
+
max_length=max_length,
|
44 |
+
truncation="only_second",
|
45 |
+
stride=stride,
|
46 |
+
return_overflowing_tokens=True,
|
47 |
+
return_offsets_mapping=True,
|
48 |
+
padding="max_length",
|
49 |
+
)
|
50 |
+
|
51 |
+
sample_map = inputs.pop("overflow_to_sample_mapping")
|
52 |
+
example_ids = []
|
53 |
+
|
54 |
+
for i in range(len(inputs["input_ids"])):
|
55 |
+
sample_idx = sample_map[i]
|
56 |
+
example_ids.append(examples["id"][sample_idx])
|
57 |
+
|
58 |
+
sequence_ids = inputs.sequence_ids(i)
|
59 |
+
offset = inputs["offset_mapping"][i]
|
60 |
+
inputs["offset_mapping"][i] = [
|
61 |
+
o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
|
62 |
+
]
|
63 |
+
|
64 |
+
inputs["example_id"] = example_ids
|
65 |
+
return inputs
|
66 |
+
|
67 |
+
|
68 |
+
def make_predictions(model,tokenizer,inputs,examples,
|
69 |
+
n_best = 20,max_answer_length=30):
|
70 |
+
"""
|
71 |
+
Generates a list of prediction data based on logits
|
72 |
+
|
73 |
+
Parameters:
|
74 |
+
-----------
|
75 |
+
model : transformers.AutoModelForQuestionAnswering
|
76 |
+
The trained model
|
77 |
+
tokenizer : transformers.AutoTokenizer
|
78 |
+
The model's tokenizer
|
79 |
+
inputs : dict
|
80 |
+
The tokenized and processed data dictionary with
|
81 |
+
keys 'input_ids', 'attention_mask', 'offset_ids', 'example_id'
|
82 |
+
All values are lists of length = # of inputs output by tokenizer
|
83 |
+
inputs['input_ids'][k] : list
|
84 |
+
token ids corresponding to tokens in feature k
|
85 |
+
inputs['attention_mask'][k] : list
|
86 |
+
attention mask for feature k
|
87 |
+
inputs['offset_ids'][k] : list
|
88 |
+
offset ids for feature k
|
89 |
+
inputs['example_id'][k] : int
|
90 |
+
id of example from which feature k originated
|
91 |
+
examples : datasets.Dataset
|
92 |
+
The dataset of examples. Must have columns:
|
93 |
+
'id', 'question', 'context'
|
94 |
+
n_best : int
|
95 |
+
The number of top start/end (by logit) indices to consider
|
96 |
+
max_answer_length : int
|
97 |
+
The maximum length (in characters) allowed for a candidate answer
|
98 |
+
|
99 |
+
Returns:
|
100 |
+
--------
|
101 |
+
predicted_answers : list(dict)
|
102 |
+
predicted_answers[k] has keys 'id','prediction_text','confidence'
|
103 |
+
predicted_answers[k]['id'] : int
|
104 |
+
The unique id of the example
|
105 |
+
predicted_answers[k]['prediction_text'] : str
|
106 |
+
The predicted answer as a string
|
107 |
+
predicted_answers[k]['confidence'] : float
|
108 |
+
The predicted probability corresponding to the answer, i.e. the
|
109 |
+
corresponding output of a softmax function on logits
|
110 |
+
"""
|
111 |
+
assert n_best <= len(inputs['input_ids'][0]), 'n_best cannot be larger than max_length'
|
112 |
+
|
113 |
+
if torch.backends.mps.is_available():
|
114 |
+
device = "mps"
|
115 |
+
elif torch.cuda.us_available():
|
116 |
+
device = "cuda"
|
117 |
+
else:
|
118 |
+
device = "cpu"
|
119 |
+
data_for_model = inputs.remove_columns(["example_id", "offset_mapping"])
|
120 |
+
data_for_model.set_format("torch",device=device)
|
121 |
+
dl = DataLoader(
|
122 |
+
data_for_model,
|
123 |
+
collate_fn=default_data_collator,
|
124 |
+
batch_size=len(inputs)
|
125 |
+
)
|
126 |
+
model = model.to(device)
|
127 |
+
for batch in dl:
|
128 |
+
outputs = model(**batch)
|
129 |
+
|
130 |
+
start_logits = outputs.start_logits.cpu().detach().numpy()
|
131 |
+
end_logits = outputs.end_logits.cpu().detach().numpy()
|
132 |
+
example_to_inputs = collections.defaultdict(list)
|
133 |
+
for idx, feature in enumerate(inputs):
|
134 |
+
example_to_inputs[feature["example_id"]].append(idx)
|
135 |
+
|
136 |
+
predicted_answers = []
|
137 |
+
for example in examples:
|
138 |
+
example_id = example["id"]
|
139 |
+
context = example["context"]
|
140 |
+
answers = []
|
141 |
+
|
142 |
+
for feature_index in example_to_inputs[example_id]:
|
143 |
+
start_logit = start_logits[feature_index]
|
144 |
+
end_logit = end_logits[feature_index]
|
145 |
+
offsets = inputs[feature_index]['offset_mapping']
|
146 |
+
|
147 |
+
start_indices = np.argsort(start_logit)[-1:-n_best-1:-1].tolist()
|
148 |
+
end_indices = np.argsort(end_logit)[-1 :-n_best-1: -1].tolist()
|
149 |
+
|
150 |
+
for start_index in start_indices:
|
151 |
+
for end_index in end_indices:
|
152 |
+
# Skip answers with a length that is either < 0 or > max_answer_length.
|
153 |
+
if(
|
154 |
+
end_index < start_index
|
155 |
+
or end_index - start_index + 1 > max_answer_length
|
156 |
+
):
|
157 |
+
continue
|
158 |
+
|
159 |
+
if (offsets[start_index] is None)^(offsets[end_index] is None):
|
160 |
+
continue
|
161 |
+
if (offsets[start_index] is None)&(offsets[end_index] is None):
|
162 |
+
answers.append(
|
163 |
+
{
|
164 |
+
"text": '',
|
165 |
+
"logit_score": start_logit[start_index] + end_logit[end_index],
|
166 |
+
}
|
167 |
+
)
|
168 |
+
else:
|
169 |
+
answers.append(
|
170 |
+
{
|
171 |
+
"text": context[offsets[start_index][0] : offsets[end_index][1]],
|
172 |
+
"logit_score": start_logit[start_index] + end_logit[end_index],
|
173 |
+
}
|
174 |
+
)
|
175 |
+
answer_logits = [a['logit_score'] for a in answers]
|
176 |
+
answer_probs = softmax(answer_logits)
|
177 |
+
|
178 |
+
if len(answers)>0:
|
179 |
+
best_answer = max(answers, key=lambda x:x['logit_score'])
|
180 |
+
predicted_answers.append(
|
181 |
+
{'id':example_id, 'prediction_text':best_answer['text'], 'confidence':answer_probs[0]}
|
182 |
+
)
|
183 |
+
else:
|
184 |
+
predicted_answers.append({'id':example_id, 'prediction_text':'','confidence':answer_probs[0]})
|
185 |
+
for pred in predicted_answers:
|
186 |
+
if pred['prediction_text'] == '':
|
187 |
+
pred['prediction_text'] = "I don't have an answer based on the context provided."
|
188 |
+
return predicted_answers
|
requirements.txt
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiohttp==3.8.4
|
2 |
+
aiosignal==1.3.1
|
3 |
+
altair==5.0.1
|
4 |
+
async-timeout==4.0.2
|
5 |
+
attrs==23.1.0
|
6 |
+
blinker==1.6.2
|
7 |
+
cachetools==5.3.1
|
8 |
+
certifi==2023.5.7
|
9 |
+
charset-normalizer==3.1.0
|
10 |
+
click==8.1.4
|
11 |
+
datasets==2.13.1
|
12 |
+
decorator==5.1.1
|
13 |
+
dill==0.3.6
|
14 |
+
filelock==3.12.2
|
15 |
+
frozenlist==1.3.3
|
16 |
+
fsspec==2023.6.0
|
17 |
+
gitdb==4.0.10
|
18 |
+
GitPython==3.1.31
|
19 |
+
huggingface-hub==0.16.2
|
20 |
+
idna==3.4
|
21 |
+
importlib-metadata==6.7.0
|
22 |
+
Jinja2==3.1.2
|
23 |
+
jsonschema==4.18.0
|
24 |
+
jsonschema-specifications==2023.6.1
|
25 |
+
markdown-it-py==3.0.0
|
26 |
+
MarkupSafe==2.1.3
|
27 |
+
mdurl==0.1.2
|
28 |
+
mpmath==1.3.0
|
29 |
+
multidict==6.0.4
|
30 |
+
multiprocess==0.70.14
|
31 |
+
networkx==3.1
|
32 |
+
numpy==1.25.0
|
33 |
+
packaging==23.1
|
34 |
+
pandas==2.0.3
|
35 |
+
Pillow==9.5.0
|
36 |
+
protobuf==4.23.3
|
37 |
+
pyarrow==12.0.1
|
38 |
+
pydeck==0.8.1b0
|
39 |
+
Pygments==2.15.1
|
40 |
+
Pympler==1.0.1
|
41 |
+
python-dateutil==2.8.2
|
42 |
+
pytz==2023.3
|
43 |
+
pytz-deprecation-shim==0.1.0.post0
|
44 |
+
PyYAML==6.0
|
45 |
+
referencing==0.29.1
|
46 |
+
regex==2023.6.3
|
47 |
+
requests==2.31.0
|
48 |
+
rich==13.4.2
|
49 |
+
rpds-py==0.8.8
|
50 |
+
safetensors==0.3.1
|
51 |
+
scipy==1.11.1
|
52 |
+
six==1.16.0
|
53 |
+
smmap==5.0.0
|
54 |
+
streamlit==1.24.0
|
55 |
+
sympy==1.12
|
56 |
+
tenacity==8.2.2
|
57 |
+
tokenizers==0.13.3
|
58 |
+
toml==0.10.2
|
59 |
+
toolz==0.12.0
|
60 |
+
torch==2.0.1
|
61 |
+
tornado==6.3.2
|
62 |
+
tqdm==4.65.0
|
63 |
+
transformers==4.30.2
|
64 |
+
typing_extensions==4.7.1
|
65 |
+
tzdata==2023.3
|
66 |
+
tzlocal==4.3.1
|
67 |
+
urllib3==2.0.3
|
68 |
+
validators==0.20.0
|
69 |
+
xxhash==3.2.0
|
70 |
+
yarl==1.9.2
|
71 |
+
zipp==3.15.0
|
roberta_app.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import streamlit as st
|
3 |
+
from datasets import Dataset
|
4 |
+
from torch.utils.data import DataLoader
|
5 |
+
from transformers import (
|
6 |
+
AutoTokenizer,
|
7 |
+
AutoModelForQuestionAnswering,
|
8 |
+
TrainingArguments,
|
9 |
+
Trainer,
|
10 |
+
default_data_collator,
|
11 |
+
)
|
12 |
+
from lib.utils import preprocess_examples, make_predictions
|
13 |
+
|
14 |
+
if torch.backends.mps.is_available():
|
15 |
+
device = "mps"
|
16 |
+
elif torch.cuda.is_available():
|
17 |
+
device = "cuda"
|
18 |
+
else:
|
19 |
+
device = "cpu"
|
20 |
+
|
21 |
+
# TO DO:
|
22 |
+
# - make it pretty
|
23 |
+
# - add support for multiple questions corresponding to same context
|
24 |
+
# - add examples
|
25 |
+
# What else??
|
26 |
+
|
27 |
+
|
28 |
+
if 'response' not in st.session_state:
|
29 |
+
st.session_state['response'] = ''
|
30 |
+
if 'context' not in st.session_state:
|
31 |
+
st.session_state['context'] = ''
|
32 |
+
if 'question' not in st.session_state:
|
33 |
+
st.session_state['question'] = ''
|
34 |
+
|
35 |
+
# Build trainer using model and tokenizer from Hugging Face repo
|
36 |
+
@st.cache_resource(show_spinner=False)
|
37 |
+
def get_model():
|
38 |
+
repo_id = 'etweedy/roberta-base-squad-v2'
|
39 |
+
model = AutoModelForQuestionAnswering.from_pretrained(repo_id)
|
40 |
+
tokenizer = AutoTokenizer.from_pretrained(repo_id)
|
41 |
+
return model, tokenizer
|
42 |
+
|
43 |
+
with st.spinner('Loading the model...'):
|
44 |
+
model, tokenizer = get_model()
|
45 |
+
|
46 |
+
input_container = st.container()
|
47 |
+
st.divider()
|
48 |
+
response_container = st.container()
|
49 |
+
|
50 |
+
# Form for user inputs
|
51 |
+
with input_container:
|
52 |
+
with st.form(key='input_form',clear_on_submit=False):
|
53 |
+
context = st.text_area(
|
54 |
+
label='Context',
|
55 |
+
value='',
|
56 |
+
key='context_field',
|
57 |
+
label_visibility='hidden',
|
58 |
+
placeholder='Enter your context paragraph here.',
|
59 |
+
height=300,
|
60 |
+
)
|
61 |
+
question = st.text_input(
|
62 |
+
label='Question',
|
63 |
+
value='',
|
64 |
+
key='question_field',
|
65 |
+
label_visibility='hidden',
|
66 |
+
placeholder='Enter your question here.',
|
67 |
+
)
|
68 |
+
query_submitted = st.form_submit_button("Submit")
|
69 |
+
if query_submitted:
|
70 |
+
with st.spinner('Generating response...'):
|
71 |
+
data_raw = Dataset.from_dict(
|
72 |
+
{
|
73 |
+
'id':[0],
|
74 |
+
'context':[context],
|
75 |
+
'question':[question]
|
76 |
+
}
|
77 |
+
)
|
78 |
+
data_proc = data_raw.map(
|
79 |
+
preprocess_examples,
|
80 |
+
remove_columns = data_raw.column_names,
|
81 |
+
batched = True,
|
82 |
+
fn_kwargs = {
|
83 |
+
'tokenizer':tokenizer,
|
84 |
+
}
|
85 |
+
)
|
86 |
+
predicted_answers = make_predictions(model, tokenizer,
|
87 |
+
data_proc, data_raw,
|
88 |
+
n_best = 20)
|
89 |
+
answer = predicted_answers[0]['prediction_text']
|
90 |
+
confidence = predicted_answers[0]['confidence']
|
91 |
+
st.session_state['response'] = f"""
|
92 |
+
Answer: {answer}\n
|
93 |
+
Confidence: {confidence:.2%}
|
94 |
+
"""
|
95 |
+
with response_container:
|
96 |
+
st.write(st.session_state['response'])
|
97 |
+
|