Update app.py
Browse files
app.py
CHANGED
@@ -1,32 +1,219 @@
|
|
1 |
-
from
|
|
|
2 |
|
3 |
-
|
4 |
-
|
|
|
5 |
|
6 |
-
|
7 |
-
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
|
8 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
9 |
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
from transformers import pipeline
|
15 |
|
16 |
-
# Load pre-trained question-answering model
|
17 |
-
qa_pipeline = pipeline("question-answering", model="google-bert/bert-large-uncased-whole-word-masking-finetuned-squad", tokenizer="google-bert/bert-large-uncased-whole-word-masking-finetuned-squad", max_answer_length=6000,from_tf=True)
|
18 |
|
19 |
-
#
|
20 |
-
|
21 |
-
question = "What is it called in Portuguese"
|
22 |
|
23 |
-
#
|
|
|
|
|
24 |
answer = qa_pipeline(question=question, context=context)
|
25 |
|
26 |
-
# Print answer
|
27 |
print("Question:", question)
|
28 |
-
print("Answer:", answer[
|
29 |
-
|
30 |
|
31 |
import gradio as gr
|
32 |
|
|
|
1 |
+
from huggingface_hub import notebook_login
|
2 |
+
notebook_login()
|
3 |
|
4 |
+
from datasets import load_dataset
|
5 |
+
squad = load_dataset("squad", split="train[:5000]")
|
6 |
+
squad = squad.train_test_split(test_size=0.2)
|
7 |
|
8 |
+
import pandas as pd
|
|
|
|
|
9 |
|
10 |
+
import pandas as pd
|
11 |
+
# Convert the dataset to a dictionary
|
12 |
+
data_dict = squad["train"].to_dict()
|
13 |
+
# Create a DataFrame from the dictionary
|
14 |
+
df = pd.DataFrame.from_dict(data_dict)
|
15 |
+
|
16 |
+
from transformers import AutoTokenizer
|
17 |
+
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
|
18 |
+
|
19 |
+
questions = [q.strip() for q in df["question"]]
|
20 |
+
context = [q.strip() for q in df["context"]]
|
21 |
+
inputs = tokenizer(
|
22 |
+
questions,
|
23 |
+
context,
|
24 |
+
max_length=384,
|
25 |
+
truncation="only_second",
|
26 |
+
return_offsets_mapping=True,
|
27 |
+
padding="max_length",
|
28 |
+
)
|
29 |
+
|
30 |
+
offset_mapping = inputs.pop("offset_mapping")
|
31 |
+
|
32 |
+
start_positions = []
|
33 |
+
end_positions = []
|
34 |
+
answers = df['answers']
|
35 |
+
for i, offset in enumerate(offset_mapping):
|
36 |
+
answer = answers[i]
|
37 |
+
start_char = answer["answer_start"][0]
|
38 |
+
end_char = answer["answer_start"][0] + len(answer["text"][0])
|
39 |
+
sequence_ids = inputs.sequence_ids(i)
|
40 |
+
|
41 |
+
# Find the start and end of the context
|
42 |
+
idx = 0
|
43 |
+
while sequence_ids[idx] != 1:
|
44 |
+
idx += 1
|
45 |
+
context_start = idx
|
46 |
+
while sequence_ids[idx] == 1:
|
47 |
+
idx += 1
|
48 |
+
context_end = idx - 1
|
49 |
+
|
50 |
+
# If the answer is not fully inside the context, label it (0, 0)
|
51 |
+
if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
|
52 |
+
start_positions.append(0)
|
53 |
+
end_positions.append(0)
|
54 |
+
else:
|
55 |
+
# Otherwise it's the start and end token positions
|
56 |
+
idx = context_start
|
57 |
+
while idx <= context_end and offset[idx][0] <= start_char:
|
58 |
+
idx += 1
|
59 |
+
start_positions.append(idx - 1)
|
60 |
+
|
61 |
+
idx = context_end
|
62 |
+
while idx >= context_start and offset[idx][1] >= end_char:
|
63 |
+
idx -= 1
|
64 |
+
end_positions.append(idx + 1)
|
65 |
+
|
66 |
+
df["start_positions"] = start_positions
|
67 |
+
df["end_positions"] = end_positions
|
68 |
+
|
69 |
+
import pandas as pd
|
70 |
+
from datasets import Dataset
|
71 |
+
data = {'input_ids': inputs['input_ids'],
|
72 |
+
'attention_mask': inputs['attention_mask'],
|
73 |
+
'start_positions':start_positions,
|
74 |
+
'end_positions': end_positions,
|
75 |
+
}
|
76 |
+
df = pd.DataFrame(data)
|
77 |
+
df.to_csv('encoding_train.csv',index=False)
|
78 |
+
train = Dataset.from_pandas(df)
|
79 |
+
|
80 |
+
import pandas as pd
|
81 |
+
# Convert the dataset to a dictionary
|
82 |
+
data_dict = squad["test"].to_dict()
|
83 |
+
# Create a DataFrame from the dictionary
|
84 |
+
df = pd.DataFrame.from_dict(data_dict)
|
85 |
+
|
86 |
+
questions = [q.strip() for q in df["question"]]
|
87 |
+
context = [q.strip() for q in df["context"]]
|
88 |
+
inputs = tokenizer(
|
89 |
+
questions,
|
90 |
+
context,
|
91 |
+
max_length=384,
|
92 |
+
truncation="only_second",
|
93 |
+
return_offsets_mapping=True,
|
94 |
+
padding="max_length",
|
95 |
+
)
|
96 |
+
|
97 |
+
offset_mapping = inputs.pop("offset_mapping")
|
98 |
+
|
99 |
+
start_positions = []
|
100 |
+
end_positions = []
|
101 |
+
answers = df['answers']
|
102 |
+
for i, offset in enumerate(offset_mapping):
|
103 |
+
answer = answers[i]
|
104 |
+
start_char = answer["answer_start"][0]
|
105 |
+
end_char = answer["answer_start"][0] + len(answer["text"][0])
|
106 |
+
sequence_ids = inputs.sequence_ids(i)
|
107 |
+
|
108 |
+
# Find the start and end of the context
|
109 |
+
idx = 0
|
110 |
+
while sequence_ids[idx] != 1:
|
111 |
+
idx += 1
|
112 |
+
context_start = idx
|
113 |
+
while sequence_ids[idx] == 1:
|
114 |
+
idx += 1
|
115 |
+
context_end = idx - 1
|
116 |
+
|
117 |
+
# If the answer is not fully inside the context, label it (0, 0)
|
118 |
+
if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
|
119 |
+
start_positions.append(0)
|
120 |
+
end_positions.append(0)
|
121 |
+
else:
|
122 |
+
# Otherwise it's the start and end token positions
|
123 |
+
idx = context_start
|
124 |
+
while idx <= context_end and offset[idx][0] <= start_char:
|
125 |
+
idx += 1
|
126 |
+
start_positions.append(idx - 1)
|
127 |
+
|
128 |
+
idx = context_end
|
129 |
+
while idx >= context_start and offset[idx][1] >= end_char:
|
130 |
+
idx -= 1
|
131 |
+
end_positions.append(idx + 1)
|
132 |
+
|
133 |
+
df["start_positions"] = start_positions
|
134 |
+
df["end_positions"] = end_positions
|
135 |
+
|
136 |
+
data = {'input_ids': inputs['input_ids'],
|
137 |
+
'attention_mask': inputs['attention_mask'],
|
138 |
+
'start_positions':start_positions,
|
139 |
+
'end_positions': end_positions,
|
140 |
+
}
|
141 |
+
df = pd.DataFrame(data)
|
142 |
+
df.to_csv('encoding_test.csv',index=False)
|
143 |
+
test = Dataset.from_pandas(df)
|
144 |
+
|
145 |
+
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
|
146 |
+
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
|
147 |
+
|
148 |
+
from transformers import DefaultDataCollator
|
149 |
+
|
150 |
+
data_collator = DefaultDataCollator()
|
151 |
+
|
152 |
+
training_args = TrainingArguments(
|
153 |
+
output_dir="my_awesome_qa_model",
|
154 |
+
evaluation_strategy="epoch",
|
155 |
+
learning_rate=2e-5,
|
156 |
+
per_device_train_batch_size=16,
|
157 |
+
per_device_eval_batch_size=16,
|
158 |
+
num_train_epochs=4,
|
159 |
+
weight_decay=0.01,
|
160 |
+
push_to_hub=True,
|
161 |
+
)
|
162 |
+
|
163 |
+
trainer = Trainer(
|
164 |
+
model=model,
|
165 |
+
args=training_args,
|
166 |
+
train_dataset=train,
|
167 |
+
eval_dataset=test,
|
168 |
+
tokenizer=tokenizer,
|
169 |
+
data_collator=data_collator,
|
170 |
+
)
|
171 |
+
|
172 |
+
trainer.train()
|
173 |
+
|
174 |
+
# Evaluate the model on the test dataset
|
175 |
+
eval_results = trainer.evaluate(eval_dataset=test)
|
176 |
+
|
177 |
+
# Print evaluation results
|
178 |
+
print("Evaluation results:", eval_results)
|
179 |
+
|
180 |
+
# Assuming you have a fine-tuned BERT model stored in the 'model' variable
|
181 |
+
|
182 |
+
# Define the directory where you want to save the model
|
183 |
+
output_dir = "path_to_save_model"
|
184 |
+
|
185 |
+
# Create the directory if it doesn't exist
|
186 |
+
import os
|
187 |
+
if not os.path.exists(output_dir):
|
188 |
+
os.makedirs(output_dir)
|
189 |
+
|
190 |
+
# Save the model and tokenizer
|
191 |
+
model.save_pretrained(output_dir)
|
192 |
+
tokenizer.save_pretrained(output_dir)
|
193 |
+
|
194 |
+
print("Model and tokenizer saved to:", output_dir)
|
195 |
+
|
196 |
+
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
|
197 |
+
|
198 |
+
# Load the model from the specified path
|
199 |
+
model = AutoModelForQuestionAnswering.from_pretrained(model_path)
|
200 |
+
|
201 |
+
# Check the model configuration
|
202 |
+
print(model.config)
|
203 |
|
204 |
from transformers import pipeline
|
205 |
|
|
|
|
|
206 |
|
207 |
+
# Create a question answering pipeline
|
208 |
+
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
|
|
|
209 |
|
210 |
+
# Now you can use the pipeline to answer questions
|
211 |
+
question = "what does SQuAD stand for"
|
212 |
+
context = "Ernest Shackleton (15 February 1874 – 5 January 1922) led three British expeditions to the Antarctic during the Heroic Age of Antarctic Exploration. He and three companions established a new record Farthest South latitude, 112 miles (180 km) from the South Pole, as part of the Nimrod Expedition of 1907–1909, and Shackleton was knighted on his return home. He planned the Imperial Trans-Antarctic Expedition of 1914–1917 but his ship, Endurance, became trapped in pack ice and then sank on 21 November 1915. The crew escaped and used the lifeboats to reach Elephant Island and ultimately the island of South Georgia in a stormy ocean voyage of more than 700 nautical miles (800 mi; 1,300 km), Shackleton's most famous exploit. He returned to the Antarctic in 1921 with the Shackleton–Rowett Expedition, but died of a heart attack on South Georgia; at his wife's request, he was buried there. In the latter part of the 20th century, Shackleton became a role model for leadership in extreme circumstance. Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable."
|
213 |
answer = qa_pipeline(question=question, context=context)
|
214 |
|
|
|
215 |
print("Question:", question)
|
216 |
+
print("Answer:", answer["answer"])
|
|
|
217 |
|
218 |
import gradio as gr
|
219 |
|