Jahanzeb1 commited on
Commit
8309726
·
verified ·
1 Parent(s): 7bf7d9e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +205 -18
app.py CHANGED
@@ -1,32 +1,219 @@
1
- from transformers import AutoModelForQuestionAnswering, AutoTokenizer
 
2
 
3
- # Model name or identifier from Hugging Face model hub
4
- model_name = "google-bert/bert-large-uncased-whole-word-masking-finetuned-squad"
 
5
 
6
- # Load the pre-trained model and tokenizer
7
- model = AutoModelForQuestionAnswering.from_pretrained(model_name)
8
- tokenizer = AutoTokenizer.from_pretrained(model_name)
9
 
10
- # Save the model and tokenizer to a directory
11
- model.save_pretrained("model_directory")
12
- tokenizer.save_pretrained("model_directory")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  from transformers import pipeline
15
 
16
- # Load pre-trained question-answering model
17
- qa_pipeline = pipeline("question-answering", model="google-bert/bert-large-uncased-whole-word-masking-finetuned-squad", tokenizer="google-bert/bert-large-uncased-whole-word-masking-finetuned-squad", max_answer_length=6000,from_tf=True)
18
 
19
- # Define context and question
20
- context = "The Amazon rainforest (Portuguese: Floresta Amazônica or Amazônia; Spanish: Selva Amazónica, Amazonía or usually Amazonia; French: Forêt amazonienne; Dutch: Amazoneregenwoud), also known in English as Amazonia or the Amazon Jungle, is a moist broadleaf forest that covers most of the Amazon basin of South America. This basin encompasses 7,000,000 square kilometres (2,700,000 sq mi), of which 5,500,000 square kilometres (2,100,000 sq mi) are covered by the rainforest. This region includes territory belonging to nine nations. The majority of the forest is contained within Brazil, with 60% of the rainforest, followed by Peru with 13%, Colombia with 10%, and with minor amounts in Venezuela, Ecuador, Bolivia, Guyana, Suriname and French Guiana. States or departments in four nations contain 'Amazonas' in their names. The Amazon represents over half of the planet's remaining rainforests, and comprises the largest and most biodiverse tract of tropical rainforest in the world, with an estimated 390 billion individual trees divided into 16,000 species."
21
- question = "What is it called in Portuguese"
22
 
23
- # Get answer
 
 
24
  answer = qa_pipeline(question=question, context=context)
25
 
26
- # Print answer
27
  print("Question:", question)
28
- print("Answer:", answer['answer'])
29
-
30
 
31
  import gradio as gr
32
 
 
1
+ from huggingface_hub import notebook_login
2
+ notebook_login()
3
 
4
+ from datasets import load_dataset
5
+ squad = load_dataset("squad", split="train[:5000]")
6
+ squad = squad.train_test_split(test_size=0.2)
7
 
8
+ import pandas as pd
 
 
9
 
10
+ import pandas as pd
11
+ # Convert the dataset to a dictionary
12
+ data_dict = squad["train"].to_dict()
13
+ # Create a DataFrame from the dictionary
14
+ df = pd.DataFrame.from_dict(data_dict)
15
+
16
+ from transformers import AutoTokenizer
17
+ tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
18
+
19
+ questions = [q.strip() for q in df["question"]]
20
+ context = [q.strip() for q in df["context"]]
21
+ inputs = tokenizer(
22
+ questions,
23
+ context,
24
+ max_length=384,
25
+ truncation="only_second",
26
+ return_offsets_mapping=True,
27
+ padding="max_length",
28
+ )
29
+
30
+ offset_mapping = inputs.pop("offset_mapping")
31
+
32
+ start_positions = []
33
+ end_positions = []
34
+ answers = df['answers']
35
+ for i, offset in enumerate(offset_mapping):
36
+ answer = answers[i]
37
+ start_char = answer["answer_start"][0]
38
+ end_char = answer["answer_start"][0] + len(answer["text"][0])
39
+ sequence_ids = inputs.sequence_ids(i)
40
+
41
+ # Find the start and end of the context
42
+ idx = 0
43
+ while sequence_ids[idx] != 1:
44
+ idx += 1
45
+ context_start = idx
46
+ while sequence_ids[idx] == 1:
47
+ idx += 1
48
+ context_end = idx - 1
49
+
50
+ # If the answer is not fully inside the context, label it (0, 0)
51
+ if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
52
+ start_positions.append(0)
53
+ end_positions.append(0)
54
+ else:
55
+ # Otherwise it's the start and end token positions
56
+ idx = context_start
57
+ while idx <= context_end and offset[idx][0] <= start_char:
58
+ idx += 1
59
+ start_positions.append(idx - 1)
60
+
61
+ idx = context_end
62
+ while idx >= context_start and offset[idx][1] >= end_char:
63
+ idx -= 1
64
+ end_positions.append(idx + 1)
65
+
66
+ df["start_positions"] = start_positions
67
+ df["end_positions"] = end_positions
68
+
69
+ import pandas as pd
70
+ from datasets import Dataset
71
+ data = {'input_ids': inputs['input_ids'],
72
+ 'attention_mask': inputs['attention_mask'],
73
+ 'start_positions':start_positions,
74
+ 'end_positions': end_positions,
75
+ }
76
+ df = pd.DataFrame(data)
77
+ df.to_csv('encoding_train.csv',index=False)
78
+ train = Dataset.from_pandas(df)
79
+
80
+ import pandas as pd
81
+ # Convert the dataset to a dictionary
82
+ data_dict = squad["test"].to_dict()
83
+ # Create a DataFrame from the dictionary
84
+ df = pd.DataFrame.from_dict(data_dict)
85
+
86
+ questions = [q.strip() for q in df["question"]]
87
+ context = [q.strip() for q in df["context"]]
88
+ inputs = tokenizer(
89
+ questions,
90
+ context,
91
+ max_length=384,
92
+ truncation="only_second",
93
+ return_offsets_mapping=True,
94
+ padding="max_length",
95
+ )
96
+
97
+ offset_mapping = inputs.pop("offset_mapping")
98
+
99
+ start_positions = []
100
+ end_positions = []
101
+ answers = df['answers']
102
+ for i, offset in enumerate(offset_mapping):
103
+ answer = answers[i]
104
+ start_char = answer["answer_start"][0]
105
+ end_char = answer["answer_start"][0] + len(answer["text"][0])
106
+ sequence_ids = inputs.sequence_ids(i)
107
+
108
+ # Find the start and end of the context
109
+ idx = 0
110
+ while sequence_ids[idx] != 1:
111
+ idx += 1
112
+ context_start = idx
113
+ while sequence_ids[idx] == 1:
114
+ idx += 1
115
+ context_end = idx - 1
116
+
117
+ # If the answer is not fully inside the context, label it (0, 0)
118
+ if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
119
+ start_positions.append(0)
120
+ end_positions.append(0)
121
+ else:
122
+ # Otherwise it's the start and end token positions
123
+ idx = context_start
124
+ while idx <= context_end and offset[idx][0] <= start_char:
125
+ idx += 1
126
+ start_positions.append(idx - 1)
127
+
128
+ idx = context_end
129
+ while idx >= context_start and offset[idx][1] >= end_char:
130
+ idx -= 1
131
+ end_positions.append(idx + 1)
132
+
133
+ df["start_positions"] = start_positions
134
+ df["end_positions"] = end_positions
135
+
136
+ data = {'input_ids': inputs['input_ids'],
137
+ 'attention_mask': inputs['attention_mask'],
138
+ 'start_positions':start_positions,
139
+ 'end_positions': end_positions,
140
+ }
141
+ df = pd.DataFrame(data)
142
+ df.to_csv('encoding_test.csv',index=False)
143
+ test = Dataset.from_pandas(df)
144
+
145
+ from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
146
+ model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
147
+
148
+ from transformers import DefaultDataCollator
149
+
150
+ data_collator = DefaultDataCollator()
151
+
152
+ training_args = TrainingArguments(
153
+ output_dir="my_awesome_qa_model",
154
+ evaluation_strategy="epoch",
155
+ learning_rate=2e-5,
156
+ per_device_train_batch_size=16,
157
+ per_device_eval_batch_size=16,
158
+ num_train_epochs=4,
159
+ weight_decay=0.01,
160
+ push_to_hub=True,
161
+ )
162
+
163
+ trainer = Trainer(
164
+ model=model,
165
+ args=training_args,
166
+ train_dataset=train,
167
+ eval_dataset=test,
168
+ tokenizer=tokenizer,
169
+ data_collator=data_collator,
170
+ )
171
+
172
+ trainer.train()
173
+
174
+ # Evaluate the model on the test dataset
175
+ eval_results = trainer.evaluate(eval_dataset=test)
176
+
177
+ # Print evaluation results
178
+ print("Evaluation results:", eval_results)
179
+
180
+ # Assuming you have a fine-tuned BERT model stored in the 'model' variable
181
+
182
+ # Define the directory where you want to save the model
183
+ output_dir = "path_to_save_model"
184
+
185
+ # Create the directory if it doesn't exist
186
+ import os
187
+ if not os.path.exists(output_dir):
188
+ os.makedirs(output_dir)
189
+
190
+ # Save the model and tokenizer
191
+ model.save_pretrained(output_dir)
192
+ tokenizer.save_pretrained(output_dir)
193
+
194
+ print("Model and tokenizer saved to:", output_dir)
195
+
196
+ from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
197
+
198
+ # Load the model from the specified path
199
+ model = AutoModelForQuestionAnswering.from_pretrained(model_path)
200
+
201
+ # Check the model configuration
202
+ print(model.config)
203
 
204
  from transformers import pipeline
205
 
 
 
206
 
207
+ # Create a question answering pipeline
208
+ qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
 
209
 
210
+ # Now you can use the pipeline to answer questions
211
+ question = "what does SQuAD stand for"
212
+ context = "Ernest Shackleton (15 February 1874 – 5 January 1922) led three British expeditions to the Antarctic during the Heroic Age of Antarctic Exploration. He and three companions established a new record Farthest South latitude, 112 miles (180 km) from the South Pole, as part of the Nimrod Expedition of 1907–1909, and Shackleton was knighted on his return home. He planned the Imperial Trans-Antarctic Expedition of 1914–1917 but his ship, Endurance, became trapped in pack ice and then sank on 21 November 1915. The crew escaped and used the lifeboats to reach Elephant Island and ultimately the island of South Georgia in a stormy ocean voyage of more than 700 nautical miles (800 mi; 1,300 km), Shackleton's most famous exploit. He returned to the Antarctic in 1921 with the Shackleton–Rowett Expedition, but died of a heart attack on South Georgia; at his wife's request, he was buried there. In the latter part of the 20th century, Shackleton became a role model for leadership in extreme circumstance. Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable."
213
  answer = qa_pipeline(question=question, context=context)
214
 
 
215
  print("Question:", question)
216
+ print("Answer:", answer["answer"])
 
217
 
218
  import gradio as gr
219