Spaces:

Jahanzeb1
/

BERT-QA

Sleeping

App Files Files Community

Jahanzeb1 commited on Feb 17, 2024

Commit

8309726

verified ·

1 Parent(s): 7bf7d9e

Update app.py

Browse files

Files changed (1) hide show

app.py +205 -18

app.py CHANGED Viewed

@@ -1,32 +1,219 @@
-from transformers import AutoModelForQuestionAnswering, AutoTokenizer
-# Model name or identifier from Hugging Face model hub
-model_name = "google-bert/bert-large-uncased-whole-word-masking-finetuned-squad"
-# Load the pre-trained model and tokenizer
-model = AutoModelForQuestionAnswering.from_pretrained(model_name)
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-# Save the model and tokenizer to a directory
-model.save_pretrained("model_directory")
-tokenizer.save_pretrained("model_directory")
 from transformers import pipeline
-# Load pre-trained question-answering model
-qa_pipeline = pipeline("question-answering", model="google-bert/bert-large-uncased-whole-word-masking-finetuned-squad", tokenizer="google-bert/bert-large-uncased-whole-word-masking-finetuned-squad", max_answer_length=6000,from_tf=True)
-# Define context and question
-context = "The Amazon rainforest (Portuguese: Floresta Amazônica or Amazônia; Spanish: Selva Amazónica, Amazonía or usually Amazonia; French: Forêt amazonienne; Dutch: Amazoneregenwoud), also known in English as Amazonia or the Amazon Jungle, is a moist broadleaf forest that covers most of the Amazon basin of South America. This basin encompasses 7,000,000 square kilometres (2,700,000 sq mi), of which 5,500,000 square kilometres (2,100,000 sq mi) are covered by the rainforest. This region includes territory belonging to nine nations. The majority of the forest is contained within Brazil, with 60% of the rainforest, followed by Peru with 13%, Colombia with 10%, and with minor amounts in Venezuela, Ecuador, Bolivia, Guyana, Suriname and French Guiana. States or departments in four nations contain 'Amazonas' in their names. The Amazon represents over half of the planet's remaining rainforests, and comprises the largest and most biodiverse tract of tropical rainforest in the world, with an estimated 390 billion individual trees divided into 16,000 species."
-question = "What is it called in Portuguese"
-# Get answer
 answer = qa_pipeline(question=question, context=context)
-# Print answer
 print("Question:", question)
-print("Answer:", answer['answer'])
 import gradio as gr

+from huggingface_hub import notebook_login
+notebook_login()
+from datasets import load_dataset
+squad = load_dataset("squad", split="train[:5000]")
+squad = squad.train_test_split(test_size=0.2)
+import pandas as pd
+import pandas as pd
+# Convert the dataset to a dictionary
+data_dict = squad["train"].to_dict()
+# Create a DataFrame from the dictionary
+df = pd.DataFrame.from_dict(data_dict)
+from transformers import AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+questions = [q.strip() for q in df["question"]]
+context = [q.strip() for q in df["context"]]
+inputs = tokenizer(
+        questions,
+        context,
+        max_length=384,
+        truncation="only_second",
+        return_offsets_mapping=True,
+        padding="max_length",
+    )
+offset_mapping = inputs.pop("offset_mapping")
+start_positions = []
+end_positions = []
+answers = df['answers']
+for i, offset in enumerate(offset_mapping):
+    answer = answers[i]
+    start_char = answer["answer_start"][0]
+    end_char = answer["answer_start"][0] + len(answer["text"][0])
+    sequence_ids = inputs.sequence_ids(i)
+    # Find the start and end of the context
+    idx = 0
+    while sequence_ids[idx] != 1:
+        idx += 1
+    context_start = idx
+    while sequence_ids[idx] == 1:
+        idx += 1
+    context_end = idx - 1
+    # If the answer is not fully inside the context, label it (0, 0)
+    if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
+        start_positions.append(0)
+        end_positions.append(0)
+    else:
+        # Otherwise it's the start and end token positions
+        idx = context_start
+        while idx <= context_end and offset[idx][0] <= start_char:
+            idx += 1
+        start_positions.append(idx - 1)
+        idx = context_end
+        while idx >= context_start and offset[idx][1] >= end_char:
+            idx -= 1
+        end_positions.append(idx + 1)
+df["start_positions"] = start_positions
+df["end_positions"] = end_positions
+import pandas as pd
+from datasets import Dataset
+data = {'input_ids': inputs['input_ids'],
+        'attention_mask': inputs['attention_mask'],
+        'start_positions':start_positions,
+        'end_positions': end_positions,
+       }
+df = pd.DataFrame(data)
+df.to_csv('encoding_train.csv',index=False)
+train = Dataset.from_pandas(df)
+import pandas as pd
+# Convert the dataset to a dictionary
+data_dict = squad["test"].to_dict()
+# Create a DataFrame from the dictionary
+df = pd.DataFrame.from_dict(data_dict)
+questions = [q.strip() for q in df["question"]]
+context = [q.strip() for q in df["context"]]
+inputs = tokenizer(
+        questions,
+        context,
+        max_length=384,
+        truncation="only_second",
+        return_offsets_mapping=True,
+        padding="max_length",
+    )
+offset_mapping = inputs.pop("offset_mapping")
+start_positions = []
+end_positions = []
+answers = df['answers']
+for i, offset in enumerate(offset_mapping):
+    answer = answers[i]
+    start_char = answer["answer_start"][0]
+    end_char = answer["answer_start"][0] + len(answer["text"][0])
+    sequence_ids = inputs.sequence_ids(i)
+    # Find the start and end of the context
+    idx = 0
+    while sequence_ids[idx] != 1:
+        idx += 1
+    context_start = idx
+    while sequence_ids[idx] == 1:
+        idx += 1
+    context_end = idx - 1
+    # If the answer is not fully inside the context, label it (0, 0)
+    if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
+        start_positions.append(0)
+        end_positions.append(0)
+    else:
+        # Otherwise it's the start and end token positions
+        idx = context_start
+        while idx <= context_end and offset[idx][0] <= start_char:
+            idx += 1
+        start_positions.append(idx - 1)
+        idx = context_end
+        while idx >= context_start and offset[idx][1] >= end_char:
+            idx -= 1
+        end_positions.append(idx + 1)
+df["start_positions"] = start_positions
+df["end_positions"] = end_positions
+data = {'input_ids': inputs['input_ids'],
+        'attention_mask': inputs['attention_mask'],
+        'start_positions':start_positions,
+        'end_positions': end_positions,
+       }
+df = pd.DataFrame(data)
+df.to_csv('encoding_test.csv',index=False)
+test = Dataset.from_pandas(df)
+from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
+model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+from transformers import DefaultDataCollator
+data_collator = DefaultDataCollator()
+training_args = TrainingArguments(
+    output_dir="my_awesome_qa_model",
+    evaluation_strategy="epoch",
+    learning_rate=2e-5,
+    per_device_train_batch_size=16,
+    per_device_eval_batch_size=16,
+    num_train_epochs=4,
+    weight_decay=0.01,
+    push_to_hub=True,
+)
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=train,
+    eval_dataset=test,
+    tokenizer=tokenizer,
+    data_collator=data_collator,
+)
+trainer.train()
+# Evaluate the model on the test dataset
+eval_results = trainer.evaluate(eval_dataset=test)
+# Print evaluation results
+print("Evaluation results:", eval_results)
+# Assuming you have a fine-tuned BERT model stored in the 'model' variable
+# Define the directory where you want to save the model
+output_dir = "path_to_save_model"
+# Create the directory if it doesn't exist
+import os
+if not os.path.exists(output_dir):
+    os.makedirs(output_dir)
+# Save the model and tokenizer
+model.save_pretrained(output_dir)
+tokenizer.save_pretrained(output_dir)
+print("Model and tokenizer saved to:", output_dir)
+from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
+# Load the model from the specified path
+model = AutoModelForQuestionAnswering.from_pretrained(model_path)
+# Check the model configuration
+print(model.config)
 from transformers import pipeline
+# Create a question answering pipeline
+qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
+# Now you can use the pipeline to answer questions
+question = "what does SQuAD stand for"
+context = "Ernest Shackleton (15 February 1874 – 5 January 1922) led three British expeditions to the Antarctic during the Heroic Age of Antarctic Exploration. He and three companions established a new record Farthest South latitude, 112 miles (180 km) from the South Pole, as part of the Nimrod Expedition of 1907–1909, and Shackleton was knighted on his return home. He planned the Imperial Trans-Antarctic Expedition of 1914–1917 but his ship, Endurance, became trapped in pack ice and then sank on 21 November 1915. The crew escaped and used the lifeboats to reach Elephant Island and ultimately the island of South Georgia in a stormy ocean voyage of more than 700 nautical miles (800 mi; 1,300 km), Shackleton's most famous exploit. He returned to the Antarctic in 1921 with the Shackleton–Rowett Expedition, but died of a heart attack on South Georgia; at his wife's request, he was buried there. In the latter part of the 20th century, Shackleton became a role model for leadership in extreme circumstance. Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable."
 answer = qa_pipeline(question=question, context=context)
 print("Question:", question)
+print("Answer:", answer["answer"])
 import gradio as gr