Spaces:

DpShirazi
/

PupQuizAI

Sleeping

App Files Files Community

DpShirazi commited on Dec 26, 2023

Commit

8c07c55

1 Parent(s): 1dd973d

Upload 3 files

Browse files

Files changed (3) hide show

app.py +123 -0
questiongenerator.py +429 -0
requirements.txt +10 -0

app.py ADDED Viewed

	@@ -0,0 +1,123 @@

+# -*- coding: utf-8 -*-
+"""
+Created on Mon Dec 25 18:18:27 2023
+@author: alish
+"""
+import gradio as gr
+import fitz  # PyMuPDF
+import questiongenerator as qs
+import random
+from questiongenerator import QuestionGenerator
+qg = QuestionGenerator()
+def Extract_QA(qlist):
+        i=0
+        question_i= qlist[i]['question']
+        Choices_ans= []
+        Choice_is_correct=[]
+        for j in range(4):
+           Choices_ans= Choices_ans+ [qlist[i]['answer'][j]['answer']]
+           Choice_is_correct= Choice_is_correct+ [qlist[i]['answer'][j]['correct']]
+        Q=f"""
+             Q: {question_i}
+             A. {Choices_ans[0]}
+             B. {Choices_ans[1]}
+             C. {Choices_ans[2]}
+             D. {Choices_ans[3]}
+            """
+        xs=['A','B','C','D']
+        result = [x for x, y in zip(xs, Choice_is_correct) if y ]
+        A= f"""
+            The rigth answer is: {result[0]}
+            """
+        return (Q,A)
+def extract_text_from_pdf(pdf_file_path):
+    # Read the PDF file
+    global extracted_text
+    text = []
+    with fitz.open(pdf_file_path) as doc:
+        for page in doc:
+            text.append(page.get_text())
+    extracted_text= '\n'.join(text)
+    extracted_text= get_sub_text(extracted_text)
+    return ("The pdf is uploaded Successfully from:"+ str(pdf_file_path))
+qg = qs.QuestionGenerator()
+def get_sub_text(TXT):
+   sub_texts= qg._split_into_segments(TXT)
+   if isinstance(sub_texts, list):
+       return sub_texts
+   else:
+       return [sub_texts]
+def pick_One_txt(sub_texts):
+    global selected_extracted_text
+    N= len(sub_texts)
+    if N==1:
+       selected_extracted_text= sub_texts[0]
+       return(selected_extracted_text)
+    # Generate a random number between low and high
+    random_number = random.uniform(0, N)
+    # Pick the integer part of the random number
+    random_number = int(random_number)
+    selected_extracted_text= sub_texts[random_number]
+    return(selected_extracted_text)
+def pipeline():
+    global Q,A
+    text= selected_extracted_text
+    qlist= qg.generate(text, num_questions=1, answer_style="multiple_choice")
+    Q,A= Extract_QA(qlist)
+    A= A + '\n'+text
+    return (Q,A)
+def ReurnAnswer():
+    return A
+def GetQuestion():
+    pick_One_txt(extracted_text)
+    Q,A=pipeline()
+    return Q
+with gr.Blocks() as demo:
+    with gr.Row():
+        #input_file=gr.File(type="filepath", label="Upload PDF Document")
+        input_file=gr.UploadButton(label='Select a file!', file_types=[".pdf"])
+        #upload_btn = gr.Button(value="Upload File")
+        #txt= extract_text_from_pdf(input_file)
+    with gr.Row():
+        with gr.Column():
+            upload_btn = gr.Button(value="Upload the pdf File.")
+            Gen_Question = gr.Button(value="Show the Question")
+            Gen_Answer = gr.Button(value="Show the Answer")
+        with gr.Column():
+            file_stat= gr.Textbox(label="File Status")
+            question = gr.Textbox(label="Question(s)")
+            Answer = gr.Textbox(label="Answer(s)")
+    upload_btn.click(extract_text_from_pdf, inputs=input_file, outputs=file_stat, api_name="QuestioGenerator")
+    Gen_Question.click(GetQuestion, inputs=None, outputs=question, api_name="QuestioGenerator")
+    Gen_Answer.click(ReurnAnswer, inputs=None, outputs=Answer, api_name="QuestioGenerator")
+    #examples = gr.Examples(examples=["I went to the supermarket yesterday.", "Helen is a good swimmer."],
+    #                       inputs=[english])
+demo.launch()

questiongenerator.py ADDED Viewed

	@@ -0,0 +1,429 @@

+import en_core_web_sm
+import json
+import numpy as np
+import random
+import re
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSeq2SeqLM,
+    AutoModelForSequenceClassification,
+)
+from typing import Any, List, Mapping, Tuple
+class QuestionGenerator:
+    """A transformer-based NLP system for generating reading comprehension-style questions from
+    texts. It can generate full sentence questions, multiple choice questions, or a mix of the
+    two styles.
+    To filter out low quality questions, questions are assigned a score and ranked once they have
+    been generated. Only the top k questions will be returned. This behaviour can be turned off
+    by setting use_evaluator=False.
+    """
+    def __init__(self) -> None:
+        QG_PRETRAINED = "iarfmoose/t5-base-question-generator"
+        self.ANSWER_TOKEN = "<answer>"
+        self.CONTEXT_TOKEN = "<context>"
+        self.SEQ_LENGTH = 512
+        self.device = torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu")
+        self.qg_tokenizer = AutoTokenizer.from_pretrained(
+            QG_PRETRAINED, use_fast=False)
+        self.qg_model = AutoModelForSeq2SeqLM.from_pretrained(QG_PRETRAINED)
+        self.qg_model.to(self.device)
+        self.qg_model.eval()
+        self.qa_evaluator = QAEvaluator()
+    def generate(
+        self,
+        article: str,
+        use_evaluator: bool = True,
+        num_questions: bool = None,
+        answer_style: str = "all"
+    ) -> List:
+        """Takes an article and generates a set of question and answer pairs. If use_evaluator
+        is True then QA pairs will be ranked and filtered based on their quality. answer_style
+        should selected from ["all", "sentences", "multiple_choice"].
+        """
+        print("Generating questions...\n")
+        qg_inputs, qg_answers = self.generate_qg_inputs(article, answer_style)
+        generated_questions = self.generate_questions_from_inputs(qg_inputs)
+        message = "{} questions doesn't match {} answers".format(
+            len(generated_questions), len(qg_answers)
+        )
+        assert len(generated_questions) == len(qg_answers), message
+        if use_evaluator:
+            print("Evaluating QA pairs...\n")
+            encoded_qa_pairs = self.qa_evaluator.encode_qa_pairs(
+                generated_questions, qg_answers
+            )
+            scores = self.qa_evaluator.get_scores(encoded_qa_pairs)
+            if num_questions:
+                qa_list = self._get_ranked_qa_pairs(
+                    generated_questions, qg_answers, scores, num_questions
+                )
+            else:
+                qa_list = self._get_ranked_qa_pairs(
+                    generated_questions, qg_answers, scores
+                )
+        else:
+            print("Skipping evaluation step.\n")
+            qa_list = self._get_all_qa_pairs(generated_questions, qg_answers)
+        return qa_list
+    def generate_qg_inputs(self, text: str, answer_style: str) -> Tuple[List[str], List[str]]:
+        """Given a text, returns a list of model inputs and a list of corresponding answers.
+        Model inputs take the form "answer_token <answer text> context_token <context text>" where
+        the answer is a string extracted from the text, and the context is the wider text surrounding
+        the context.
+        """
+        VALID_ANSWER_STYLES = ["all", "sentences", "multiple_choice"]
+        if answer_style not in VALID_ANSWER_STYLES:
+            raise ValueError(
+                "Invalid answer style {}. Please choose from {}".format(
+                    answer_style, VALID_ANSWER_STYLES
+                )
+            )
+        inputs = []
+        answers = []
+        if answer_style == "sentences" or answer_style == "all":
+            segments = self._split_into_segments(text)
+            for segment in segments:
+                sentences = self._split_text(segment)
+                prepped_inputs, prepped_answers = self._prepare_qg_inputs(
+                    sentences, segment
+                )
+                inputs.extend(prepped_inputs)
+                answers.extend(prepped_answers)
+        if answer_style == "multiple_choice" or answer_style == "all":
+            sentences = self._split_text(text)
+            prepped_inputs, prepped_answers = self._prepare_qg_inputs_MC(
+                sentences
+            )
+            inputs.extend(prepped_inputs)
+            answers.extend(prepped_answers)
+        return inputs, answers
+    def generate_questions_from_inputs(self, qg_inputs: List) -> List[str]:
+        """Given a list of concatenated answers and contexts, with the form:
+        "answer_token <answer text> context_token <context text>", generates a list of
+        questions.
+        """
+        generated_questions = []
+        for qg_input in qg_inputs:
+            question = self._generate_question(qg_input)
+            generated_questions.append(question)
+        return generated_questions
+    def _split_text(self, text: str) -> List[str]:
+        """Splits the text into sentences, and attempts to split or truncate long sentences."""
+        MAX_SENTENCE_LEN = 128
+        sentences = re.findall(".*?[.!\?]", text)
+        cut_sentences = []
+        for sentence in sentences:
+            if len(sentence) > MAX_SENTENCE_LEN:
+                cut_sentences.extend(re.split("[,;:)]", sentence))
+        # remove useless post-quote sentence fragments
+        cut_sentences = [s for s in sentences if len(s.split(" ")) > 5]
+        sentences = sentences + cut_sentences
+        return list(set([s.strip(" ") for s in sentences]))
+    def _split_into_segments(self, text: str) -> List[str]:
+        """Splits a long text into segments short enough to be input into the transformer network.
+        Segments are used as context for question generation.
+        """
+        MAX_TOKENS = 490
+        paragraphs = text.split("\n")
+        tokenized_paragraphs = [
+            self.qg_tokenizer(p)["input_ids"] for p in paragraphs if len(p) > 0
+        ]
+        segments = []
+        while len(tokenized_paragraphs) > 0:
+            segment = []
+            while len(segment) < MAX_TOKENS and len(tokenized_paragraphs) > 0:
+                paragraph = tokenized_paragraphs.pop(0)
+                segment.extend(paragraph)
+            segments.append(segment)
+        return [self.qg_tokenizer.decode(s, skip_special_tokens=True) for s in segments]
+    def _prepare_qg_inputs(
+        self,
+        sentences: List[str],
+        text: str
+    ) -> Tuple[List[str], List[str]]:
+        """Uses sentences as answers and the text as context. Returns a tuple of (model inputs, answers).
+        Model inputs are "answer_token <answer text> context_token <context text>"
+        """
+        inputs = []
+        answers = []
+        for sentence in sentences:
+            qg_input = f"{self.ANSWER_TOKEN} {sentence} {self.CONTEXT_TOKEN} {text}"
+            inputs.append(qg_input)
+            answers.append(sentence)
+        return inputs, answers
+    def _prepare_qg_inputs_MC(self, sentences: List[str]) -> Tuple[List[str], List[str]]:
+        """Performs NER on the text, and uses extracted entities are candidate answers for multiple-choice
+        questions. Sentences are used as context, and entities as answers. Returns a tuple of (model inputs, answers).
+        Model inputs are "answer_token <answer text> context_token <context text>"
+        """
+        spacy_nlp = en_core_web_sm.load()
+        docs = list(spacy_nlp.pipe(sentences, disable=["parser"]))
+        inputs_from_text = []
+        answers_from_text = []
+        for doc, sentence in zip(docs, sentences):
+            entities = doc.ents
+            if entities:
+                for entity in entities:
+                    qg_input = f"{self.ANSWER_TOKEN} {entity} {self.CONTEXT_TOKEN} {sentence}"
+                    answers = self._get_MC_answers(entity, docs)
+                    inputs_from_text.append(qg_input)
+                    answers_from_text.append(answers)
+        return inputs_from_text, answers_from_text
+    def _get_MC_answers(self, correct_answer: Any, docs: Any) -> List[Mapping[str, Any]]:
+        """Finds a set of alternative answers for a multiple-choice question. Will attempt to find
+        alternatives of the same entity type as correct_answer if possible.
+        """
+        entities = []
+        for doc in docs:
+            entities.extend([{"text": e.text, "label_": e.label_}
+                            for e in doc.ents])
+        # remove duplicate elements
+        entities_json = [json.dumps(kv) for kv in entities]
+        pool = set(entities_json)
+        num_choices = (
+            min(4, len(pool)) - 1
+        )  # -1 because we already have the correct answer
+        # add the correct answer
+        final_choices = []
+        correct_label = correct_answer.label_
+        final_choices.append({"answer": correct_answer.text, "correct": True})
+        pool.remove(
+            json.dumps({"text": correct_answer.text,
+                       "label_": correct_answer.label_})
+        )
+        # find answers with the same NER label
+        matches = [e for e in pool if correct_label in e]
+        # if we don't have enough then add some other random answers
+        if len(matches) < num_choices:
+            choices = matches
+            pool = pool.difference(set(choices))
+            choices.extend(random.sample(pool, num_choices - len(choices)))
+        else:
+            choices = random.sample(matches, num_choices)
+        choices = [json.loads(s) for s in choices]
+        for choice in choices:
+            final_choices.append({"answer": choice["text"], "correct": False})
+        random.shuffle(final_choices)
+        return final_choices
+    @torch.no_grad()
+    def _generate_question(self, qg_input: str) -> str:
+        """Takes qg_input which is the concatenated answer and context, and uses it to generate
+        a question sentence. The generated question is decoded and then returned.
+        """
+        encoded_input = self._encode_qg_input(qg_input)
+        output = self.qg_model.generate(input_ids=encoded_input["input_ids"])
+        question = self.qg_tokenizer.decode(
+            output[0],
+            skip_special_tokens=True
+        )
+        return question
+    def _encode_qg_input(self, qg_input: str) -> torch.tensor:
+        """Tokenizes a string and returns a tensor of input ids corresponding to indices of tokens in
+        the vocab.
+        """
+        return self.qg_tokenizer(
+            qg_input,
+            padding='max_length',
+            max_length=self.SEQ_LENGTH,
+            truncation=True,
+            return_tensors="pt",
+        ).to(self.device)
+    def _get_ranked_qa_pairs(
+        self, generated_questions: List[str], qg_answers: List[str], scores, num_questions: int = 10
+    ) -> List[Mapping[str, str]]:
+        """Ranks generated questions according to scores, and returns the top num_questions examples.
+        """
+        if num_questions > len(scores):
+            num_questions = len(scores)
+            print((
+                f"\nWas only able to generate {num_questions} questions.",
+                "For more questions, please input a longer text.")
+            )
+        qa_list = []
+        for i in range(num_questions):
+            index = scores[i]
+            qa = {
+                "question": generated_questions[index].split("?")[0] + "?",
+                "answer": qg_answers[index]
+            }
+            qa_list.append(qa)
+        return qa_list
+    def _get_all_qa_pairs(self, generated_questions: List[str], qg_answers: List[str]):
+        """Formats question and answer pairs without ranking or filtering."""
+        qa_list = []
+        for question, answer in zip(generated_questions, qg_answers):
+            qa = {
+                "question": question.split("?")[0] + "?",
+                "answer": answer
+            }
+            qa_list.append(qa)
+        return qa_list
+class QAEvaluator:
+    """Wrapper for a transformer model which evaluates the quality of question-answer pairs.
+    Given a QA pair, the model will generate a score. Scores can be used to rank and filter
+    QA pairs.
+    """
+    def __init__(self) -> None:
+        QAE_PRETRAINED = "iarfmoose/bert-base-cased-qa-evaluator"
+        self.SEQ_LENGTH = 512
+        self.device = torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu")
+        self.qae_tokenizer = AutoTokenizer.from_pretrained(QAE_PRETRAINED)
+        self.qae_model = AutoModelForSequenceClassification.from_pretrained(
+            QAE_PRETRAINED
+        )
+        self.qae_model.to(self.device)
+        self.qae_model.eval()
+    def encode_qa_pairs(self, questions: List[str], answers: List[str]) -> List[torch.tensor]:
+        """Takes a list of questions and a list of answers and encodes them as a list of tensors."""
+        encoded_pairs = []
+        for question, answer in zip(questions, answers):
+            encoded_qa = self._encode_qa(question, answer)
+            encoded_pairs.append(encoded_qa.to(self.device))
+        return encoded_pairs
+    def get_scores(self, encoded_qa_pairs: List[torch.tensor]) -> List[float]:
+        """Generates scores for a list of encoded QA pairs."""
+        scores = {}
+        for i in range(len(encoded_qa_pairs)):
+            scores[i] = self._evaluate_qa(encoded_qa_pairs[i])
+        return [
+            k for k, v in sorted(scores.items(), key=lambda item: item[1], reverse=True)
+        ]
+    def _encode_qa(self, question: str, answer: str) -> torch.tensor:
+        """Concatenates a question and answer, and then tokenizes them. Returns a tensor of
+        input ids corresponding to indices in the vocab.
+        """
+        if type(answer) is list:
+            for a in answer:
+                if a["correct"]:
+                    correct_answer = a["answer"]
+        else:
+            correct_answer = answer
+        return self.qae_tokenizer(
+            text=question,
+            text_pair=correct_answer,
+            padding="max_length",
+            max_length=self.SEQ_LENGTH,
+            truncation=True,
+            return_tensors="pt",
+        )
+    @torch.no_grad()
+    def _evaluate_qa(self, encoded_qa_pair: torch.tensor) -> float:
+        """Takes an encoded QA pair and returns a score."""
+        output = self.qae_model(**encoded_qa_pair)
+        return output[0][0][1]
+def print_qa(qa_list: List[Mapping[str, str]], show_answers: bool = True) -> None:
+    """Formats and prints a list of generated questions and answers."""
+    for i in range(len(qa_list)):
+        # wider space for 2 digit q nums
+        space = " " * int(np.where(i < 9, 3, 4))
+        print(f"{i + 1}) Q: {qa_list[i]['question']}")
+        answer = qa_list[i]["answer"]
+        # print a list of multiple choice answers
+        if type(answer) is list:
+            if show_answers:
+                print(
+                    f"{space}A: 1. {answer[0]['answer']} "
+                    f"{np.where(answer[0]['correct'], '(correct)', '')}"
+                )
+                for j in range(1, len(answer)):
+                    print(
+                        f"{space + '   '}{j + 1}. {answer[j]['answer']} "
+                        f"{np.where(answer[j]['correct']==True,'(correct)', '')}"
+                    )
+            else:
+                print(f"{space}A: 1. {answer[0]['answer']}")
+                for j in range(1, len(answer)):
+                    print(f"{space + '   '}{j + 1}. {answer[j]['answer']}")
+            print("")
+        # print full sentence answers
+        else:
+            if show_answers:
+                print(f"{space}A: {answer}\n")

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+datasets==1.16.1
+en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz
+numpy==1.22.0
+sentencepiece==0.1.96
+spacy
+tokenizers==0.10.3
+torch==1.7.1
+transformers==4.12.5
+gradio
+pymupdf