import gradio as gr import pandas as pd import re import os import fitz from transformers import AutoTokenizer, AutoModelForSeq2SeqLM tokenizer = AutoTokenizer.from_pretrained("potsawee/t5-large-generation-squad-QuestionAnswer") model = AutoModelForSeq2SeqLM.from_pretrained("potsawee/t5-large-generation-squad-QuestionAnswer") def extract_text_from_pdf(pdf_file_path): doc = fitz.open(pdf_file_path) text = "" for page in doc: text+=page.get_text() return text def generate_question_answer_pairs(pdf_file): if pdf_file is None: return "Please upload a PDF file" d = {'Question':[],'Answer':[]} df = pd.DataFrame(data=d) pdf_text = extract_text_from_pdf(pdf_file.name) sentences = re.split(r'(?<=[.!?])', pdf_text) question_answer_pairs = [] for sentence in sentences: input_ids = tokenizer.encode(sentence, return_tensors="pt") outputs = model.generate(input_ids, max_length=100, num_return_sequences=1) question_answer = tokenizer.decode(outputs[0], skip_special_tokens=True) question_answer_pairs.append(question_answer) result = '' for question_answer in question_answer_pairs: qa_parts = question_answer.split("?") if len(qa_parts) >= 2: question_part = qa_parts[0] + "?" answer_part = qa_parts[1].strip() new_data = {'Question': [question_part], 'Answer': [answer_part]} df = pd.concat([df, pd.DataFrame(new_data)], ignore_index=True) result += f"Question: {question_part}\nAnswer: {answer_part}\n\n" df.to_csv("QAPairs.csv") return result, "QAPairs.csv" title = "Question-Answer Pairs Generation" input_file = gr.File(label="Upload a PDF file") output_file = gr.File(label="Download as csv") output_text = gr.Textbox() interface = gr.Interface( fn=generate_question_answer_pairs, inputs=input_file, outputs=[output_text, output_file], title=title, ) interface.launch()