Spaces:
Sleeping
Sleeping
import PyPDF2 | |
import gradio as gr | |
import json | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline | |
# Função para extrair texto do PDF | |
def extract_text_from_pdf(pdf_file): | |
reader = PyPDF2.PdfReader(pdf_file) | |
text = "" | |
for page in reader.pages: | |
text += page.extract_text() | |
return text | |
# Função para gerar perguntas usando um modelo da Hugging Face | |
def generate_questions(text): | |
tokenizer = AutoTokenizer.from_pretrained("valhalla/t5-base-qg-hl") | |
model = AutoModelForSeq2SeqLM.from_pretrained("valhalla/t5-base-qg-hl") | |
inputs = tokenizer.encode("generate questions: " + text, return_tensors="pt", max_length=512, truncation=True) | |
outputs = model.generate(inputs, max_length=512, num_beams=4, early_stopping=True) | |
questions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs] | |
return questions | |
# Função para responder perguntas usando um pipeline de perguntas e respostas | |
def answer_questions(context, questions): | |
qa_pipeline = pipeline("question-answering") | |
qas = [] | |
for question in questions: | |
answer = qa_pipeline(question=question, context=context) | |
qas.append({ | |
"question": question, | |
"answer": answer['answer'], | |
"answer_start": answer['start'] | |
}) | |
return qas | |
# Função para converter os pares de QA no formato SQuAD | |
def convert_to_squad_format(qas, context): | |
squad_data = [] | |
for i, qa in enumerate(qas): | |
entry = { | |
"title": "Generated Data", | |
"context": context, | |
"question": qa['question'], | |
"id": str(i), | |
"answers": { | |
"answer_start": [qa['answer_start']], | |
"text": [qa['answer']] | |
} | |
} | |
squad_data.append(entry) | |
return squad_data | |
# Função para salvar os dados no formato SQuAD | |
def save_to_json(data, file_name): | |
if not file_name.endswith(".json"): | |
file_name += ".json" | |
with open(file_name, "w", encoding='utf-8') as f: | |
json.dump(data, f, ensure_ascii=False, indent=4) | |
return file_name | |
# Função principal para ser usada no Gradio | |
def process_pdf(pdf_file, file_name): | |
context = extract_text_from_pdf(pdf_file) | |
questions = generate_questions(context) | |
qas = answer_questions(context, questions) | |
squad_data = convert_to_squad_format(qas, context) | |
file_path = save_to_json(squad_data, file_name) | |
return file_path | |
# Interface Gradio | |
with gr.Blocks() as demo: | |
with gr.Row(): | |
pdf_file = gr.File(label="Upload PDF", file_types=[".pdf"]) | |
file_name = gr.Textbox(label="Output JSON File Name", value="squad_dataset") | |
process_button = gr.Button("Process PDF") | |
download_link = gr.File(label="Download JSON", interactive=False) | |
process_button.click(fn=process_pdf, inputs=[pdf_file, file_name], outputs=download_link) | |
demo.launch() |