Spaces:
Runtime error
Runtime error
import fitz | |
from fastapi import FastAPI, File, UploadFile | |
#from pyngrok import ngrok | |
from typing import List | |
import pytesseract | |
import requests | |
from io import BytesIO | |
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline | |
from top2vec import Top2Vec | |
from llama_index.node_parser import SimpleNodeParser | |
description = """ | |
## DocQA | |
This app shows how to do Document Question Answering | |
Check out the docs for the `/predict` endpoint below to try it out! | |
""" | |
app = FastAPI(docs_url="/", description=description) | |
def doc_chunk(data): | |
node_parser = SimpleNodeParser.from_defaults(chunk_size=256) | |
nodes = node_parser.get_nodes_from_documents(data) | |
return nodes | |
def create_train_data(nodes): | |
data = [] | |
for i in range(len(nodes)): | |
#print(nodes[i].get_content()) | |
data.append(nodes[i].get_content()) | |
return data | |
def get_model(data): | |
model = Top2Vec(data, embedding_model='universal-sentence-encoder') | |
return model | |
def get_search_result(model, question): | |
documents, doc_scores, doc_ids = model.query_documents(question, 1) | |
return documents | |
# pipe = pipeline("document-question-answering", model="impira/layoutlm-document-qa") | |
# @app.post("/predict") | |
# def predict(image_file: bytes = File(...), question: str = Form(...)): | |
def load_file(file_url: str, sentences: List[str]): | |
# URL to the PDF file | |
pdf_url = file_url | |
# Initialize an empty variable to store the extracted text | |
all_text = '' | |
# Download the PDF from the URL | |
response = requests.get(pdf_url) | |
if response.status_code == 200: | |
pdf_bytes = BytesIO(response.content) | |
# Open the PDF file using PyMuPDF | |
pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf") | |
# Loop through each page and perform OCR | |
for page_num in range(pdf_document.page_count): | |
page = pdf_document.load_page(page_num) | |
print(f"Processing page {page_num + 1}...") | |
text = page.get_text() | |
all_text += text + '\n' | |
# Print or do something with the collected text | |
print(all_text) | |
model_name = "deepset/roberta-base-squad2" | |
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name) | |
########################## | |
nodes = doc_chunk(all_text) | |
data = create_train_data(nodes) | |
model = get_model(data) | |
#context = get_search_result(model, question) | |
# Define the common context | |
#context = all_text | |
# List of questions | |
questions = sentences | |
# Initialize an empty dictionary to store questions and answers | |
qa_dict = {} | |
# Get answers for each question with the same context | |
for question in questions: | |
context = get_search_result(model, question) | |
QA_input = { | |
'question': question, | |
'context': context | |
} | |
res = nlp(QA_input) | |
print(f"Question: {question}") | |
print(f"Answer: {res['answer']}") | |
qa_dict[question] = res['answer'] | |
return qa_dict | |