Spaces:
Running
Running
import streamlit as st | |
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering | |
import fitz # PyMuPDF | |
# Function to process the uploaded PDF file | |
def process_pdf(uploaded_file, qa_model, tokenizer): | |
# Check if file is uploaded | |
if uploaded_file is not None: | |
# Read the file as bytes | |
file_contents = uploaded_file.read() | |
# Process the PDF file | |
doc = fitz.open(file_contents, filetype="pdf") | |
if doc is not None: | |
text = "" | |
for page in doc: | |
text += page.get_text() | |
# Tokenize the text | |
inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True) | |
# Perform question answering | |
outputs = qa_model(**inputs) | |
start_scores = outputs.start_logits | |
end_scores = outputs.end_logits | |
# Display the generated questions and answers | |
for i, (start, end) in enumerate(zip(start_scores, end_scores)): | |
answer = tokenizer.decode(inputs["input_ids"][i][start.argmax():end.argmax()+1]) | |
st.write("Answer:", answer) | |
st.write("---") | |
else: | |
st.error("Error occurred while opening the PDF file.") | |
# Main function | |
def main(): | |
# Load the question answering model and tokenizer | |
qa_model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad") | |
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad") | |
# Set title and description | |
st.title("PDF QA Generator") | |
st.write("Upload a PDF file and generate questions and answers!") | |
# Create a sidebar for file upload | |
st.sidebar.title("Upload File") | |
uploaded_file = st.sidebar.file_uploader("Choose a PDF file", type=['pdf']) | |
# Process the uploaded PDF file | |
process_pdf(uploaded_file, qa_model, tokenizer) | |
if __name__ == "__main__": | |
main() | |