Spaces:
Running
Running
import streamlit as st | |
from transformers import pipeline | |
from PyPDF2 import PdfReader | |
import docx | |
# Initialize the NLP pipeline | |
nlp = pipeline( | |
"document-question-answering", | |
model="impira/layoutlm-document-qa", | |
) | |
# Set the title of the app | |
st.title("LayoutLM Example") | |
# Create a file uploader that accepts various document formats | |
uploaded_file = st.file_uploader("Drag and drop a document here", type=['txt', 'pdf', 'docx']) | |
# Create a text box for user input | |
question = st.text_area("What would you like to know?") | |
def extract_text_from_file(uploaded_file): | |
if uploaded_file.type == "text/plain": | |
return uploaded_file.read().decode("utf-8") | |
elif uploaded_file.type == "application/pdf": | |
reader = PdfReader(uploaded_file) | |
text = "" | |
for page in reader.pages: | |
text += page.extract_text() | |
return text | |
elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": | |
doc = docx.Document(uploaded_file) | |
text = "\n".join([paragraph.text for paragraph in doc.paragraphs]) | |
return text | |
else: | |
return None | |
if uploaded_file and question: | |
# Extract text from the uploaded document | |
document_text = extract_text_from_file(uploaded_file) | |
if document_text: | |
# Run the NLP model on the extracted text and the user's question | |
answer = nlp( | |
{ | |
"context": document_text, | |
"question": question | |
} | |
) | |
# Display the answer | |
st.write("Answer:") | |
st.write(answer['answer']) | |
else: | |
st.write("Unsupported file type or failed to extract text from the document.") | |