Spaces:
Running
Running
import os | |
import streamlit as st | |
from transformers import pipeline | |
import re | |
from PyPDF2 import PdfFileReader | |
# Function to truncate text to the nearest word boundary | |
def truncate_to_word_boundary(text, max_words=100): | |
words = re.findall(r'\w+', text) | |
truncated_text = ' '.join(words[:max_words]) | |
return truncated_text | |
# Function to perform question-answering | |
def question_answering(question, text): | |
# Perform question-answering using Hugging Face's Transformers | |
question_answerer = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", tokenizer="distilbert-base-cased-distilled-squad") | |
answer = question_answerer(question=question, context=text) | |
return answer | |
def main(): | |
st.title("Question Answering on an Uploaded File") | |
uploaded_file = st.file_uploader("Upload a file:", type=["pdf", "txt", "docx", "csv", "json", "txt"]) | |
question = st.text_input("Ask your question:") | |
if st.button("Answer") and uploaded_file is not None: | |
file_extension = os.path.splitext(uploaded_file.name)[1].lower() | |
file_contents = uploaded_file.read() | |
if file_extension == ".pdf": | |
# Handle PDF files using PyPDF2 | |
pdf_reader = PdfFileReader(uploaded_file) | |
pdf_text = "" | |
for page_num in range(pdf_reader.getNumPages()): | |
pdf_page = pdf_reader.getPage(page_num) | |
pdf_text += pdf_page.extractText() | |
# Perform question-answering | |
answer = question_answering(question, pdf_text) | |
elif file_extension == ".txt": | |
# Handle plain text files | |
text = file_contents.decode("utf-8") | |
# Perform question-answering | |
answer = question_answering(question, text) | |
# Add support for other file types (e.g., docx, csv, json) if needed | |
st.write(f"Question: '{question}'") | |
st.write("Answer:", answer['answer']) | |
st.write("Score:", answer['score']) | |
st.write("Page Number:", answer['start'] + 1) # Add 1 to convert 0-based index to 1-based page number | |
# Display truncated context | |
start_page = answer['start'] | |
context = pdf_text if file_extension == ".pdf" else text | |
truncated_context = truncate_to_word_boundary(context) | |
st.write("Context:", truncated_context) | |
if __name__ == "__main__": | |
main() |