Spaces:

ivyblossom
/

question-answering

Running

App Files Files Community

question-answering / app.py

ivyblossom

Update app.py

6b1590b over 1 year ago

raw

history blame

2.39 kB

	import os
	import streamlit as st
	from transformers import pipeline
	import re
	from PyPDF2 import PdfFileReader

	# Function to truncate text to the nearest word boundary
	def truncate_to_word_boundary(text, max_words=100):
	words = re.findall(r'\w+', text)
	truncated_text = ' '.join(words[:max_words])
	return truncated_text

	# Function to perform question-answering
	def question_answering(question, text):
	# Perform question-answering using Hugging Face's Transformers
	question_answerer = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", tokenizer="distilbert-base-cased-distilled-squad")
	answer = question_answerer(question=question, context=text)

	return answer

	def main():
	st.title("Question Answering on an Uploaded File")

	uploaded_file = st.file_uploader("Upload a file:", type=["pdf", "txt", "docx", "csv", "json", "txt"])
	question = st.text_input("Ask your question:")

	if st.button("Answer") and uploaded_file is not None:
	file_extension = os.path.splitext(uploaded_file.name)[1].lower()
	file_contents = uploaded_file.read()

	if file_extension == ".pdf":
	# Handle PDF files using PyPDF2
	pdf_reader = PdfFileReader(uploaded_file)
	pdf_text = ""
	for page_num in range(pdf_reader.getNumPages()):
	pdf_page = pdf_reader.getPage(page_num)
	pdf_text += pdf_page.extractText()

	# Perform question-answering
	answer = question_answering(question, pdf_text)

	elif file_extension == ".txt":
	# Handle plain text files
	text = file_contents.decode("utf-8")
	# Perform question-answering
	answer = question_answering(question, text)

	# Add support for other file types (e.g., docx, csv, json) if needed

	st.write(f"Question: '{question}'")
	st.write("Answer:", answer['answer'])
	st.write("Score:", answer['score'])
	st.write("Page Number:", answer['start'] + 1) # Add 1 to convert 0-based index to 1-based page number

	# Display truncated context
	start_page = answer['start']
	context = pdf_text if file_extension == ".pdf" else text
	truncated_context = truncate_to_word_boundary(context)
	st.write("Context:", truncated_context)

	if __name__ == "__main__":
	main()