Spaces:

ivyblossom
/

question-answering

Running

App Files Files Community

ivyblossom commited on Aug 3, 2023

Commit

f601880

1 Parent(s): 7b208e8

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -35

app.py CHANGED Viewed

@@ -1,19 +1,8 @@
 import os
-import fitz  # PyMuPDF for parsing PDF
 import streamlit as st
 from transformers import pipeline
 import re
-# Function to extract text from a PDF file
-def extract_text_from_pdf(pdf_path):
-    text = ""
-    with fitz.open(pdf_path) as pdf_document:
-        for page_num in range(pdf_document.page_count):
-            page = pdf_document.load_page(page_num)
-            page_text = page.get_text()
-            text += page_text
-            yield page_num + 1, page_text  # Return the page number (1-based) and the extracted text
 # Function to truncate text to the nearest word boundary
 def truncate_to_word_boundary(text, max_words=100):
     words = re.findall(r'\w+', text)
@@ -21,45 +10,53 @@ def truncate_to_word_boundary(text, max_words=100):
     return truncated_text
 # Function to perform question-answering
-def question_answering(question, pdf_text_with_pages):
-    pdf_text = "\n".join([text for _, text in pdf_text_with_pages])
     # Perform question-answering using Hugging Face's Transformers
     question_answerer = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", tokenizer="distilbert-base-cased-distilled-squad")
-    answer = question_answerer(question=question, context=pdf_text)
     return answer
 def main():
-    st.title("Question Answering using a PDF Document")
-    pdf_file = st.file_uploader("Upload a PDF file:", type=["pdf"])
     question = st.text_input("Ask your question:")
-    if st.button("Answer"):
-        if pdf_file:
-            pdf_path = os.path.join(os.getcwd(), pdf_file.name)
-            with open(pdf_path, "wb") as f:
-                f.write(pdf_file.read())
-            # Extract text from the PDF along with page numbers
-            pdf_text_with_pages = list(extract_text_from_pdf(pdf_path))
             # Perform question-answering
-            answer = question_answering(question, pdf_text_with_pages)
-            os.remove(pdf_path)  # Delete the uploaded file after processing
-            st.write(f"Question: '{question}'")
-            st.write("Answer:", answer['answer'])
-            st.write("Score:", answer['score'])
-            st.write("Page Number:", answer['start'] + 1)  # Add 1 to convert 0-based index to 1-based page number
-            # Display truncated context
-            start_page = answer['start']
-            context = pdf_text_with_pages[start_page][1]
-            truncated_context = truncate_to_word_boundary(context)
-            st.write("Context:", truncated_context)
 if __name__ == "__main__":
     main()

 import os
 import streamlit as st
 from transformers import pipeline
 import re
 # Function to truncate text to the nearest word boundary
 def truncate_to_word_boundary(text, max_words=100):
     words = re.findall(r'\w+', text)
     return truncated_text
 # Function to perform question-answering
+def question_answering(question, text):
     # Perform question-answering using Hugging Face's Transformers
     question_answerer = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", tokenizer="distilbert-base-cased-distilled-squad")
+    answer = question_answerer(question=question, context=text)
     return answer
 def main():
+    st.title("Question Answering on Uploaded Files")
+    uploaded_file = st.file_uploader("Upload a file:", type=["pdf", "txt"]) # , "docx", "csv", "json", "txt"
     question = st.text_input("Ask your question:")
+    if st.button("Answer") and uploaded_file is not None:
+        file_extension = os.path.splitext(uploaded_file.name)[1].lower()
+        file_contents = uploaded_file.read()
+        if file_extension == ".pdf":
+            # Handle PDF files
+            import fitz  # PyMuPDF for parsing PDF
+            pdf_text = ""
+            with fitz.open(stream=uploaded_file, filetype="pdf") as pdf_document:
+                for page in pdf_document:
+                    pdf_text += page.get_text()
+            # Perform question-answering
+            answer = question_answering(question, pdf_text)
+        elif file_extension == ".txt":
+            # Handle plain text files
+            text = file_contents.decode("utf-8")
             # Perform question-answering
+            answer = question_answering(question, text)
+        # Add support for other file types (e.g., docx, csv, json) if needed
+        st.write(f"Question: '{question}'")
+        st.write("Answer:", answer['answer'])
+        st.write("Score:", answer['score'])
+        st.write("Page Number:", answer['start'] + 1)  # Add 1 to convert 0-based index to 1-based page number
+        # Display truncated context
+        start_page = answer['start']
+        context = pdf_text if file_extension == ".pdf" else text
+        truncated_context = truncate_to_word_boundary(context)
+        st.write("Context:", truncated_context)
 if __name__ == "__main__":
     main()