Spaces:

Shankarm08
/

pdfreader

Sleeping

App Files Files Community

Shankarm08 commited on Oct 5, 2024

Commit

9e487ab

verified ·

1 Parent(s): 5fa3c44

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -12

app.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import streamlit as st
 import torch
 from transformers import BertTokenizer, BertModel
 import pdfplumber
 # Load the pre-trained BERT model and tokenizer once
 model_name = "bert-base-uncased"
@@ -27,12 +29,11 @@ def get_embeddings(text):
     with torch.no_grad():  # Disable gradient calculation for inference
         outputs = model(**inputs)
-    # Check if the output contains the last hidden state
     if hasattr(outputs, 'last_hidden_state'):
-        # Extract the embeddings from the last hidden state
         return outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()  # Move to CPU before converting to numpy
     else:
-        raise ValueError("Model output does not contain 'last_hidden_state'. Please check the model configuration.")
 # Extract text from PDF
 def extract_text_from_pdf(pdf_file):
@@ -42,9 +43,9 @@ def extract_text_from_pdf(pdf_file):
             text += page.extract_text() + "\n"  # Add newline for better separation
     return text
-# Store the PDF text and embeddings
-pdf_text = ""
-pdf_embeddings = None
 # Streamlit app
 st.title("PDF Chatbot using BERT")
@@ -52,10 +53,15 @@ st.title("PDF Chatbot using BERT")
 # PDF file upload
 pdf_file = st.file_uploader("Upload a PDF file", type=["pdf"])
 if pdf_file:
     pdf_text = extract_text_from_pdf(pdf_file)
     try:
-        pdf_embeddings = get_embeddings(pdf_text)
         st.success("PDF loaded successfully!")
     except Exception as e:
         st.error(f"Error while processing PDF: {e}")
@@ -64,16 +70,23 @@ if pdf_file:
 user_input = st.text_input("Ask a question about the PDF:")
 if st.button("Get Response"):
-    if pdf_text == "":
         st.warning("Please upload a PDF file first.")
     else:
-        # Get embeddings for user input
         try:
             user_embeddings = get_embeddings(user_input)
-            # For demonstration, simply return the PDF text.
-            # Implement similarity matching logic here as needed.
             st.write("### Response:")
-            st.write(pdf_text)  # For simplicity, returning all text
         except Exception as e:
             st.error(f"Error while processing user input: {e}")

 import streamlit as st
 import torch
+import numpy as np
 from transformers import BertTokenizer, BertModel
 import pdfplumber
+from sklearn.metrics.pairwise import cosine_similarity
 # Load the pre-trained BERT model and tokenizer once
 model_name = "bert-base-uncased"
     with torch.no_grad():  # Disable gradient calculation for inference
         outputs = model(**inputs)
+    # Extract the embeddings from the last hidden state
     if hasattr(outputs, 'last_hidden_state'):
         return outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()  # Move to CPU before converting to numpy
     else:
+        raise ValueError("Model output does not contain 'last_hidden_state'.")
 # Extract text from PDF
 def extract_text_from_pdf(pdf_file):
             text += page.extract_text() + "\n"  # Add newline for better separation
     return text
+# Split text into sentences for better matching
+def split_text_into_sentences(text):
+    return text.split('\n')  # Split by newlines; adjust as needed
 # Streamlit app
 st.title("PDF Chatbot using BERT")
 # PDF file upload
 pdf_file = st.file_uploader("Upload a PDF file", type=["pdf"])
+# Store the PDF text and embeddings
+pdf_text = ""
+pdf_embeddings = None
 if pdf_file:
     pdf_text = extract_text_from_pdf(pdf_file)
     try:
+        pdf_sentences = split_text_into_sentences(pdf_text)  # Split PDF text into sentences
+        pdf_embeddings = np.array([get_embeddings(sentence) for sentence in pdf_sentences])  # Get embeddings for each sentence
         st.success("PDF loaded successfully!")
     except Exception as e:
         st.error(f"Error while processing PDF: {e}")
 user_input = st.text_input("Ask a question about the PDF:")
 if st.button("Get Response"):
+    if not pdf_sentences:
         st.warning("Please upload a PDF file first.")
+    elif not user_input.strip():
+        st.warning("Please enter a question.")
     else:
         try:
             user_embeddings = get_embeddings(user_input)
+            user_embeddings = user_embeddings.reshape(1, -1)  # Reshape for cosine similarity calculation
+            # Calculate cosine similarity between user input and PDF sentence embeddings
+            similarities = cosine_similarity(user_embeddings, pdf_embeddings)
+            best_match_index = np.argmax(similarities)  # Get the index of the best match
+            # Display the most relevant sentence
             st.write("### Response:")
+            st.write(pdf_sentences[best_match_index])  # Return the most relevant sentence
         except Exception as e:
             st.error(f"Error while processing user input: {e}")