Spaces:

adinarayana
/

Sample

Sleeping

App Files Files Community

adinarayana commited on Feb 16, 2024

Commit

f9b6f1e

verified ·

1 Parent(s): 3f45e56

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -53

app.py CHANGED Viewed

@@ -7,79 +7,41 @@ from transformers import pipeline, TFBertForQuestionAnswering, AutoTokenizer
 import tensorflow as tf
 import streamlit as st
 def preprocess_text(element):
-    """Preprocesses text elements from the PDF.
-    Args:
-        element: A PDFminer text element.
-    Returns:
-        The preprocessed text.
-    """
     if isinstance(element, pdfminer.layout.LTTextBoxHorizontal):
         text = element.get_text().strip()
         # Remove non-textual elements
-        text = re.sub(r'[^\w\s]', '', text)  # Replace with your preferred regular expression
-        # Remove stop words (optional)
-        # from nltk.corpus import stopwords
-        # stop_words = set(stopwords.words('english'))
-        # text = " ".join([word for word in text.split() if word not in stop_words])
-        # Convert to lowercase (optional)
         text = text.lower()
         return text
     else:
         return ""
 def answer_question(text, question, max_length=512):
-    """Answers a question using the provided text and a pre-trained model.
-    Args:
-        text: The preprocessed text from the PDF.
-        question: The user's question.
-    Returns:
-        The answer extracted from the text using the model.
-    """
-    qa_model_name = "bert-base-uncased"  # Replace with your model
     qa_model = TFBertForQuestionAnswering.from_pretrained(qa_model_name)
     tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
-    # Truncate text if necessary:
-    if len(text) > max_length:
-        text = text[:max_length]
-    # Add special tokens and tokenize:
-    inputs = tokenizer(
-        question, text, return_tensors="tf", padding="max_length", truncation=True
-    )
     outputs = qa_model(inputs)
     start_logits = outputs.start_logits
     end_logits = outputs.end_logits
-    # Ensure start_logits and end_logits are tensors
-    start_logits = tf.convert_to_tensor(start_logits)
-    end_logits = tf.convert_to_tensor(end_logits)
     # Find the indices of the start and end positions
     answer_start = tf.argmax(start_logits, axis=1).numpy()[0]
-    answer_end = (tf.argmax(end_logits, axis=1) + 1).numpy()[0]  # Increment by 1 for exclusive end index
     # Extract the answer text from the original text
-    answer = text[answer_start:answer_end].strip()
     return answer if answer else "No answer found."
-## Streamlit app
 st.set_page_config(page_title="PDF Summarizer and Q&A")
 st.header("PDF Summarizer and Q&A")
@@ -101,9 +63,7 @@ if uploaded_file is not None:
             summarize_button = st.button("Generate Summary")
             if summarize_button:
                 with st.spinner("Summarizing..."):
-                    max_input_length = 1024  # Example value, adjust according to your model
-                    truncated_text = text[:max_input_length]  # Truncate the text
-                    summary_response = pipeline("summarization", model=summarization_model)(truncated_text, min_length=min_summary_length)
                     st.subheader("Summary")
                     st.write(summary_response[0]["summary_text"])
             if question:
@@ -113,4 +73,3 @@ if uploaded_file is not None:
                     st.write(answer)
         else:
             st.error("No text found in the PDF.")

 import tensorflow as tf
 import streamlit as st
 def preprocess_text(element):
+    """Preprocesses text elements from the PDF."""
     if isinstance(element, pdfminer.layout.LTTextBoxHorizontal):
         text = element.get_text().strip()
         # Remove non-textual elements
+        text = re.sub(r'[^\w\s]', '', text)
+        # Convert to lowercase
         text = text.lower()
         return text
     else:
         return ""
 def answer_question(text, question, max_length=512):
+    """Answers a question using the provided text and a pre-trained model."""
+    qa_model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
     qa_model = TFBertForQuestionAnswering.from_pretrained(qa_model_name)
     tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
+    # Add special tokens and tokenize
+    inputs = tokenizer(question, text, return_tensors="tf", padding=True, truncation=True, max_length=max_length)
+    # Model prediction
     outputs = qa_model(inputs)
     start_logits = outputs.start_logits
     end_logits = outputs.end_logits
     # Find the indices of the start and end positions
     answer_start = tf.argmax(start_logits, axis=1).numpy()[0]
+    answer_end = (tf.argmax(end_logits, axis=1) + 1).numpy()[0]
     # Extract the answer text from the original text
+    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
     return answer if answer else "No answer found."
+# Streamlit app
 st.set_page_config(page_title="PDF Summarizer and Q&A")
 st.header("PDF Summarizer and Q&A")
             summarize_button = st.button("Generate Summary")
             if summarize_button:
                 with st.spinner("Summarizing..."):
+                    summary_response = pipeline("summarization", model=summarization_model)(text, min_length=min_summary_length)
                     st.subheader("Summary")
                     st.write(summary_response[0]["summary_text"])
             if question:
                     st.write(answer)
         else:
             st.error("No text found in the PDF.")