adinarayana commited on
Commit
f9b6f1e
·
verified ·
1 Parent(s): 3f45e56

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -53
app.py CHANGED
@@ -7,79 +7,41 @@ from transformers import pipeline, TFBertForQuestionAnswering, AutoTokenizer
7
  import tensorflow as tf
8
  import streamlit as st
9
 
10
-
11
  def preprocess_text(element):
12
- """Preprocesses text elements from the PDF.
13
-
14
- Args:
15
- element: A PDFminer text element.
16
-
17
- Returns:
18
- The preprocessed text.
19
- """
20
  if isinstance(element, pdfminer.layout.LTTextBoxHorizontal):
21
  text = element.get_text().strip()
22
  # Remove non-textual elements
23
- text = re.sub(r'[^\w\s]', '', text) # Replace with your preferred regular expression
24
-
25
- # Remove stop words (optional)
26
- # from nltk.corpus import stopwords
27
- # stop_words = set(stopwords.words('english'))
28
- # text = " ".join([word for word in text.split() if word not in stop_words])
29
-
30
- # Convert to lowercase (optional)
31
  text = text.lower()
32
  return text
33
  else:
34
  return ""
35
 
36
-
37
  def answer_question(text, question, max_length=512):
38
- """Answers a question using the provided text and a pre-trained model.
39
-
40
- Args:
41
- text: The preprocessed text from the PDF.
42
- question: The user's question.
43
-
44
- Returns:
45
- The answer extracted from the text using the model.
46
- """
47
-
48
- qa_model_name = "bert-base-uncased" # Replace with your model
49
-
50
  qa_model = TFBertForQuestionAnswering.from_pretrained(qa_model_name)
51
  tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
52
 
53
- # Truncate text if necessary:
54
- if len(text) > max_length:
55
- text = text[:max_length]
56
-
57
- # Add special tokens and tokenize:
58
- inputs = tokenizer(
59
- question, text, return_tensors="tf", padding="max_length", truncation=True
60
- )
61
 
 
62
  outputs = qa_model(inputs)
63
-
64
  start_logits = outputs.start_logits
65
  end_logits = outputs.end_logits
66
 
67
- # Ensure start_logits and end_logits are tensors
68
- start_logits = tf.convert_to_tensor(start_logits)
69
- end_logits = tf.convert_to_tensor(end_logits)
70
-
71
  # Find the indices of the start and end positions
72
  answer_start = tf.argmax(start_logits, axis=1).numpy()[0]
73
- answer_end = (tf.argmax(end_logits, axis=1) + 1).numpy()[0] # Increment by 1 for exclusive end index
74
 
75
  # Extract the answer text from the original text
76
- answer = text[answer_start:answer_end].strip()
77
-
78
  return answer if answer else "No answer found."
79
 
80
-
81
- ## Streamlit app
82
-
83
  st.set_page_config(page_title="PDF Summarizer and Q&A")
84
  st.header("PDF Summarizer and Q&A")
85
 
@@ -101,9 +63,7 @@ if uploaded_file is not None:
101
  summarize_button = st.button("Generate Summary")
102
  if summarize_button:
103
  with st.spinner("Summarizing..."):
104
- max_input_length = 1024 # Example value, adjust according to your model
105
- truncated_text = text[:max_input_length] # Truncate the text
106
- summary_response = pipeline("summarization", model=summarization_model)(truncated_text, min_length=min_summary_length)
107
  st.subheader("Summary")
108
  st.write(summary_response[0]["summary_text"])
109
  if question:
@@ -113,4 +73,3 @@ if uploaded_file is not None:
113
  st.write(answer)
114
  else:
115
  st.error("No text found in the PDF.")
116
-
 
7
  import tensorflow as tf
8
  import streamlit as st
9
 
 
10
  def preprocess_text(element):
11
+ """Preprocesses text elements from the PDF."""
 
 
 
 
 
 
 
12
  if isinstance(element, pdfminer.layout.LTTextBoxHorizontal):
13
  text = element.get_text().strip()
14
  # Remove non-textual elements
15
+ text = re.sub(r'[^\w\s]', '', text)
16
+ # Convert to lowercase
 
 
 
 
 
 
17
  text = text.lower()
18
  return text
19
  else:
20
  return ""
21
 
 
22
  def answer_question(text, question, max_length=512):
23
+ """Answers a question using the provided text and a pre-trained model."""
24
+ qa_model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
 
 
 
 
 
 
 
 
 
 
25
  qa_model = TFBertForQuestionAnswering.from_pretrained(qa_model_name)
26
  tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
27
 
28
+ # Add special tokens and tokenize
29
+ inputs = tokenizer(question, text, return_tensors="tf", padding=True, truncation=True, max_length=max_length)
 
 
 
 
 
 
30
 
31
+ # Model prediction
32
  outputs = qa_model(inputs)
 
33
  start_logits = outputs.start_logits
34
  end_logits = outputs.end_logits
35
 
 
 
 
 
36
  # Find the indices of the start and end positions
37
  answer_start = tf.argmax(start_logits, axis=1).numpy()[0]
38
+ answer_end = (tf.argmax(end_logits, axis=1) + 1).numpy()[0]
39
 
40
  # Extract the answer text from the original text
41
+ answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
 
42
  return answer if answer else "No answer found."
43
 
44
+ # Streamlit app
 
 
45
  st.set_page_config(page_title="PDF Summarizer and Q&A")
46
  st.header("PDF Summarizer and Q&A")
47
 
 
63
  summarize_button = st.button("Generate Summary")
64
  if summarize_button:
65
  with st.spinner("Summarizing..."):
66
+ summary_response = pipeline("summarization", model=summarization_model)(text, min_length=min_summary_length)
 
 
67
  st.subheader("Summary")
68
  st.write(summary_response[0]["summary_text"])
69
  if question:
 
73
  st.write(answer)
74
  else:
75
  st.error("No text found in the PDF.")