Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
CHANGED
|
@@ -30,6 +30,30 @@ def question_model():
|
|
| 30 |
question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer, handle_impossible_answer=True)
|
| 31 |
return question_answerer
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
# # get the answer by passing the context & question to the model
|
| 34 |
# def question_answering(context, question):
|
| 35 |
# with st.spinner(text="Loading question model..."):
|
|
@@ -90,29 +114,6 @@ def question_answering(context, question):
|
|
| 90 |
container.write("<h5><b>Answer:</b></h5>" + answer + "<p><small>(F1 score: " + answer_score + ")</small></p><br>",
|
| 91 |
unsafe_allow_html=True)
|
| 92 |
|
| 93 |
-
@st.cache_data(show_spinner=False)
|
| 94 |
-
def extract_text(file_path):
|
| 95 |
-
text = ""
|
| 96 |
-
image_text = ""
|
| 97 |
-
with st.spinner(text="Extracting text from file..."):
|
| 98 |
-
with open(file_path, "rb") as pdf_file:
|
| 99 |
-
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
| 100 |
-
num_pages = len(pdf_reader.pages)
|
| 101 |
-
|
| 102 |
-
for page_number in range(num_pages):
|
| 103 |
-
# st.write(f"Page {page_number + 1}")
|
| 104 |
-
page = pdf_reader.pages[page_number]
|
| 105 |
-
text += page.extract_text()
|
| 106 |
-
|
| 107 |
-
images = convert_from_path(file_path) # Convert PDF pages to images
|
| 108 |
-
for i, image in enumerate(images):
|
| 109 |
-
image_text += pytesseract.image_to_string(image)
|
| 110 |
-
|
| 111 |
-
# text = text + image_text
|
| 112 |
-
text = image_text
|
| 113 |
-
# remove more than one new line
|
| 114 |
-
text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)
|
| 115 |
-
return text
|
| 116 |
|
| 117 |
|
| 118 |
#-------------------- Main Webpage --------------------
|
|
@@ -178,8 +179,8 @@ with tab2:
|
|
| 178 |
if not st.session_state.text_extracted:
|
| 179 |
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
|
| 180 |
temp_file.write(uploaded_file.read()) # Save uploaded file to a temporary path
|
| 181 |
-
|
| 182 |
-
|
| 183 |
st.session_state.text_extracted = True
|
| 184 |
|
| 185 |
|
|
|
|
| 30 |
question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer, handle_impossible_answer=True)
|
| 31 |
return question_answerer
|
| 32 |
|
| 33 |
+
@st.cache_data(show_spinner=False)
|
| 34 |
+
def extract_text(file_path):
|
| 35 |
+
text = ""
|
| 36 |
+
image_text = ""
|
| 37 |
+
with st.spinner(text="Extracting text from file..."):
|
| 38 |
+
with open(file_path, "rb") as pdf_file:
|
| 39 |
+
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
| 40 |
+
num_pages = len(pdf_reader.pages)
|
| 41 |
+
|
| 42 |
+
for page_number in range(num_pages):
|
| 43 |
+
# st.write(f"Page {page_number + 1}")
|
| 44 |
+
page = pdf_reader.pages[page_number]
|
| 45 |
+
text += page.extract_text()
|
| 46 |
+
|
| 47 |
+
images = convert_from_path(file_path) # Convert PDF pages to images
|
| 48 |
+
for i, image in enumerate(images):
|
| 49 |
+
image_text += pytesseract.image_to_string(image)
|
| 50 |
+
|
| 51 |
+
# text = text + image_text
|
| 52 |
+
text = image_text
|
| 53 |
+
# remove more than one new line
|
| 54 |
+
text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)
|
| 55 |
+
return text
|
| 56 |
+
|
| 57 |
# # get the answer by passing the context & question to the model
|
| 58 |
# def question_answering(context, question):
|
| 59 |
# with st.spinner(text="Loading question model..."):
|
|
|
|
| 114 |
container.write("<h5><b>Answer:</b></h5>" + answer + "<p><small>(F1 score: " + answer_score + ")</small></p><br>",
|
| 115 |
unsafe_allow_html=True)
|
| 116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
|
| 118 |
|
| 119 |
#-------------------- Main Webpage --------------------
|
|
|
|
| 179 |
if not st.session_state.text_extracted:
|
| 180 |
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
|
| 181 |
temp_file.write(uploaded_file.read()) # Save uploaded file to a temporary path
|
| 182 |
+
raw_text = extract_text(temp_file.name)
|
| 183 |
+
context2 = raw_text
|
| 184 |
st.session_state.text_extracted = True
|
| 185 |
|
| 186 |
|