Update app.py
Browse files
app.py
CHANGED
@@ -1,93 +1,7 @@
|
|
1 |
-
# import streamlit as st
|
2 |
-
# from transformers import pipeline
|
3 |
-
# from sentence_transformers import SentenceTransformer, util
|
4 |
-
# import pdfplumber
|
5 |
-
|
6 |
-
# # ---- App Setup ----
|
7 |
-
# st.set_page_config(page_title='Gender Strategy Chatbot', layout='wide', initial_sidebar_state='expanded')
|
8 |
-
# st.title("Chatbot for Gender Strategy Document")
|
9 |
-
|
10 |
-
# # ---- Helper Functions ----
|
11 |
-
# def extract_text_from_pdf(pdf_path):
|
12 |
-
# """Extracts text from a PDF file."""
|
13 |
-
# text = ""
|
14 |
-
# with pdfplumber.open(pdf_path) as pdf:
|
15 |
-
# for page in pdf.pages:
|
16 |
-
# text += page.extract_text()
|
17 |
-
# return text
|
18 |
-
|
19 |
-
# def preprocess_text(document_text):
|
20 |
-
# """Processes the text, removes hard line breaks, and ensures clean paragraphs."""
|
21 |
-
# # 1. Remove hyphenation and line breaks, but keep the word intact
|
22 |
-
# document_text = re.sub(r'(?<=\S)-\n(?=\S)', '', document_text) # Remove hyphenation and \n
|
23 |
-
# # 2. Merge hard line breaks that occur between two words without hyphenation into a single space
|
24 |
-
# document_text = re.sub(r'(?<=\S)\n(?=\S)', ' ', document_text)
|
25 |
-
# # 3. Remove unnecessary whitespace at the beginning and end of the text
|
26 |
-
# document_text = document_text.strip()
|
27 |
-
# # 4. Optional: Reduce multiple consecutive spaces to a single space
|
28 |
-
# document_text = re.sub(r'\s{2,}', ' ', document_text)
|
29 |
-
# # Return the processed text
|
30 |
-
# standardized_text = document_text
|
31 |
-
# return standardized_text
|
32 |
-
|
33 |
-
|
34 |
-
# def semantic_search(query, corpus, model):
|
35 |
-
# """Performs semantic search to find the most relevant text in the corpus."""
|
36 |
-
# query_embedding = model.encode(query, convert_to_tensor=True)
|
37 |
-
# corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
|
38 |
-
|
39 |
-
# scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
|
40 |
-
# best_match_idx = scores.argmax().item()
|
41 |
-
# return corpus[best_match_idx], scores[best_match_idx].item()
|
42 |
-
|
43 |
-
# # ---- Load PDF and Extract Text ----
|
44 |
-
# @st.cache_data
|
45 |
-
# def load_pdf_and_prepare_embeddings(pdf_path):
|
46 |
-
# """Loads a PDF, extracts text, standardizes formatting, splits into chunks, and prepares embeddings."""
|
47 |
-
# document_text = extract_text_from_pdf(pdf_path)
|
48 |
-
# standardized_text = preprocess_text(document_text)
|
49 |
-
# chunks = standardized_text.split("\n\n") # Splitting text into chunks by paragraphs
|
50 |
-
# model = SentenceTransformer('all-MiniLM-L6-v2')
|
51 |
-
# return chunks, model
|
52 |
-
|
53 |
-
# pdf_path = "giz-2019-en-gender-strategy-web-version-with-bookmarks.pdf"
|
54 |
-
# chunks, embedding_model = load_pdf_and_prepare_embeddings(pdf_path)
|
55 |
-
|
56 |
-
# # ---- User Input Section ----
|
57 |
-
# st.sidebar.header("Ask a Question")
|
58 |
-
# query = st.sidebar.text_area("Type your question here:")
|
59 |
-
|
60 |
-
# if st.sidebar.button("Submit"):
|
61 |
-
# if query.strip() == "":
|
62 |
-
# st.sidebar.error("Please enter a question.")
|
63 |
-
# else:
|
64 |
-
# with st.spinner("Searching for the best answer..."):
|
65 |
-
# answer, score = semantic_search(query, chunks, embedding_model)
|
66 |
-
# st.write("### Your Question:")
|
67 |
-
# st.write(query)
|
68 |
-
# st.write("### Best Match:")
|
69 |
-
# st.write(answer)
|
70 |
-
# st.write(f"**Relevance Score:** {score:.2f}")
|
71 |
-
|
72 |
-
# # ---- Info Section ----
|
73 |
-
# with st.expander("ℹ️ - About this app"):
|
74 |
-
# st.write(
|
75 |
-
# """
|
76 |
-
# This chatbot allows users to ask questions about the Gender Strategy document.
|
77 |
-
# It uses a semantic search model (`all-MiniLM-L6-v2`) to find the most relevant passages from the document.
|
78 |
-
|
79 |
-
# - The document is pre-loaded and processed into searchable chunks.
|
80 |
-
# - The model ranks the relevance of the results based on cosine similarity.
|
81 |
-
|
82 |
-
# For feedback or improvements, please contact the developer.
|
83 |
-
# """
|
84 |
-
# )
|
85 |
-
|
86 |
import streamlit as st
|
87 |
from transformers import pipeline
|
88 |
from sentence_transformers import SentenceTransformer, util
|
89 |
import pdfplumber
|
90 |
-
import re # Import für Regular Expressions
|
91 |
|
92 |
# ---- App Setup ----
|
93 |
st.set_page_config(page_title='Gender Strategy Chatbot', layout='wide', initial_sidebar_state='expanded')
|
@@ -102,26 +16,19 @@ def extract_text_from_pdf(pdf_path):
|
|
102 |
text += page.extract_text()
|
103 |
return text
|
104 |
|
105 |
-
|
106 |
def preprocess_text(document_text):
|
107 |
"""Processes the text, removes hard line breaks, and ensures clean paragraphs."""
|
108 |
-
# 1.
|
109 |
-
document_text = re.sub(r'(?<=\S)-\n(?=\S)', '', document_text)
|
110 |
-
|
111 |
-
# 2. Identifiziere harte Zeilenumbrüche zwischen Wörtern und ersetze sie durch Leerzeichen
|
112 |
document_text = re.sub(r'(?<=\S)\n(?=\S)', ' ', document_text)
|
113 |
-
|
114 |
-
# 3. Ersetze mehrere Leerzeilen durch einen einzigen doppelten Zeilenumbruch für saubere Absätze
|
115 |
-
document_text = re.sub(r'\n{2,}', '\n\n', document_text)
|
116 |
-
|
117 |
-
# 4. Entferne überflüssige Leerzeichen am Anfang und Ende
|
118 |
document_text = document_text.strip()
|
119 |
-
|
120 |
-
# 5. Reduziere mehrere Leerzeichen innerhalb eines Absatzes auf ein einziges Leerzeichen
|
121 |
document_text = re.sub(r'\s{2,}', ' ', document_text)
|
122 |
-
|
123 |
-
|
124 |
-
return
|
125 |
|
126 |
|
127 |
def semantic_search(query, corpus, model):
|
@@ -137,15 +44,14 @@ def semantic_search(query, corpus, model):
|
|
137 |
@st.cache_data
|
138 |
def load_pdf_and_prepare_embeddings(pdf_path):
|
139 |
"""Loads a PDF, extracts text, standardizes formatting, splits into chunks, and prepares embeddings."""
|
140 |
-
|
141 |
-
|
142 |
-
chunks =
|
143 |
model = SentenceTransformer('all-MiniLM-L6-v2')
|
144 |
-
return chunks, model
|
145 |
|
146 |
-
# ---- Main Application Logic ----
|
147 |
pdf_path = "giz-2019-en-gender-strategy-web-version-with-bookmarks.pdf"
|
148 |
-
chunks, embedding_model
|
149 |
|
150 |
# ---- User Input Section ----
|
151 |
st.sidebar.header("Ask a Question")
|
@@ -163,21 +69,17 @@ if st.sidebar.button("Submit"):
|
|
163 |
st.write(answer)
|
164 |
st.write(f"**Relevance Score:** {score:.2f}")
|
165 |
|
166 |
-
# ---- Before & After Section ----
|
167 |
-
st.write("## Original vs Processed Text")
|
168 |
-
with st.expander("View Original Text"):
|
169 |
-
st.text(raw_text[:2000]) # Display the first 2000 characters of the raw text
|
170 |
-
with st.expander("View Processed Text"):
|
171 |
-
st.text(processed_text[:2000]) # Display the first 2000 characters of the processed text
|
172 |
-
|
173 |
# ---- Info Section ----
|
174 |
with st.expander("ℹ️ - About this app"):
|
175 |
st.write(
|
176 |
"""
|
177 |
This chatbot allows users to ask questions about the Gender Strategy document.
|
178 |
It uses a semantic search model (`all-MiniLM-L6-v2`) to find the most relevant passages from the document.
|
|
|
179 |
- The document is pre-loaded and processed into searchable chunks.
|
180 |
- The model ranks the relevance of the results based on cosine similarity.
|
|
|
181 |
For feedback or improvements, please contact the developer.
|
182 |
"""
|
183 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
from transformers import pipeline
|
3 |
from sentence_transformers import SentenceTransformer, util
|
4 |
import pdfplumber
|
|
|
5 |
|
6 |
# ---- App Setup ----
|
7 |
st.set_page_config(page_title='Gender Strategy Chatbot', layout='wide', initial_sidebar_state='expanded')
|
|
|
16 |
text += page.extract_text()
|
17 |
return text
|
18 |
|
|
|
19 |
def preprocess_text(document_text):
|
20 |
"""Processes the text, removes hard line breaks, and ensures clean paragraphs."""
|
21 |
+
# 1. Remove hyphenation and line breaks, but keep the word intact
|
22 |
+
document_text = re.sub(r'(?<=\S)-\n(?=\S)', '', document_text) # Remove hyphenation and \n
|
23 |
+
# 2. Merge hard line breaks that occur between two words without hyphenation into a single space
|
|
|
24 |
document_text = re.sub(r'(?<=\S)\n(?=\S)', ' ', document_text)
|
25 |
+
# 3. Remove unnecessary whitespace at the beginning and end of the text
|
|
|
|
|
|
|
|
|
26 |
document_text = document_text.strip()
|
27 |
+
# 4. Optional: Reduce multiple consecutive spaces to a single space
|
|
|
28 |
document_text = re.sub(r'\s{2,}', ' ', document_text)
|
29 |
+
# Return the processed text
|
30 |
+
standardized_text = document_text
|
31 |
+
return standardized_text
|
32 |
|
33 |
|
34 |
def semantic_search(query, corpus, model):
|
|
|
44 |
@st.cache_data
|
45 |
def load_pdf_and_prepare_embeddings(pdf_path):
|
46 |
"""Loads a PDF, extracts text, standardizes formatting, splits into chunks, and prepares embeddings."""
|
47 |
+
document_text = extract_text_from_pdf(pdf_path)
|
48 |
+
standardized_text = preprocess_text(document_text)
|
49 |
+
chunks = standardized_text.split("\n\n") # Splitting text into chunks by paragraphs
|
50 |
model = SentenceTransformer('all-MiniLM-L6-v2')
|
51 |
+
return chunks, model
|
52 |
|
|
|
53 |
pdf_path = "giz-2019-en-gender-strategy-web-version-with-bookmarks.pdf"
|
54 |
+
chunks, embedding_model = load_pdf_and_prepare_embeddings(pdf_path)
|
55 |
|
56 |
# ---- User Input Section ----
|
57 |
st.sidebar.header("Ask a Question")
|
|
|
69 |
st.write(answer)
|
70 |
st.write(f"**Relevance Score:** {score:.2f}")
|
71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
# ---- Info Section ----
|
73 |
with st.expander("ℹ️ - About this app"):
|
74 |
st.write(
|
75 |
"""
|
76 |
This chatbot allows users to ask questions about the Gender Strategy document.
|
77 |
It uses a semantic search model (`all-MiniLM-L6-v2`) to find the most relevant passages from the document.
|
78 |
+
|
79 |
- The document is pre-loaded and processed into searchable chunks.
|
80 |
- The model ranks the relevance of the results based on cosine similarity.
|
81 |
+
|
82 |
For feedback or improvements, please contact the developer.
|
83 |
"""
|
84 |
)
|
85 |
+
|