alronlam commited on
Commit
613c93d
1 Parent(s): bba0b87

Add app and data files

Browse files
app.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ from pathlib import Path
4
+
5
+ import streamlit as st
6
+
7
+ from src import bible_loader
8
+ from src.embeddings import EmbeddingsManager
9
+ from src.reranker import (
10
+ CombinedScoreAndNumberReranker,
11
+ MaxVerseReranker,
12
+ Reranker,
13
+ SemanticSimScoreReranker,
14
+ )
15
+ from src.retriever import Retriever, SemanticRetriever
16
+
17
+
18
+ def display_chapter(chapter):
19
+ st.header(f"[{str(chapter)}]({chapter.get_biblegateway_url()})")
20
+ chapter_text = chapter.get_formatted_text()
21
+ st.markdown(chapter_text, unsafe_allow_html=True)
22
+ # st.write(chapter.highlight_verses_df)
23
+
24
+
25
+ def config():
26
+ n_results = st.sidebar.slider("Maximum Results?", 5, 30, 10)
27
+ # bible_version = st.sidebar.selectbox("Bible Version", ["NIV", "ESV"]) # TODO
28
+ bible_version = "NIV"
29
+ new_testament = st.sidebar.checkbox("Search New Testament?", True)
30
+ old_testament = st.sidebar.checkbox("Search Old Testament?", False)
31
+
32
+ return n_results, new_testament, old_testament, bible_version
33
+
34
+
35
+ def main():
36
+
37
+ st.set_page_config(page_title="Bible Search", layout="wide")
38
+
39
+ n_results, new_testament, old_testament, bible_version = config()
40
+
41
+ # Config
42
+ ROOT_DIR = Path(os.path.abspath(os.path.dirname(__file__)))
43
+ DATA_DIR = ROOT_DIR / "data"
44
+
45
+ n_candidates = n_results * 2
46
+ metadata_csv = DATA_DIR / "key_english.csv"
47
+ verses_csv = DATA_DIR / f"{bible_version}.csv"
48
+
49
+ semantic_sim_model = "msmarco-distilbert-base-v4"
50
+
51
+ # Initialize / Index
52
+ bible_df = bible_loader.load_bible(metadata_csv, verses_csv)
53
+ embeddings_manager = EmbeddingsManager(
54
+ model_name=semantic_sim_model,
55
+ bible_version=bible_version,
56
+ embeddings_cache_dir=DATA_DIR,
57
+ texts=bible_df["text"].tolist(),
58
+ )
59
+
60
+ # Trim down search space if needed
61
+ if not new_testament:
62
+ bible_df = bible_df[bible_df["testament"] != "NT"]
63
+ if not old_testament:
64
+ bible_df = bible_df[bible_df["testament"] != "OT"]
65
+
66
+ # Initialize retriever and reranker based on filtered texts
67
+ retriever = SemanticRetriever(bible_df, embeddings_manager)
68
+ reranker = CombinedScoreAndNumberReranker()
69
+ # reranker = SemanticSimScoreReranker()
70
+ # reranker = MaxVerseReranker()
71
+
72
+ _, main_col, _ = st.columns([1, 2, 1])
73
+
74
+ with main_col:
75
+
76
+ # Get user input
77
+ st.title("Verse Similarity Search")
78
+ st.markdown(
79
+ "- Have you ever been stumped by a verse and wondered what related things the Bible says about it?\n"
80
+ "- Or you have a verse of interest and you simply want to find related ones?\n"
81
+ "- Or you vaguely recall a verse's idea, but can't recall the exact text?\n"
82
+ "This tool was made just for that!"
83
+ )
84
+
85
+ st.markdown("---")
86
+
87
+ demo_query = st.selectbox(
88
+ "Try some demo queries...",
89
+ [
90
+ "",
91
+ "For God so loved the world that he gave his one and only Son, that whoever believes in him shall not perish but have eternal life.",
92
+ "In the same way, faith by itself, if it is not accompanied by action, is dead.",
93
+ "I tell you the truth, no one can enter the kingdom of God unless he is born of water and the Spirit.",
94
+ "the Lord is patient with us, not wanting us to perish",
95
+ "is it ok for believers to continue in sin?",
96
+ "it is possible to resist every temptation",
97
+ "heavenly rewards",
98
+ "the old is gone, the new has come",
99
+ "suffering for Christ",
100
+ "rejoicing in trials",
101
+ "Be careful of false prophets, wolves in sheep skin",
102
+ "will there be marriage in heaven?",
103
+ ],
104
+ index=1,
105
+ )
106
+
107
+ query = st.text_area(
108
+ "Or type a verse's text here to find similar verses",
109
+ demo_query if demo_query.strip() else "",
110
+ )
111
+
112
+ clicked_search = st.button("Search", type="primary")
113
+
114
+ if query or clicked_search:
115
+
116
+ if len(bible_df) == 0:
117
+ st.markdown(
118
+ "---\n:red[Please select at least one testament to search through (left hand side of the screen). :)]"
119
+ )
120
+ else:
121
+ with st.spinner("Searching..."):
122
+
123
+ start = time.time()
124
+
125
+ # Retrieve and re-rank
126
+ candidate_chapters = retriever.retrieve(query, n=n_candidates)
127
+ candidate_chapters = reranker.rerank(candidate_chapters)
128
+
129
+ # Trim because candidates can be more than the desired results
130
+ final_chapter_results = candidate_chapters[:n_results]
131
+
132
+ # Display quick stats
133
+ st.markdown(
134
+ f"_{len(final_chapter_results)} results found in {time.time()-start:.2f}s_"
135
+ )
136
+ st.markdown("---")
137
+
138
+ # Display results
139
+ for chapter in final_chapter_results:
140
+ display_chapter(chapter)
141
+ st.markdown("---")
142
+
143
+
144
+ if __name__ == "__main__":
145
+ main()
data/NIV.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/key_english.csv ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ b,n,t,g
2
+ 1,Genesis,OT,1
3
+ 2,Exodus,OT,1
4
+ 3,Leviticus,OT,1
5
+ 4,Numbers,OT,1
6
+ 5,Deuteronomy,OT,1
7
+ 6,Joshua,OT,2
8
+ 7,Judges,OT,2
9
+ 8,Ruth,OT,2
10
+ 9,1 Samuel,OT,2
11
+ 10,2 Samuel,OT,2
12
+ 11,1 Kings,OT,2
13
+ 12,2 Kings,OT,2
14
+ 13,1 Chronicles,OT,2
15
+ 14,2 Chronicles,OT,2
16
+ 15,Ezra,OT,2
17
+ 16,Nehemiah,OT,2
18
+ 17,Esther,OT,2
19
+ 18,Job,OT,3
20
+ 19,Psalms,OT,3
21
+ 20,Proverbs,OT,3
22
+ 21,Ecclesiastes,OT,3
23
+ 22,Song of Solomon,OT,3
24
+ 23,Isaiah,OT,4
25
+ 24,Jeremiah,OT,4
26
+ 25,Lamentations,OT,4
27
+ 26,Ezekiel,OT,4
28
+ 27,Daniel,OT,4
29
+ 28,Hosea,OT,4
30
+ 29,Joel,OT,4
31
+ 30,Amos,OT,4
32
+ 31,Obadiah,OT,4
33
+ 32,Jonah,OT,4
34
+ 33,Micah,OT,4
35
+ 34,Nahum,OT,4
36
+ 35,Habakkuk,OT,4
37
+ 36,Zephaniah,OT,4
38
+ 37,Haggai,OT,4
39
+ 38,Zechariah,OT,4
40
+ 39,Malachi,OT,4
41
+ 40,Matthew,NT,5
42
+ 41,Mark,NT,5
43
+ 42,Luke,NT,5
44
+ 43,John,NT,5
45
+ 44,Acts,NT,6
46
+ 45,Romans,NT,7
47
+ 46,1 Corinthians,NT,7
48
+ 47,2 Corinthians,NT,7
49
+ 48,Galatians,NT,7
50
+ 49,Ephesians,NT,7
51
+ 50,Philippians,NT,7
52
+ 51,Colossians,NT,7
53
+ 52,1 Thessalonians,NT,7
54
+ 53,2 Thessalonians,NT,7
55
+ 54,1 Timothy,NT,7
56
+ 55,2 Timothy,NT,7
57
+ 56,Titus,NT,7
58
+ 57,Philemon,NT,7
59
+ 58,Hebrews,NT,7
60
+ 59,James,NT,7
61
+ 60,1 Peter,NT,7
62
+ 61,2 Peter,NT,7
63
+ 62,1 John,NT,7
64
+ 63,2 John,NT,7
65
+ 64,3 John,NT,7
66
+ 65,Jude,NT,7
67
+ 66,Revelation,NT,8
requirements.txt ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # This file is autogenerated by pip-compile with Python 3.7
3
+ # by the following command:
4
+ #
5
+ # pip-compile --output-file=requirements.txt requirements.in
6
+ #
7
+ altair==4.2.0
8
+ # via streamlit
9
+ attrs==22.1.0
10
+ # via jsonschema
11
+ backports-zoneinfo==0.2.1
12
+ # via
13
+ # pytz-deprecation-shim
14
+ # tzlocal
15
+ black==22.12.0
16
+ # via -r requirements.in
17
+ blinker==1.5
18
+ # via streamlit
19
+ cachetools==5.2.0
20
+ # via streamlit
21
+ certifi==2022.12.7
22
+ # via requests
23
+ charset-normalizer==2.1.1
24
+ # via requests
25
+ click==8.0.4
26
+ # via
27
+ # black
28
+ # nltk
29
+ # streamlit
30
+ decorator==5.1.1
31
+ # via validators
32
+ entrypoints==0.4
33
+ # via altair
34
+ filelock==3.8.2
35
+ # via
36
+ # huggingface-hub
37
+ # transformers
38
+ gitdb==4.0.10
39
+ # via gitpython
40
+ gitpython==3.1.29
41
+ # via streamlit
42
+ h5py==3.7.0
43
+ # via -r requirements.in
44
+ huggingface-hub==0.11.1
45
+ # via
46
+ # sentence-transformers
47
+ # transformers
48
+ idna==3.4
49
+ # via requests
50
+ importlib-metadata==5.1.0
51
+ # via
52
+ # click
53
+ # huggingface-hub
54
+ # jsonschema
55
+ # streamlit
56
+ # transformers
57
+ importlib-resources==5.10.1
58
+ # via jsonschema
59
+ isort==5.11.4
60
+ # via -r requirements.in
61
+ jinja2==3.1.2
62
+ # via
63
+ # altair
64
+ # pydeck
65
+ joblib==1.2.0
66
+ # via
67
+ # nltk
68
+ # scikit-learn
69
+ jsonschema==4.17.3
70
+ # via altair
71
+ loguru==0.6.0
72
+ # via -r requirements.in
73
+ markdown-it-py==2.1.0
74
+ # via rich
75
+ markupsafe==2.1.1
76
+ # via jinja2
77
+ mdurl==0.1.2
78
+ # via markdown-it-py
79
+ mypy-extensions==0.4.3
80
+ # via black
81
+ nltk==3.8
82
+ # via sentence-transformers
83
+ numpy==1.21.6
84
+ # via
85
+ # -r requirements.in
86
+ # altair
87
+ # h5py
88
+ # pandas
89
+ # pyarrow
90
+ # pydeck
91
+ # scikit-learn
92
+ # scipy
93
+ # sentence-transformers
94
+ # sparse-dot-topn
95
+ # streamlit
96
+ # torchvision
97
+ # transformers
98
+ nvidia-cublas-cu11==11.10.3.66
99
+ # via
100
+ # nvidia-cudnn-cu11
101
+ # torch
102
+ nvidia-cuda-nvrtc-cu11==11.7.99
103
+ # via torch
104
+ nvidia-cuda-runtime-cu11==11.7.99
105
+ # via torch
106
+ nvidia-cudnn-cu11==8.5.0.96
107
+ # via torch
108
+ packaging==22.0
109
+ # via
110
+ # huggingface-hub
111
+ # streamlit
112
+ # transformers
113
+ pandas==1.3.5
114
+ # via
115
+ # -r requirements.in
116
+ # altair
117
+ # streamlit
118
+ pathspec==0.10.3
119
+ # via black
120
+ pillow==9.3.0
121
+ # via
122
+ # streamlit
123
+ # torchvision
124
+ pkgutil-resolve-name==1.3.10
125
+ # via jsonschema
126
+ platformdirs==2.6.0
127
+ # via black
128
+ protobuf==3.19.6
129
+ # via
130
+ # -r requirements.in
131
+ # streamlit
132
+ pyarrow==10.0.1
133
+ # via streamlit
134
+ pydantic==1.10.2
135
+ # via -r requirements.in
136
+ pydeck==0.8.0
137
+ # via streamlit
138
+ pygments==2.14.0
139
+ # via rich
140
+ pympler==1.0.1
141
+ # via streamlit
142
+ pyrsistent==0.19.2
143
+ # via jsonschema
144
+ python-dateutil==2.8.2
145
+ # via
146
+ # pandas
147
+ # streamlit
148
+ pytz==2022.6
149
+ # via pandas
150
+ pytz-deprecation-shim==0.1.0.post0
151
+ # via tzlocal
152
+ pyyaml==6.0
153
+ # via
154
+ # huggingface-hub
155
+ # transformers
156
+ regex==2022.10.31
157
+ # via
158
+ # nltk
159
+ # transformers
160
+ requests==2.28.1
161
+ # via
162
+ # huggingface-hub
163
+ # streamlit
164
+ # torchvision
165
+ # transformers
166
+ rich==13.3.1
167
+ # via streamlit
168
+ scikit-learn==1.0.2
169
+ # via sentence-transformers
170
+ scipy==1.7.3
171
+ # via
172
+ # scikit-learn
173
+ # sentence-transformers
174
+ # sparse-dot-topn
175
+ semver==2.13.0
176
+ # via streamlit
177
+ sentence-transformers==2.2.2
178
+ # via -r requirements.in
179
+ sentencepiece==0.1.91
180
+ # via sentence-transformers
181
+ six==1.16.0
182
+ # via python-dateutil
183
+ sklearn==0.0.post1
184
+ # via -r requirements.in
185
+ smmap==5.0.0
186
+ # via gitdb
187
+ sparse-dot-topn==0.3.3
188
+ # via -r requirements.in
189
+ streamlit==1.17.0
190
+ # via -r requirements.in
191
+ threadpoolctl==3.1.0
192
+ # via scikit-learn
193
+ tokenizers==0.13.2
194
+ # via transformers
195
+ toml==0.10.2
196
+ # via streamlit
197
+ tomli==2.0.1
198
+ # via black
199
+ toolz==0.12.0
200
+ # via altair
201
+ torch==1.13.1
202
+ # via
203
+ # -r requirements.in
204
+ # sentence-transformers
205
+ # torchvision
206
+ torchvision==0.14.1
207
+ # via sentence-transformers
208
+ tornado==6.2
209
+ # via streamlit
210
+ tqdm==4.64.1
211
+ # via
212
+ # huggingface-hub
213
+ # nltk
214
+ # sentence-transformers
215
+ # transformers
216
+ transformers==4.25.1
217
+ # via
218
+ # -r requirements.in
219
+ # sentence-transformers
220
+ typed-ast==1.5.4
221
+ # via black
222
+ typing-extensions==4.4.0
223
+ # via
224
+ # black
225
+ # gitpython
226
+ # huggingface-hub
227
+ # importlib-metadata
228
+ # jsonschema
229
+ # markdown-it-py
230
+ # pydantic
231
+ # rich
232
+ # streamlit
233
+ # torch
234
+ # torchvision
235
+ tzdata==2022.7
236
+ # via pytz-deprecation-shim
237
+ tzlocal==4.2
238
+ # via streamlit
239
+ urllib3==1.26.13
240
+ # via requests
241
+ validators==0.20.0
242
+ # via streamlit
243
+ watchdog==2.2.0
244
+ # via streamlit
245
+ wheel==0.38.4
246
+ # via
247
+ # nvidia-cublas-cu11
248
+ # nvidia-cuda-runtime-cu11
249
+ zipp==3.11.0
250
+ # via
251
+ # importlib-metadata
252
+ # importlib-resources
253
+
254
+ # The following packages are considered to be unsafe in a requirements file:
255
+ # setuptools
src/__pycache__/bible_loader.cpython-37.pyc ADDED
Binary file (1.01 kB). View file
 
src/__pycache__/embeddings.cpython-37.pyc ADDED
Binary file (1.91 kB). View file
 
src/__pycache__/models.cpython-37.pyc ADDED
Binary file (1.69 kB). View file
 
src/__pycache__/reranker.cpython-37.pyc ADDED
Binary file (3.43 kB). View file
 
src/__pycache__/retriever.cpython-37.pyc ADDED
Binary file (4.82 kB). View file
 
src/bible_loader.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import streamlit as st
3
+ from loguru import logger
4
+
5
+
6
+ @st.cache()
7
+ def load_bible(metadata_csv, verses_csv):
8
+ # There is one constant metadata file (metadata_csv),
9
+ # and another csv file containing the actual verses in the specified version (bible_csv).
10
+ metadata_df = pd.read_csv(metadata_csv)
11
+ verses_df = pd.read_csv(verses_csv, escapechar="\\")
12
+ df = pd.merge(verses_df, metadata_df, on="b")
13
+ df = df.fillna("") # Some verses are blank in some versions
14
+
15
+ df = df[["n", "c", "v", "t_x", "t_y"]]
16
+
17
+ # The data sources used have this convention in the columns.
18
+ # Renaming them here for ease of remembrance.
19
+ col_rename = {
20
+ "t_y": "testament",
21
+ "n": "book",
22
+ "c": "chapter",
23
+ "v": "verse",
24
+ "t_x": "text",
25
+ }
26
+ df = df.rename(columns=col_rename)
27
+
28
+ # Create a human-friendly string of specifying a verse (e.g. Genesis 1:1)
29
+ df["source"] = df.apply(
30
+ lambda row: f"{row['book']} {row['chapter']}:{row['verse']}", axis=1
31
+ )
32
+
33
+ logger.info(
34
+ f"Successfully loaded Bible DF with {len(df):,} rows. Columns: {df.columns.tolist()}"
35
+ )
36
+
37
+ return df
src/embeddings.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import traceback
3
+
4
+ import h5py
5
+ import numpy as np
6
+ from loguru import logger
7
+ from sentence_transformers import SentenceTransformer
8
+
9
+
10
+ class EmbeddingsManager:
11
+ def __init__(self, model_name, bible_version, texts, embeddings_cache_dir) -> None:
12
+
13
+ # Load embeddings model
14
+ self.model = SentenceTransformer(model_name)
15
+
16
+ # Load or generate embeddings baseed on the corpus
17
+ sanitized_model_name = model_name.replace("\\", "-").replace("/", "-")
18
+ self.cache_filename = f"{bible_version}_{sanitized_model_name}.h5"
19
+ self.emb_cache_filepath = os.path.join(
20
+ embeddings_cache_dir, self.cache_filename
21
+ )
22
+
23
+ # Load embeddings if it exists
24
+ try:
25
+ with h5py.File(self.emb_cache_filepath, "r") as h:
26
+ self.embeddings = np.array(h["embeddings"])
27
+ except Exception:
28
+ traceback.print_exc()
29
+ # If it doesn't, generate embeddings and save to a file
30
+ logger.info(
31
+ f"Generating embeddings and saving to {self.emb_cache_filepath}"
32
+ )
33
+ self.embeddings = self.model.encode(texts)
34
+ with h5py.File(self.emb_cache_filepath, "w") as f:
35
+ f.create_dataset("embeddings", data=self.embeddings)
36
+
37
+ # Create a look-up dict to quickly retrieve embeddings of texts
38
+ self.text_emb_dict = {}
39
+ for text, embedding in zip(texts, self.embeddings):
40
+ self.text_emb_dict[text] = embedding
41
+
42
+ logger.info(
43
+ f"Successfully loaded {model_name} embeddings for {bible_version} from {self.emb_cache_filepath}."
44
+ )
45
+
46
+ def get_embeddings(self, texts):
47
+ embeddings = []
48
+ for text in texts:
49
+ if text not in self.text_emb_dict:
50
+ self.text_emb_dict[text] = self.model.encode([text])[0]
51
+ embeddings.append(self.text_emb_dict[text])
52
+ return embeddings
53
+
54
+ def __str__(self):
55
+ return self.emb_cache_filepath
56
+
57
+
58
+ def score_semantic_similarity(query, texts_df):
59
+ """Returns copy of text_df with semantic similarity scores."""
60
+ pass
src/models.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import urllib
2
+
3
+ import pandas as pd
4
+ from pydantic import BaseModel
5
+
6
+
7
+ class Chapter(BaseModel):
8
+ book_name: str
9
+ chapter_num: int
10
+ verses_df: pd.DataFrame
11
+ highlight_verses_df: pd.DataFrame
12
+
13
+ class Config:
14
+ arbitrary_types_allowed = True
15
+
16
+ def __str__(self) -> str:
17
+ return f"{self.book_name} {self.chapter_num}"
18
+
19
+ def get_formatted_text(self):
20
+
21
+ # Construct chapter text
22
+ texts = []
23
+ for _, row in self.verses_df.iterrows():
24
+ text = row["text"]
25
+ if text in self.highlight_verses_df["text"].tolist():
26
+ text = f"**:green[{text}]**"
27
+ text = f"<sup>{row['verse']}</sup> {text}"
28
+ texts.append(text)
29
+ chapter_text = " ".join(texts)
30
+ return chapter_text
31
+
32
+ def get_biblegateway_url(self, version="NIV"):
33
+ return f"https://www.biblegateway.com/passage/?search={urllib.parse.quote(self.book_name)}+{self.chapter_num}&version={version}"
34
+
35
+ def get_num_unique_highlight_verse(self):
36
+ return len(self.highlight_verses_df.drop_duplicates(subset="text"))
src/reranker.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ import numpy as np
4
+ import streamlit as st
5
+
6
+ from src.models import Chapter
7
+
8
+
9
+ class Reranker:
10
+ def rerank(self, chapters: List[Chapter]) -> List[Chapter]:
11
+ # TODO
12
+ return chapters
13
+
14
+
15
+ # Rerankers applicable to SemanticRetriever results
16
+
17
+
18
+ def sort_chapters(chapters, scores):
19
+ reranked_chapters = sorted(zip(chapters, scores), key=lambda x: x[1], reverse=True)
20
+ reranked_chapters = [x[0] for x in reranked_chapters]
21
+ return reranked_chapters
22
+
23
+
24
+ class CombinedScoreAndNumberReranker(Reranker):
25
+ def __init__(self, num_verse_weight=0.3, semantic_sim_weight=0.7):
26
+ self.num_verse_weight = num_verse_weight
27
+ self.semantic_sim_weight = semantic_sim_weight
28
+
29
+ def rerank(self, chapters: List[Chapter]) -> List[Chapter]:
30
+ num_verse_score = compute_num_verse_scores(chapters)
31
+ max_sem_sim_score = compute_sem_sim_scores(chapters)
32
+
33
+ final_scores = (
34
+ self.num_verse_weight * num_verse_score
35
+ + self.semantic_sim_weight * max_sem_sim_score
36
+ )
37
+ return sort_chapters(chapters, final_scores)
38
+
39
+
40
+ class SemanticSimScoreReranker(Reranker):
41
+ def rerank(self, chapters: List[Chapter]) -> List[Chapter]:
42
+ sem_sim_scores = np.array(
43
+ [chapter.highlight_verses_df["score"].max() for chapter in chapters]
44
+ )
45
+ return sort_chapters(chapters, sem_sim_scores)
46
+
47
+
48
+ class MaxVerseReranker(Reranker):
49
+ def rerank(self, chapters: List[Chapter]) -> List[Chapter]:
50
+
51
+ num_verses = [chapter.get_num_unique_highlight_verse() for chapter in chapters]
52
+
53
+ return sort_chapters(chapters, num_verses)
54
+
55
+
56
+ def compute_num_verse_scores(chapters):
57
+ num_verses = np.array(
58
+ [chapter.get_num_unique_highlight_verse() for chapter in chapters]
59
+ )
60
+ max_verses = max(num_verses)
61
+ num_verse_scores = num_verses / max_verses
62
+ return num_verse_scores
63
+
64
+
65
+ def compute_sem_sim_scores(chapters):
66
+ sem_sim_scores = np.array(
67
+ [chapter.highlight_verses_df["score"].max() for chapter in chapters]
68
+ )
69
+ return sem_sim_scores
src/retriever.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import abc
2
+ from typing import List
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ import sklearn
7
+ import streamlit as st
8
+ from sentence_transformers.cross_encoder import CrossEncoder
9
+ from sklearn.feature_extraction.text import TfidfVectorizer
10
+ from sklearn.preprocessing import MinMaxScaler
11
+ from sparse_dot_topn import awesome_cossim_topn
12
+
13
+ from src.models import Chapter
14
+
15
+
16
+ class Retriever:
17
+ @abc.abstractmethod
18
+ def retrieve(self, query, n=10) -> List[Chapter]:
19
+ pass
20
+
21
+
22
+ class SemanticRetriever:
23
+ def __init__(
24
+ self,
25
+ bible_df,
26
+ embeddings_manager,
27
+ threshold=0.4,
28
+ cross_encoder_model="cross-encoder/ms-marco-MiniLM-L-12-v2",
29
+ ):
30
+ self.bible_df = bible_df
31
+ self.embeddings_manager = embeddings_manager
32
+ self.threshold = threshold
33
+ self.cross_encoder_model = (
34
+ CrossEncoder(cross_encoder_model) if cross_encoder_model else None
35
+ )
36
+
37
+ # 'cross-encoder/stsb-distilroberta-base'
38
+ # cross-encoder/ms-marco-MiniLM-L-12-v2
39
+
40
+ def retrieve(self, query, n=10) -> List[Chapter]:
41
+
42
+ verse_candidates_df = self.semantic_search(
43
+ query=query,
44
+ texts=self.bible_df["text"].tolist(),
45
+ embeddings_manager=self.embeddings_manager,
46
+ n=n * 2,
47
+ threshold=self.threshold,
48
+ )
49
+
50
+ if len(verse_candidates_df) == 0:
51
+ return []
52
+
53
+ if self.cross_encoder_model is not None:
54
+ verse_candidates_df = self.cross_encode(
55
+ query, verse_candidates_df["text"].tolist()
56
+ )
57
+
58
+ # TODO: revisit this logic as some verses can have the same exact text
59
+ # For now, workaround is to drop duplicates
60
+ verse_candidates_df.drop_duplicates(subset="text", inplace=True)
61
+
62
+ # Join back verse metadata
63
+ verse_candidates_df = pd.merge(
64
+ verse_candidates_df, self.bible_df, how="left", on="text"
65
+ )
66
+ # DEBUG
67
+ # st.write(verse_candidates_df)
68
+
69
+ chapter_candidates = self.extract_chapters_from_verses(
70
+ self.bible_df, verse_candidates_df
71
+ )
72
+ return chapter_candidates
73
+
74
+ def cross_encode(self, query, texts):
75
+ combinations = [[query, text] for text in texts]
76
+ sim_scores = self.cross_encoder_model.predict(combinations)
77
+ sim_scores = MinMaxScaler().fit_transform(sim_scores.reshape(-1, 1)).flatten()
78
+ reranked_texts_scores = sorted(
79
+ zip(texts, sim_scores), key=lambda x: x[1], reverse=True
80
+ )
81
+ df = pd.DataFrame(reranked_texts_scores, columns=["text", "score"])
82
+ return df
83
+
84
+ def semantic_search(self, query, texts, embeddings_manager, n=None, threshold=0):
85
+ embeddings = embeddings_manager.get_embeddings(texts)
86
+ query_embedding = embeddings_manager.get_embeddings([query])
87
+ sim_scores = sklearn.metrics.pairwise.cosine_similarity(
88
+ query_embedding, embeddings
89
+ )[0]
90
+
91
+ # Results is a list of tuples: [(text, score)]
92
+ results = sorted(list(zip(texts, sim_scores)), key=lambda x: x[1], reverse=True)
93
+
94
+ # Take top n only if specified
95
+ if n:
96
+ results = results[:n]
97
+
98
+ # Apply a threshold to filter irrelevant results
99
+ if threshold:
100
+ results = [x for x in results if x[1] >= threshold]
101
+
102
+ df = pd.DataFrame(results, columns=["text", "score"])
103
+
104
+ return df
105
+
106
+ def extract_chapters_from_verses(self, bible_df, verse_results_df) -> List[Chapter]:
107
+ # Simple, naive assumption now is to just follow order of first appearance
108
+ # I.e. The per-verse scores dictate the order
109
+ # TODO: Revisit ranking
110
+
111
+ # The goal here is to extract all the unique chapters based on the top verse results
112
+ verse_results_df = verse_results_df.copy()
113
+ verse_results_df["book_chapter"] = (
114
+ verse_results_df["book"] + " " + verse_results_df["chapter"].astype(str)
115
+ )
116
+ unique_chapters = verse_results_df["book_chapter"].unique()
117
+
118
+ bible_df = bible_df.copy()
119
+ bible_df["book_chapter"] = (
120
+ bible_df["book"] + " " + bible_df["chapter"].astype(str)
121
+ )
122
+
123
+ chapters = []
124
+ for unique_chapter in unique_chapters:
125
+ chapter_verses_df = bible_df[bible_df["book_chapter"] == unique_chapter]
126
+ book = chapter_verses_df["book"].tolist()[0]
127
+ chapter = chapter_verses_df["chapter"].tolist()[0]
128
+
129
+ # Keep track of the matched verses as highlight verses
130
+ highlight_verses_df = pd.merge(
131
+ chapter_verses_df,
132
+ verse_results_df[["text", "score", "book", "chapter"]],
133
+ how="inner",
134
+ on=["text", "book", "chapter"],
135
+ )
136
+
137
+ chapter = Chapter(
138
+ book_name=book,
139
+ chapter_num=chapter,
140
+ verses_df=chapter_verses_df,
141
+ highlight_verses_df=highlight_verses_df,
142
+ )
143
+
144
+ chapters.append(chapter)
145
+
146
+ return chapters
147
+
148
+
149
+ class TfIdfRetriever(Retriever):
150
+ def __init__(self, texts, preprocessors=[]) -> None:
151
+ self.vectorizer = TfidfVectorizer(analyzer="word", stop_words="english")
152
+ self.preprocessors = preprocessors
153
+ # TODO: pre-process the texts
154
+ self.tfidf_vectors = self.vectorizer.fit_transform(texts)
155
+ self.tfidf_vectors_transposed = self.tfidf_vectors.transpose()
156
+
157
+ def search(self, query, n=10):
158
+ query_tfidf_vector = self.vectorizer.transform([query])
159
+ results = awesome_cossim_topn(
160
+ query_tfidf_vector, self.tfidf_vectors_transposed, n, 0.01
161
+ )
162
+ return results