Spaces:
Running
Running
Add app and data files
Browse files- app.py +145 -0
- data/NIV.csv +0 -0
- data/key_english.csv +67 -0
- requirements.txt +255 -0
- src/__pycache__/bible_loader.cpython-37.pyc +0 -0
- src/__pycache__/embeddings.cpython-37.pyc +0 -0
- src/__pycache__/models.cpython-37.pyc +0 -0
- src/__pycache__/reranker.cpython-37.pyc +0 -0
- src/__pycache__/retriever.cpython-37.pyc +0 -0
- src/bible_loader.py +37 -0
- src/embeddings.py +60 -0
- src/models.py +36 -0
- src/reranker.py +69 -0
- src/retriever.py +162 -0
app.py
ADDED
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import time
|
3 |
+
from pathlib import Path
|
4 |
+
|
5 |
+
import streamlit as st
|
6 |
+
|
7 |
+
from src import bible_loader
|
8 |
+
from src.embeddings import EmbeddingsManager
|
9 |
+
from src.reranker import (
|
10 |
+
CombinedScoreAndNumberReranker,
|
11 |
+
MaxVerseReranker,
|
12 |
+
Reranker,
|
13 |
+
SemanticSimScoreReranker,
|
14 |
+
)
|
15 |
+
from src.retriever import Retriever, SemanticRetriever
|
16 |
+
|
17 |
+
|
18 |
+
def display_chapter(chapter):
|
19 |
+
st.header(f"[{str(chapter)}]({chapter.get_biblegateway_url()})")
|
20 |
+
chapter_text = chapter.get_formatted_text()
|
21 |
+
st.markdown(chapter_text, unsafe_allow_html=True)
|
22 |
+
# st.write(chapter.highlight_verses_df)
|
23 |
+
|
24 |
+
|
25 |
+
def config():
|
26 |
+
n_results = st.sidebar.slider("Maximum Results?", 5, 30, 10)
|
27 |
+
# bible_version = st.sidebar.selectbox("Bible Version", ["NIV", "ESV"]) # TODO
|
28 |
+
bible_version = "NIV"
|
29 |
+
new_testament = st.sidebar.checkbox("Search New Testament?", True)
|
30 |
+
old_testament = st.sidebar.checkbox("Search Old Testament?", False)
|
31 |
+
|
32 |
+
return n_results, new_testament, old_testament, bible_version
|
33 |
+
|
34 |
+
|
35 |
+
def main():
|
36 |
+
|
37 |
+
st.set_page_config(page_title="Bible Search", layout="wide")
|
38 |
+
|
39 |
+
n_results, new_testament, old_testament, bible_version = config()
|
40 |
+
|
41 |
+
# Config
|
42 |
+
ROOT_DIR = Path(os.path.abspath(os.path.dirname(__file__)))
|
43 |
+
DATA_DIR = ROOT_DIR / "data"
|
44 |
+
|
45 |
+
n_candidates = n_results * 2
|
46 |
+
metadata_csv = DATA_DIR / "key_english.csv"
|
47 |
+
verses_csv = DATA_DIR / f"{bible_version}.csv"
|
48 |
+
|
49 |
+
semantic_sim_model = "msmarco-distilbert-base-v4"
|
50 |
+
|
51 |
+
# Initialize / Index
|
52 |
+
bible_df = bible_loader.load_bible(metadata_csv, verses_csv)
|
53 |
+
embeddings_manager = EmbeddingsManager(
|
54 |
+
model_name=semantic_sim_model,
|
55 |
+
bible_version=bible_version,
|
56 |
+
embeddings_cache_dir=DATA_DIR,
|
57 |
+
texts=bible_df["text"].tolist(),
|
58 |
+
)
|
59 |
+
|
60 |
+
# Trim down search space if needed
|
61 |
+
if not new_testament:
|
62 |
+
bible_df = bible_df[bible_df["testament"] != "NT"]
|
63 |
+
if not old_testament:
|
64 |
+
bible_df = bible_df[bible_df["testament"] != "OT"]
|
65 |
+
|
66 |
+
# Initialize retriever and reranker based on filtered texts
|
67 |
+
retriever = SemanticRetriever(bible_df, embeddings_manager)
|
68 |
+
reranker = CombinedScoreAndNumberReranker()
|
69 |
+
# reranker = SemanticSimScoreReranker()
|
70 |
+
# reranker = MaxVerseReranker()
|
71 |
+
|
72 |
+
_, main_col, _ = st.columns([1, 2, 1])
|
73 |
+
|
74 |
+
with main_col:
|
75 |
+
|
76 |
+
# Get user input
|
77 |
+
st.title("Verse Similarity Search")
|
78 |
+
st.markdown(
|
79 |
+
"- Have you ever been stumped by a verse and wondered what related things the Bible says about it?\n"
|
80 |
+
"- Or you have a verse of interest and you simply want to find related ones?\n"
|
81 |
+
"- Or you vaguely recall a verse's idea, but can't recall the exact text?\n"
|
82 |
+
"This tool was made just for that!"
|
83 |
+
)
|
84 |
+
|
85 |
+
st.markdown("---")
|
86 |
+
|
87 |
+
demo_query = st.selectbox(
|
88 |
+
"Try some demo queries...",
|
89 |
+
[
|
90 |
+
"",
|
91 |
+
"For God so loved the world that he gave his one and only Son, that whoever believes in him shall not perish but have eternal life.",
|
92 |
+
"In the same way, faith by itself, if it is not accompanied by action, is dead.",
|
93 |
+
"I tell you the truth, no one can enter the kingdom of God unless he is born of water and the Spirit.",
|
94 |
+
"the Lord is patient with us, not wanting us to perish",
|
95 |
+
"is it ok for believers to continue in sin?",
|
96 |
+
"it is possible to resist every temptation",
|
97 |
+
"heavenly rewards",
|
98 |
+
"the old is gone, the new has come",
|
99 |
+
"suffering for Christ",
|
100 |
+
"rejoicing in trials",
|
101 |
+
"Be careful of false prophets, wolves in sheep skin",
|
102 |
+
"will there be marriage in heaven?",
|
103 |
+
],
|
104 |
+
index=1,
|
105 |
+
)
|
106 |
+
|
107 |
+
query = st.text_area(
|
108 |
+
"Or type a verse's text here to find similar verses",
|
109 |
+
demo_query if demo_query.strip() else "",
|
110 |
+
)
|
111 |
+
|
112 |
+
clicked_search = st.button("Search", type="primary")
|
113 |
+
|
114 |
+
if query or clicked_search:
|
115 |
+
|
116 |
+
if len(bible_df) == 0:
|
117 |
+
st.markdown(
|
118 |
+
"---\n:red[Please select at least one testament to search through (left hand side of the screen). :)]"
|
119 |
+
)
|
120 |
+
else:
|
121 |
+
with st.spinner("Searching..."):
|
122 |
+
|
123 |
+
start = time.time()
|
124 |
+
|
125 |
+
# Retrieve and re-rank
|
126 |
+
candidate_chapters = retriever.retrieve(query, n=n_candidates)
|
127 |
+
candidate_chapters = reranker.rerank(candidate_chapters)
|
128 |
+
|
129 |
+
# Trim because candidates can be more than the desired results
|
130 |
+
final_chapter_results = candidate_chapters[:n_results]
|
131 |
+
|
132 |
+
# Display quick stats
|
133 |
+
st.markdown(
|
134 |
+
f"_{len(final_chapter_results)} results found in {time.time()-start:.2f}s_"
|
135 |
+
)
|
136 |
+
st.markdown("---")
|
137 |
+
|
138 |
+
# Display results
|
139 |
+
for chapter in final_chapter_results:
|
140 |
+
display_chapter(chapter)
|
141 |
+
st.markdown("---")
|
142 |
+
|
143 |
+
|
144 |
+
if __name__ == "__main__":
|
145 |
+
main()
|
data/NIV.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/key_english.csv
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
b,n,t,g
|
2 |
+
1,Genesis,OT,1
|
3 |
+
2,Exodus,OT,1
|
4 |
+
3,Leviticus,OT,1
|
5 |
+
4,Numbers,OT,1
|
6 |
+
5,Deuteronomy,OT,1
|
7 |
+
6,Joshua,OT,2
|
8 |
+
7,Judges,OT,2
|
9 |
+
8,Ruth,OT,2
|
10 |
+
9,1 Samuel,OT,2
|
11 |
+
10,2 Samuel,OT,2
|
12 |
+
11,1 Kings,OT,2
|
13 |
+
12,2 Kings,OT,2
|
14 |
+
13,1 Chronicles,OT,2
|
15 |
+
14,2 Chronicles,OT,2
|
16 |
+
15,Ezra,OT,2
|
17 |
+
16,Nehemiah,OT,2
|
18 |
+
17,Esther,OT,2
|
19 |
+
18,Job,OT,3
|
20 |
+
19,Psalms,OT,3
|
21 |
+
20,Proverbs,OT,3
|
22 |
+
21,Ecclesiastes,OT,3
|
23 |
+
22,Song of Solomon,OT,3
|
24 |
+
23,Isaiah,OT,4
|
25 |
+
24,Jeremiah,OT,4
|
26 |
+
25,Lamentations,OT,4
|
27 |
+
26,Ezekiel,OT,4
|
28 |
+
27,Daniel,OT,4
|
29 |
+
28,Hosea,OT,4
|
30 |
+
29,Joel,OT,4
|
31 |
+
30,Amos,OT,4
|
32 |
+
31,Obadiah,OT,4
|
33 |
+
32,Jonah,OT,4
|
34 |
+
33,Micah,OT,4
|
35 |
+
34,Nahum,OT,4
|
36 |
+
35,Habakkuk,OT,4
|
37 |
+
36,Zephaniah,OT,4
|
38 |
+
37,Haggai,OT,4
|
39 |
+
38,Zechariah,OT,4
|
40 |
+
39,Malachi,OT,4
|
41 |
+
40,Matthew,NT,5
|
42 |
+
41,Mark,NT,5
|
43 |
+
42,Luke,NT,5
|
44 |
+
43,John,NT,5
|
45 |
+
44,Acts,NT,6
|
46 |
+
45,Romans,NT,7
|
47 |
+
46,1 Corinthians,NT,7
|
48 |
+
47,2 Corinthians,NT,7
|
49 |
+
48,Galatians,NT,7
|
50 |
+
49,Ephesians,NT,7
|
51 |
+
50,Philippians,NT,7
|
52 |
+
51,Colossians,NT,7
|
53 |
+
52,1 Thessalonians,NT,7
|
54 |
+
53,2 Thessalonians,NT,7
|
55 |
+
54,1 Timothy,NT,7
|
56 |
+
55,2 Timothy,NT,7
|
57 |
+
56,Titus,NT,7
|
58 |
+
57,Philemon,NT,7
|
59 |
+
58,Hebrews,NT,7
|
60 |
+
59,James,NT,7
|
61 |
+
60,1 Peter,NT,7
|
62 |
+
61,2 Peter,NT,7
|
63 |
+
62,1 John,NT,7
|
64 |
+
63,2 John,NT,7
|
65 |
+
64,3 John,NT,7
|
66 |
+
65,Jude,NT,7
|
67 |
+
66,Revelation,NT,8
|
requirements.txt
ADDED
@@ -0,0 +1,255 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#
|
2 |
+
# This file is autogenerated by pip-compile with Python 3.7
|
3 |
+
# by the following command:
|
4 |
+
#
|
5 |
+
# pip-compile --output-file=requirements.txt requirements.in
|
6 |
+
#
|
7 |
+
altair==4.2.0
|
8 |
+
# via streamlit
|
9 |
+
attrs==22.1.0
|
10 |
+
# via jsonschema
|
11 |
+
backports-zoneinfo==0.2.1
|
12 |
+
# via
|
13 |
+
# pytz-deprecation-shim
|
14 |
+
# tzlocal
|
15 |
+
black==22.12.0
|
16 |
+
# via -r requirements.in
|
17 |
+
blinker==1.5
|
18 |
+
# via streamlit
|
19 |
+
cachetools==5.2.0
|
20 |
+
# via streamlit
|
21 |
+
certifi==2022.12.7
|
22 |
+
# via requests
|
23 |
+
charset-normalizer==2.1.1
|
24 |
+
# via requests
|
25 |
+
click==8.0.4
|
26 |
+
# via
|
27 |
+
# black
|
28 |
+
# nltk
|
29 |
+
# streamlit
|
30 |
+
decorator==5.1.1
|
31 |
+
# via validators
|
32 |
+
entrypoints==0.4
|
33 |
+
# via altair
|
34 |
+
filelock==3.8.2
|
35 |
+
# via
|
36 |
+
# huggingface-hub
|
37 |
+
# transformers
|
38 |
+
gitdb==4.0.10
|
39 |
+
# via gitpython
|
40 |
+
gitpython==3.1.29
|
41 |
+
# via streamlit
|
42 |
+
h5py==3.7.0
|
43 |
+
# via -r requirements.in
|
44 |
+
huggingface-hub==0.11.1
|
45 |
+
# via
|
46 |
+
# sentence-transformers
|
47 |
+
# transformers
|
48 |
+
idna==3.4
|
49 |
+
# via requests
|
50 |
+
importlib-metadata==5.1.0
|
51 |
+
# via
|
52 |
+
# click
|
53 |
+
# huggingface-hub
|
54 |
+
# jsonschema
|
55 |
+
# streamlit
|
56 |
+
# transformers
|
57 |
+
importlib-resources==5.10.1
|
58 |
+
# via jsonschema
|
59 |
+
isort==5.11.4
|
60 |
+
# via -r requirements.in
|
61 |
+
jinja2==3.1.2
|
62 |
+
# via
|
63 |
+
# altair
|
64 |
+
# pydeck
|
65 |
+
joblib==1.2.0
|
66 |
+
# via
|
67 |
+
# nltk
|
68 |
+
# scikit-learn
|
69 |
+
jsonschema==4.17.3
|
70 |
+
# via altair
|
71 |
+
loguru==0.6.0
|
72 |
+
# via -r requirements.in
|
73 |
+
markdown-it-py==2.1.0
|
74 |
+
# via rich
|
75 |
+
markupsafe==2.1.1
|
76 |
+
# via jinja2
|
77 |
+
mdurl==0.1.2
|
78 |
+
# via markdown-it-py
|
79 |
+
mypy-extensions==0.4.3
|
80 |
+
# via black
|
81 |
+
nltk==3.8
|
82 |
+
# via sentence-transformers
|
83 |
+
numpy==1.21.6
|
84 |
+
# via
|
85 |
+
# -r requirements.in
|
86 |
+
# altair
|
87 |
+
# h5py
|
88 |
+
# pandas
|
89 |
+
# pyarrow
|
90 |
+
# pydeck
|
91 |
+
# scikit-learn
|
92 |
+
# scipy
|
93 |
+
# sentence-transformers
|
94 |
+
# sparse-dot-topn
|
95 |
+
# streamlit
|
96 |
+
# torchvision
|
97 |
+
# transformers
|
98 |
+
nvidia-cublas-cu11==11.10.3.66
|
99 |
+
# via
|
100 |
+
# nvidia-cudnn-cu11
|
101 |
+
# torch
|
102 |
+
nvidia-cuda-nvrtc-cu11==11.7.99
|
103 |
+
# via torch
|
104 |
+
nvidia-cuda-runtime-cu11==11.7.99
|
105 |
+
# via torch
|
106 |
+
nvidia-cudnn-cu11==8.5.0.96
|
107 |
+
# via torch
|
108 |
+
packaging==22.0
|
109 |
+
# via
|
110 |
+
# huggingface-hub
|
111 |
+
# streamlit
|
112 |
+
# transformers
|
113 |
+
pandas==1.3.5
|
114 |
+
# via
|
115 |
+
# -r requirements.in
|
116 |
+
# altair
|
117 |
+
# streamlit
|
118 |
+
pathspec==0.10.3
|
119 |
+
# via black
|
120 |
+
pillow==9.3.0
|
121 |
+
# via
|
122 |
+
# streamlit
|
123 |
+
# torchvision
|
124 |
+
pkgutil-resolve-name==1.3.10
|
125 |
+
# via jsonschema
|
126 |
+
platformdirs==2.6.0
|
127 |
+
# via black
|
128 |
+
protobuf==3.19.6
|
129 |
+
# via
|
130 |
+
# -r requirements.in
|
131 |
+
# streamlit
|
132 |
+
pyarrow==10.0.1
|
133 |
+
# via streamlit
|
134 |
+
pydantic==1.10.2
|
135 |
+
# via -r requirements.in
|
136 |
+
pydeck==0.8.0
|
137 |
+
# via streamlit
|
138 |
+
pygments==2.14.0
|
139 |
+
# via rich
|
140 |
+
pympler==1.0.1
|
141 |
+
# via streamlit
|
142 |
+
pyrsistent==0.19.2
|
143 |
+
# via jsonschema
|
144 |
+
python-dateutil==2.8.2
|
145 |
+
# via
|
146 |
+
# pandas
|
147 |
+
# streamlit
|
148 |
+
pytz==2022.6
|
149 |
+
# via pandas
|
150 |
+
pytz-deprecation-shim==0.1.0.post0
|
151 |
+
# via tzlocal
|
152 |
+
pyyaml==6.0
|
153 |
+
# via
|
154 |
+
# huggingface-hub
|
155 |
+
# transformers
|
156 |
+
regex==2022.10.31
|
157 |
+
# via
|
158 |
+
# nltk
|
159 |
+
# transformers
|
160 |
+
requests==2.28.1
|
161 |
+
# via
|
162 |
+
# huggingface-hub
|
163 |
+
# streamlit
|
164 |
+
# torchvision
|
165 |
+
# transformers
|
166 |
+
rich==13.3.1
|
167 |
+
# via streamlit
|
168 |
+
scikit-learn==1.0.2
|
169 |
+
# via sentence-transformers
|
170 |
+
scipy==1.7.3
|
171 |
+
# via
|
172 |
+
# scikit-learn
|
173 |
+
# sentence-transformers
|
174 |
+
# sparse-dot-topn
|
175 |
+
semver==2.13.0
|
176 |
+
# via streamlit
|
177 |
+
sentence-transformers==2.2.2
|
178 |
+
# via -r requirements.in
|
179 |
+
sentencepiece==0.1.91
|
180 |
+
# via sentence-transformers
|
181 |
+
six==1.16.0
|
182 |
+
# via python-dateutil
|
183 |
+
sklearn==0.0.post1
|
184 |
+
# via -r requirements.in
|
185 |
+
smmap==5.0.0
|
186 |
+
# via gitdb
|
187 |
+
sparse-dot-topn==0.3.3
|
188 |
+
# via -r requirements.in
|
189 |
+
streamlit==1.17.0
|
190 |
+
# via -r requirements.in
|
191 |
+
threadpoolctl==3.1.0
|
192 |
+
# via scikit-learn
|
193 |
+
tokenizers==0.13.2
|
194 |
+
# via transformers
|
195 |
+
toml==0.10.2
|
196 |
+
# via streamlit
|
197 |
+
tomli==2.0.1
|
198 |
+
# via black
|
199 |
+
toolz==0.12.0
|
200 |
+
# via altair
|
201 |
+
torch==1.13.1
|
202 |
+
# via
|
203 |
+
# -r requirements.in
|
204 |
+
# sentence-transformers
|
205 |
+
# torchvision
|
206 |
+
torchvision==0.14.1
|
207 |
+
# via sentence-transformers
|
208 |
+
tornado==6.2
|
209 |
+
# via streamlit
|
210 |
+
tqdm==4.64.1
|
211 |
+
# via
|
212 |
+
# huggingface-hub
|
213 |
+
# nltk
|
214 |
+
# sentence-transformers
|
215 |
+
# transformers
|
216 |
+
transformers==4.25.1
|
217 |
+
# via
|
218 |
+
# -r requirements.in
|
219 |
+
# sentence-transformers
|
220 |
+
typed-ast==1.5.4
|
221 |
+
# via black
|
222 |
+
typing-extensions==4.4.0
|
223 |
+
# via
|
224 |
+
# black
|
225 |
+
# gitpython
|
226 |
+
# huggingface-hub
|
227 |
+
# importlib-metadata
|
228 |
+
# jsonschema
|
229 |
+
# markdown-it-py
|
230 |
+
# pydantic
|
231 |
+
# rich
|
232 |
+
# streamlit
|
233 |
+
# torch
|
234 |
+
# torchvision
|
235 |
+
tzdata==2022.7
|
236 |
+
# via pytz-deprecation-shim
|
237 |
+
tzlocal==4.2
|
238 |
+
# via streamlit
|
239 |
+
urllib3==1.26.13
|
240 |
+
# via requests
|
241 |
+
validators==0.20.0
|
242 |
+
# via streamlit
|
243 |
+
watchdog==2.2.0
|
244 |
+
# via streamlit
|
245 |
+
wheel==0.38.4
|
246 |
+
# via
|
247 |
+
# nvidia-cublas-cu11
|
248 |
+
# nvidia-cuda-runtime-cu11
|
249 |
+
zipp==3.11.0
|
250 |
+
# via
|
251 |
+
# importlib-metadata
|
252 |
+
# importlib-resources
|
253 |
+
|
254 |
+
# The following packages are considered to be unsafe in a requirements file:
|
255 |
+
# setuptools
|
src/__pycache__/bible_loader.cpython-37.pyc
ADDED
Binary file (1.01 kB). View file
|
|
src/__pycache__/embeddings.cpython-37.pyc
ADDED
Binary file (1.91 kB). View file
|
|
src/__pycache__/models.cpython-37.pyc
ADDED
Binary file (1.69 kB). View file
|
|
src/__pycache__/reranker.cpython-37.pyc
ADDED
Binary file (3.43 kB). View file
|
|
src/__pycache__/retriever.cpython-37.pyc
ADDED
Binary file (4.82 kB). View file
|
|
src/bible_loader.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import streamlit as st
|
3 |
+
from loguru import logger
|
4 |
+
|
5 |
+
|
6 |
+
@st.cache()
|
7 |
+
def load_bible(metadata_csv, verses_csv):
|
8 |
+
# There is one constant metadata file (metadata_csv),
|
9 |
+
# and another csv file containing the actual verses in the specified version (bible_csv).
|
10 |
+
metadata_df = pd.read_csv(metadata_csv)
|
11 |
+
verses_df = pd.read_csv(verses_csv, escapechar="\\")
|
12 |
+
df = pd.merge(verses_df, metadata_df, on="b")
|
13 |
+
df = df.fillna("") # Some verses are blank in some versions
|
14 |
+
|
15 |
+
df = df[["n", "c", "v", "t_x", "t_y"]]
|
16 |
+
|
17 |
+
# The data sources used have this convention in the columns.
|
18 |
+
# Renaming them here for ease of remembrance.
|
19 |
+
col_rename = {
|
20 |
+
"t_y": "testament",
|
21 |
+
"n": "book",
|
22 |
+
"c": "chapter",
|
23 |
+
"v": "verse",
|
24 |
+
"t_x": "text",
|
25 |
+
}
|
26 |
+
df = df.rename(columns=col_rename)
|
27 |
+
|
28 |
+
# Create a human-friendly string of specifying a verse (e.g. Genesis 1:1)
|
29 |
+
df["source"] = df.apply(
|
30 |
+
lambda row: f"{row['book']} {row['chapter']}:{row['verse']}", axis=1
|
31 |
+
)
|
32 |
+
|
33 |
+
logger.info(
|
34 |
+
f"Successfully loaded Bible DF with {len(df):,} rows. Columns: {df.columns.tolist()}"
|
35 |
+
)
|
36 |
+
|
37 |
+
return df
|
src/embeddings.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import traceback
|
3 |
+
|
4 |
+
import h5py
|
5 |
+
import numpy as np
|
6 |
+
from loguru import logger
|
7 |
+
from sentence_transformers import SentenceTransformer
|
8 |
+
|
9 |
+
|
10 |
+
class EmbeddingsManager:
|
11 |
+
def __init__(self, model_name, bible_version, texts, embeddings_cache_dir) -> None:
|
12 |
+
|
13 |
+
# Load embeddings model
|
14 |
+
self.model = SentenceTransformer(model_name)
|
15 |
+
|
16 |
+
# Load or generate embeddings baseed on the corpus
|
17 |
+
sanitized_model_name = model_name.replace("\\", "-").replace("/", "-")
|
18 |
+
self.cache_filename = f"{bible_version}_{sanitized_model_name}.h5"
|
19 |
+
self.emb_cache_filepath = os.path.join(
|
20 |
+
embeddings_cache_dir, self.cache_filename
|
21 |
+
)
|
22 |
+
|
23 |
+
# Load embeddings if it exists
|
24 |
+
try:
|
25 |
+
with h5py.File(self.emb_cache_filepath, "r") as h:
|
26 |
+
self.embeddings = np.array(h["embeddings"])
|
27 |
+
except Exception:
|
28 |
+
traceback.print_exc()
|
29 |
+
# If it doesn't, generate embeddings and save to a file
|
30 |
+
logger.info(
|
31 |
+
f"Generating embeddings and saving to {self.emb_cache_filepath}"
|
32 |
+
)
|
33 |
+
self.embeddings = self.model.encode(texts)
|
34 |
+
with h5py.File(self.emb_cache_filepath, "w") as f:
|
35 |
+
f.create_dataset("embeddings", data=self.embeddings)
|
36 |
+
|
37 |
+
# Create a look-up dict to quickly retrieve embeddings of texts
|
38 |
+
self.text_emb_dict = {}
|
39 |
+
for text, embedding in zip(texts, self.embeddings):
|
40 |
+
self.text_emb_dict[text] = embedding
|
41 |
+
|
42 |
+
logger.info(
|
43 |
+
f"Successfully loaded {model_name} embeddings for {bible_version} from {self.emb_cache_filepath}."
|
44 |
+
)
|
45 |
+
|
46 |
+
def get_embeddings(self, texts):
|
47 |
+
embeddings = []
|
48 |
+
for text in texts:
|
49 |
+
if text not in self.text_emb_dict:
|
50 |
+
self.text_emb_dict[text] = self.model.encode([text])[0]
|
51 |
+
embeddings.append(self.text_emb_dict[text])
|
52 |
+
return embeddings
|
53 |
+
|
54 |
+
def __str__(self):
|
55 |
+
return self.emb_cache_filepath
|
56 |
+
|
57 |
+
|
58 |
+
def score_semantic_similarity(query, texts_df):
|
59 |
+
"""Returns copy of text_df with semantic similarity scores."""
|
60 |
+
pass
|
src/models.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import urllib
|
2 |
+
|
3 |
+
import pandas as pd
|
4 |
+
from pydantic import BaseModel
|
5 |
+
|
6 |
+
|
7 |
+
class Chapter(BaseModel):
|
8 |
+
book_name: str
|
9 |
+
chapter_num: int
|
10 |
+
verses_df: pd.DataFrame
|
11 |
+
highlight_verses_df: pd.DataFrame
|
12 |
+
|
13 |
+
class Config:
|
14 |
+
arbitrary_types_allowed = True
|
15 |
+
|
16 |
+
def __str__(self) -> str:
|
17 |
+
return f"{self.book_name} {self.chapter_num}"
|
18 |
+
|
19 |
+
def get_formatted_text(self):
|
20 |
+
|
21 |
+
# Construct chapter text
|
22 |
+
texts = []
|
23 |
+
for _, row in self.verses_df.iterrows():
|
24 |
+
text = row["text"]
|
25 |
+
if text in self.highlight_verses_df["text"].tolist():
|
26 |
+
text = f"**:green[{text}]**"
|
27 |
+
text = f"<sup>{row['verse']}</sup> {text}"
|
28 |
+
texts.append(text)
|
29 |
+
chapter_text = " ".join(texts)
|
30 |
+
return chapter_text
|
31 |
+
|
32 |
+
def get_biblegateway_url(self, version="NIV"):
|
33 |
+
return f"https://www.biblegateway.com/passage/?search={urllib.parse.quote(self.book_name)}+{self.chapter_num}&version={version}"
|
34 |
+
|
35 |
+
def get_num_unique_highlight_verse(self):
|
36 |
+
return len(self.highlight_verses_df.drop_duplicates(subset="text"))
|
src/reranker.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
import streamlit as st
|
5 |
+
|
6 |
+
from src.models import Chapter
|
7 |
+
|
8 |
+
|
9 |
+
class Reranker:
|
10 |
+
def rerank(self, chapters: List[Chapter]) -> List[Chapter]:
|
11 |
+
# TODO
|
12 |
+
return chapters
|
13 |
+
|
14 |
+
|
15 |
+
# Rerankers applicable to SemanticRetriever results
|
16 |
+
|
17 |
+
|
18 |
+
def sort_chapters(chapters, scores):
|
19 |
+
reranked_chapters = sorted(zip(chapters, scores), key=lambda x: x[1], reverse=True)
|
20 |
+
reranked_chapters = [x[0] for x in reranked_chapters]
|
21 |
+
return reranked_chapters
|
22 |
+
|
23 |
+
|
24 |
+
class CombinedScoreAndNumberReranker(Reranker):
|
25 |
+
def __init__(self, num_verse_weight=0.3, semantic_sim_weight=0.7):
|
26 |
+
self.num_verse_weight = num_verse_weight
|
27 |
+
self.semantic_sim_weight = semantic_sim_weight
|
28 |
+
|
29 |
+
def rerank(self, chapters: List[Chapter]) -> List[Chapter]:
|
30 |
+
num_verse_score = compute_num_verse_scores(chapters)
|
31 |
+
max_sem_sim_score = compute_sem_sim_scores(chapters)
|
32 |
+
|
33 |
+
final_scores = (
|
34 |
+
self.num_verse_weight * num_verse_score
|
35 |
+
+ self.semantic_sim_weight * max_sem_sim_score
|
36 |
+
)
|
37 |
+
return sort_chapters(chapters, final_scores)
|
38 |
+
|
39 |
+
|
40 |
+
class SemanticSimScoreReranker(Reranker):
|
41 |
+
def rerank(self, chapters: List[Chapter]) -> List[Chapter]:
|
42 |
+
sem_sim_scores = np.array(
|
43 |
+
[chapter.highlight_verses_df["score"].max() for chapter in chapters]
|
44 |
+
)
|
45 |
+
return sort_chapters(chapters, sem_sim_scores)
|
46 |
+
|
47 |
+
|
48 |
+
class MaxVerseReranker(Reranker):
|
49 |
+
def rerank(self, chapters: List[Chapter]) -> List[Chapter]:
|
50 |
+
|
51 |
+
num_verses = [chapter.get_num_unique_highlight_verse() for chapter in chapters]
|
52 |
+
|
53 |
+
return sort_chapters(chapters, num_verses)
|
54 |
+
|
55 |
+
|
56 |
+
def compute_num_verse_scores(chapters):
|
57 |
+
num_verses = np.array(
|
58 |
+
[chapter.get_num_unique_highlight_verse() for chapter in chapters]
|
59 |
+
)
|
60 |
+
max_verses = max(num_verses)
|
61 |
+
num_verse_scores = num_verses / max_verses
|
62 |
+
return num_verse_scores
|
63 |
+
|
64 |
+
|
65 |
+
def compute_sem_sim_scores(chapters):
|
66 |
+
sem_sim_scores = np.array(
|
67 |
+
[chapter.highlight_verses_df["score"].max() for chapter in chapters]
|
68 |
+
)
|
69 |
+
return sem_sim_scores
|
src/retriever.py
ADDED
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import abc
|
2 |
+
from typing import List
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
import pandas as pd
|
6 |
+
import sklearn
|
7 |
+
import streamlit as st
|
8 |
+
from sentence_transformers.cross_encoder import CrossEncoder
|
9 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
10 |
+
from sklearn.preprocessing import MinMaxScaler
|
11 |
+
from sparse_dot_topn import awesome_cossim_topn
|
12 |
+
|
13 |
+
from src.models import Chapter
|
14 |
+
|
15 |
+
|
16 |
+
class Retriever:
|
17 |
+
@abc.abstractmethod
|
18 |
+
def retrieve(self, query, n=10) -> List[Chapter]:
|
19 |
+
pass
|
20 |
+
|
21 |
+
|
22 |
+
class SemanticRetriever:
|
23 |
+
def __init__(
|
24 |
+
self,
|
25 |
+
bible_df,
|
26 |
+
embeddings_manager,
|
27 |
+
threshold=0.4,
|
28 |
+
cross_encoder_model="cross-encoder/ms-marco-MiniLM-L-12-v2",
|
29 |
+
):
|
30 |
+
self.bible_df = bible_df
|
31 |
+
self.embeddings_manager = embeddings_manager
|
32 |
+
self.threshold = threshold
|
33 |
+
self.cross_encoder_model = (
|
34 |
+
CrossEncoder(cross_encoder_model) if cross_encoder_model else None
|
35 |
+
)
|
36 |
+
|
37 |
+
# 'cross-encoder/stsb-distilroberta-base'
|
38 |
+
# cross-encoder/ms-marco-MiniLM-L-12-v2
|
39 |
+
|
40 |
+
def retrieve(self, query, n=10) -> List[Chapter]:
|
41 |
+
|
42 |
+
verse_candidates_df = self.semantic_search(
|
43 |
+
query=query,
|
44 |
+
texts=self.bible_df["text"].tolist(),
|
45 |
+
embeddings_manager=self.embeddings_manager,
|
46 |
+
n=n * 2,
|
47 |
+
threshold=self.threshold,
|
48 |
+
)
|
49 |
+
|
50 |
+
if len(verse_candidates_df) == 0:
|
51 |
+
return []
|
52 |
+
|
53 |
+
if self.cross_encoder_model is not None:
|
54 |
+
verse_candidates_df = self.cross_encode(
|
55 |
+
query, verse_candidates_df["text"].tolist()
|
56 |
+
)
|
57 |
+
|
58 |
+
# TODO: revisit this logic as some verses can have the same exact text
|
59 |
+
# For now, workaround is to drop duplicates
|
60 |
+
verse_candidates_df.drop_duplicates(subset="text", inplace=True)
|
61 |
+
|
62 |
+
# Join back verse metadata
|
63 |
+
verse_candidates_df = pd.merge(
|
64 |
+
verse_candidates_df, self.bible_df, how="left", on="text"
|
65 |
+
)
|
66 |
+
# DEBUG
|
67 |
+
# st.write(verse_candidates_df)
|
68 |
+
|
69 |
+
chapter_candidates = self.extract_chapters_from_verses(
|
70 |
+
self.bible_df, verse_candidates_df
|
71 |
+
)
|
72 |
+
return chapter_candidates
|
73 |
+
|
74 |
+
def cross_encode(self, query, texts):
|
75 |
+
combinations = [[query, text] for text in texts]
|
76 |
+
sim_scores = self.cross_encoder_model.predict(combinations)
|
77 |
+
sim_scores = MinMaxScaler().fit_transform(sim_scores.reshape(-1, 1)).flatten()
|
78 |
+
reranked_texts_scores = sorted(
|
79 |
+
zip(texts, sim_scores), key=lambda x: x[1], reverse=True
|
80 |
+
)
|
81 |
+
df = pd.DataFrame(reranked_texts_scores, columns=["text", "score"])
|
82 |
+
return df
|
83 |
+
|
84 |
+
def semantic_search(self, query, texts, embeddings_manager, n=None, threshold=0):
|
85 |
+
embeddings = embeddings_manager.get_embeddings(texts)
|
86 |
+
query_embedding = embeddings_manager.get_embeddings([query])
|
87 |
+
sim_scores = sklearn.metrics.pairwise.cosine_similarity(
|
88 |
+
query_embedding, embeddings
|
89 |
+
)[0]
|
90 |
+
|
91 |
+
# Results is a list of tuples: [(text, score)]
|
92 |
+
results = sorted(list(zip(texts, sim_scores)), key=lambda x: x[1], reverse=True)
|
93 |
+
|
94 |
+
# Take top n only if specified
|
95 |
+
if n:
|
96 |
+
results = results[:n]
|
97 |
+
|
98 |
+
# Apply a threshold to filter irrelevant results
|
99 |
+
if threshold:
|
100 |
+
results = [x for x in results if x[1] >= threshold]
|
101 |
+
|
102 |
+
df = pd.DataFrame(results, columns=["text", "score"])
|
103 |
+
|
104 |
+
return df
|
105 |
+
|
106 |
+
def extract_chapters_from_verses(self, bible_df, verse_results_df) -> List[Chapter]:
|
107 |
+
# Simple, naive assumption now is to just follow order of first appearance
|
108 |
+
# I.e. The per-verse scores dictate the order
|
109 |
+
# TODO: Revisit ranking
|
110 |
+
|
111 |
+
# The goal here is to extract all the unique chapters based on the top verse results
|
112 |
+
verse_results_df = verse_results_df.copy()
|
113 |
+
verse_results_df["book_chapter"] = (
|
114 |
+
verse_results_df["book"] + " " + verse_results_df["chapter"].astype(str)
|
115 |
+
)
|
116 |
+
unique_chapters = verse_results_df["book_chapter"].unique()
|
117 |
+
|
118 |
+
bible_df = bible_df.copy()
|
119 |
+
bible_df["book_chapter"] = (
|
120 |
+
bible_df["book"] + " " + bible_df["chapter"].astype(str)
|
121 |
+
)
|
122 |
+
|
123 |
+
chapters = []
|
124 |
+
for unique_chapter in unique_chapters:
|
125 |
+
chapter_verses_df = bible_df[bible_df["book_chapter"] == unique_chapter]
|
126 |
+
book = chapter_verses_df["book"].tolist()[0]
|
127 |
+
chapter = chapter_verses_df["chapter"].tolist()[0]
|
128 |
+
|
129 |
+
# Keep track of the matched verses as highlight verses
|
130 |
+
highlight_verses_df = pd.merge(
|
131 |
+
chapter_verses_df,
|
132 |
+
verse_results_df[["text", "score", "book", "chapter"]],
|
133 |
+
how="inner",
|
134 |
+
on=["text", "book", "chapter"],
|
135 |
+
)
|
136 |
+
|
137 |
+
chapter = Chapter(
|
138 |
+
book_name=book,
|
139 |
+
chapter_num=chapter,
|
140 |
+
verses_df=chapter_verses_df,
|
141 |
+
highlight_verses_df=highlight_verses_df,
|
142 |
+
)
|
143 |
+
|
144 |
+
chapters.append(chapter)
|
145 |
+
|
146 |
+
return chapters
|
147 |
+
|
148 |
+
|
149 |
+
class TfIdfRetriever(Retriever):
|
150 |
+
def __init__(self, texts, preprocessors=[]) -> None:
|
151 |
+
self.vectorizer = TfidfVectorizer(analyzer="word", stop_words="english")
|
152 |
+
self.preprocessors = preprocessors
|
153 |
+
# TODO: pre-process the texts
|
154 |
+
self.tfidf_vectors = self.vectorizer.fit_transform(texts)
|
155 |
+
self.tfidf_vectors_transposed = self.tfidf_vectors.transpose()
|
156 |
+
|
157 |
+
def search(self, query, n=10):
|
158 |
+
query_tfidf_vector = self.vectorizer.transform([query])
|
159 |
+
results = awesome_cossim_topn(
|
160 |
+
query_tfidf_vector, self.tfidf_vectors_transposed, n, 0.01
|
161 |
+
)
|
162 |
+
return results
|