Spaces:
Build error
Build error
Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .DS_Store +0 -0
- .cache/huggingface/.gitignore +1 -0
- .cache/huggingface/download/openbiollm-llama3-8b.Q5_K_M.gguf.lock +0 -0
- .cache/huggingface/download/openbiollm-llama3-8b.Q5_K_M.gguf.metadata +3 -0
- .env +4 -0
- .gitattributes +155 -0
- .github/workflows/update_space.yml +28 -0
- .gitignore +7 -0
- .streamlit/secrets.toml +3 -0
- Data/AC-Aids-for-Dogs_Canine-Periodontal-Disease.pdf +0 -0
- Data/cancer_and_cure__a_critical_analysis.27.pdf +0 -0
- Data/medical_oncology_handbook_june_2020_edition.pdf +0 -0
- DockerFile +20 -0
- MultimodalRAG.ipynb +0 -0
- MultimodalRAGUpdatedVersion.ipynb +0 -0
- README.md +125 -8
- Streaming.py +223 -0
- Streamingnewversion.py +244 -0
- __pycache__/app.cpython-310.pyc +0 -0
- __pycache__/clip_helpers.cpython-310.pyc +0 -0
- __pycache__/combinedmultimodal.cpython-310.pyc +0 -0
- __pycache__/imagebind.cpython-310.pyc +0 -0
- __pycache__/images.cpython-310.pyc +0 -0
- __pycache__/ingest.cpython-310.pyc +0 -0
- app.py +83 -0
- app1.py +119 -0
- combinedmultimodal.py +621 -0
- freeze +0 -0
- images.py +12 -0
- images/architecture.png +0 -0
- images/figure-1-1.jpg +0 -0
- images/figure-1-10.jpg +0 -0
- images/figure-1-11.jpg +0 -0
- images/figure-1-2.jpg +0 -0
- images/figure-1-3.jpg +0 -0
- images/figure-1-4.jpg +0 -0
- images/figure-1-5.jpg +0 -0
- images/figure-1-6.jpg +0 -0
- images/figure-1-7.jpg +0 -0
- images/figure-1-8.jpg +0 -0
- images/figure-1-9.jpg +0 -0
- images/multimodal.png +3 -0
- images1/figure-1-1.jpg +0 -0
- images1/figure-1-10.jpg +0 -0
- images1/figure-1-11.jpg +0 -0
- images1/figure-1-2.jpg +0 -0
- images1/figure-1-3.jpg +0 -0
- images1/figure-1-4.jpg +0 -0
- images1/figure-1-5.jpg +0 -0
- images1/figure-1-6.jpg +0 -0
.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
.cache/huggingface/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
*
|
.cache/huggingface/download/openbiollm-llama3-8b.Q5_K_M.gguf.lock
ADDED
File without changes
|
.cache/huggingface/download/openbiollm-llama3-8b.Q5_K_M.gguf.metadata
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
d1248c48f0ade670847d05fb2cb356a75df4db3a
|
2 |
+
1753c629bf99c261e8b92498d813f382f811e903cdc0e685a11d1689612b34ce
|
3 |
+
1723860909.403446
|
.env
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
QDRANT_URL=https://f1e9a70a-afb9-498d-b66d-cb248e0d5557.us-east4-0.gcp.cloud.qdrant.io:6333
|
2 |
+
QDRANT_API_KEY=REXlX_PeDvCoXeS9uKCzC--e3-LQV0lw3_jBTdcLZ7P5_F6EOdwklA
|
3 |
+
NVIDIA_API_KEY=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
|
4 |
+
GEMINI_API_KEY=AIzaSyCXGnm-n6aF962jeorkjo2IsMCwxDwj4bo
|
.gitattributes
CHANGED
@@ -33,3 +33,158 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
images/multimodal.png filter=lfs diff=lfs merge=lfs -text
|
37 |
+
multimodal.png filter=lfs diff=lfs merge=lfs -text
|
38 |
+
myenv/bin/python filter=lfs diff=lfs merge=lfs -text
|
39 |
+
myenv/bin/python3 filter=lfs diff=lfs merge=lfs -text
|
40 |
+
myenv/bin/python3.10 filter=lfs diff=lfs merge=lfs -text
|
41 |
+
myenv/bin/ruff filter=lfs diff=lfs merge=lfs -text
|
42 |
+
myenv/lib/python3.10/site-packages/Cython/Compiler/Code.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
43 |
+
myenv/lib/python3.10/site-packages/PIL/.dylibs/libfreetype.6.dylib filter=lfs diff=lfs merge=lfs -text
|
44 |
+
myenv/lib/python3.10/site-packages/PIL/.dylibs/libharfbuzz.0.dylib filter=lfs diff=lfs merge=lfs -text
|
45 |
+
myenv/lib/python3.10/site-packages/_soundfile_data/libsndfile_x86_64.dylib filter=lfs diff=lfs merge=lfs -text
|
46 |
+
myenv/lib/python3.10/site-packages/altair/vegalite/v5/schema/__pycache__/channels.cpython-310.pyc filter=lfs diff=lfs merge=lfs -text
|
47 |
+
myenv/lib/python3.10/site-packages/altair/vegalite/v5/schema/__pycache__/core.cpython-310.pyc filter=lfs diff=lfs merge=lfs -text
|
48 |
+
myenv/lib/python3.10/site-packages/av/.dylibs/libaom.3.2.0.dylib filter=lfs diff=lfs merge=lfs -text
|
49 |
+
myenv/lib/python3.10/site-packages/av/.dylibs/libavcodec.60.31.102.dylib filter=lfs diff=lfs merge=lfs -text
|
50 |
+
myenv/lib/python3.10/site-packages/av/.dylibs/libavfilter.9.12.100.dylib filter=lfs diff=lfs merge=lfs -text
|
51 |
+
myenv/lib/python3.10/site-packages/av/.dylibs/libavformat.60.16.100.dylib filter=lfs diff=lfs merge=lfs -text
|
52 |
+
myenv/lib/python3.10/site-packages/av/.dylibs/libdav1d.7.dylib filter=lfs diff=lfs merge=lfs -text
|
53 |
+
myenv/lib/python3.10/site-packages/av/.dylibs/libfreetype.6.dylib filter=lfs diff=lfs merge=lfs -text
|
54 |
+
myenv/lib/python3.10/site-packages/av/.dylibs/libharfbuzz.0.dylib filter=lfs diff=lfs merge=lfs -text
|
55 |
+
myenv/lib/python3.10/site-packages/av/.dylibs/libswscale.7.5.100.dylib filter=lfs diff=lfs merge=lfs -text
|
56 |
+
myenv/lib/python3.10/site-packages/av/.dylibs/libvpx.9.dylib filter=lfs diff=lfs merge=lfs -text
|
57 |
+
myenv/lib/python3.10/site-packages/av/.dylibs/libx264.164.dylib filter=lfs diff=lfs merge=lfs -text
|
58 |
+
myenv/lib/python3.10/site-packages/av/.dylibs/libx265.199.dylib filter=lfs diff=lfs merge=lfs -text
|
59 |
+
myenv/lib/python3.10/site-packages/av/.dylibs/libxml2.2.dylib filter=lfs diff=lfs merge=lfs -text
|
60 |
+
myenv/lib/python3.10/site-packages/cmake/data/bin/ccmake filter=lfs diff=lfs merge=lfs -text
|
61 |
+
myenv/lib/python3.10/site-packages/cmake/data/bin/cmake filter=lfs diff=lfs merge=lfs -text
|
62 |
+
myenv/lib/python3.10/site-packages/cmake/data/bin/cpack filter=lfs diff=lfs merge=lfs -text
|
63 |
+
myenv/lib/python3.10/site-packages/cmake/data/bin/ctest filter=lfs diff=lfs merge=lfs -text
|
64 |
+
myenv/lib/python3.10/site-packages/cmake/data/doc/cmake/CMake.qch filter=lfs diff=lfs merge=lfs -text
|
65 |
+
myenv/lib/python3.10/site-packages/cryptography/hazmat/bindings/_rust.abi3.so filter=lfs diff=lfs merge=lfs -text
|
66 |
+
myenv/lib/python3.10/site-packages/ctransformers/lib/avx/ctransformers.dll filter=lfs diff=lfs merge=lfs -text
|
67 |
+
myenv/lib/python3.10/site-packages/ctransformers/lib/avx/libctransformers.dylib filter=lfs diff=lfs merge=lfs -text
|
68 |
+
myenv/lib/python3.10/site-packages/ctransformers/lib/avx/libctransformers.so filter=lfs diff=lfs merge=lfs -text
|
69 |
+
myenv/lib/python3.10/site-packages/ctransformers/lib/avx2/ctransformers.dll filter=lfs diff=lfs merge=lfs -text
|
70 |
+
myenv/lib/python3.10/site-packages/ctransformers/lib/avx2/libctransformers.dylib filter=lfs diff=lfs merge=lfs -text
|
71 |
+
myenv/lib/python3.10/site-packages/ctransformers/lib/avx2/libctransformers.so filter=lfs diff=lfs merge=lfs -text
|
72 |
+
myenv/lib/python3.10/site-packages/ctransformers/lib/basic/ctransformers.dll filter=lfs diff=lfs merge=lfs -text
|
73 |
+
myenv/lib/python3.10/site-packages/ctransformers/lib/basic/libctransformers.dylib filter=lfs diff=lfs merge=lfs -text
|
74 |
+
myenv/lib/python3.10/site-packages/ctransformers/lib/basic/libctransformers.so filter=lfs diff=lfs merge=lfs -text
|
75 |
+
myenv/lib/python3.10/site-packages/ctransformers/lib/cuda/ctransformers.dll filter=lfs diff=lfs merge=lfs -text
|
76 |
+
myenv/lib/python3.10/site-packages/ctransformers/lib/cuda/libctransformers.so filter=lfs diff=lfs merge=lfs -text
|
77 |
+
myenv/lib/python3.10/site-packages/cv2/.dylibs/libSvtAv1Enc.1.8.0.dylib filter=lfs diff=lfs merge=lfs -text
|
78 |
+
myenv/lib/python3.10/site-packages/cv2/.dylibs/libX11.6.dylib filter=lfs diff=lfs merge=lfs -text
|
79 |
+
myenv/lib/python3.10/site-packages/cv2/.dylibs/libaom.3.8.0.dylib filter=lfs diff=lfs merge=lfs -text
|
80 |
+
myenv/lib/python3.10/site-packages/cv2/.dylibs/libavcodec.60.31.102.dylib filter=lfs diff=lfs merge=lfs -text
|
81 |
+
myenv/lib/python3.10/site-packages/cv2/.dylibs/libavformat.60.16.100.dylib filter=lfs diff=lfs merge=lfs -text
|
82 |
+
myenv/lib/python3.10/site-packages/cv2/.dylibs/libcrypto.3.dylib filter=lfs diff=lfs merge=lfs -text
|
83 |
+
myenv/lib/python3.10/site-packages/cv2/.dylibs/libdav1d.7.dylib filter=lfs diff=lfs merge=lfs -text
|
84 |
+
myenv/lib/python3.10/site-packages/cv2/.dylibs/libgnutls.30.dylib filter=lfs diff=lfs merge=lfs -text
|
85 |
+
myenv/lib/python3.10/site-packages/cv2/.dylibs/libjxl.0.9.0.dylib filter=lfs diff=lfs merge=lfs -text
|
86 |
+
myenv/lib/python3.10/site-packages/cv2/.dylibs/libp11-kit.0.dylib filter=lfs diff=lfs merge=lfs -text
|
87 |
+
myenv/lib/python3.10/site-packages/cv2/.dylibs/librav1e.0.6.6.dylib filter=lfs diff=lfs merge=lfs -text
|
88 |
+
myenv/lib/python3.10/site-packages/cv2/.dylibs/libunistring.5.dylib filter=lfs diff=lfs merge=lfs -text
|
89 |
+
myenv/lib/python3.10/site-packages/cv2/.dylibs/libvpx.8.dylib filter=lfs diff=lfs merge=lfs -text
|
90 |
+
myenv/lib/python3.10/site-packages/cv2/.dylibs/libx264.164.dylib filter=lfs diff=lfs merge=lfs -text
|
91 |
+
myenv/lib/python3.10/site-packages/cv2/.dylibs/libx265.199.dylib filter=lfs diff=lfs merge=lfs -text
|
92 |
+
myenv/lib/python3.10/site-packages/cv2/cv2.abi3.so filter=lfs diff=lfs merge=lfs -text
|
93 |
+
myenv/lib/python3.10/site-packages/decord/.dylibs/libavcodec.58.35.100.dylib filter=lfs diff=lfs merge=lfs -text
|
94 |
+
myenv/lib/python3.10/site-packages/decord/.dylibs/libavfilter.7.40.101.dylib filter=lfs diff=lfs merge=lfs -text
|
95 |
+
myenv/lib/python3.10/site-packages/decord/.dylibs/libavformat.58.20.100.dylib filter=lfs diff=lfs merge=lfs -text
|
96 |
+
myenv/lib/python3.10/site-packages/decord/.dylibs/libvpx.8.dylib filter=lfs diff=lfs merge=lfs -text
|
97 |
+
myenv/lib/python3.10/site-packages/decord/.dylibs/libx264.164.dylib filter=lfs diff=lfs merge=lfs -text
|
98 |
+
myenv/lib/python3.10/site-packages/decord/libdecord.dylib filter=lfs diff=lfs merge=lfs -text
|
99 |
+
myenv/lib/python3.10/site-packages/emoji/unicode_codes/__pycache__/data_dict.cpython-310.pyc filter=lfs diff=lfs merge=lfs -text
|
100 |
+
myenv/lib/python3.10/site-packages/gradio/frpc_darwin_amd64_v0.2 filter=lfs diff=lfs merge=lfs -text
|
101 |
+
myenv/lib/python3.10/site-packages/grpc/_cython/cygrpc.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
102 |
+
myenv/lib/python3.10/site-packages/grpc_tools/_protoc_compiler.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
103 |
+
myenv/lib/python3.10/site-packages/layoutparser/misc/NotoSerifCJKjp-Regular.otf filter=lfs diff=lfs merge=lfs -text
|
104 |
+
myenv/lib/python3.10/site-packages/lib/libllama.dylib filter=lfs diff=lfs merge=lfs -text
|
105 |
+
myenv/lib/python3.10/site-packages/llama_cpp/libllama.dylib filter=lfs diff=lfs merge=lfs -text
|
106 |
+
myenv/lib/python3.10/site-packages/lxml/etree.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
107 |
+
myenv/lib/python3.10/site-packages/lxml/objectify.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
108 |
+
myenv/lib/python3.10/site-packages/magic/libmagic/magic.mgc filter=lfs diff=lfs merge=lfs -text
|
109 |
+
myenv/lib/python3.10/site-packages/minijinja/_lowlevel.abi3.so filter=lfs diff=lfs merge=lfs -text
|
110 |
+
myenv/lib/python3.10/site-packages/numpy/.dylibs/libgfortran.5.dylib filter=lfs diff=lfs merge=lfs -text
|
111 |
+
myenv/lib/python3.10/site-packages/numpy/.dylibs/libopenblas64_.0.dylib filter=lfs diff=lfs merge=lfs -text
|
112 |
+
myenv/lib/python3.10/site-packages/numpy/core/_multiarray_umath.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
113 |
+
myenv/lib/python3.10/site-packages/numpy/core/_simd.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
114 |
+
myenv/lib/python3.10/site-packages/onnx/onnx_cpp2py_export.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
115 |
+
myenv/lib/python3.10/site-packages/onnxruntime/capi/onnxruntime_pybind11_state.so filter=lfs diff=lfs merge=lfs -text
|
116 |
+
myenv/lib/python3.10/site-packages/pandas/_libs/algos.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
117 |
+
myenv/lib/python3.10/site-packages/pandas/_libs/groupby.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
118 |
+
myenv/lib/python3.10/site-packages/pandas/_libs/hashtable.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
119 |
+
myenv/lib/python3.10/site-packages/pandas/_libs/interval.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
120 |
+
myenv/lib/python3.10/site-packages/pandas/_libs/join.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
121 |
+
myenv/lib/python3.10/site-packages/pandas/_libs/tslibs/offsets.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
122 |
+
myenv/lib/python3.10/site-packages/pikepdf/.dylibs/libgnutls.30.dylib filter=lfs diff=lfs merge=lfs -text
|
123 |
+
myenv/lib/python3.10/site-packages/pikepdf/.dylibs/libp11-kit.0.dylib filter=lfs diff=lfs merge=lfs -text
|
124 |
+
myenv/lib/python3.10/site-packages/pikepdf/.dylibs/libqpdf.29.8.0.dylib filter=lfs diff=lfs merge=lfs -text
|
125 |
+
myenv/lib/python3.10/site-packages/pikepdf/.dylibs/libunistring.5.dylib filter=lfs diff=lfs merge=lfs -text
|
126 |
+
myenv/lib/python3.10/site-packages/pikepdf/_core.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
127 |
+
myenv/lib/python3.10/site-packages/pillow_heif/.dylibs/libaom.3.8.0.dylib filter=lfs diff=lfs merge=lfs -text
|
128 |
+
myenv/lib/python3.10/site-packages/pillow_heif/.dylibs/libjxl.0.8.2.dylib filter=lfs diff=lfs merge=lfs -text
|
129 |
+
myenv/lib/python3.10/site-packages/pillow_heif/.dylibs/libx265.199.dylib filter=lfs diff=lfs merge=lfs -text
|
130 |
+
myenv/lib/python3.10/site-packages/pyarrow/_compute.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
131 |
+
myenv/lib/python3.10/site-packages/pyarrow/_dataset.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
132 |
+
myenv/lib/python3.10/site-packages/pyarrow/_flight.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
133 |
+
myenv/lib/python3.10/site-packages/pyarrow/lib.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
134 |
+
myenv/lib/python3.10/site-packages/pyarrow/libarrow.1601.dylib filter=lfs diff=lfs merge=lfs -text
|
135 |
+
myenv/lib/python3.10/site-packages/pyarrow/libarrow_acero.1601.dylib filter=lfs diff=lfs merge=lfs -text
|
136 |
+
myenv/lib/python3.10/site-packages/pyarrow/libarrow_dataset.1601.dylib filter=lfs diff=lfs merge=lfs -text
|
137 |
+
myenv/lib/python3.10/site-packages/pyarrow/libarrow_flight.1601.dylib filter=lfs diff=lfs merge=lfs -text
|
138 |
+
myenv/lib/python3.10/site-packages/pyarrow/libarrow_python.dylib filter=lfs diff=lfs merge=lfs -text
|
139 |
+
myenv/lib/python3.10/site-packages/pyarrow/libarrow_substrait.1601.dylib filter=lfs diff=lfs merge=lfs -text
|
140 |
+
myenv/lib/python3.10/site-packages/pyarrow/libparquet.1601.dylib filter=lfs diff=lfs merge=lfs -text
|
141 |
+
myenv/lib/python3.10/site-packages/pydantic_core/_pydantic_core.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
142 |
+
myenv/lib/python3.10/site-packages/pydeck/nbextension/static/index.js.map filter=lfs diff=lfs merge=lfs -text
|
143 |
+
myenv/lib/python3.10/site-packages/pypdf/_codecs/__pycache__/adobe_glyphs.cpython-310.pyc filter=lfs diff=lfs merge=lfs -text
|
144 |
+
myenv/lib/python3.10/site-packages/pypdfium2_raw/libpdfium.dylib filter=lfs diff=lfs merge=lfs -text
|
145 |
+
myenv/lib/python3.10/site-packages/rapidfuzz/distance/metrics_cpp.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
146 |
+
myenv/lib/python3.10/site-packages/rapidfuzz/distance/metrics_cpp_avx2.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
147 |
+
myenv/lib/python3.10/site-packages/rapidfuzz/fuzz_cpp.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
148 |
+
myenv/lib/python3.10/site-packages/rapidfuzz/fuzz_cpp_avx2.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
149 |
+
myenv/lib/python3.10/site-packages/safetensors/_safetensors_rust.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
150 |
+
myenv/lib/python3.10/site-packages/scipy/.dylibs/libgfortran.5.dylib filter=lfs diff=lfs merge=lfs -text
|
151 |
+
myenv/lib/python3.10/site-packages/scipy/.dylibs/libopenblas.0.dylib filter=lfs diff=lfs merge=lfs -text
|
152 |
+
myenv/lib/python3.10/site-packages/scipy/fft/_pocketfft/pypocketfft.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
153 |
+
myenv/lib/python3.10/site-packages/scipy/io/_fast_matrix_market/_fmm_core.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
154 |
+
myenv/lib/python3.10/site-packages/scipy/linalg/_flapack.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
155 |
+
myenv/lib/python3.10/site-packages/scipy/misc/face.dat filter=lfs diff=lfs merge=lfs -text
|
156 |
+
myenv/lib/python3.10/site-packages/scipy/optimize/_highs/_highs_wrapper.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
157 |
+
myenv/lib/python3.10/site-packages/scipy/sparse/_sparsetools.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
158 |
+
myenv/lib/python3.10/site-packages/scipy/spatial/_qhull.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
159 |
+
myenv/lib/python3.10/site-packages/scipy/special/_ufuncs.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
160 |
+
myenv/lib/python3.10/site-packages/scipy/special/cython_special.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
161 |
+
myenv/lib/python3.10/site-packages/scipy/stats/_unuran/unuran_wrapper.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
162 |
+
myenv/lib/python3.10/site-packages/sentencepiece/_sentencepiece.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
163 |
+
myenv/lib/python3.10/site-packages/skimage/filters/rank/generic_cy.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
164 |
+
myenv/lib/python3.10/site-packages/sklearn/_loss/_loss.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
165 |
+
myenv/lib/python3.10/site-packages/tiktoken/_tiktoken.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
166 |
+
myenv/lib/python3.10/site-packages/tokenizers/tokenizers.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
167 |
+
myenv/lib/python3.10/site-packages/torch/.dylibs/libiomp5.dylib filter=lfs diff=lfs merge=lfs -text
|
168 |
+
myenv/lib/python3.10/site-packages/torch/bin/protoc filter=lfs diff=lfs merge=lfs -text
|
169 |
+
myenv/lib/python3.10/site-packages/torch/bin/protoc-3.13.0.0 filter=lfs diff=lfs merge=lfs -text
|
170 |
+
myenv/lib/python3.10/site-packages/torch/lib/libiomp5.dylib filter=lfs diff=lfs merge=lfs -text
|
171 |
+
myenv/lib/python3.10/site-packages/torch/lib/libtorch_cpu.dylib filter=lfs diff=lfs merge=lfs -text
|
172 |
+
myenv/lib/python3.10/site-packages/torch/lib/libtorch_python.dylib filter=lfs diff=lfs merge=lfs -text
|
173 |
+
myenv/lib/python3.10/site-packages/torchaudio/_torchaudio.so filter=lfs diff=lfs merge=lfs -text
|
174 |
+
myenv/lib/python3.10/site-packages/torchaudio/lib/libflashlight-text.so filter=lfs diff=lfs merge=lfs -text
|
175 |
+
myenv/lib/python3.10/site-packages/torchaudio/lib/libtorchaudio.so filter=lfs diff=lfs merge=lfs -text
|
176 |
+
myenv/lib/python3.10/site-packages/torchvision/.dylibs/libc++.1.0.dylib filter=lfs diff=lfs merge=lfs -text
|
177 |
+
myenv/lib/python3.10/site-packages/unicorn/lib/libunicorn.2.dylib filter=lfs diff=lfs merge=lfs -text
|
178 |
+
myenv/lib/python3.10/site-packages/unicorn/lib/libunicorn.a filter=lfs diff=lfs merge=lfs -text
|
179 |
+
myenv/share/jupyter/nbextensions/pydeck/index.js.map filter=lfs diff=lfs merge=lfs -text
|
180 |
+
openbiollm-llama3-8b.Q5_K_M.gguf filter=lfs diff=lfs merge=lfs -text
|
181 |
+
path/to/data/collections/image_data/0/wal/open-1 filter=lfs diff=lfs merge=lfs -text
|
182 |
+
path/to/data/collections/image_data/0/wal/open-2 filter=lfs diff=lfs merge=lfs -text
|
183 |
+
path/to/data/collections/medical_img/0/wal/open-1 filter=lfs diff=lfs merge=lfs -text
|
184 |
+
path/to/data/collections/medical_img/0/wal/open-2 filter=lfs diff=lfs merge=lfs -text
|
185 |
+
qdrant_data/collections/vector_db/0/wal/open-1 filter=lfs diff=lfs merge=lfs -text
|
186 |
+
qdrant_data/collections/vector_db/0/wal/open-2 filter=lfs diff=lfs merge=lfs -text
|
187 |
+
qdrant_storage/collections/medical_img/0/wal/open-1 filter=lfs diff=lfs merge=lfs -text
|
188 |
+
qdrant_storage/collections/medical_img/0/wal/open-2 filter=lfs diff=lfs merge=lfs -text
|
189 |
+
qdrant_storage/collections/vector_db/0/wal/open-1 filter=lfs diff=lfs merge=lfs -text
|
190 |
+
qdrant_storage/collections/vector_db/0/wal/open-2 filter=lfs diff=lfs merge=lfs -text
|
.github/workflows/update_space.yml
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Run Python script
|
2 |
+
|
3 |
+
on:
|
4 |
+
push:
|
5 |
+
branches:
|
6 |
+
- surbhi
|
7 |
+
|
8 |
+
jobs:
|
9 |
+
build:
|
10 |
+
runs-on: ubuntu-latest
|
11 |
+
|
12 |
+
steps:
|
13 |
+
- name: Checkout
|
14 |
+
uses: actions/checkout@v2
|
15 |
+
|
16 |
+
- name: Set up Python
|
17 |
+
uses: actions/setup-python@v2
|
18 |
+
with:
|
19 |
+
python-version: '3.9'
|
20 |
+
|
21 |
+
- name: Install Gradio
|
22 |
+
run: python -m pip install gradio
|
23 |
+
|
24 |
+
- name: Log in to Hugging Face
|
25 |
+
run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
|
26 |
+
|
27 |
+
- name: Deploy to Spaces
|
28 |
+
run: gradio deploy
|
.gitignore
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
qdrant_data
|
2 |
+
myenv
|
3 |
+
openbiollm-llama3-8b.Q5_K_M.gguf
|
4 |
+
__pycache__
|
5 |
+
secrets.toml
|
6 |
+
.streamlit/
|
7 |
+
.env
|
.streamlit/secrets.toml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
# .streamlit/secrets.toml
|
2 |
+
QDRANT_URL = "https://f1e9a70a-afb9-498d-b66d-cb248e0d5557.us-east4-0.gcp.cloud.qdrant.io:6333"
|
3 |
+
QDRANT_API_KEY = "REXlX_PeDvCoXeS9uKCzC--e3-LQV0lw3_jBTdcLZ7P5_F6EOdwklA"
|
Data/AC-Aids-for-Dogs_Canine-Periodontal-Disease.pdf
ADDED
Binary file (485 kB). View file
|
|
Data/cancer_and_cure__a_critical_analysis.27.pdf
ADDED
Binary file (226 kB). View file
|
|
Data/medical_oncology_handbook_june_2020_edition.pdf
ADDED
Binary file (818 kB). View file
|
|
DockerFile
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use the official Python image from the Docker Hub
|
2 |
+
FROM python:3.10
|
3 |
+
|
4 |
+
# Set the working directory in the container
|
5 |
+
WORKDIR /app
|
6 |
+
|
7 |
+
# Copy the requirements file into the container at /app
|
8 |
+
COPY requirements.txt .
|
9 |
+
|
10 |
+
# Install the required libraries
|
11 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
12 |
+
|
13 |
+
# Copy the rest of the application code into the container
|
14 |
+
COPY . .
|
15 |
+
|
16 |
+
# Expose the port the app runs on
|
17 |
+
EXPOSE 8501
|
18 |
+
|
19 |
+
# Command to run the application
|
20 |
+
CMD ["streamlit", "run", "stream.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
MultimodalRAG.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
MultimodalRAGUpdatedVersion.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
README.md
CHANGED
@@ -1,12 +1,129 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
|
4 |
-
colorFrom: gray
|
5 |
-
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 4.
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
---
|
|
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: Medical_RAG
|
3 |
+
app_file: combinedmultimodal.py
|
|
|
|
|
4 |
sdk: gradio
|
5 |
+
sdk_version: 4.41.0
|
|
|
|
|
6 |
---
|
7 |
+
# Advancing Text Searching with Advanced Indexing Techniques in Healthcare Applications(In Progress)
|
8 |
|
9 |
+
Welcome to the project repository for advancing text searching with advanced indexing techniques in healthcare applications. This project implements a powerful Retrieval-Augmented Generation (RAG) system using cutting-edge AI technologies, specifically designed to enhance text searching capabilities within the healthcare domain.I have also implemented Multimodal Text Searching for Medical Documents.
|
10 |
+
|
11 |
+
## 🚀 Features For Text Based Medical Query Based System
|
12 |
+
|
13 |
+
- **BioLLM 8B**: Advanced language model for generating and processing medical text.
|
14 |
+
- **ClinicalBert**: State-of-the-art embedding model for accurate representation of medical texts.
|
15 |
+
- **Qdrant**: Self-hosted Vector Database (Vector DB) for efficient storage and retrieval of embeddings.
|
16 |
+
- **Langchain & Llama CPP**: Orchestration frameworks for seamless integration and workflow management.
|
17 |
+
|
18 |
+
# Medical Knowledge Base Query System
|
19 |
+
|
20 |
+
A multimodal medical information retrieval system combining text and image-based querying for comprehensive medical knowledge access.
|
21 |
+
|
22 |
+
## Features For Multimodality Medical Query Based System:
|
23 |
+
[Watch the video on YouTube](https://youtu.be/pNy7RqfRUrc?si=1HQgq54oHT6YoR0B)
|
24 |
+
|
25 |
+
### 🧠 Multimodal Medical Information Retrieval
|
26 |
+
- Combines text and image-based querying for comprehensive medical knowledge access
|
27 |
+
- Uses Qdrant vector database to store and retrieve both text and image embeddings
|
28 |
+
|
29 |
+
### 🔤 Advanced Natural Language Processing
|
30 |
+
- Utilizes ClinicalBERT for domain-specific text embeddings
|
31 |
+
- Implements NVIDIA's Palmyra-med-70b model for medical language understanding fast Inference time.
|
32 |
+
|
33 |
+
### 🖼️ Image Analysis Capabilities
|
34 |
+
- Incorporates CLIP (Contrastive Language-Image Pre-training) for image feature extraction
|
35 |
+
- Generates image summaries using Google's Gemini 1.5 Flash model
|
36 |
+
|
37 |
+
### 📄 PDF Processing
|
38 |
+
- Extracts text and images from medical PDF documents
|
39 |
+
- Implements intelligent chunking strategies for text processing
|
40 |
+
|
41 |
+
### 🔍 Vector Search
|
42 |
+
- Uses Qdrant for efficient similarity search on both text and image vectors
|
43 |
+
- Implements hybrid search combining CLIP-based image similarity and text-based summary similarity
|
44 |
+
|
45 |
+
### 🖥️ Interactive User Interface
|
46 |
+
- Gradio-based web interface for easy querying and result visualization
|
47 |
+
- Displays relevant text responses alongside related medical images
|
48 |
+
|
49 |
+
### 🧩 Extensible Architecture
|
50 |
+
- Modular design allowing for easy integration of new models or data sources
|
51 |
+
- Supports both local and cloud-based model deployment
|
52 |
+
The high level architectural framework for this application is given as follows:
|
53 |
+
![System Architecture Diagram](images/architecture.png)
|
54 |
+
|
55 |
+
### ⚡ Performance Optimization
|
56 |
+
- Implements batching and multi-threading for efficient processing of large document sets
|
57 |
+
- Utilizes GPU acceleration where available
|
58 |
+
|
59 |
+
### 🎛️ Customizable Retrieval
|
60 |
+
- Adjustable similarity thresholds for image retrieval
|
61 |
+
- Configurable number of top-k results for both text and image queries
|
62 |
+
|
63 |
+
### 📊 Comprehensive Visualization
|
64 |
+
- Displays query results with both textual information and related images
|
65 |
+
- Provides a gallery view of all extracted images from the knowledge base
|
66 |
+
|
67 |
+
### 🔐 Environment Management
|
68 |
+
- Uses .env file for secure API key management
|
69 |
+
- Supports both CPU and GPU environments
|
70 |
+
|
71 |
+
### DEMO SCREENSHOT
|
72 |
+
![DEMO-SCREENSHOT](images/multimodal.png)
|
73 |
+
|
74 |
+
## 🎥 Video Demonstration
|
75 |
+
|
76 |
+
Explore the capabilities of our project with our detailed [YouTube video](https://youtu.be/nKCKUcnQ390).
|
77 |
+
|
78 |
+
## Installation
|
79 |
+
|
80 |
+
To get started with this project, follow these steps:
|
81 |
+
|
82 |
+
1. **Install Dependencies**:
|
83 |
+
```bash
|
84 |
+
pip install -r requirements.txt
|
85 |
+
```
|
86 |
+
|
87 |
+
2. **Set up Qdrant**:
|
88 |
+
- Follow the [Qdrant Installation Guide](https://qdrant.tech/documentation/quick_start/) to install and configure Qdrant.
|
89 |
+
|
90 |
+
3. **Configure the Application**:
|
91 |
+
- Ensure configuration files for BioLLM, ClinicalBert, Langchain, and Llama CPP are correctly set up.
|
92 |
+
|
93 |
+
4. **Run the Application**:
|
94 |
+
if you want to run the text reterival application in Flask mode
|
95 |
+
```bash
|
96 |
+
uvicorn app:app
|
97 |
+
```
|
98 |
+
if you want to run the text reterival application through Streamlit
|
99 |
+
``bash
|
100 |
+
streamlit run Streaming.py
|
101 |
+
```
|
102 |
+
|
103 |
+
if you want to run the multimodal application run it through Gradio Interface
|
104 |
+
```bash
|
105 |
+
python combinedmultimodal.py
|
106 |
+
```
|
107 |
+
|
108 |
+
## 💡 Usage
|
109 |
+
|
110 |
+
- **Querying the System**: Input medical queries via the application's interface for detailed information retrieval.
|
111 |
+
- **Text Generation**: Utilize BioLLM 8B to generate comprehensive medical responses.
|
112 |
+
|
113 |
+
## 👥 Contributing
|
114 |
+
|
115 |
+
We welcome contributions to enhance this project! Here's how you can contribute:
|
116 |
+
|
117 |
+
1. Fork the repository.
|
118 |
+
2. Create a new branch (`git checkout -b feature-name`).
|
119 |
+
3. Commit your changes (`git commit -am 'Add feature'`).
|
120 |
+
4. Push to the branch (`git push origin feature-name`).
|
121 |
+
5. Open a Pull Request with detailed information about your changes.
|
122 |
+
|
123 |
+
## 📜 License
|
124 |
+
|
125 |
+
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
|
126 |
+
|
127 |
+
## 📞 Contact
|
128 |
+
|
129 |
+
For questions or suggestions, please open an issue or contact the repository owner at [surbhisharma9099@gmail.com](mailto:surbhisharma9099@gmail.com).
|
Streaming.py
ADDED
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
3 |
+
from langchain_community.document_loaders import DirectoryLoader, UnstructuredFileLoader, PDFMinerLoader
|
4 |
+
from langchain_community.vectorstores import Qdrant
|
5 |
+
from langchain_community.embeddings import SentenceTransformerEmbeddings
|
6 |
+
from langchain_community.retrievers import BM25Retriever
|
7 |
+
from qdrant_client import QdrantClient
|
8 |
+
from qdrant_client.http.exceptions import ResponseHandlingException
|
9 |
+
from glob import glob
|
10 |
+
from llama_index.vector_stores.qdrant import QdrantVectorStore
|
11 |
+
from langchain.chains import RetrievalQA
|
12 |
+
from transformers import AutoTokenizer, AutoModel
|
13 |
+
from sentence_transformers import models, SentenceTransformer
|
14 |
+
from langchain.embeddings.base import Embeddings
|
15 |
+
from qdrant_client.models import VectorParams
|
16 |
+
import torch
|
17 |
+
import base64
|
18 |
+
from langchain_community.llms import LlamaCpp
|
19 |
+
from langchain_core.prompts import PromptTemplate
|
20 |
+
from huggingface_hub import hf_hub_download
|
21 |
+
from tempfile import NamedTemporaryFile
|
22 |
+
from langchain.retrievers import EnsembleRetriever
|
23 |
+
|
24 |
+
# Set page configuration
|
25 |
+
st.set_page_config(layout="wide")
|
26 |
+
st.markdown("""
|
27 |
+
<meta http-equiv="Content-Security-Policy"
|
28 |
+
content="default-src 'self'; object-src 'self'; frame-src 'self' data:;
|
29 |
+
script-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline';">
|
30 |
+
""", unsafe_allow_html=True)
|
31 |
+
# Streamlit secrets
|
32 |
+
qdrant_url = st.secrets["QDRANT_URL"]
|
33 |
+
qdrant_api_key = st.secrets["QDRANT_API_KEY"]
|
34 |
+
|
35 |
+
# For debugging only - remove or comment out these lines after verification
|
36 |
+
#st.write(f"QDRANT_URL: {qdrant_url}")
|
37 |
+
#st.write(f"QDRANT_API_KEY: {qdrant_api_key}")
|
38 |
+
|
39 |
+
class ClinicalBertEmbeddings(Embeddings):
|
40 |
+
def __init__(self, model_name: str = "medicalai/ClinicalBERT"):
|
41 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
42 |
+
self.model = AutoModel.from_pretrained(model_name)
|
43 |
+
self.model.eval()
|
44 |
+
|
45 |
+
def embed(self, text: str):
|
46 |
+
inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
47 |
+
with torch.no_grad():
|
48 |
+
outputs = self.model(**inputs)
|
49 |
+
embeddings = self.mean_pooling(outputs, inputs['attention_mask'])
|
50 |
+
return embeddings.squeeze().numpy()
|
51 |
+
|
52 |
+
def mean_pooling(self, model_output, attention_mask):
|
53 |
+
token_embeddings = model_output[0] # First element of model_output contains all token embeddings
|
54 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
55 |
+
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
56 |
+
|
57 |
+
def embed_documents(self, texts):
|
58 |
+
return [self.embed(text) for text in texts]
|
59 |
+
|
60 |
+
def embed_query(self, text):
|
61 |
+
return self.embed(text)
|
62 |
+
|
63 |
+
@st.cache_resource
|
64 |
+
def load_model():
|
65 |
+
model_name = "aaditya/OpenBioLLM-Llama3-8B-GGUF"
|
66 |
+
model_file = "openbiollm-llama3-8b.Q5_K_M.gguf"
|
67 |
+
model_path = hf_hub_download(model_name, filename=model_file, local_dir='./')
|
68 |
+
return LlamaCpp(
|
69 |
+
model_path=model_path,
|
70 |
+
temperature=0.3,
|
71 |
+
n_ctx=2048,
|
72 |
+
top_p=1
|
73 |
+
)
|
74 |
+
|
75 |
+
# Initialize embeddings
|
76 |
+
@st.cache_resource
|
77 |
+
def load_embeddings():
|
78 |
+
return ClinicalBertEmbeddings(model_name="medicalai/ClinicalBERT")
|
79 |
+
|
80 |
+
# Initialize database
|
81 |
+
@st.cache_resource
|
82 |
+
def setup_qdrant():
|
83 |
+
try:
|
84 |
+
if not qdrant_url or not qdrant_api_key:
|
85 |
+
raise ValueError("QDRANT_URL or QDRANT_API_KEY not set in environment variables.")
|
86 |
+
|
87 |
+
# Initialize Qdrant client
|
88 |
+
client = QdrantClient(
|
89 |
+
url=qdrant_url,
|
90 |
+
api_key=qdrant_api_key,
|
91 |
+
port=443, # Assuming HTTPS should use port 443
|
92 |
+
)
|
93 |
+
st.write("Qdrant client initialized successfully.")
|
94 |
+
|
95 |
+
# Create or recreate collection
|
96 |
+
collection_name = "vector_db"
|
97 |
+
try:
|
98 |
+
collection_info = client.get_collection(collection_name=collection_name)
|
99 |
+
st.write(f"Collection '{collection_name}' already exists.")
|
100 |
+
except ResponseHandlingException:
|
101 |
+
st.write(f"Collection '{collection_name}' does not exist. Creating a new one.")
|
102 |
+
client.recreate_collection(
|
103 |
+
collection_name=collection_name,
|
104 |
+
vectors_config=VectorParams(size=768, distance="Cosine")
|
105 |
+
)
|
106 |
+
st.write(f"Collection '{collection_name}' created successfully.")
|
107 |
+
|
108 |
+
embeddings = load_embeddings()
|
109 |
+
st.write("Embeddings model loaded successfully.")
|
110 |
+
|
111 |
+
return Qdrant(client=client, embeddings=embeddings, collection_name=collection_name)
|
112 |
+
|
113 |
+
except Exception as e:
|
114 |
+
st.error(f"Failed to initialize Qdrant: {e}")
|
115 |
+
return None
|
116 |
+
|
117 |
+
# Initialize database
|
118 |
+
db = setup_qdrant()
|
119 |
+
|
120 |
+
if db is None:
|
121 |
+
st.error("Qdrant setup failed, exiting.")
|
122 |
+
else:
|
123 |
+
st.success("Qdrant setup successful.")
|
124 |
+
|
125 |
+
# Load models
|
126 |
+
llm = load_model()
|
127 |
+
embeddings = load_embeddings()
|
128 |
+
|
129 |
+
# Define prompt template
|
130 |
+
prompt_template = """Use the following pieces of information to answer the user's question.
|
131 |
+
If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
132 |
+
|
133 |
+
Context: {context}
|
134 |
+
Question: {question}
|
135 |
+
|
136 |
+
Only return the helpful answer. Answer must be detailed and well explained.
|
137 |
+
Helpful answer:
|
138 |
+
"""
|
139 |
+
prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])
|
140 |
+
# Define retriever
|
141 |
+
|
142 |
+
# Define Streamlit app
|
143 |
+
|
144 |
+
def process_answer(query):
|
145 |
+
chain_type_kwargs = {"prompt": prompt}
|
146 |
+
global ensemble_retriever
|
147 |
+
qa = RetrievalQA.from_chain_type(
|
148 |
+
llm=llm,
|
149 |
+
chain_type="stuff",
|
150 |
+
retriever=ensemble_retriever,
|
151 |
+
return_source_documents=True,
|
152 |
+
chain_type_kwargs=chain_type_kwargs,
|
153 |
+
verbose=True
|
154 |
+
)
|
155 |
+
response = qa(query)
|
156 |
+
answer = response['result']
|
157 |
+
source_document = response['source_documents'][0].page_content
|
158 |
+
doc = response['source_documents'][0].metadata['source']
|
159 |
+
return answer, source_document, doc
|
160 |
+
|
161 |
+
def display_pdf(file):
|
162 |
+
with open(file, "rb") as f:
|
163 |
+
base64_pdf = base64.b64encode(f.read()).decode('utf-8')
|
164 |
+
pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
|
165 |
+
st.markdown(pdf_display, unsafe_allow_html=True)
|
166 |
+
|
167 |
+
def main():
|
168 |
+
st.title("PDF Question Answering System")
|
169 |
+
|
170 |
+
uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
|
171 |
+
|
172 |
+
if uploaded_file is not None:
|
173 |
+
# Save uploaded PDF
|
174 |
+
with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
|
175 |
+
temp_file.write(uploaded_file.read())
|
176 |
+
temp_file_path = temp_file.name
|
177 |
+
|
178 |
+
# Display PDF
|
179 |
+
st.subheader("PDF Preview")
|
180 |
+
display_pdf(temp_file_path)
|
181 |
+
|
182 |
+
# Load and process PDF
|
183 |
+
loader = PDFMinerLoader(temp_file_path)
|
184 |
+
documents = loader.load()
|
185 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
186 |
+
texts = text_splitter.split_documents(documents)
|
187 |
+
|
188 |
+
# Update the Qdrant database with the new PDF content
|
189 |
+
|
190 |
+
try:
|
191 |
+
db.add_documents(texts)
|
192 |
+
st.success("PDF processed and vector database updated!")
|
193 |
+
global ensemble_retriever
|
194 |
+
# Initialize retriever after documents are added
|
195 |
+
bm25_retriever = BM25Retriever.from_documents(documents=texts)
|
196 |
+
bm25_retriever.k = 3
|
197 |
+
qdrant_retriever = db.as_retriever(search_kwargs={"k":1})
|
198 |
+
# Combine both retrievers using EnsembleRetriever
|
199 |
+
ensemble_retriever = EnsembleRetriever(
|
200 |
+
retrievers=[qdrant_retriever, bm25_retriever],
|
201 |
+
weights=[0.5, 0.5] # Adjust weights based on desired contribution
|
202 |
+
)
|
203 |
+
|
204 |
+
except Exception as e:
|
205 |
+
st.error(f"Error updating database: {e}")
|
206 |
+
|
207 |
+
st.subheader("Ask a question about the PDF")
|
208 |
+
user_input = st.text_input("Your question:")
|
209 |
+
|
210 |
+
if st.button('Get Response'):
|
211 |
+
if user_input:
|
212 |
+
try:
|
213 |
+
answer, source_document, doc = process_answer(user_input)
|
214 |
+
st.write("*Answer:*", answer)
|
215 |
+
st.write("*Source Document:*", source_document)
|
216 |
+
st.write("*Document Source:*", doc)
|
217 |
+
except Exception as e:
|
218 |
+
st.error(f"Error processing query: {e}")
|
219 |
+
else:
|
220 |
+
st.warning("Please enter a query.")
|
221 |
+
|
222 |
+
if __name__ == "__main__":
|
223 |
+
main()
|
Streamingnewversion.py
ADDED
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
3 |
+
from langchain_community.document_loaders import DirectoryLoader, UnstructuredFileLoader, PDFMinerLoader
|
4 |
+
from langchain_community.vectorstores import Qdrant
|
5 |
+
from langchain_community.embeddings import SentenceTransformerEmbeddings
|
6 |
+
from langchain_community.retrievers import BM25Retriever
|
7 |
+
from qdrant_client import QdrantClient
|
8 |
+
from qdrant_client.http.exceptions import ResponseHandlingException
|
9 |
+
from glob import glob
|
10 |
+
from llama_index.vector_stores.qdrant import QdrantVectorStore
|
11 |
+
from langchain.chains import RetrievalQA
|
12 |
+
from transformers import AutoTokenizer, AutoModel
|
13 |
+
from sentence_transformers import models, SentenceTransformer
|
14 |
+
from langchain.embeddings.base import Embeddings
|
15 |
+
from qdrant_client.models import VectorParams
|
16 |
+
import torch
|
17 |
+
import base64
|
18 |
+
from langchain_community.llms import LlamaCpp
|
19 |
+
from langchain_core.prompts import PromptTemplate
|
20 |
+
from huggingface_hub import hf_hub_download
|
21 |
+
from tempfile import NamedTemporaryFile
|
22 |
+
from langchain.retrievers import EnsembleRetriever
|
23 |
+
import urllib
|
24 |
+
import nltk
|
25 |
+
import os
|
26 |
+
# Add this at the beginning of your script
|
27 |
+
import logging
|
28 |
+
logging.basicConfig(level=logging.DEBUG)
|
29 |
+
|
30 |
+
|
31 |
+
# Define the path for NLTK data
|
32 |
+
nltk_data_path = '/tmp/nltk_data'
|
33 |
+
os.makedirs(nltk_data_path, exist_ok=True)
|
34 |
+
|
35 |
+
# Set NLTK data path environment variable
|
36 |
+
nltk.data.path.append(nltk_data_path)
|
37 |
+
|
38 |
+
# Download required NLTK data
|
39 |
+
try:
|
40 |
+
nltk.data.find('tokenizers/punkt')
|
41 |
+
except LookupError:
|
42 |
+
nltk.download('punkt', download_dir=nltk_data_path)
|
43 |
+
|
44 |
+
# Set page configuration
|
45 |
+
st.set_page_config(layout="wide")
|
46 |
+
st.markdown("""
|
47 |
+
<meta http-equiv="Content-Security-Policy"
|
48 |
+
content="default-src 'self'; object-src 'self'; frame-src 'self' data:;
|
49 |
+
script-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline';">
|
50 |
+
""", unsafe_allow_html=True)
|
51 |
+
# Streamlit secrets
|
52 |
+
qdrant_url = st.secrets["QDRANT_URL"]
|
53 |
+
qdrant_api_key = st.secrets["QDRANT_API_KEY"]
|
54 |
+
|
55 |
+
# For debugging only - remove or comment out these lines after verification
|
56 |
+
#st.write(f"QDRANT_URL: {qdrant_url}")
|
57 |
+
#st.write(f"QDRANT_API_KEY: {qdrant_api_key}")
|
58 |
+
|
59 |
+
class ClinicalBertEmbeddings(Embeddings):
|
60 |
+
def __init__(self, model_name: str = "medicalai/ClinicalBERT"):
|
61 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
62 |
+
self.model = AutoModel.from_pretrained(model_name)
|
63 |
+
self.model.eval()
|
64 |
+
|
65 |
+
def embed(self, text: str):
|
66 |
+
inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
67 |
+
with torch.no_grad():
|
68 |
+
outputs = self.model(**inputs)
|
69 |
+
embeddings = self.mean_pooling(outputs, inputs['attention_mask'])
|
70 |
+
return embeddings.squeeze().numpy()
|
71 |
+
|
72 |
+
def mean_pooling(self, model_output, attention_mask):
|
73 |
+
token_embeddings = model_output[0] # First element of model_output contains all token embeddings
|
74 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
75 |
+
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
76 |
+
|
77 |
+
def embed_documents(self, texts):
|
78 |
+
return [self.embed(text) for text in texts]
|
79 |
+
|
80 |
+
def embed_query(self, text):
|
81 |
+
return self.embed(text)
|
82 |
+
|
83 |
+
@st.cache_resource
|
84 |
+
def load_model():
|
85 |
+
model_name = "aaditya/OpenBioLLM-Llama3-8B-GGUF"
|
86 |
+
model_file = "openbiollm-llama3-8b.Q5_K_M.gguf"
|
87 |
+
model_path = hf_hub_download(model_name, filename=model_file, local_dir='./')
|
88 |
+
return LlamaCpp(
|
89 |
+
model_path=model_path,
|
90 |
+
temperature=0.3,
|
91 |
+
n_ctx=2048,
|
92 |
+
top_p=1
|
93 |
+
)
|
94 |
+
|
95 |
+
# Initialize embeddings
|
96 |
+
@st.cache_resource
|
97 |
+
def load_embeddings():
|
98 |
+
return ClinicalBertEmbeddings(model_name="medicalai/ClinicalBERT")
|
99 |
+
|
100 |
+
# Initialize database
|
101 |
+
@st.cache_resource
|
102 |
+
def setup_qdrant():
|
103 |
+
try:
|
104 |
+
if not qdrant_url or not qdrant_api_key:
|
105 |
+
raise ValueError("QDRANT_URL or QDRANT_API_KEY not set in environment variables.")
|
106 |
+
|
107 |
+
# Initialize Qdrant client
|
108 |
+
client = QdrantClient(
|
109 |
+
url=qdrant_url,
|
110 |
+
api_key=qdrant_api_key,
|
111 |
+
port=443, # Assuming HTTPS should use port 443
|
112 |
+
)
|
113 |
+
st.write("Qdrant client initialized successfully.")
|
114 |
+
|
115 |
+
# Create or recreate collection
|
116 |
+
collection_name = "vector_db"
|
117 |
+
try:
|
118 |
+
collection_info = client.get_collection(collection_name=collection_name)
|
119 |
+
st.write(f"Collection '{collection_name}' already exists.")
|
120 |
+
except ResponseHandlingException:
|
121 |
+
st.write(f"Collection '{collection_name}' does not exist. Creating a new one.")
|
122 |
+
client.recreate_collection(
|
123 |
+
collection_name=collection_name,
|
124 |
+
vectors_config=VectorParams(size=768, distance="Cosine")
|
125 |
+
)
|
126 |
+
st.write(f"Collection '{collection_name}' created successfully.")
|
127 |
+
|
128 |
+
embeddings = load_embeddings()
|
129 |
+
st.write("Embeddings model loaded successfully.")
|
130 |
+
|
131 |
+
return Qdrant(client=client, embeddings=embeddings, collection_name=collection_name)
|
132 |
+
|
133 |
+
except Exception as e:
|
134 |
+
st.error(f"Failed to initialize Qdrant: {e}")
|
135 |
+
return None
|
136 |
+
|
137 |
+
# Initialize database
|
138 |
+
db = setup_qdrant()
|
139 |
+
|
140 |
+
if db is None:
|
141 |
+
st.error("Qdrant setup failed, exiting.")
|
142 |
+
else:
|
143 |
+
st.success("Qdrant setup successful.")
|
144 |
+
|
145 |
+
# Load models
|
146 |
+
llm = load_model()
|
147 |
+
embeddings = load_embeddings()
|
148 |
+
|
149 |
+
# Define prompt template
|
150 |
+
prompt_template = """Use the following pieces of information to answer the user's question.
|
151 |
+
If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
152 |
+
|
153 |
+
Context: {context}
|
154 |
+
Question: {question}
|
155 |
+
|
156 |
+
Only return the helpful answer. Answer must be detailed and well explained.
|
157 |
+
Helpful answer:
|
158 |
+
"""
|
159 |
+
prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])
|
160 |
+
# Define retriever
|
161 |
+
|
162 |
+
# Define Streamlit app
|
163 |
+
|
164 |
+
def process_answer(query):
|
165 |
+
chain_type_kwargs = {"prompt": prompt}
|
166 |
+
global ensemble_retriever
|
167 |
+
qa = RetrievalQA.from_chain_type(
|
168 |
+
llm=llm,
|
169 |
+
chain_type="stuff",
|
170 |
+
retriever=ensemble_retriever,
|
171 |
+
return_source_documents=True,
|
172 |
+
chain_type_kwargs=chain_type_kwargs,
|
173 |
+
verbose=True
|
174 |
+
)
|
175 |
+
response = qa(query)
|
176 |
+
answer = response['result']
|
177 |
+
source_document = response['source_documents'][0].page_content
|
178 |
+
doc = response['source_documents'][0].metadata['source']
|
179 |
+
return answer, source_document, doc
|
180 |
+
|
181 |
+
def display_pdf(file):
|
182 |
+
with open(file, "rb") as f:
|
183 |
+
base64_pdf = base64.b64encode(f.read()).decode('utf-8')
|
184 |
+
pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
|
185 |
+
st.markdown(pdf_display, unsafe_allow_html=True)
|
186 |
+
|
187 |
+
def main():
|
188 |
+
st.title("PDF Question Answering System")
|
189 |
+
|
190 |
+
# Displaying File
|
191 |
+
uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
|
192 |
+
|
193 |
+
if uploaded_file is not None:
|
194 |
+
# Save uploaded PDF
|
195 |
+
with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
|
196 |
+
temp_file.write(uploaded_file.read())
|
197 |
+
temp_file_path = temp_file.name
|
198 |
+
|
199 |
+
# Display PDF
|
200 |
+
st.subheader("PDF Preview")
|
201 |
+
display_pdf(temp_file_path)
|
202 |
+
|
203 |
+
# Load and process PDF
|
204 |
+
loader = PDFMinerLoader(temp_file_path)
|
205 |
+
documents = loader.load()
|
206 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
207 |
+
texts = text_splitter.split_documents(documents)
|
208 |
+
|
209 |
+
# Update the Qdrant database with the new PDF content
|
210 |
+
|
211 |
+
try:
|
212 |
+
db.add_documents(texts)
|
213 |
+
st.success("PDF processed and vector database updated!")
|
214 |
+
global ensemble_retriever
|
215 |
+
# Initialize retriever after documents are added
|
216 |
+
bm25_retriever = BM25Retriever.from_documents(documents=texts)
|
217 |
+
bm25_retriever.k = 3
|
218 |
+
qdrant_retriever = db.as_retriever(search_kwargs={"k":1})
|
219 |
+
# Combine both retrievers using EnsembleRetriever
|
220 |
+
ensemble_retriever = EnsembleRetriever(
|
221 |
+
retrievers=[qdrant_retriever, bm25_retriever],
|
222 |
+
weights=[0.5, 0.5] # Adjust weights based on desired contribution
|
223 |
+
)
|
224 |
+
|
225 |
+
except Exception as e:
|
226 |
+
st.error(f"Error updating database: {e}")
|
227 |
+
|
228 |
+
st.subheader("Ask a question about the PDF")
|
229 |
+
user_input = st.text_input("Your question:")
|
230 |
+
|
231 |
+
if st.button('Get Response'):
|
232 |
+
if user_input:
|
233 |
+
try:
|
234 |
+
answer, source_document, doc = process_answer(user_input)
|
235 |
+
st.write("*Answer:*", answer)
|
236 |
+
st.write("*Source Document:*", source_document)
|
237 |
+
st.write("*Document Source:*", doc)
|
238 |
+
except Exception as e:
|
239 |
+
st.error(f"Error processing query: {e}")
|
240 |
+
else:
|
241 |
+
st.warning("Please enter a query.")
|
242 |
+
|
243 |
+
if __name__ == "__main__":
|
244 |
+
main()
|
__pycache__/app.cpython-310.pyc
ADDED
Binary file (2.97 kB). View file
|
|
__pycache__/clip_helpers.cpython-310.pyc
ADDED
Binary file (644 Bytes). View file
|
|
__pycache__/combinedmultimodal.cpython-310.pyc
ADDED
Binary file (15.3 kB). View file
|
|
__pycache__/imagebind.cpython-310.pyc
ADDED
Binary file (2.9 kB). View file
|
|
__pycache__/images.cpython-310.pyc
ADDED
Binary file (543 Bytes). View file
|
|
__pycache__/ingest.cpython-310.pyc
ADDED
Binary file (3.68 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain import PromptTemplate
|
2 |
+
from langchain_community.llms import LlamaCpp
|
3 |
+
from langchain.chains import RetrievalQA
|
4 |
+
from langchain_community.embeddings import SentenceTransformerEmbeddings
|
5 |
+
from fastapi import FastAPI, Request, Form, Response
|
6 |
+
from fastapi.responses import HTMLResponse
|
7 |
+
from fastapi.templating import Jinja2Templates
|
8 |
+
from fastapi.staticfiles import StaticFiles
|
9 |
+
from fastapi.encoders import jsonable_encoder
|
10 |
+
from qdrant_client import QdrantClient
|
11 |
+
from langchain_community.vectorstores import Qdrant
|
12 |
+
import os
|
13 |
+
import json
|
14 |
+
from huggingface_hub import hf_hub_download
|
15 |
+
from langchain.retrievers import EnsembleRetriever
|
16 |
+
from ingest import keyword_retriever
|
17 |
+
|
18 |
+
app = FastAPI()
|
19 |
+
|
20 |
+
templates = Jinja2Templates(directory="templates")
|
21 |
+
app.mount("/static", StaticFiles(directory="static"), name="static")
|
22 |
+
model_name = "aaditya/OpenBioLLM-Llama3-8B-GGUF"
|
23 |
+
model_file = "openbiollm-llama3-8b.Q5_K_M.gguf"
|
24 |
+
|
25 |
+
model_path = hf_hub_download(model_name,
|
26 |
+
filename=model_file, local_dir='./')
|
27 |
+
|
28 |
+
local_llm = "openbiollm-llama3-8b.Q5_K_M.gguf"
|
29 |
+
|
30 |
+
# Make sure the model path is correct for your system!
|
31 |
+
llm = LlamaCpp(
|
32 |
+
model_path= local_llm,
|
33 |
+
temperature=0.3,
|
34 |
+
# max_tokens=2048,
|
35 |
+
n_ctx=2048,
|
36 |
+
top_p=1
|
37 |
+
)
|
38 |
+
|
39 |
+
print("LLM Initialized....")
|
40 |
+
|
41 |
+
prompt_template = """Use the following pieces of information to answer the user's question.
|
42 |
+
If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
43 |
+
|
44 |
+
Context: {context}
|
45 |
+
Question: {question}
|
46 |
+
|
47 |
+
Only return the helpful answer. Answer must be detailed and well explained.
|
48 |
+
Helpful answer:
|
49 |
+
"""
|
50 |
+
|
51 |
+
embeddings = SentenceTransformerEmbeddings(model_name="medicalai/ClinicalBERT")
|
52 |
+
|
53 |
+
url = "http://localhost:6333"
|
54 |
+
|
55 |
+
client = QdrantClient(
|
56 |
+
url=url, prefer_grpc=False
|
57 |
+
)
|
58 |
+
|
59 |
+
db = Qdrant(client=client, embeddings=embeddings, collection_name="vector_db")
|
60 |
+
|
61 |
+
prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])
|
62 |
+
|
63 |
+
retriever = db.as_retriever(search_kwargs={"k":1})
|
64 |
+
ensemble_retriever = EnsembleRetriever(retrievers=[retriever,
|
65 |
+
keyword_retriever],
|
66 |
+
weights=[0.5, 0.5])
|
67 |
+
@app.get("/", response_class=HTMLResponse)
|
68 |
+
async def read_root(request: Request):
|
69 |
+
return templates.TemplateResponse("index.html", {"request": request})
|
70 |
+
|
71 |
+
@app.post("/get_response")
|
72 |
+
async def get_response(query: str = Form(...)):
|
73 |
+
chain_type_kwargs = {"prompt": prompt}
|
74 |
+
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=ensemble_retriever, return_source_documents=True, chain_type_kwargs=chain_type_kwargs, verbose=True)
|
75 |
+
response = qa(query)
|
76 |
+
print(response)
|
77 |
+
answer = response['result']
|
78 |
+
source_document = response['source_documents'][0].page_content
|
79 |
+
doc = response['source_documents'][0].metadata['source']
|
80 |
+
response_data = jsonable_encoder(json.dumps({"answer": answer, "source_document": source_document, "doc": doc}))
|
81 |
+
|
82 |
+
res = Response(response_data)
|
83 |
+
return res
|
app1.py
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from langchain import PromptTemplate
|
3 |
+
from langchain_community.llms import LlamaCpp
|
4 |
+
from langchain.chains import RetrievalQA
|
5 |
+
from langchain_community.embeddings import SentenceTransformerEmbeddings
|
6 |
+
from qdrant_client import QdrantClient
|
7 |
+
from langchain_community.vectorstores import Qdrant
|
8 |
+
import os
|
9 |
+
import json
|
10 |
+
from huggingface_hub import hf_hub_download
|
11 |
+
from langchain.retrievers import EnsembleRetriever
|
12 |
+
# from ingest import ClinicalBertEmbeddings, keyword_retriever
|
13 |
+
from langchain_community.llms import CTransformers
|
14 |
+
from transformers import AutoTokenizer, AutoModel
|
15 |
+
# # Initialize Streamlit app
|
16 |
+
# st.set_page_config(page_title="Document Retrieval App", layout='wide')
|
17 |
+
|
18 |
+
# # Download and initialize LLM model
|
19 |
+
# MODEL_PATH = './'
|
20 |
+
|
21 |
+
# # Some basic configurations for the model
|
22 |
+
# config = {
|
23 |
+
# "max_new_tokens": 2048,
|
24 |
+
# "context_length": 4096,
|
25 |
+
# "repetition_penalty": 1.1,
|
26 |
+
# "temperature": 0.5,
|
27 |
+
# "top_k": 50,
|
28 |
+
# "top_p": 0.9,
|
29 |
+
# "stream": True,
|
30 |
+
# "threads": int(os.cpu_count() / 2)
|
31 |
+
# }
|
32 |
+
|
33 |
+
# # We use Langchain's CTransformers llm class to load our quantized model
|
34 |
+
# llm = CTransformers(model=MODEL_PATH,
|
35 |
+
# config=config)
|
36 |
+
|
37 |
+
# # Tokenizer for Mistral-7B-Instruct from HuggingFace
|
38 |
+
# tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
|
39 |
+
# model_name = "aaditya/OpenBioLLM-Llama3-8B-GGUF"
|
40 |
+
# model_file = "openbiollm-llama3-8b.Q5_K_M.gguf"
|
41 |
+
# model_path = hf_hub_download(model_name, filename=model_file, local_dir='./')
|
42 |
+
|
43 |
+
# local_llm = "openbiollm-llama3-8b.Q5_K_M.gguf"
|
44 |
+
# llm = LlamaCpp(
|
45 |
+
# model_path=local_llm,
|
46 |
+
# temperature=0.3,
|
47 |
+
# n_ctx=2048,
|
48 |
+
# top_p=1
|
49 |
+
# )
|
50 |
+
|
51 |
+
# st.sidebar.title("Document Retrieval App")
|
52 |
+
|
53 |
+
# # Initialize embeddings
|
54 |
+
# embeddings = ClinicalBertEmbeddings()
|
55 |
+
|
56 |
+
# # Qdrant setup for medical_image collection
|
57 |
+
# url = "http://localhost:6333"
|
58 |
+
# client_medical = QdrantClient(url=url, prefer_grpc=False)
|
59 |
+
# db_medical = Qdrant(client=client_medical, embeddings=embeddings, collection_name="medical_image")
|
60 |
+
|
61 |
+
# # Qdrant setup for pdf collection
|
62 |
+
# client_pdf = QdrantClient(url=url, prefer_grpc=False)
|
63 |
+
# db_pdf = Qdrant(client=client_pdf, embeddings=embeddings, collection_name="pdf")
|
64 |
+
|
65 |
+
# # Define retrievers for both collections
|
66 |
+
# retriever_medical = db_medical.as_retriever(search_kwargs={"k": 1})
|
67 |
+
# retriever_pdf = db_pdf.as_retriever(search_kwargs={"k": 1})
|
68 |
+
|
69 |
+
# # Ensemble retriever combining both retrievers
|
70 |
+
# ensemble_retriever = EnsembleRetriever(retrievers=[retriever_medical, retriever_pdf], weights=[0.5, 0.5])
|
71 |
+
|
72 |
+
# # Prompt template for querying
|
73 |
+
# prompt_template = """Use the following pieces of information to answer the user's question.
|
74 |
+
# If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
75 |
+
|
76 |
+
# Context: {context}
|
77 |
+
# Question: {question}
|
78 |
+
|
79 |
+
# Only return the helpful answer. Answer must be detailed and well explained.
|
80 |
+
# Helpful answer:
|
81 |
+
# """
|
82 |
+
# prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])
|
83 |
+
|
84 |
+
# # Streamlit app layout
|
85 |
+
# with st.sidebar:
|
86 |
+
# query = st.text_area("Enter your query here:")
|
87 |
+
# if st.button("Get Response"):
|
88 |
+
# st.write("Processing query...")
|
89 |
+
# chain_type_kwargs = {"prompt": prompt}
|
90 |
+
# qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=ensemble_retriever, return_source_documents=True, chain_type_kwargs=chain_type_kwargs, verbose=True)
|
91 |
+
# response = qa(query)
|
92 |
+
|
93 |
+
# # Process response to extract answer, source document, and metadata
|
94 |
+
# answer = response['result']
|
95 |
+
# source_document = response['source_documents'][0].page_content
|
96 |
+
# doc = response['source_documents'][0].metadata['source']
|
97 |
+
|
98 |
+
# # Display response
|
99 |
+
# st.subheader("Answer:")
|
100 |
+
# st.write(answer)
|
101 |
+
# st.subheader("Source Document:")
|
102 |
+
# st.write(source_document)
|
103 |
+
# st.subheader("Document Metadata:")
|
104 |
+
# st.write(doc)
|
105 |
+
|
106 |
+
# # Run the app
|
107 |
+
# if __name__ == '__main__':
|
108 |
+
# st.title("Document Retrieval App")
|
109 |
+
# st.write("Enter your query in the sidebar and click 'Get Response' to retrieve relevant documents.")
|
110 |
+
# Define model and prompt template
|
111 |
+
|
112 |
+
|
113 |
+
# Set your Hugging Face API token
|
114 |
+
os.environ['HUGGINGFACE_HUB_TOKEN'] = ''
|
115 |
+
|
116 |
+
model_name = "mistralai/Mistral-7B-Instruct-v0.1"
|
117 |
+
model_file = "mistral-7b-instruct.q4_0.bin"
|
118 |
+
|
119 |
+
model_path = hf_hub_download(model_name, filename=model_file, local_dir='./', use_auth_token='HUGGINGFACE_HUB_TOKEN')
|
combinedmultimodal.py
ADDED
@@ -0,0 +1,621 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import uuid
|
3 |
+
from llama_index.vector_stores.qdrant import QdrantVectorStore
|
4 |
+
from llama_index.core import VectorStoreIndex, StorageContext
|
5 |
+
import qdrant_client
|
6 |
+
import torch
|
7 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
8 |
+
import clip
|
9 |
+
from llama_index.core import Document
|
10 |
+
from langchain_community.llms import LlamaCpp
|
11 |
+
import numpy as np
|
12 |
+
from huggingface_hub import hf_hub_download
|
13 |
+
from langchain_community.llms import LlamaCpp
|
14 |
+
from llama_index.core import (
|
15 |
+
ServiceContext,
|
16 |
+
SimpleDirectoryReader,
|
17 |
+
)
|
18 |
+
import threading
|
19 |
+
from dotenv import load_dotenv
|
20 |
+
from llama_index.llms.nvidia import NVIDIA
|
21 |
+
from open_clip import create_model_from_pretrained, get_tokenizer
|
22 |
+
from llama_index.core import Settings
|
23 |
+
from llama_index.core import VectorStoreIndex
|
24 |
+
from llama_index.core.vector_stores import VectorStoreQuery
|
25 |
+
from llama_index.core.query_engine import RetrieverQueryEngine
|
26 |
+
from tqdm import tqdm
|
27 |
+
from transformers import AutoTokenizer, AutoModel
|
28 |
+
from langchain.embeddings.base import Embeddings
|
29 |
+
from llama_index.embeddings.langchain import LangchainEmbedding
|
30 |
+
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
|
31 |
+
from llama_index.core import Settings
|
32 |
+
from transformers import AutoProcessor, AutoModel
|
33 |
+
import hashlib
|
34 |
+
import uuid
|
35 |
+
import os
|
36 |
+
import gradio as gr
|
37 |
+
import torch
|
38 |
+
import clip
|
39 |
+
import open_clip
|
40 |
+
import numpy as np
|
41 |
+
from llama_index.core.schema import ImageDocument
|
42 |
+
import cv2
|
43 |
+
import matplotlib.pyplot as plt
|
44 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
45 |
+
from unstructured.partition.pdf import partition_pdf
|
46 |
+
from pathlib import Path
|
47 |
+
from langchain_community.document_loaders import DirectoryLoader, UnstructuredFileLoader
|
48 |
+
from PIL import Image
|
49 |
+
import logging
|
50 |
+
import concurrent.futures
|
51 |
+
import logging
|
52 |
+
from llama_index.core import set_global_service_context
|
53 |
+
from llama_index.core import Document as LlamaIndexDocument
|
54 |
+
import getpass
|
55 |
+
import os
|
56 |
+
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
|
57 |
+
from sentence_transformers import util
|
58 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
59 |
+
import base64
|
60 |
+
from google.generativeai import GenerativeModel, configure
|
61 |
+
import google.generativeai as genai
|
62 |
+
|
63 |
+
# Configure logging
|
64 |
+
# logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
|
65 |
+
|
66 |
+
|
67 |
+
class MetadataMode:
|
68 |
+
EMBED = "embed"
|
69 |
+
INLINE = "inline"
|
70 |
+
NONE = "none"
|
71 |
+
|
72 |
+
# Define the vectors configuration
|
73 |
+
vectors_config = {
|
74 |
+
"vector_size": 768, # or whatever the dimensionality of your vectors is
|
75 |
+
"distance": "Cosine" # can be "Cosine", "Euclidean", etc.
|
76 |
+
}
|
77 |
+
class ClinicalBertEmbeddingWrapper:
|
78 |
+
def __init__(self, model_name: str = "medicalai/ClinicalBERT"):
|
79 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
80 |
+
self.model = AutoModel.from_pretrained(model_name)
|
81 |
+
self.model.eval()
|
82 |
+
|
83 |
+
def embed(self, text: str):
|
84 |
+
inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
85 |
+
with torch.no_grad():
|
86 |
+
outputs = self.model(**inputs)
|
87 |
+
embeddings = self.mean_pooling(outputs, inputs['attention_mask'])
|
88 |
+
return embeddings.squeeze().tolist()
|
89 |
+
|
90 |
+
def mean_pooling(self, model_output, attention_mask):
|
91 |
+
token_embeddings = model_output[0] # First element of model_output contains all token embeddings
|
92 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
93 |
+
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
94 |
+
|
95 |
+
def embed_documents(self, texts):
|
96 |
+
return [self.embed(text) for text in texts]
|
97 |
+
|
98 |
+
def embed_query(self, text):
|
99 |
+
return self.embed(text)
|
100 |
+
# Implement this method if needed
|
101 |
+
def get_text_embedding_batch(self, text_batch, show_progress=False):
|
102 |
+
embeddings = []
|
103 |
+
num_batches = len(text_batch)
|
104 |
+
|
105 |
+
# Process in batches of size 8
|
106 |
+
batch_size = 8
|
107 |
+
for i in tqdm(range(0, num_batches, batch_size), desc="Processing Batches", disable=not show_progress):
|
108 |
+
batch_texts = text_batch[i:i + batch_size]
|
109 |
+
batch_embeddings = self.embed_documents(batch_texts)
|
110 |
+
embeddings.extend(batch_embeddings)
|
111 |
+
|
112 |
+
return embeddings
|
113 |
+
def get_agg_embedding_from_queries(self, queries):
|
114 |
+
# Get embeddings for each query using the embed method
|
115 |
+
embeddings = [torch.tensor(self.embed(query)) for query in queries]
|
116 |
+
|
117 |
+
# Convert list of tensors to a single tensor for aggregation
|
118 |
+
embeddings_tensor = torch.stack(embeddings)
|
119 |
+
|
120 |
+
# Example: averaging embeddings
|
121 |
+
agg_embedding = embeddings_tensor.mean(dim=0)
|
122 |
+
|
123 |
+
return agg_embedding.tolist()
|
124 |
+
|
125 |
+
|
126 |
+
# Load environment variables
|
127 |
+
load_dotenv()
|
128 |
+
genai.configure(api_key=os.environ["GEMINI_API_KEY"])
|
129 |
+
nvidia_api_key = os.getenv("NVIDIA_API_KEY")
|
130 |
+
if not nvidia_api_key:
|
131 |
+
raise ValueError("NVIDIA_API_KEY not found in .env file")
|
132 |
+
|
133 |
+
os.environ["NVIDIA_API_KEY"] = nvidia_api_key
|
134 |
+
|
135 |
+
model_name = "aaditya/OpenBioLLM-Llama3-8B-GGUF"
|
136 |
+
model_file = "openbiollm-llama3-8b.Q5_K_M.gguf"
|
137 |
+
QDRANT_URL = "https://f1e9a70a-afb9-498d-b66d-cb248e0d5557.us-east4-0.gcp.cloud.qdrant.io:6333"
|
138 |
+
QDRANT_API_KEY = "REXlX_PeDvCoXeS9uKCzC--e3-LQV0lw3_jBTdcLZ7P5_F6EOdwklA"
|
139 |
+
|
140 |
+
# Download model
|
141 |
+
model_path = hf_hub_download(model_name, filename=model_file, local_dir='./')
|
142 |
+
llm = NVIDIA(model="writer/palmyra-med-70b")
|
143 |
+
llm.model
|
144 |
+
local_llm = "openbiollm-llama3-8b.Q5_K_M.gguf"
|
145 |
+
# Initialize ClinicalBert embeddings model
|
146 |
+
# text_embed_model = ClinicalBertEmbeddings(model_name="medicalai/ClinicalBERT")
|
147 |
+
text_embed_model = ClinicalBertEmbeddingWrapper(model_name="medicalai/ClinicalBERT")
|
148 |
+
# Intially I was using this biollm but for faster text response during inference I am going for external models
|
149 |
+
#but with this also it works fine.
|
150 |
+
llm1 = LlamaCpp(
|
151 |
+
model_path=local_llm,
|
152 |
+
temperature=0.3,
|
153 |
+
n_ctx=2048,
|
154 |
+
top_p=1
|
155 |
+
)
|
156 |
+
Settings.llm = llm
|
157 |
+
Settings.embed_model = text_embed_model
|
158 |
+
# Define ServiceContext with ClinicalBertEmbeddings for text
|
159 |
+
service_context = ServiceContext.from_defaults(
|
160 |
+
llm = llm,
|
161 |
+
embed_model=text_embed_model # Use ClinicalBert embeddings model
|
162 |
+
)
|
163 |
+
set_global_service_context(service_context)
|
164 |
+
# Just for logging and Debugging
|
165 |
+
# Log ServiceContext details
|
166 |
+
# logging.debug(f"LLM: {service_context.llm}")
|
167 |
+
# logging.debug(f"Embed Model: {service_context.embed_model}")
|
168 |
+
# logging.debug(f"Node Parser: {service_context.node_parser}")
|
169 |
+
# logging.debug(f"Prompt Helper: {service_context.prompt_helper}")
|
170 |
+
# Create QdrantClient with the location set to ":memory:", which means the vector db will be stored in memory
|
171 |
+
try:
|
172 |
+
text_client = qdrant_client.QdrantClient(
|
173 |
+
url=QDRANT_URL,
|
174 |
+
api_key=QDRANT_API_KEY,
|
175 |
+
port=443,
|
176 |
+
)
|
177 |
+
print("Qdrant client initialized successfully.")
|
178 |
+
except Exception as e:
|
179 |
+
print(f"Error initializing Qdrant client: {e}")
|
180 |
+
raise
|
181 |
+
# load Text documents from the data_wiki directory
|
182 |
+
# text_documents = SimpleDirectoryReader("./Data").load_data()
|
183 |
+
# Load documents
|
184 |
+
loader = DirectoryLoader("./Data/", glob="**/*.pdf", show_progress=True, loader_cls=UnstructuredFileLoader)
|
185 |
+
documents = loader.load()
|
186 |
+
# Print document names
|
187 |
+
for doc in documents:
|
188 |
+
print(f"Processing document: {doc.metadata.get('source', 'Unknown')}")
|
189 |
+
# Split documents into chunks
|
190 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=70)
|
191 |
+
texts = text_splitter.split_documents(documents)
|
192 |
+
|
193 |
+
|
194 |
+
|
195 |
+
print(f"Loaded {len(documents)} documents")
|
196 |
+
print(f"Split into {len(texts)} chunks")
|
197 |
+
# Convert langchain documents to llama_index documents
|
198 |
+
text_documents = [
|
199 |
+
LlamaIndexDocument(text=t.page_content, metadata=t.metadata)
|
200 |
+
for t in texts
|
201 |
+
]
|
202 |
+
# Initialize Qdrant vector store
|
203 |
+
try:
|
204 |
+
text_vector_store = QdrantVectorStore(
|
205 |
+
client=text_client, collection_name="pdf_text"
|
206 |
+
)
|
207 |
+
print("Qdrant vector store initialized successfully.")
|
208 |
+
except Exception as e:
|
209 |
+
print(f"Error initializing Qdrant vector store: {e}")
|
210 |
+
raise
|
211 |
+
|
212 |
+
try:
|
213 |
+
image_vector_store = QdrantVectorStore(
|
214 |
+
client=text_client, collection_name="pdf_img"
|
215 |
+
)
|
216 |
+
print("Qdrant vector store initialized successfully.")
|
217 |
+
except Exception as e:
|
218 |
+
print(f"Error initializing Qdrant vector store: {e}")
|
219 |
+
raise
|
220 |
+
|
221 |
+
storage_context = StorageContext.from_defaults(vector_store=text_vector_store)
|
222 |
+
|
223 |
+
wiki_text_index = VectorStoreIndex.from_documents(text_documents
|
224 |
+
# , storage_context=storage_context
|
225 |
+
, service_context=service_context
|
226 |
+
)
|
227 |
+
print(f"VectorStoreIndex created with {len(wiki_text_index.docstore.docs)} documents")
|
228 |
+
|
229 |
+
# define the streaming query engine
|
230 |
+
streaming_qe = wiki_text_index.as_query_engine(streaming=True)
|
231 |
+
print(len(text_documents))
|
232 |
+
|
233 |
+
# Function to query the text vector database
|
234 |
+
# Modify the process_query function
|
235 |
+
|
236 |
+
model, preprocess = clip.load("ViT-B/32")
|
237 |
+
input_resolution = model.visual.input_resolution
|
238 |
+
context_length = model.context_length
|
239 |
+
vocab_size = model.vocab_size
|
240 |
+
|
241 |
+
print(
|
242 |
+
"Model parameters:",
|
243 |
+
f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}",
|
244 |
+
)
|
245 |
+
print("Input resolution:", input_resolution)
|
246 |
+
print("Context length:", context_length)
|
247 |
+
print("Vocab size:", vocab_size)
|
248 |
+
|
249 |
+
pdf_directory = Path("./data")
|
250 |
+
image_path = Path("./images1")
|
251 |
+
image_path.mkdir(exist_ok=True, parents=True)
|
252 |
+
|
253 |
+
# Dictionary to store image metadata
|
254 |
+
image_metadata_dict = {}
|
255 |
+
|
256 |
+
# Limit the number of images downloaded per PDF
|
257 |
+
MAX_IMAGES_PER_PDF = 15
|
258 |
+
|
259 |
+
# Generate a UUID for each image
|
260 |
+
image_uuid = 0
|
261 |
+
|
262 |
+
# Iterate over each PDF file in the data folder
|
263 |
+
for pdf_file in pdf_directory.glob("*.pdf"):
|
264 |
+
images_per_pdf = 0
|
265 |
+
print(f"Processing: {pdf_file}")
|
266 |
+
|
267 |
+
# Extract images from the PDF
|
268 |
+
try:
|
269 |
+
raw_pdf_elements = partition_pdf(
|
270 |
+
filename=str(pdf_file),
|
271 |
+
extract_images_in_pdf=True,
|
272 |
+
infer_table_structure=True,
|
273 |
+
chunking_strategy="by_title",
|
274 |
+
max_characters=4000,
|
275 |
+
new_after_n_chars=3800,
|
276 |
+
combine_text_under_n_chars=2000,
|
277 |
+
extract_image_block_output_dir=image_path,
|
278 |
+
)
|
279 |
+
# Loop through the elements
|
280 |
+
except Exception as e:
|
281 |
+
print(f"Error processing {pdf_file}: {e}")
|
282 |
+
import traceback
|
283 |
+
traceback.print_exc()
|
284 |
+
continue
|
285 |
+
# Function to summarize images
|
286 |
+
def summarize_image(image_path):
|
287 |
+
# Load and encode the image
|
288 |
+
with open(image_path, "rb") as image_file:
|
289 |
+
encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
|
290 |
+
|
291 |
+
# Create a GenerativeModel object
|
292 |
+
model = GenerativeModel('gemini-1.5-flash')
|
293 |
+
|
294 |
+
# Prepare the prompt
|
295 |
+
prompt = """
|
296 |
+
You are an expert in analyzing medical images. Please provide a detailed description of this medical image, including:
|
297 |
+
1. You are a bot that is good at analyzing images related to Dog's health
|
298 |
+
2. The body part or area being examined
|
299 |
+
3. Any visible structures, organs, or tissues
|
300 |
+
4. Any abnormalities, lesions, or notable features
|
301 |
+
5. Any other relevant medical diagram description.
|
302 |
+
|
303 |
+
Please be as specific and detailed as possible in your analysis.
|
304 |
+
"""
|
305 |
+
|
306 |
+
# Generate the response
|
307 |
+
response = model.generate_content([
|
308 |
+
prompt,
|
309 |
+
{"mime_type": "image/jpeg", "data": encoded_image}
|
310 |
+
])
|
311 |
+
|
312 |
+
return response.text
|
313 |
+
|
314 |
+
# # Iterate through each file in the directory
|
315 |
+
for image_file in os.listdir(image_path):
|
316 |
+
if image_file.endswith(('.jpg', '.jpeg', '.png')):
|
317 |
+
# Generate a standard UUID for the image
|
318 |
+
image_uuid = str(uuid.uuid4())
|
319 |
+
image_file_name = image_file
|
320 |
+
image_file_path = image_path / image_file
|
321 |
+
# Generate image summary
|
322 |
+
# image_summary = generate_image_summary_with(str(image_file_path), model, feature_extractor, tokenizer, device)
|
323 |
+
# image_summary = generate_summary_with_lm(str(image_file_path), preprocess, model, device, tokenizer, lm_model)
|
324 |
+
image_summary = summarize_image(image_file_path)
|
325 |
+
# Construct metadata entry for the image
|
326 |
+
image_metadata_dict[image_uuid] = {
|
327 |
+
"filename": image_file_name,
|
328 |
+
"img_path": str(image_file_path), # Store the absolute path to the image
|
329 |
+
"summary": image_summary # Add the summary to the metadata
|
330 |
+
}
|
331 |
+
|
332 |
+
# Limit the number of images processed per folder
|
333 |
+
if len(image_metadata_dict) >= MAX_IMAGES_PER_PDF:
|
334 |
+
break
|
335 |
+
|
336 |
+
print(f"Number of items in image_dict: {len(image_metadata_dict)}")
|
337 |
+
|
338 |
+
# Print the metadata dictionary
|
339 |
+
for key, value in image_metadata_dict.items():
|
340 |
+
print(f"UUID: {key}, Metadata: {value}")
|
341 |
+
|
342 |
+
|
343 |
+
def plot_images_with_opencv(image_metadata_dict):
|
344 |
+
original_images_urls = []
|
345 |
+
images_shown = 0
|
346 |
+
|
347 |
+
plt.figure(figsize=(16, 16)) # Adjust the figure size as needed
|
348 |
+
|
349 |
+
for image_id in image_metadata_dict:
|
350 |
+
img_path = image_metadata_dict[image_id]["img_path"]
|
351 |
+
if os.path.isfile(img_path):
|
352 |
+
try:
|
353 |
+
img = cv2.imread(img_path)
|
354 |
+
if img is not None:
|
355 |
+
# Convert BGR (OpenCV) to RGB (matplotlib)
|
356 |
+
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
357 |
+
|
358 |
+
plt.subplot(8, 8, len(original_images_urls) + 1)
|
359 |
+
plt.imshow(img_rgb)
|
360 |
+
plt.xticks([])
|
361 |
+
plt.yticks([])
|
362 |
+
|
363 |
+
original_images_urls.append(image_metadata_dict[image_id]["filename"])
|
364 |
+
images_shown += 1
|
365 |
+
if images_shown >= 64:
|
366 |
+
break
|
367 |
+
except Exception as e:
|
368 |
+
print(f"Error processing image {img_path}: {e}")
|
369 |
+
|
370 |
+
plt.tight_layout()
|
371 |
+
plt.show()
|
372 |
+
|
373 |
+
plot_images_with_opencv(image_metadata_dict)
|
374 |
+
# set the device to use for the CLIP model, either CUDA (GPU) or CPU, depending on availability
|
375 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
376 |
+
print(device)
|
377 |
+
# Function to preprocess image using OpenCV
|
378 |
+
def preprocess_image(img):
|
379 |
+
# Convert BGR to RGB
|
380 |
+
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
381 |
+
# Convert the image to a PIL Image and then preprocess
|
382 |
+
img_pil = Image.fromarray(img_rgb)
|
383 |
+
return preprocess(img_pil)
|
384 |
+
# Use BiomedCLIP processor for preprocessing
|
385 |
+
# return preprocess(images=img_pil, return_tensors="pt")
|
386 |
+
# return preprocess(img_pil).unsqueeze(0)
|
387 |
+
|
388 |
+
|
389 |
+
img_emb_dict = {}
|
390 |
+
with torch.no_grad():
|
391 |
+
for image_id in image_metadata_dict:
|
392 |
+
img_file_path = image_metadata_dict[image_id]["img_path"]
|
393 |
+
if os.path.isfile(img_file_path):
|
394 |
+
try:
|
395 |
+
# Load image using OpenCV
|
396 |
+
img = cv2.imread(img_file_path)
|
397 |
+
|
398 |
+
if img is not None:
|
399 |
+
# Preprocess image
|
400 |
+
image = preprocess_image(img).unsqueeze(0).to(device)
|
401 |
+
# image = preprocess_image(img).to(device)
|
402 |
+
|
403 |
+
# Extract image features
|
404 |
+
image_features = model.encode_image(image)
|
405 |
+
|
406 |
+
# Store image features
|
407 |
+
img_emb_dict[image_id] = image_features
|
408 |
+
else:
|
409 |
+
print(f"Failed to load image {img_file_path}")
|
410 |
+
except Exception as e:
|
411 |
+
print(f"Error processing image {img_file_path}: {e}")
|
412 |
+
|
413 |
+
len(img_emb_dict) #22 image so 22 img emb
|
414 |
+
|
415 |
+
|
416 |
+
|
417 |
+
|
418 |
+
# create a list of ImageDocument objects, one for each image in the dataset
|
419 |
+
img_documents = []
|
420 |
+
for image_filename in image_metadata_dict:
|
421 |
+
# the img_emb_dict dictionary contains the image embeddings
|
422 |
+
if image_filename in img_emb_dict:
|
423 |
+
filename = image_metadata_dict[image_filename]["filename"]
|
424 |
+
filepath = image_metadata_dict[image_filename]["img_path"]
|
425 |
+
summary = image_metadata_dict[image_filename]["summary"]
|
426 |
+
#print(filepath)
|
427 |
+
|
428 |
+
# create an ImageDocument for each image
|
429 |
+
newImgDoc = ImageDocument(
|
430 |
+
text=filename, metadata={"filepath": filepath, "summary": summary} # Include the summary in the metadata
|
431 |
+
|
432 |
+
)
|
433 |
+
|
434 |
+
# set image embedding on the ImageDocument
|
435 |
+
newImgDoc.embedding = img_emb_dict[image_filename].tolist()[0]
|
436 |
+
img_documents.append(newImgDoc)
|
437 |
+
|
438 |
+
# define storage context
|
439 |
+
storage_context = StorageContext.from_defaults(vector_store=image_vector_store)
|
440 |
+
|
441 |
+
# define image index
|
442 |
+
image_index = VectorStoreIndex.from_documents(
|
443 |
+
img_documents,
|
444 |
+
storage_context=storage_context
|
445 |
+
)
|
446 |
+
# for doc in img_documents:
|
447 |
+
# print(f"ImageDocument: {doc.text}, Embedding: {doc.embedding}, Metadata: {doc.metadata}")
|
448 |
+
|
449 |
+
def retrieve_results_from_image_index(query):
|
450 |
+
""" take a text query as input and return the most similar image from the vector store """
|
451 |
+
|
452 |
+
# first tokenize the text query and convert it to a tensor
|
453 |
+
text = clip.tokenize(query).to(device)
|
454 |
+
|
455 |
+
# encode the text tensor using the CLIP model to produce a query embedding
|
456 |
+
query_embedding = model.encode_text(text).tolist()[0]
|
457 |
+
# Encode the query using ClinicalBERT for text similarity
|
458 |
+
clinical_query_embedding = text_embed_model.embed_query(query)
|
459 |
+
# create a VectorStoreQuery
|
460 |
+
image_vector_store_query = VectorStoreQuery(
|
461 |
+
query_embedding=query_embedding,
|
462 |
+
similarity_top_k=1, # returns 1 image
|
463 |
+
mode="default",
|
464 |
+
)
|
465 |
+
|
466 |
+
# execute the query against the image vector store
|
467 |
+
image_retrieval_results = image_vector_store.query(
|
468 |
+
image_vector_store_query
|
469 |
+
)
|
470 |
+
if image_retrieval_results.nodes:
|
471 |
+
best_score = -1
|
472 |
+
best_image = None
|
473 |
+
|
474 |
+
for node, clip_score in zip(image_retrieval_results.nodes, image_retrieval_results.similarities):
|
475 |
+
image_path = node.metadata["filepath"]
|
476 |
+
image_summary = node.metadata.get("summary", "") # Assuming summaries are stored in metadata
|
477 |
+
|
478 |
+
# Calculate text similarity between query and image summary
|
479 |
+
summary_embedding = text_embed_model.embed_query(image_summary)
|
480 |
+
# text_score = util.cosine_similarity(
|
481 |
+
# [clinical_query_embedding], [summary_embedding]
|
482 |
+
# )[0][0]
|
483 |
+
# Use util.cos_sim for cosine similarity
|
484 |
+
text_score = util.cos_sim(torch.tensor([clinical_query_embedding]),
|
485 |
+
torch.tensor([summary_embedding]))[0][0].item()
|
486 |
+
|
487 |
+
|
488 |
+
# Calculate average similarity score
|
489 |
+
avg_score = (clip_score + text_score) / 2
|
490 |
+
|
491 |
+
if avg_score > best_score:
|
492 |
+
best_score = avg_score
|
493 |
+
best_image = image_path
|
494 |
+
|
495 |
+
return best_image, best_score
|
496 |
+
|
497 |
+
return None, 0.0
|
498 |
+
|
499 |
+
def plot_image_retrieve_results(image_retrieval_results):
|
500 |
+
""" Take a list of image retrieval results and create a new figure """
|
501 |
+
|
502 |
+
plt.figure(figsize=(16, 5))
|
503 |
+
|
504 |
+
img_cnt = 0
|
505 |
+
|
506 |
+
# Iterate over the image retrieval results, and for each result, display the corresponding image and its score in a subplot.
|
507 |
+
# The title of the subplot is the score of the image, formatted to four decimal places.
|
508 |
+
|
509 |
+
for returned_image, score in zip(
|
510 |
+
image_retrieval_results.nodes, image_retrieval_results.similarities
|
511 |
+
):
|
512 |
+
img_name = returned_image.text
|
513 |
+
img_path = returned_image.metadata["filepath"]
|
514 |
+
|
515 |
+
# Read image using OpenCV
|
516 |
+
image = cv2.imread(img_path)
|
517 |
+
# Convert image to RGB format (OpenCV reads in BGR by default)
|
518 |
+
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
519 |
+
|
520 |
+
plt.subplot(2, 3, img_cnt + 1)
|
521 |
+
plt.title("{:.4f}".format(score))
|
522 |
+
|
523 |
+
plt.imshow(image_rgb)
|
524 |
+
plt.xticks([])
|
525 |
+
plt.yticks([])
|
526 |
+
img_cnt += 1
|
527 |
+
|
528 |
+
plt.tight_layout()
|
529 |
+
plt.show()
|
530 |
+
def get_all_images():
|
531 |
+
image_paths = []
|
532 |
+
for _, metadata in image_metadata_dict.items():
|
533 |
+
image_paths.append(metadata["img_path"])
|
534 |
+
return image_paths
|
535 |
+
|
536 |
+
def load_image(image_path):
|
537 |
+
return Image.open(image_path)
|
538 |
+
|
539 |
+
# Define the combined query function
|
540 |
+
def combined_query(query, similarity_threshold=0.3):
|
541 |
+
# Text query
|
542 |
+
text_response = streaming_qe.query(query)
|
543 |
+
text_result = ""
|
544 |
+
for text in text_response.response_gen:
|
545 |
+
text_result += text
|
546 |
+
|
547 |
+
# Image query
|
548 |
+
top_image_path, similarity_score = retrieve_results_from_image_index(query)
|
549 |
+
|
550 |
+
if similarity_score >= similarity_threshold:
|
551 |
+
return text_result, top_image_path, similarity_score
|
552 |
+
else:
|
553 |
+
return text_result, None, similarity_score
|
554 |
+
def gradio_interface(query):
|
555 |
+
text_result, image_path, similarity_score = combined_query(query)
|
556 |
+
top_image = load_image(image_path) if image_path else None
|
557 |
+
all_images = [load_image(path) for path in get_all_images()]
|
558 |
+
return text_result, top_image, all_images, f"Similarity Score: {similarity_score:.4f}"
|
559 |
+
|
560 |
+
with gr.Blocks() as iface:
|
561 |
+
gr.Markdown("# Medical Knowledge Base Query System")
|
562 |
+
|
563 |
+
with gr.Row():
|
564 |
+
query_input = gr.Textbox(lines=2, placeholder="Enter your medical query here...")
|
565 |
+
submit_button = gr.Button("Submit")
|
566 |
+
|
567 |
+
with gr.Row():
|
568 |
+
text_output = gr.Textbox(label="Text Response")
|
569 |
+
image_output = gr.Image(label="Top Related Image (if similarity > threshold)")
|
570 |
+
|
571 |
+
similarity_score_output = gr.Textbox(label="Similarity Score")
|
572 |
+
|
573 |
+
gallery_output = gr.Gallery(label="All Extracted Images", show_label=True, elem_id="gallery")
|
574 |
+
|
575 |
+
submit_button.click(
|
576 |
+
fn=gradio_interface,
|
577 |
+
inputs=query_input,
|
578 |
+
outputs=[text_output, image_output, gallery_output, similarity_score_output]
|
579 |
+
)
|
580 |
+
|
581 |
+
# Load all images on startup
|
582 |
+
iface.load(lambda: ["", None, [load_image(path) for path in get_all_images()], ""],
|
583 |
+
outputs=[text_output, image_output, gallery_output, similarity_score_output])
|
584 |
+
# Launch the Gradio interface
|
585 |
+
iface.launch(share=True)
|
586 |
+
# just to check if it works or not
|
587 |
+
# def image_query(query):
|
588 |
+
# image_retrieval_results = retrieve_results_from_image_index(query)
|
589 |
+
# plot_image_retrieve_results(image_retrieval_results)
|
590 |
+
|
591 |
+
# query1 = "What is gingivitis?"
|
592 |
+
# # generate image retrieval results
|
593 |
+
# image_query(query1)
|
594 |
+
|
595 |
+
# # Modify your text query function
|
596 |
+
# # def text_query(query):
|
597 |
+
# # text_retrieval_results = process_query(query, text_embed_model, k=10)
|
598 |
+
# # return text_retrieval_results
|
599 |
+
# # Function to query the text vector database
|
600 |
+
|
601 |
+
|
602 |
+
# def text_query(query: str, k: int = 10):
|
603 |
+
# # Create a VectorStoreIndex from the existing vector store
|
604 |
+
# index = VectorStoreIndex.from_vector_store(text_vector_store)
|
605 |
+
|
606 |
+
# # Create a retriever with top-k configuration
|
607 |
+
# retriever = index.as_retriever(similarity_top_k=k)
|
608 |
+
|
609 |
+
# # Create a query engine
|
610 |
+
# query_engine = RetrieverQueryEngine.from_args(retriever)
|
611 |
+
|
612 |
+
# # Execute the query
|
613 |
+
# response = query_engine.query(query)
|
614 |
+
|
615 |
+
# return response
|
616 |
+
|
617 |
+
# # text_retrieval_results = text_query(query1)
|
618 |
+
# streaming_response = streaming_qe.query(
|
619 |
+
# query1
|
620 |
+
# )
|
621 |
+
# streaming_response.print_response_stream()
|
freeze
ADDED
File without changes
|
images.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from unstructured.partition.pdf import partition_pdf
|
2 |
+
output_path = "./images"
|
3 |
+
raw_pdf_elements = partition_pdf(
|
4 |
+
filename="./Data/AC-Aids-for-Dogs_Canine-Periodontal-Disease.pdf",
|
5 |
+
extract_images_in_pdf=True,
|
6 |
+
infer_table_structure=True,
|
7 |
+
chunking_strategy="by_title",
|
8 |
+
max_characters=4000,
|
9 |
+
new_after_n_chars=3800,
|
10 |
+
combine_text_under_n_chars=2000,
|
11 |
+
extract_image_block_output_dir=output_path,
|
12 |
+
)
|
images/architecture.png
ADDED
images/figure-1-1.jpg
ADDED
images/figure-1-10.jpg
ADDED
images/figure-1-11.jpg
ADDED
images/figure-1-2.jpg
ADDED
images/figure-1-3.jpg
ADDED
images/figure-1-4.jpg
ADDED
images/figure-1-5.jpg
ADDED
images/figure-1-6.jpg
ADDED
images/figure-1-7.jpg
ADDED
images/figure-1-8.jpg
ADDED
images/figure-1-9.jpg
ADDED
images/multimodal.png
ADDED
Git LFS Details
|
images1/figure-1-1.jpg
ADDED
images1/figure-1-10.jpg
ADDED
images1/figure-1-11.jpg
ADDED
images1/figure-1-2.jpg
ADDED
images1/figure-1-3.jpg
ADDED
images1/figure-1-4.jpg
ADDED
images1/figure-1-5.jpg
ADDED
images1/figure-1-6.jpg
ADDED