Spaces:

Surbhi123
/

Medical_RAG

Build error

App Files Files Community

Surbhi123 commited on Aug 22, 2024

Commit

64772a4

verified ·

1 Parent(s): 9e7ce10

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.DS_Store +0 -0
.cache/huggingface/.gitignore +1 -0
.cache/huggingface/download/openbiollm-llama3-8b.Q5_K_M.gguf.lock +0 -0
.cache/huggingface/download/openbiollm-llama3-8b.Q5_K_M.gguf.metadata +3 -0
.env +4 -0
.gitattributes +155 -0
.github/workflows/update_space.yml +28 -0
.gitignore +7 -0
.streamlit/secrets.toml +3 -0
Data/AC-Aids-for-Dogs_Canine-Periodontal-Disease.pdf +0 -0
Data/cancer_and_cure__a_critical_analysis.27.pdf +0 -0
Data/medical_oncology_handbook_june_2020_edition.pdf +0 -0
DockerFile +20 -0
MultimodalRAG.ipynb +0 -0
MultimodalRAGUpdatedVersion.ipynb +0 -0
README.md +125 -8
Streaming.py +223 -0
Streamingnewversion.py +244 -0
__pycache__/app.cpython-310.pyc +0 -0
__pycache__/clip_helpers.cpython-310.pyc +0 -0
__pycache__/combinedmultimodal.cpython-310.pyc +0 -0
__pycache__/imagebind.cpython-310.pyc +0 -0
__pycache__/images.cpython-310.pyc +0 -0
__pycache__/ingest.cpython-310.pyc +0 -0
app.py +83 -0
app1.py +119 -0
combinedmultimodal.py +621 -0
freeze +0 -0
images.py +12 -0
images/architecture.png +0 -0
images/figure-1-1.jpg +0 -0
images/figure-1-10.jpg +0 -0
images/figure-1-11.jpg +0 -0
images/figure-1-2.jpg +0 -0
images/figure-1-3.jpg +0 -0
images/figure-1-4.jpg +0 -0
images/figure-1-5.jpg +0 -0
images/figure-1-6.jpg +0 -0
images/figure-1-7.jpg +0 -0
images/figure-1-8.jpg +0 -0
images/figure-1-9.jpg +0 -0
images/multimodal.png +3 -0
images1/figure-1-1.jpg +0 -0
images1/figure-1-10.jpg +0 -0
images1/figure-1-11.jpg +0 -0
images1/figure-1-2.jpg +0 -0
images1/figure-1-3.jpg +0 -0
images1/figure-1-4.jpg +0 -0
images1/figure-1-5.jpg +0 -0
images1/figure-1-6.jpg +0 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.cache/huggingface/.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ *

.cache/huggingface/download/openbiollm-llama3-8b.Q5_K_M.gguf.lock ADDED Viewed

File without changes

.cache/huggingface/download/openbiollm-llama3-8b.Q5_K_M.gguf.metadata ADDED Viewed

	@@ -0,0 +1,3 @@

+d1248c48f0ade670847d05fb2cb356a75df4db3a
+1753c629bf99c261e8b92498d813f382f811e903cdc0e685a11d1689612b34ce
+1723860909.403446

.env ADDED Viewed

	@@ -0,0 +1,4 @@

+QDRANT_URL=https://f1e9a70a-afb9-498d-b66d-cb248e0d5557.us-east4-0.gcp.cloud.qdrant.io:6333
+QDRANT_API_KEY=REXlX_PeDvCoXeS9uKCzC--e3-LQV0lw3_jBTdcLZ7P5_F6EOdwklA
+NVIDIA_API_KEY=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
+GEMINI_API_KEY=AIzaSyCXGnm-n6aF962jeorkjo2IsMCwxDwj4bo

.gitattributes CHANGED Viewed

@@ -33,3 +33,158 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+images/multimodal.png filter=lfs diff=lfs merge=lfs -text
+multimodal.png filter=lfs diff=lfs merge=lfs -text
+myenv/bin/python filter=lfs diff=lfs merge=lfs -text
+myenv/bin/python3 filter=lfs diff=lfs merge=lfs -text
+myenv/bin/python3.10 filter=lfs diff=lfs merge=lfs -text
+myenv/bin/ruff filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/Cython/Compiler/Code.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/PIL/.dylibs/libfreetype.6.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/PIL/.dylibs/libharfbuzz.0.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/_soundfile_data/libsndfile_x86_64.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/altair/vegalite/v5/schema/__pycache__/channels.cpython-310.pyc filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/altair/vegalite/v5/schema/__pycache__/core.cpython-310.pyc filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/av/.dylibs/libaom.3.2.0.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/av/.dylibs/libavcodec.60.31.102.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/av/.dylibs/libavfilter.9.12.100.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/av/.dylibs/libavformat.60.16.100.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/av/.dylibs/libdav1d.7.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/av/.dylibs/libfreetype.6.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/av/.dylibs/libharfbuzz.0.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/av/.dylibs/libswscale.7.5.100.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/av/.dylibs/libvpx.9.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/av/.dylibs/libx264.164.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/av/.dylibs/libx265.199.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/av/.dylibs/libxml2.2.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/cmake/data/bin/ccmake filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/cmake/data/bin/cmake filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/cmake/data/bin/cpack filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/cmake/data/bin/ctest filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/cmake/data/doc/cmake/CMake.qch filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/cryptography/hazmat/bindings/_rust.abi3.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/ctransformers/lib/avx/ctransformers.dll filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/ctransformers/lib/avx/libctransformers.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/ctransformers/lib/avx/libctransformers.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/ctransformers/lib/avx2/ctransformers.dll filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/ctransformers/lib/avx2/libctransformers.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/ctransformers/lib/avx2/libctransformers.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/ctransformers/lib/basic/ctransformers.dll filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/ctransformers/lib/basic/libctransformers.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/ctransformers/lib/basic/libctransformers.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/ctransformers/lib/cuda/ctransformers.dll filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/ctransformers/lib/cuda/libctransformers.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/cv2/.dylibs/libSvtAv1Enc.1.8.0.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/cv2/.dylibs/libX11.6.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/cv2/.dylibs/libaom.3.8.0.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/cv2/.dylibs/libavcodec.60.31.102.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/cv2/.dylibs/libavformat.60.16.100.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/cv2/.dylibs/libcrypto.3.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/cv2/.dylibs/libdav1d.7.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/cv2/.dylibs/libgnutls.30.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/cv2/.dylibs/libjxl.0.9.0.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/cv2/.dylibs/libp11-kit.0.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/cv2/.dylibs/librav1e.0.6.6.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/cv2/.dylibs/libunistring.5.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/cv2/.dylibs/libvpx.8.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/cv2/.dylibs/libx264.164.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/cv2/.dylibs/libx265.199.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/cv2/cv2.abi3.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/decord/.dylibs/libavcodec.58.35.100.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/decord/.dylibs/libavfilter.7.40.101.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/decord/.dylibs/libavformat.58.20.100.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/decord/.dylibs/libvpx.8.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/decord/.dylibs/libx264.164.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/decord/libdecord.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/emoji/unicode_codes/__pycache__/data_dict.cpython-310.pyc filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/gradio/frpc_darwin_amd64_v0.2 filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/grpc/_cython/cygrpc.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/grpc_tools/_protoc_compiler.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/layoutparser/misc/NotoSerifCJKjp-Regular.otf filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/lib/libllama.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/llama_cpp/libllama.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/lxml/etree.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/lxml/objectify.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/magic/libmagic/magic.mgc filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/minijinja/_lowlevel.abi3.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/numpy/.dylibs/libgfortran.5.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/numpy/.dylibs/libopenblas64_.0.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/numpy/core/_multiarray_umath.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/numpy/core/_simd.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/onnx/onnx_cpp2py_export.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/onnxruntime/capi/onnxruntime_pybind11_state.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/pandas/_libs/algos.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/pandas/_libs/groupby.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/pandas/_libs/hashtable.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/pandas/_libs/interval.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/pandas/_libs/join.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/pandas/_libs/tslibs/offsets.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/pikepdf/.dylibs/libgnutls.30.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/pikepdf/.dylibs/libp11-kit.0.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/pikepdf/.dylibs/libqpdf.29.8.0.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/pikepdf/.dylibs/libunistring.5.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/pikepdf/_core.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/pillow_heif/.dylibs/libaom.3.8.0.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/pillow_heif/.dylibs/libjxl.0.8.2.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/pillow_heif/.dylibs/libx265.199.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/pyarrow/_compute.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/pyarrow/_dataset.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/pyarrow/_flight.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/pyarrow/lib.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/pyarrow/libarrow.1601.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/pyarrow/libarrow_acero.1601.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/pyarrow/libarrow_dataset.1601.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/pyarrow/libarrow_flight.1601.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/pyarrow/libarrow_python.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/pyarrow/libarrow_substrait.1601.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/pyarrow/libparquet.1601.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/pydantic_core/_pydantic_core.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/pydeck/nbextension/static/index.js.map filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/pypdf/_codecs/__pycache__/adobe_glyphs.cpython-310.pyc filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/pypdfium2_raw/libpdfium.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/rapidfuzz/distance/metrics_cpp.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/rapidfuzz/distance/metrics_cpp_avx2.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/rapidfuzz/fuzz_cpp.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/rapidfuzz/fuzz_cpp_avx2.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/safetensors/_safetensors_rust.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/scipy/.dylibs/libgfortran.5.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/scipy/.dylibs/libopenblas.0.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/scipy/fft/_pocketfft/pypocketfft.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/scipy/io/_fast_matrix_market/_fmm_core.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/scipy/linalg/_flapack.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/scipy/misc/face.dat filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/scipy/optimize/_highs/_highs_wrapper.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/scipy/sparse/_sparsetools.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/scipy/spatial/_qhull.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/scipy/special/_ufuncs.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/scipy/special/cython_special.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/scipy/stats/_unuran/unuran_wrapper.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/sentencepiece/_sentencepiece.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/skimage/filters/rank/generic_cy.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/sklearn/_loss/_loss.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/tiktoken/_tiktoken.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/tokenizers/tokenizers.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/torch/.dylibs/libiomp5.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/torch/bin/protoc filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/torch/bin/protoc-3.13.0.0 filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/torch/lib/libiomp5.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/torch/lib/libtorch_cpu.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/torch/lib/libtorch_python.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/torchaudio/_torchaudio.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/torchaudio/lib/libflashlight-text.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/torchaudio/lib/libtorchaudio.so filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/torchvision/.dylibs/libc++.1.0.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/unicorn/lib/libunicorn.2.dylib filter=lfs diff=lfs merge=lfs -text
+myenv/lib/python3.10/site-packages/unicorn/lib/libunicorn.a filter=lfs diff=lfs merge=lfs -text
+myenv/share/jupyter/nbextensions/pydeck/index.js.map filter=lfs diff=lfs merge=lfs -text
+openbiollm-llama3-8b.Q5_K_M.gguf filter=lfs diff=lfs merge=lfs -text
+path/to/data/collections/image_data/0/wal/open-1 filter=lfs diff=lfs merge=lfs -text
+path/to/data/collections/image_data/0/wal/open-2 filter=lfs diff=lfs merge=lfs -text
+path/to/data/collections/medical_img/0/wal/open-1 filter=lfs diff=lfs merge=lfs -text
+path/to/data/collections/medical_img/0/wal/open-2 filter=lfs diff=lfs merge=lfs -text
+qdrant_data/collections/vector_db/0/wal/open-1 filter=lfs diff=lfs merge=lfs -text
+qdrant_data/collections/vector_db/0/wal/open-2 filter=lfs diff=lfs merge=lfs -text
+qdrant_storage/collections/medical_img/0/wal/open-1 filter=lfs diff=lfs merge=lfs -text
+qdrant_storage/collections/medical_img/0/wal/open-2 filter=lfs diff=lfs merge=lfs -text
+qdrant_storage/collections/vector_db/0/wal/open-1 filter=lfs diff=lfs merge=lfs -text
+qdrant_storage/collections/vector_db/0/wal/open-2 filter=lfs diff=lfs merge=lfs -text

.github/workflows/update_space.yml ADDED Viewed

	@@ -0,0 +1,28 @@

+name: Run Python script
+on:
+  push:
+    branches:
+      - surbhi
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.9'
+    - name: Install Gradio
+      run: python -m pip install gradio
+    - name: Log in to Hugging Face
+      run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
+    - name: Deploy to Spaces
+      run: gradio deploy

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+qdrant_data
+myenv
+openbiollm-llama3-8b.Q5_K_M.gguf
+__pycache__
+secrets.toml
+.streamlit/
+.env

.streamlit/secrets.toml ADDED Viewed

	@@ -0,0 +1,3 @@

+# .streamlit/secrets.toml
+QDRANT_URL = "https://f1e9a70a-afb9-498d-b66d-cb248e0d5557.us-east4-0.gcp.cloud.qdrant.io:6333"
+QDRANT_API_KEY = "REXlX_PeDvCoXeS9uKCzC--e3-LQV0lw3_jBTdcLZ7P5_F6EOdwklA"

Data/AC-Aids-for-Dogs_Canine-Periodontal-Disease.pdf ADDED Viewed

Binary file (485 kB). View file

Data/cancer_and_cure__a_critical_analysis.27.pdf ADDED Viewed

Binary file (226 kB). View file

Data/medical_oncology_handbook_june_2020_edition.pdf ADDED Viewed

Binary file (818 kB). View file

DockerFile ADDED Viewed

	@@ -0,0 +1,20 @@

+# Use the official Python image from the Docker Hub
+FROM python:3.10
+# Set the working directory in the container
+WORKDIR /app
+# Copy the requirements file into the container at /app
+COPY requirements.txt .
+# Install the required libraries
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the rest of the application code into the container
+COPY . .
+# Expose the port the app runs on
+EXPOSE 8501
+# Command to run the application
+CMD ["streamlit", "run", "stream.py", "--server.port=8501", "--server.address=0.0.0.0"]

MultimodalRAG.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

MultimodalRAGUpdatedVersion.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

README.md CHANGED Viewed

@@ -1,12 +1,129 @@
 ---
-title: Medical RAG
-emoji: 📉
-colorFrom: gray
-colorTo: indigo
 sdk: gradio
-sdk_version: 4.42.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Medical_RAG
+app_file: combinedmultimodal.py
 sdk: gradio
+sdk_version: 4.41.0
 ---
+# Advancing Text Searching with Advanced Indexing Techniques in Healthcare Applications(In Progress)
+Welcome to the project repository for advancing text searching with advanced indexing techniques in healthcare applications. This project implements a powerful Retrieval-Augmented Generation (RAG) system using cutting-edge AI technologies, specifically designed to enhance text searching capabilities within the healthcare domain.I have also implemented Multimodal Text Searching for Medical Documents.
+## 🚀 Features For Text Based Medical Query Based System
+- **BioLLM 8B**: Advanced language model for generating and processing medical text.
+- **ClinicalBert**: State-of-the-art embedding model for accurate representation of medical texts.
+- **Qdrant**: Self-hosted Vector Database (Vector DB) for efficient storage and retrieval of embeddings.
+- **Langchain & Llama CPP**: Orchestration frameworks for seamless integration and workflow management.
+# Medical Knowledge Base Query System
+A multimodal medical information retrieval system combining text and image-based querying for comprehensive medical knowledge access.
+## Features For Multimodality Medical Query Based System:
+[Watch the video on YouTube](https://youtu.be/pNy7RqfRUrc?si=1HQgq54oHT6YoR0B)
+### 🧠 Multimodal Medical Information Retrieval
+- Combines text and image-based querying for comprehensive medical knowledge access
+- Uses Qdrant vector database to store and retrieve both text and image embeddings
+### 🔤 Advanced Natural Language Processing
+- Utilizes ClinicalBERT for domain-specific text embeddings
+- Implements NVIDIA's Palmyra-med-70b model for medical language understanding fast Inference time.
+### 🖼️ Image Analysis Capabilities
+- Incorporates CLIP (Contrastive Language-Image Pre-training) for image feature extraction
+- Generates image summaries using Google's Gemini 1.5 Flash model
+### 📄 PDF Processing
+- Extracts text and images from medical PDF documents
+- Implements intelligent chunking strategies for text processing
+### 🔍 Vector Search
+- Uses Qdrant for efficient similarity search on both text and image vectors
+- Implements hybrid search combining CLIP-based image similarity and text-based summary similarity
+### 🖥️ Interactive User Interface
+- Gradio-based web interface for easy querying and result visualization
+- Displays relevant text responses alongside related medical images
+### 🧩 Extensible Architecture
+- Modular design allowing for easy integration of new models or data sources
+- Supports both local and cloud-based model deployment
+The high level architectural framework for this application is given as follows:
+![System Architecture Diagram](images/architecture.png)
+### ⚡ Performance Optimization
+- Implements batching and multi-threading for efficient processing of large document sets
+- Utilizes GPU acceleration where available
+### 🎛️ Customizable Retrieval
+- Adjustable similarity thresholds for image retrieval
+- Configurable number of top-k results for both text and image queries
+### 📊 Comprehensive Visualization
+- Displays query results with both textual information and related images
+- Provides a gallery view of all extracted images from the knowledge base
+### 🔐 Environment Management
+- Uses .env file for secure API key management
+- Supports both CPU and GPU environments
+### DEMO SCREENSHOT
+![DEMO-SCREENSHOT](images/multimodal.png)
+## 🎥 Video Demonstration
+Explore the capabilities of our project with our detailed [YouTube video](https://youtu.be/nKCKUcnQ390).
+## Installation
+To get started with this project, follow these steps:
+1. **Install Dependencies**:
+   ```bash
+   pip install -r requirements.txt
+   ```
+2. **Set up Qdrant**:
+   - Follow the [Qdrant Installation Guide](https://qdrant.tech/documentation/quick_start/) to install and configure Qdrant.
+3. **Configure the Application**:
+   - Ensure configuration files for BioLLM, ClinicalBert, Langchain, and Llama CPP are correctly set up.
+4. **Run the Application**:
+if you want to run the text reterival application in Flask mode
+   ```bash
+   uvicorn app:app
+   ```
+if you want to run the text reterival application through Streamlit
+``bash
+streamlit run Streaming.py
+```
+if you want to run the multimodal application run it through Gradio Interface
+```bash
+python combinedmultimodal.py
+```
+## 💡 Usage
+- **Querying the System**: Input medical queries via the application's interface for detailed information retrieval.
+- **Text Generation**: Utilize BioLLM 8B to generate comprehensive medical responses.
+## 👥 Contributing
+We welcome contributions to enhance this project! Here's how you can contribute:
+1. Fork the repository.
+2. Create a new branch (`git checkout -b feature-name`).
+3. Commit your changes (`git commit -am 'Add feature'`).
+4. Push to the branch (`git push origin feature-name`).
+5. Open a Pull Request with detailed information about your changes.
+## 📜 License
+This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
+## 📞 Contact
+For questions or suggestions, please open an issue or contact the repository owner at [surbhisharma9099@gmail.com](mailto:surbhisharma9099@gmail.com).

Streaming.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import streamlit as st
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import DirectoryLoader, UnstructuredFileLoader, PDFMinerLoader
+from langchain_community.vectorstores import Qdrant
+from langchain_community.embeddings import SentenceTransformerEmbeddings
+from langchain_community.retrievers import BM25Retriever
+from qdrant_client import QdrantClient
+from qdrant_client.http.exceptions import ResponseHandlingException
+from glob import glob
+from llama_index.vector_stores.qdrant import QdrantVectorStore
+from langchain.chains import RetrievalQA
+from transformers import AutoTokenizer, AutoModel
+from sentence_transformers import models, SentenceTransformer
+from langchain.embeddings.base import Embeddings
+from qdrant_client.models import VectorParams
+import torch
+import base64
+from langchain_community.llms import LlamaCpp
+from langchain_core.prompts import PromptTemplate
+from huggingface_hub import hf_hub_download
+from tempfile import NamedTemporaryFile
+from langchain.retrievers import EnsembleRetriever
+# Set page configuration
+st.set_page_config(layout="wide")
+st.markdown("""
+    <meta http-equiv="Content-Security-Policy"
+    content="default-src 'self'; object-src 'self'; frame-src 'self' data:;
+    script-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline';">
+""", unsafe_allow_html=True)
+# Streamlit secrets
+qdrant_url = st.secrets["QDRANT_URL"]
+qdrant_api_key = st.secrets["QDRANT_API_KEY"]
+# For debugging only - remove or comment out these lines after verification
+#st.write(f"QDRANT_URL: {qdrant_url}")
+#st.write(f"QDRANT_API_KEY: {qdrant_api_key}")
+class ClinicalBertEmbeddings(Embeddings):
+    def __init__(self, model_name: str = "medicalai/ClinicalBERT"):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModel.from_pretrained(model_name)
+        self.model.eval()
+    def embed(self, text: str):
+        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        embeddings = self.mean_pooling(outputs, inputs['attention_mask'])
+        return embeddings.squeeze().numpy()
+    def mean_pooling(self, model_output, attention_mask):
+        token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
+        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+    def embed_documents(self, texts):
+        return [self.embed(text) for text in texts]
+    def embed_query(self, text):
+        return self.embed(text)
+@st.cache_resource
+def load_model():
+    model_name = "aaditya/OpenBioLLM-Llama3-8B-GGUF"
+    model_file = "openbiollm-llama3-8b.Q5_K_M.gguf"
+    model_path = hf_hub_download(model_name, filename=model_file, local_dir='./')
+    return LlamaCpp(
+        model_path=model_path,
+        temperature=0.3,
+        n_ctx=2048,
+        top_p=1
+    )
+# Initialize embeddings
+@st.cache_resource
+def load_embeddings():
+    return ClinicalBertEmbeddings(model_name="medicalai/ClinicalBERT")
+# Initialize database
+@st.cache_resource
+def setup_qdrant():
+    try:
+        if not qdrant_url or not qdrant_api_key:
+            raise ValueError("QDRANT_URL or QDRANT_API_KEY not set in environment variables.")
+        # Initialize Qdrant client
+        client = QdrantClient(
+            url=qdrant_url,
+            api_key=qdrant_api_key,
+            port=443,  # Assuming HTTPS should use port 443
+        )
+        st.write("Qdrant client initialized successfully.")
+        # Create or recreate collection
+        collection_name = "vector_db"
+        try:
+            collection_info = client.get_collection(collection_name=collection_name)
+            st.write(f"Collection '{collection_name}' already exists.")
+        except ResponseHandlingException:
+            st.write(f"Collection '{collection_name}' does not exist. Creating a new one.")
+            client.recreate_collection(
+                collection_name=collection_name,
+                vectors_config=VectorParams(size=768, distance="Cosine")
+            )
+            st.write(f"Collection '{collection_name}' created successfully.")
+        embeddings = load_embeddings()
+        st.write("Embeddings model loaded successfully.")
+        return Qdrant(client=client, embeddings=embeddings, collection_name=collection_name)
+    except Exception as e:
+        st.error(f"Failed to initialize Qdrant: {e}")
+        return None
+# Initialize database
+db = setup_qdrant()
+if db is None:
+    st.error("Qdrant setup failed, exiting.")
+else:
+    st.success("Qdrant setup successful.")
+# Load models
+llm = load_model()
+embeddings = load_embeddings()
+# Define prompt template
+prompt_template = """Use the following pieces of information to answer the user's question.
+If you don't know the answer, just say that you don't know, don't try to make up an answer.
+Context: {context}
+Question: {question}
+Only return the helpful answer. Answer must be detailed and well explained.
+Helpful answer:
+"""
+prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])
+# Define retriever
+# Define Streamlit app
+def process_answer(query):
+    chain_type_kwargs = {"prompt": prompt}
+    global ensemble_retriever
+    qa = RetrievalQA.from_chain_type(
+        llm=llm,
+        chain_type="stuff",
+        retriever=ensemble_retriever,
+        return_source_documents=True,
+        chain_type_kwargs=chain_type_kwargs,
+        verbose=True
+    )
+    response = qa(query)
+    answer = response['result']
+    source_document = response['source_documents'][0].page_content
+    doc = response['source_documents'][0].metadata['source']
+    return answer, source_document, doc
+def display_pdf(file):
+    with open(file, "rb") as f:
+        base64_pdf = base64.b64encode(f.read()).decode('utf-8')
+    pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
+    st.markdown(pdf_display, unsafe_allow_html=True)
+def main():
+    st.title("PDF Question Answering System")
+    uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
+    if uploaded_file is not None:
+        # Save uploaded PDF
+        with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
+            temp_file.write(uploaded_file.read())
+            temp_file_path = temp_file.name
+        # Display PDF
+        st.subheader("PDF Preview")
+        display_pdf(temp_file_path)
+        # Load and process PDF
+        loader = PDFMinerLoader(temp_file_path)
+        documents = loader.load()
+        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
+        texts = text_splitter.split_documents(documents)
+        # Update the Qdrant database with the new PDF content
+        try:
+            db.add_documents(texts)
+            st.success("PDF processed and vector database updated!")
+            global ensemble_retriever
+            # Initialize retriever after documents are added
+            bm25_retriever  = BM25Retriever.from_documents(documents=texts)
+            bm25_retriever.k = 3
+            qdrant_retriever = db.as_retriever(search_kwargs={"k":1})
+            # Combine both retrievers using EnsembleRetriever
+            ensemble_retriever = EnsembleRetriever(
+            retrievers=[qdrant_retriever, bm25_retriever],
+            weights=[0.5, 0.5]  # Adjust weights based on desired contribution
+            )
+        except Exception as e:
+            st.error(f"Error updating database: {e}")
+        st.subheader("Ask a question about the PDF")
+        user_input = st.text_input("Your question:")
+        if st.button('Get Response'):
+            if user_input:
+                try:
+                    answer, source_document, doc = process_answer(user_input)
+                    st.write("*Answer:*", answer)
+                    st.write("*Source Document:*", source_document)
+                    st.write("*Document Source:*", doc)
+                except Exception as e:
+                    st.error(f"Error processing query: {e}")
+            else:
+                st.warning("Please enter a query.")
+if __name__ == "__main__":
+    main()

Streamingnewversion.py ADDED Viewed

	@@ -0,0 +1,244 @@

+import streamlit as st
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import DirectoryLoader, UnstructuredFileLoader, PDFMinerLoader
+from langchain_community.vectorstores import Qdrant
+from langchain_community.embeddings import SentenceTransformerEmbeddings
+from langchain_community.retrievers import BM25Retriever
+from qdrant_client import QdrantClient
+from qdrant_client.http.exceptions import ResponseHandlingException
+from glob import glob
+from llama_index.vector_stores.qdrant import QdrantVectorStore
+from langchain.chains import RetrievalQA
+from transformers import AutoTokenizer, AutoModel
+from sentence_transformers import models, SentenceTransformer
+from langchain.embeddings.base import Embeddings
+from qdrant_client.models import VectorParams
+import torch
+import base64
+from langchain_community.llms import LlamaCpp
+from langchain_core.prompts import PromptTemplate
+from huggingface_hub import hf_hub_download
+from tempfile import NamedTemporaryFile
+from langchain.retrievers import EnsembleRetriever
+import urllib
+import nltk
+import os
+# Add this at the beginning of your script
+import logging
+logging.basicConfig(level=logging.DEBUG)
+# Define the path for NLTK data
+nltk_data_path = '/tmp/nltk_data'
+os.makedirs(nltk_data_path, exist_ok=True)
+# Set NLTK data path environment variable
+nltk.data.path.append(nltk_data_path)
+# Download required NLTK data
+try:
+    nltk.data.find('tokenizers/punkt')
+except LookupError:
+    nltk.download('punkt', download_dir=nltk_data_path)
+# Set page configuration
+st.set_page_config(layout="wide")
+st.markdown("""
+    <meta http-equiv="Content-Security-Policy"
+    content="default-src 'self'; object-src 'self'; frame-src 'self' data:;
+    script-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline';">
+""", unsafe_allow_html=True)
+# Streamlit secrets
+qdrant_url = st.secrets["QDRANT_URL"]
+qdrant_api_key = st.secrets["QDRANT_API_KEY"]
+# For debugging only - remove or comment out these lines after verification
+#st.write(f"QDRANT_URL: {qdrant_url}")
+#st.write(f"QDRANT_API_KEY: {qdrant_api_key}")
+class ClinicalBertEmbeddings(Embeddings):
+    def __init__(self, model_name: str = "medicalai/ClinicalBERT"):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModel.from_pretrained(model_name)
+        self.model.eval()
+    def embed(self, text: str):
+        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        embeddings = self.mean_pooling(outputs, inputs['attention_mask'])
+        return embeddings.squeeze().numpy()
+    def mean_pooling(self, model_output, attention_mask):
+        token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
+        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+    def embed_documents(self, texts):
+        return [self.embed(text) for text in texts]
+    def embed_query(self, text):
+        return self.embed(text)
+@st.cache_resource
+def load_model():
+    model_name = "aaditya/OpenBioLLM-Llama3-8B-GGUF"
+    model_file = "openbiollm-llama3-8b.Q5_K_M.gguf"
+    model_path = hf_hub_download(model_name, filename=model_file, local_dir='./')
+    return LlamaCpp(
+        model_path=model_path,
+        temperature=0.3,
+        n_ctx=2048,
+        top_p=1
+    )
+# Initialize embeddings
+@st.cache_resource
+def load_embeddings():
+    return ClinicalBertEmbeddings(model_name="medicalai/ClinicalBERT")
+# Initialize database
+@st.cache_resource
+def setup_qdrant():
+    try:
+        if not qdrant_url or not qdrant_api_key:
+            raise ValueError("QDRANT_URL or QDRANT_API_KEY not set in environment variables.")
+        # Initialize Qdrant client
+        client = QdrantClient(
+            url=qdrant_url,
+            api_key=qdrant_api_key,
+            port=443,  # Assuming HTTPS should use port 443
+        )
+        st.write("Qdrant client initialized successfully.")
+        # Create or recreate collection
+        collection_name = "vector_db"
+        try:
+            collection_info = client.get_collection(collection_name=collection_name)
+            st.write(f"Collection '{collection_name}' already exists.")
+        except ResponseHandlingException:
+            st.write(f"Collection '{collection_name}' does not exist. Creating a new one.")
+            client.recreate_collection(
+                collection_name=collection_name,
+                vectors_config=VectorParams(size=768, distance="Cosine")
+            )
+            st.write(f"Collection '{collection_name}' created successfully.")
+        embeddings = load_embeddings()
+        st.write("Embeddings model loaded successfully.")
+        return Qdrant(client=client, embeddings=embeddings, collection_name=collection_name)
+    except Exception as e:
+        st.error(f"Failed to initialize Qdrant: {e}")
+        return None
+# Initialize database
+db = setup_qdrant()
+if db is None:
+    st.error("Qdrant setup failed, exiting.")
+else:
+    st.success("Qdrant setup successful.")
+# Load models
+llm = load_model()
+embeddings = load_embeddings()
+# Define prompt template
+prompt_template = """Use the following pieces of information to answer the user's question.
+If you don't know the answer, just say that you don't know, don't try to make up an answer.
+Context: {context}
+Question: {question}
+Only return the helpful answer. Answer must be detailed and well explained.
+Helpful answer:
+"""
+prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])
+# Define retriever
+# Define Streamlit app
+def process_answer(query):
+    chain_type_kwargs = {"prompt": prompt}
+    global ensemble_retriever
+    qa = RetrievalQA.from_chain_type(
+        llm=llm,
+        chain_type="stuff",
+        retriever=ensemble_retriever,
+        return_source_documents=True,
+        chain_type_kwargs=chain_type_kwargs,
+        verbose=True
+    )
+    response = qa(query)
+    answer = response['result']
+    source_document = response['source_documents'][0].page_content
+    doc = response['source_documents'][0].metadata['source']
+    return answer, source_document, doc
+def display_pdf(file):
+    with open(file, "rb") as f:
+        base64_pdf = base64.b64encode(f.read()).decode('utf-8')
+    pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
+    st.markdown(pdf_display, unsafe_allow_html=True)
+def main():
+    st.title("PDF Question Answering System")
+    # Displaying File
+    uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
+    if uploaded_file is not None:
+        # Save uploaded PDF
+        with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
+            temp_file.write(uploaded_file.read())
+            temp_file_path = temp_file.name
+        # Display PDF
+        st.subheader("PDF Preview")
+        display_pdf(temp_file_path)
+        # Load and process PDF
+        loader = PDFMinerLoader(temp_file_path)
+        documents = loader.load()
+        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
+        texts = text_splitter.split_documents(documents)
+        # Update the Qdrant database with the new PDF content
+        try:
+            db.add_documents(texts)
+            st.success("PDF processed and vector database updated!")
+            global ensemble_retriever
+            # Initialize retriever after documents are added
+            bm25_retriever  = BM25Retriever.from_documents(documents=texts)
+            bm25_retriever.k = 3
+            qdrant_retriever = db.as_retriever(search_kwargs={"k":1})
+            # Combine both retrievers using EnsembleRetriever
+            ensemble_retriever = EnsembleRetriever(
+            retrievers=[qdrant_retriever, bm25_retriever],
+            weights=[0.5, 0.5]  # Adjust weights based on desired contribution
+            )
+        except Exception as e:
+            st.error(f"Error updating database: {e}")
+        st.subheader("Ask a question about the PDF")
+        user_input = st.text_input("Your question:")
+        if st.button('Get Response'):
+            if user_input:
+                try:
+                    answer, source_document, doc = process_answer(user_input)
+                    st.write("*Answer:*", answer)
+                    st.write("*Source Document:*", source_document)
+                    st.write("*Document Source:*", doc)
+                except Exception as e:
+                    st.error(f"Error processing query: {e}")
+            else:
+                st.warning("Please enter a query.")
+if __name__ == "__main__":
+    main()

__pycache__/app.cpython-310.pyc ADDED Viewed

Binary file (2.97 kB). View file

__pycache__/clip_helpers.cpython-310.pyc ADDED Viewed

Binary file (644 Bytes). View file

__pycache__/combinedmultimodal.cpython-310.pyc ADDED Viewed

Binary file (15.3 kB). View file

__pycache__/imagebind.cpython-310.pyc ADDED Viewed

Binary file (2.9 kB). View file

__pycache__/images.cpython-310.pyc ADDED Viewed

Binary file (543 Bytes). View file

__pycache__/ingest.cpython-310.pyc ADDED Viewed

Binary file (3.68 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,83 @@

+from langchain import PromptTemplate
+from langchain_community.llms import LlamaCpp
+from langchain.chains import RetrievalQA
+from langchain_community.embeddings import SentenceTransformerEmbeddings
+from fastapi import FastAPI, Request, Form, Response
+from fastapi.responses import HTMLResponse
+from fastapi.templating import Jinja2Templates
+from fastapi.staticfiles import StaticFiles
+from fastapi.encoders import jsonable_encoder
+from qdrant_client import QdrantClient
+from langchain_community.vectorstores import Qdrant
+import os
+import json
+from huggingface_hub import hf_hub_download
+from langchain.retrievers import EnsembleRetriever
+from ingest import keyword_retriever
+app = FastAPI()
+templates = Jinja2Templates(directory="templates")
+app.mount("/static", StaticFiles(directory="static"), name="static")
+model_name = "aaditya/OpenBioLLM-Llama3-8B-GGUF"
+model_file = "openbiollm-llama3-8b.Q5_K_M.gguf"
+model_path = hf_hub_download(model_name,
+                             filename=model_file, local_dir='./')
+local_llm = "openbiollm-llama3-8b.Q5_K_M.gguf"
+# Make sure the model path is correct for your system!
+llm = LlamaCpp(
+    model_path= local_llm,
+    temperature=0.3,
+    # max_tokens=2048,
+    n_ctx=2048,
+    top_p=1
+)
+print("LLM Initialized....")
+prompt_template = """Use the following pieces of information to answer the user's question.
+If you don't know the answer, just say that you don't know, don't try to make up an answer.
+Context: {context}
+Question: {question}
+Only return the helpful answer. Answer must be detailed and well explained.
+Helpful answer:
+"""
+embeddings = SentenceTransformerEmbeddings(model_name="medicalai/ClinicalBERT")
+url = "http://localhost:6333"
+client = QdrantClient(
+    url=url, prefer_grpc=False
+)
+db = Qdrant(client=client, embeddings=embeddings, collection_name="vector_db")
+prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])
+retriever = db.as_retriever(search_kwargs={"k":1})
+ensemble_retriever = EnsembleRetriever(retrievers=[retriever,
+                                                   keyword_retriever],
+                                       weights=[0.5, 0.5])
+@app.get("/", response_class=HTMLResponse)
+async def read_root(request: Request):
+    return templates.TemplateResponse("index.html", {"request": request})
+@app.post("/get_response")
+async def get_response(query: str = Form(...)):
+    chain_type_kwargs = {"prompt": prompt}
+    qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=ensemble_retriever, return_source_documents=True, chain_type_kwargs=chain_type_kwargs, verbose=True)
+    response = qa(query)
+    print(response)
+    answer = response['result']
+    source_document = response['source_documents'][0].page_content
+    doc = response['source_documents'][0].metadata['source']
+    response_data = jsonable_encoder(json.dumps({"answer": answer, "source_document": source_document, "doc": doc}))
+    res = Response(response_data)
+    return res

app1.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import streamlit as st
+from langchain import PromptTemplate
+from langchain_community.llms import LlamaCpp
+from langchain.chains import RetrievalQA
+from langchain_community.embeddings import SentenceTransformerEmbeddings
+from qdrant_client import QdrantClient
+from langchain_community.vectorstores import Qdrant
+import os
+import json
+from huggingface_hub import hf_hub_download
+from langchain.retrievers import EnsembleRetriever
+# from ingest import ClinicalBertEmbeddings, keyword_retriever
+from langchain_community.llms import CTransformers
+from transformers import AutoTokenizer, AutoModel
+# # Initialize Streamlit app
+# st.set_page_config(page_title="Document Retrieval App", layout='wide')
+# # Download and initialize LLM model
+# MODEL_PATH = './'
+# # Some basic configurations for the model
+# config = {
+#     "max_new_tokens": 2048,
+#     "context_length": 4096,
+#     "repetition_penalty": 1.1,
+#     "temperature": 0.5,
+#     "top_k": 50,
+#     "top_p": 0.9,
+#     "stream": True,
+#     "threads": int(os.cpu_count() / 2)
+# }
+# # We use Langchain's CTransformers llm class to load our quantized model
+# llm = CTransformers(model=MODEL_PATH,
+#                     config=config)
+# # Tokenizer for Mistral-7B-Instruct from HuggingFace
+# tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
+# model_name = "aaditya/OpenBioLLM-Llama3-8B-GGUF"
+# model_file = "openbiollm-llama3-8b.Q5_K_M.gguf"
+# model_path = hf_hub_download(model_name, filename=model_file, local_dir='./')
+# local_llm = "openbiollm-llama3-8b.Q5_K_M.gguf"
+# llm = LlamaCpp(
+#     model_path=local_llm,
+#     temperature=0.3,
+#     n_ctx=2048,
+#     top_p=1
+# )
+# st.sidebar.title("Document Retrieval App")
+# # Initialize embeddings
+# embeddings = ClinicalBertEmbeddings()
+# # Qdrant setup for medical_image collection
+# url = "http://localhost:6333"
+# client_medical = QdrantClient(url=url, prefer_grpc=False)
+# db_medical = Qdrant(client=client_medical, embeddings=embeddings, collection_name="medical_image")
+# # Qdrant setup for pdf collection
+# client_pdf = QdrantClient(url=url, prefer_grpc=False)
+# db_pdf = Qdrant(client=client_pdf, embeddings=embeddings, collection_name="pdf")
+# # Define retrievers for both collections
+# retriever_medical = db_medical.as_retriever(search_kwargs={"k": 1})
+# retriever_pdf = db_pdf.as_retriever(search_kwargs={"k": 1})
+# # Ensemble retriever combining both retrievers
+# ensemble_retriever = EnsembleRetriever(retrievers=[retriever_medical, retriever_pdf], weights=[0.5, 0.5])
+# # Prompt template for querying
+# prompt_template = """Use the following pieces of information to answer the user's question.
+# If you don't know the answer, just say that you don't know, don't try to make up an answer.
+# Context: {context}
+# Question: {question}
+# Only return the helpful answer. Answer must be detailed and well explained.
+# Helpful answer:
+# """
+# prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])
+# # Streamlit app layout
+# with st.sidebar:
+#     query = st.text_area("Enter your query here:")
+#     if st.button("Get Response"):
+#         st.write("Processing query...")
+#         chain_type_kwargs = {"prompt": prompt}
+#         qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=ensemble_retriever, return_source_documents=True, chain_type_kwargs=chain_type_kwargs, verbose=True)
+#         response = qa(query)
+#         # Process response to extract answer, source document, and metadata
+#         answer = response['result']
+#         source_document = response['source_documents'][0].page_content
+#         doc = response['source_documents'][0].metadata['source']
+#         # Display response
+#         st.subheader("Answer:")
+#         st.write(answer)
+#         st.subheader("Source Document:")
+#         st.write(source_document)
+#         st.subheader("Document Metadata:")
+#         st.write(doc)
+# # Run the app
+# if __name__ == '__main__':
+#     st.title("Document Retrieval App")
+#     st.write("Enter your query in the sidebar and click 'Get Response' to retrieve relevant documents.")
+# Define model and prompt template
+# Set your Hugging Face API token
+os.environ['HUGGINGFACE_HUB_TOKEN'] = ''
+model_name = "mistralai/Mistral-7B-Instruct-v0.1"
+model_file = "mistral-7b-instruct.q4_0.bin"
+model_path = hf_hub_download(model_name, filename=model_file, local_dir='./', use_auth_token='HUGGINGFACE_HUB_TOKEN')

combinedmultimodal.py ADDED Viewed

	@@ -0,0 +1,621 @@

+import os
+import uuid
+from llama_index.vector_stores.qdrant import QdrantVectorStore
+from llama_index.core import VectorStoreIndex, StorageContext
+import qdrant_client
+import torch
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+import clip
+from llama_index.core import Document
+from langchain_community.llms import LlamaCpp
+import numpy as np
+from huggingface_hub import hf_hub_download
+from langchain_community.llms import LlamaCpp
+from llama_index.core import (
+    ServiceContext,
+    SimpleDirectoryReader,
+)
+import threading
+from dotenv import load_dotenv
+from llama_index.llms.nvidia import NVIDIA
+from open_clip import create_model_from_pretrained, get_tokenizer
+from llama_index.core import Settings
+from llama_index.core import VectorStoreIndex
+from llama_index.core.vector_stores import VectorStoreQuery
+from llama_index.core.query_engine import RetrieverQueryEngine
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModel
+from langchain.embeddings.base import Embeddings
+from llama_index.embeddings.langchain import LangchainEmbedding
+from langchain.embeddings.huggingface import HuggingFaceEmbeddings
+from llama_index.core import Settings
+from transformers import AutoProcessor, AutoModel
+import hashlib
+import uuid
+import os
+import gradio as gr
+import torch
+import clip
+import open_clip
+import numpy as np
+from llama_index.core.schema import ImageDocument
+import cv2
+import matplotlib.pyplot as plt
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+from unstructured.partition.pdf import partition_pdf
+from pathlib import Path
+from langchain_community.document_loaders import DirectoryLoader, UnstructuredFileLoader
+from PIL import Image
+import logging
+import concurrent.futures
+import logging
+from llama_index.core import set_global_service_context
+from llama_index.core import Document as LlamaIndexDocument
+import getpass
+import os
+from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
+from sentence_transformers import util
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import base64
+from google.generativeai import GenerativeModel, configure
+import google.generativeai as genai
+# Configure logging
+# logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
+class MetadataMode:
+    EMBED = "embed"
+    INLINE = "inline"
+    NONE = "none"
+# Define the vectors configuration
+vectors_config = {
+    "vector_size": 768,  # or whatever the dimensionality of your vectors is
+    "distance": "Cosine"  # can be "Cosine", "Euclidean", etc.
+}
+class ClinicalBertEmbeddingWrapper:
+    def __init__(self, model_name: str = "medicalai/ClinicalBERT"):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModel.from_pretrained(model_name)
+        self.model.eval()
+    def embed(self, text: str):
+        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        embeddings = self.mean_pooling(outputs, inputs['attention_mask'])
+        return embeddings.squeeze().tolist()
+    def mean_pooling(self, model_output, attention_mask):
+        token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
+        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+    def embed_documents(self, texts):
+        return [self.embed(text) for text in texts]
+    def embed_query(self, text):
+        return self.embed(text)
+     # Implement this method if needed
+    def get_text_embedding_batch(self, text_batch, show_progress=False):
+        embeddings = []
+        num_batches = len(text_batch)
+        # Process in batches of size 8
+        batch_size = 8
+        for i in tqdm(range(0, num_batches, batch_size), desc="Processing Batches", disable=not show_progress):
+            batch_texts = text_batch[i:i + batch_size]
+            batch_embeddings = self.embed_documents(batch_texts)
+            embeddings.extend(batch_embeddings)
+        return embeddings
+    def get_agg_embedding_from_queries(self, queries):
+        # Get embeddings for each query using the embed method
+        embeddings = [torch.tensor(self.embed(query)) for query in queries]
+        # Convert list of tensors to a single tensor for aggregation
+        embeddings_tensor = torch.stack(embeddings)
+        # Example: averaging embeddings
+        agg_embedding = embeddings_tensor.mean(dim=0)
+        return agg_embedding.tolist()
+# Load environment variables
+load_dotenv()
+genai.configure(api_key=os.environ["GEMINI_API_KEY"])
+nvidia_api_key = os.getenv("NVIDIA_API_KEY")
+if not nvidia_api_key:
+    raise ValueError("NVIDIA_API_KEY not found in .env file")
+os.environ["NVIDIA_API_KEY"] = nvidia_api_key
+model_name = "aaditya/OpenBioLLM-Llama3-8B-GGUF"
+model_file = "openbiollm-llama3-8b.Q5_K_M.gguf"
+QDRANT_URL = "https://f1e9a70a-afb9-498d-b66d-cb248e0d5557.us-east4-0.gcp.cloud.qdrant.io:6333"
+QDRANT_API_KEY = "REXlX_PeDvCoXeS9uKCzC--e3-LQV0lw3_jBTdcLZ7P5_F6EOdwklA"
+# Download model
+model_path = hf_hub_download(model_name, filename=model_file, local_dir='./')
+llm = NVIDIA(model="writer/palmyra-med-70b")
+llm.model
+local_llm = "openbiollm-llama3-8b.Q5_K_M.gguf"
+# Initialize ClinicalBert embeddings model
+# text_embed_model = ClinicalBertEmbeddings(model_name="medicalai/ClinicalBERT")
+text_embed_model = ClinicalBertEmbeddingWrapper(model_name="medicalai/ClinicalBERT")
+# Intially I was using this biollm but for faster text response during inference I am going for external models
+#but with this also it works fine.
+llm1 = LlamaCpp(
+        model_path=local_llm,
+        temperature=0.3,
+        n_ctx=2048,
+        top_p=1
+    )
+Settings.llm = llm
+Settings.embed_model = text_embed_model
+# Define ServiceContext with ClinicalBertEmbeddings for text
+service_context = ServiceContext.from_defaults(
+    llm = llm,
+    embed_model=text_embed_model  # Use ClinicalBert embeddings model
+)
+set_global_service_context(service_context)
+# Just for logging and Debugging
+# Log ServiceContext details
+# logging.debug(f"LLM: {service_context.llm}")
+# logging.debug(f"Embed Model: {service_context.embed_model}")
+# logging.debug(f"Node Parser: {service_context.node_parser}")
+# logging.debug(f"Prompt Helper: {service_context.prompt_helper}")
+# Create QdrantClient with the location set to ":memory:", which means the vector db will be stored in memory
+try:
+    text_client = qdrant_client.QdrantClient(
+        url=QDRANT_URL,
+        api_key=QDRANT_API_KEY,
+        port=443,
+    )
+    print("Qdrant client initialized successfully.")
+except Exception as e:
+    print(f"Error initializing Qdrant client: {e}")
+    raise
+# load Text documents from the data_wiki directory
+# text_documents = SimpleDirectoryReader("./Data").load_data()
+# Load documents
+loader = DirectoryLoader("./Data/", glob="**/*.pdf", show_progress=True, loader_cls=UnstructuredFileLoader)
+documents = loader.load()
+# Print document names
+for doc in documents:
+    print(f"Processing document: {doc.metadata.get('source', 'Unknown')}")
+# Split documents into chunks
+text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=70)
+texts = text_splitter.split_documents(documents)
+print(f"Loaded {len(documents)} documents")
+print(f"Split into {len(texts)} chunks")
+# Convert langchain documents to llama_index documents
+text_documents = [
+    LlamaIndexDocument(text=t.page_content, metadata=t.metadata)
+    for t in texts
+]
+# Initialize Qdrant vector store
+try:
+    text_vector_store = QdrantVectorStore(
+        client=text_client, collection_name="pdf_text"
+    )
+    print("Qdrant vector store initialized successfully.")
+except Exception as e:
+    print(f"Error initializing Qdrant vector store: {e}")
+    raise
+try:
+    image_vector_store = QdrantVectorStore(
+        client=text_client, collection_name="pdf_img"
+    )
+    print("Qdrant vector store initialized successfully.")
+except Exception as e:
+    print(f"Error initializing Qdrant vector store: {e}")
+    raise
+storage_context = StorageContext.from_defaults(vector_store=text_vector_store)
+wiki_text_index = VectorStoreIndex.from_documents(text_documents
+    # , storage_context=storage_context
+    , service_context=service_context
+    )
+print(f"VectorStoreIndex created with {len(wiki_text_index.docstore.docs)} documents")
+# define the streaming query engine
+streaming_qe = wiki_text_index.as_query_engine(streaming=True)
+print(len(text_documents))
+# Function to query the text vector database
+# Modify the process_query function
+model, preprocess = clip.load("ViT-B/32")
+input_resolution = model.visual.input_resolution
+context_length = model.context_length
+vocab_size = model.vocab_size
+print(
+    "Model parameters:",
+    f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}",
+)
+print("Input resolution:", input_resolution)
+print("Context length:", context_length)
+print("Vocab size:", vocab_size)
+pdf_directory = Path("./data")
+image_path = Path("./images1")
+image_path.mkdir(exist_ok=True, parents=True)
+# Dictionary to store image metadata
+image_metadata_dict = {}
+# Limit the number of images downloaded per PDF
+MAX_IMAGES_PER_PDF = 15
+# Generate a UUID for each image
+image_uuid = 0
+# Iterate over each PDF file in the data folder
+for pdf_file in pdf_directory.glob("*.pdf"):
+    images_per_pdf = 0
+    print(f"Processing: {pdf_file}")
+    # Extract images from the PDF
+    try:
+        raw_pdf_elements = partition_pdf(
+            filename=str(pdf_file),
+            extract_images_in_pdf=True,
+            infer_table_structure=True,
+            chunking_strategy="by_title",
+            max_characters=4000,
+            new_after_n_chars=3800,
+            combine_text_under_n_chars=2000,
+            extract_image_block_output_dir=image_path,
+        )
+        # Loop through the elements
+    except Exception as e:
+        print(f"Error processing {pdf_file}: {e}")
+        import traceback
+        traceback.print_exc()
+        continue
+# Function to summarize images
+def summarize_image(image_path):
+    # Load and encode the image
+    with open(image_path, "rb") as image_file:
+        encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
+    # Create a GenerativeModel object
+    model = GenerativeModel('gemini-1.5-flash')
+    # Prepare the prompt
+    prompt = """
+    You are an expert in analyzing medical images. Please provide a detailed description of this medical image, including:
+    1. You are a bot that is good at analyzing images related to Dog's health
+    2. The body part or area being examined
+    3. Any visible structures, organs, or tissues
+    4. Any abnormalities, lesions, or notable features
+    5. Any other relevant medical diagram description.
+    Please be as specific and detailed as possible in your analysis.
+    """
+    # Generate the response
+    response = model.generate_content([
+        prompt,
+        {"mime_type": "image/jpeg", "data": encoded_image}
+    ])
+    return response.text
+# # Iterate through each file in the directory
+for image_file in os.listdir(image_path):
+    if image_file.endswith(('.jpg', '.jpeg', '.png')):
+        # Generate a standard UUID for the image
+        image_uuid = str(uuid.uuid4())
+        image_file_name = image_file
+        image_file_path = image_path / image_file
+        # Generate image summary
+        # image_summary = generate_image_summary_with(str(image_file_path), model, feature_extractor, tokenizer, device)
+        # image_summary = generate_summary_with_lm(str(image_file_path), preprocess, model, device, tokenizer, lm_model)
+        image_summary = summarize_image(image_file_path)
+        # Construct metadata entry for the image
+        image_metadata_dict[image_uuid] = {
+            "filename": image_file_name,
+            "img_path": str(image_file_path), # Store the absolute path to the image
+            "summary": image_summary  # Add the summary to the metadata
+        }
+        # Limit the number of images processed per folder
+        if len(image_metadata_dict) >= MAX_IMAGES_PER_PDF:
+            break
+print(f"Number of items in image_dict: {len(image_metadata_dict)}")
+# Print the metadata dictionary
+for key, value in image_metadata_dict.items():
+    print(f"UUID: {key}, Metadata: {value}")
+def plot_images_with_opencv(image_metadata_dict):
+    original_images_urls = []
+    images_shown = 0
+    plt.figure(figsize=(16, 16))  # Adjust the figure size as needed
+    for image_id in image_metadata_dict:
+        img_path = image_metadata_dict[image_id]["img_path"]
+        if os.path.isfile(img_path):
+            try:
+                img = cv2.imread(img_path)
+                if img is not None:
+                    # Convert BGR (OpenCV) to RGB (matplotlib)
+                    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+                    plt.subplot(8, 8, len(original_images_urls) + 1)
+                    plt.imshow(img_rgb)
+                    plt.xticks([])
+                    plt.yticks([])
+                    original_images_urls.append(image_metadata_dict[image_id]["filename"])
+                    images_shown += 1
+                    if images_shown >= 64:
+                        break
+            except Exception as e:
+                print(f"Error processing image {img_path}: {e}")
+    plt.tight_layout()
+    plt.show()
+plot_images_with_opencv(image_metadata_dict)
+# set the device to use for the CLIP model, either CUDA (GPU) or CPU, depending on availability
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(device)
+# Function to preprocess image using OpenCV
+def preprocess_image(img):
+    # Convert BGR to RGB
+    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    # Convert the image to a PIL Image and then preprocess
+    img_pil = Image.fromarray(img_rgb)
+    return preprocess(img_pil)
+    # Use BiomedCLIP processor for preprocessing
+    # return preprocess(images=img_pil, return_tensors="pt")
+    # return preprocess(img_pil).unsqueeze(0)
+img_emb_dict = {}
+with torch.no_grad():
+    for image_id in image_metadata_dict:
+        img_file_path = image_metadata_dict[image_id]["img_path"]
+        if os.path.isfile(img_file_path):
+            try:
+                # Load image using OpenCV
+                img = cv2.imread(img_file_path)
+                if img is not None:
+                    # Preprocess image
+                    image = preprocess_image(img).unsqueeze(0).to(device)
+                    # image = preprocess_image(img).to(device)
+                    # Extract image features
+                    image_features = model.encode_image(image)
+                    # Store image features
+                    img_emb_dict[image_id] = image_features
+                else:
+                    print(f"Failed to load image {img_file_path}")
+            except Exception as e:
+                print(f"Error processing image {img_file_path}: {e}")
+len(img_emb_dict) #22 image so 22 img emb
+# create a list of ImageDocument objects, one for each image in the dataset
+img_documents = []
+for image_filename in image_metadata_dict:
+    # the img_emb_dict dictionary contains the image embeddings
+    if image_filename in img_emb_dict:
+        filename = image_metadata_dict[image_filename]["filename"]
+        filepath = image_metadata_dict[image_filename]["img_path"]
+        summary = image_metadata_dict[image_filename]["summary"]
+        #print(filepath)
+        # create an ImageDocument for each image
+        newImgDoc = ImageDocument(
+            text=filename, metadata={"filepath": filepath, "summary": summary}  # Include the summary in the metadata
+        )
+        # set image embedding on the ImageDocument
+        newImgDoc.embedding = img_emb_dict[image_filename].tolist()[0]
+        img_documents.append(newImgDoc)
+# define storage context
+storage_context = StorageContext.from_defaults(vector_store=image_vector_store)
+# define image index
+image_index = VectorStoreIndex.from_documents(
+    img_documents,
+    storage_context=storage_context
+)
+# for doc in img_documents:
+#     print(f"ImageDocument: {doc.text}, Embedding: {doc.embedding}, Metadata: {doc.metadata}")
+def retrieve_results_from_image_index(query):
+    """ take a text query as input and return the most similar image from the vector store """
+    # first tokenize the text query and convert it to a tensor
+    text = clip.tokenize(query).to(device)
+    # encode the text tensor using the CLIP model to produce a query embedding
+    query_embedding = model.encode_text(text).tolist()[0]
+    # Encode the query using ClinicalBERT for text similarity
+    clinical_query_embedding = text_embed_model.embed_query(query)
+    # create a VectorStoreQuery
+    image_vector_store_query = VectorStoreQuery(
+        query_embedding=query_embedding,
+        similarity_top_k=1, # returns 1 image
+        mode="default",
+    )
+    # execute the query against the image vector store
+    image_retrieval_results = image_vector_store.query(
+        image_vector_store_query
+    )
+    if image_retrieval_results.nodes:
+        best_score = -1
+        best_image = None
+        for node, clip_score in zip(image_retrieval_results.nodes, image_retrieval_results.similarities):
+            image_path = node.metadata["filepath"]
+            image_summary = node.metadata.get("summary", "")  # Assuming summaries are stored in metadata
+            # Calculate text similarity between query and image summary
+            summary_embedding = text_embed_model.embed_query(image_summary)
+            # text_score = util.cosine_similarity(
+            #     [clinical_query_embedding], [summary_embedding]
+            # )[0][0]
+            # Use util.cos_sim for cosine similarity
+            text_score = util.cos_sim(torch.tensor([clinical_query_embedding]),
+                                      torch.tensor([summary_embedding]))[0][0].item()
+            # Calculate average similarity score
+            avg_score = (clip_score + text_score) / 2
+            if avg_score > best_score:
+                best_score = avg_score
+                best_image = image_path
+        return best_image, best_score
+    return None, 0.0
+def plot_image_retrieve_results(image_retrieval_results):
+    """ Take a list of image retrieval results and create a new figure """
+    plt.figure(figsize=(16, 5))
+    img_cnt = 0
+    # Iterate over the image retrieval results, and for each result, display the corresponding image and its score in a subplot.
+    # The title of the subplot is the score of the image, formatted to four decimal places.
+    for returned_image, score in zip(
+        image_retrieval_results.nodes, image_retrieval_results.similarities
+    ):
+        img_name = returned_image.text
+        img_path = returned_image.metadata["filepath"]
+        # Read image using OpenCV
+        image = cv2.imread(img_path)
+        # Convert image to RGB format (OpenCV reads in BGR by default)
+        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        plt.subplot(2, 3, img_cnt + 1)
+        plt.title("{:.4f}".format(score))
+        plt.imshow(image_rgb)
+        plt.xticks([])
+        plt.yticks([])
+        img_cnt += 1
+    plt.tight_layout()
+    plt.show()
+def get_all_images():
+    image_paths = []
+    for _, metadata in image_metadata_dict.items():
+        image_paths.append(metadata["img_path"])
+    return image_paths
+def load_image(image_path):
+    return Image.open(image_path)
+# Define the combined query function
+def combined_query(query, similarity_threshold=0.3):
+    # Text query
+    text_response = streaming_qe.query(query)
+    text_result = ""
+    for text in text_response.response_gen:
+        text_result += text
+    # Image query
+    top_image_path, similarity_score = retrieve_results_from_image_index(query)
+    if similarity_score >= similarity_threshold:
+        return text_result, top_image_path, similarity_score
+    else:
+        return text_result, None, similarity_score
+def gradio_interface(query):
+    text_result, image_path, similarity_score = combined_query(query)
+    top_image = load_image(image_path) if image_path else None
+    all_images = [load_image(path) for path in get_all_images()]
+    return text_result, top_image, all_images, f"Similarity Score: {similarity_score:.4f}"
+with gr.Blocks() as iface:
+    gr.Markdown("# Medical Knowledge Base Query System")
+    with gr.Row():
+        query_input = gr.Textbox(lines=2, placeholder="Enter your medical query here...")
+        submit_button = gr.Button("Submit")
+    with gr.Row():
+        text_output = gr.Textbox(label="Text Response")
+        image_output = gr.Image(label="Top Related Image (if similarity > threshold)")
+    similarity_score_output = gr.Textbox(label="Similarity Score")
+    gallery_output = gr.Gallery(label="All Extracted Images", show_label=True, elem_id="gallery")
+    submit_button.click(
+        fn=gradio_interface,
+        inputs=query_input,
+        outputs=[text_output, image_output, gallery_output, similarity_score_output]
+    )
+    # Load all images on startup
+    iface.load(lambda: ["", None, [load_image(path) for path in get_all_images()], ""],
+               outputs=[text_output, image_output, gallery_output, similarity_score_output])
+# Launch the Gradio interface
+iface.launch(share=True)
+# just to check if it works or not
+# def image_query(query):
+#     image_retrieval_results = retrieve_results_from_image_index(query)
+#     plot_image_retrieve_results(image_retrieval_results)
+# query1 = "What is gingivitis?"
+# # generate image retrieval results
+# image_query(query1)
+# # Modify your text query function
+# # def text_query(query):
+# #     text_retrieval_results = process_query(query, text_embed_model, k=10)
+# #     return text_retrieval_results
+# # Function to query the text vector database
+# def text_query(query: str, k: int = 10):
+#     # Create a VectorStoreIndex from the existing vector store
+#     index = VectorStoreIndex.from_vector_store(text_vector_store)
+#     # Create a retriever with top-k configuration
+#     retriever = index.as_retriever(similarity_top_k=k)
+#     # Create a query engine
+#     query_engine = RetrieverQueryEngine.from_args(retriever)
+#     # Execute the query
+#     response = query_engine.query(query)
+#     return response
+# # text_retrieval_results = text_query(query1)
+# streaming_response = streaming_qe.query(
+#     query1
+# )
+# streaming_response.print_response_stream()

freeze ADDED Viewed

File without changes

images.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from unstructured.partition.pdf import partition_pdf
+output_path = "./images"
+raw_pdf_elements = partition_pdf(
+    filename="./Data/AC-Aids-for-Dogs_Canine-Periodontal-Disease.pdf",
+    extract_images_in_pdf=True,
+    infer_table_structure=True,
+    chunking_strategy="by_title",
+    max_characters=4000,
+    new_after_n_chars=3800,
+    combine_text_under_n_chars=2000,
+    extract_image_block_output_dir=output_path,
+)