Surbhi123 commited on
Commit
64772a4
1 Parent(s): 9e7ce10

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .DS_Store +0 -0
  2. .cache/huggingface/.gitignore +1 -0
  3. .cache/huggingface/download/openbiollm-llama3-8b.Q5_K_M.gguf.lock +0 -0
  4. .cache/huggingface/download/openbiollm-llama3-8b.Q5_K_M.gguf.metadata +3 -0
  5. .env +4 -0
  6. .gitattributes +155 -0
  7. .github/workflows/update_space.yml +28 -0
  8. .gitignore +7 -0
  9. .streamlit/secrets.toml +3 -0
  10. Data/AC-Aids-for-Dogs_Canine-Periodontal-Disease.pdf +0 -0
  11. Data/cancer_and_cure__a_critical_analysis.27.pdf +0 -0
  12. Data/medical_oncology_handbook_june_2020_edition.pdf +0 -0
  13. DockerFile +20 -0
  14. MultimodalRAG.ipynb +0 -0
  15. MultimodalRAGUpdatedVersion.ipynb +0 -0
  16. README.md +125 -8
  17. Streaming.py +223 -0
  18. Streamingnewversion.py +244 -0
  19. __pycache__/app.cpython-310.pyc +0 -0
  20. __pycache__/clip_helpers.cpython-310.pyc +0 -0
  21. __pycache__/combinedmultimodal.cpython-310.pyc +0 -0
  22. __pycache__/imagebind.cpython-310.pyc +0 -0
  23. __pycache__/images.cpython-310.pyc +0 -0
  24. __pycache__/ingest.cpython-310.pyc +0 -0
  25. app.py +83 -0
  26. app1.py +119 -0
  27. combinedmultimodal.py +621 -0
  28. freeze +0 -0
  29. images.py +12 -0
  30. images/architecture.png +0 -0
  31. images/figure-1-1.jpg +0 -0
  32. images/figure-1-10.jpg +0 -0
  33. images/figure-1-11.jpg +0 -0
  34. images/figure-1-2.jpg +0 -0
  35. images/figure-1-3.jpg +0 -0
  36. images/figure-1-4.jpg +0 -0
  37. images/figure-1-5.jpg +0 -0
  38. images/figure-1-6.jpg +0 -0
  39. images/figure-1-7.jpg +0 -0
  40. images/figure-1-8.jpg +0 -0
  41. images/figure-1-9.jpg +0 -0
  42. images/multimodal.png +3 -0
  43. images1/figure-1-1.jpg +0 -0
  44. images1/figure-1-10.jpg +0 -0
  45. images1/figure-1-11.jpg +0 -0
  46. images1/figure-1-2.jpg +0 -0
  47. images1/figure-1-3.jpg +0 -0
  48. images1/figure-1-4.jpg +0 -0
  49. images1/figure-1-5.jpg +0 -0
  50. images1/figure-1-6.jpg +0 -0
.DS_Store ADDED
Binary file (6.15 kB). View file
 
.cache/huggingface/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *
.cache/huggingface/download/openbiollm-llama3-8b.Q5_K_M.gguf.lock ADDED
File without changes
.cache/huggingface/download/openbiollm-llama3-8b.Q5_K_M.gguf.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ d1248c48f0ade670847d05fb2cb356a75df4db3a
2
+ 1753c629bf99c261e8b92498d813f382f811e903cdc0e685a11d1689612b34ce
3
+ 1723860909.403446
.env ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ QDRANT_URL=https://f1e9a70a-afb9-498d-b66d-cb248e0d5557.us-east4-0.gcp.cloud.qdrant.io:6333
2
+ QDRANT_API_KEY=REXlX_PeDvCoXeS9uKCzC--e3-LQV0lw3_jBTdcLZ7P5_F6EOdwklA
3
+ NVIDIA_API_KEY=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
4
+ GEMINI_API_KEY=AIzaSyCXGnm-n6aF962jeorkjo2IsMCwxDwj4bo
.gitattributes CHANGED
@@ -33,3 +33,158 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ images/multimodal.png filter=lfs diff=lfs merge=lfs -text
37
+ multimodal.png filter=lfs diff=lfs merge=lfs -text
38
+ myenv/bin/python filter=lfs diff=lfs merge=lfs -text
39
+ myenv/bin/python3 filter=lfs diff=lfs merge=lfs -text
40
+ myenv/bin/python3.10 filter=lfs diff=lfs merge=lfs -text
41
+ myenv/bin/ruff filter=lfs diff=lfs merge=lfs -text
42
+ myenv/lib/python3.10/site-packages/Cython/Compiler/Code.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
43
+ myenv/lib/python3.10/site-packages/PIL/.dylibs/libfreetype.6.dylib filter=lfs diff=lfs merge=lfs -text
44
+ myenv/lib/python3.10/site-packages/PIL/.dylibs/libharfbuzz.0.dylib filter=lfs diff=lfs merge=lfs -text
45
+ myenv/lib/python3.10/site-packages/_soundfile_data/libsndfile_x86_64.dylib filter=lfs diff=lfs merge=lfs -text
46
+ myenv/lib/python3.10/site-packages/altair/vegalite/v5/schema/__pycache__/channels.cpython-310.pyc filter=lfs diff=lfs merge=lfs -text
47
+ myenv/lib/python3.10/site-packages/altair/vegalite/v5/schema/__pycache__/core.cpython-310.pyc filter=lfs diff=lfs merge=lfs -text
48
+ myenv/lib/python3.10/site-packages/av/.dylibs/libaom.3.2.0.dylib filter=lfs diff=lfs merge=lfs -text
49
+ myenv/lib/python3.10/site-packages/av/.dylibs/libavcodec.60.31.102.dylib filter=lfs diff=lfs merge=lfs -text
50
+ myenv/lib/python3.10/site-packages/av/.dylibs/libavfilter.9.12.100.dylib filter=lfs diff=lfs merge=lfs -text
51
+ myenv/lib/python3.10/site-packages/av/.dylibs/libavformat.60.16.100.dylib filter=lfs diff=lfs merge=lfs -text
52
+ myenv/lib/python3.10/site-packages/av/.dylibs/libdav1d.7.dylib filter=lfs diff=lfs merge=lfs -text
53
+ myenv/lib/python3.10/site-packages/av/.dylibs/libfreetype.6.dylib filter=lfs diff=lfs merge=lfs -text
54
+ myenv/lib/python3.10/site-packages/av/.dylibs/libharfbuzz.0.dylib filter=lfs diff=lfs merge=lfs -text
55
+ myenv/lib/python3.10/site-packages/av/.dylibs/libswscale.7.5.100.dylib filter=lfs diff=lfs merge=lfs -text
56
+ myenv/lib/python3.10/site-packages/av/.dylibs/libvpx.9.dylib filter=lfs diff=lfs merge=lfs -text
57
+ myenv/lib/python3.10/site-packages/av/.dylibs/libx264.164.dylib filter=lfs diff=lfs merge=lfs -text
58
+ myenv/lib/python3.10/site-packages/av/.dylibs/libx265.199.dylib filter=lfs diff=lfs merge=lfs -text
59
+ myenv/lib/python3.10/site-packages/av/.dylibs/libxml2.2.dylib filter=lfs diff=lfs merge=lfs -text
60
+ myenv/lib/python3.10/site-packages/cmake/data/bin/ccmake filter=lfs diff=lfs merge=lfs -text
61
+ myenv/lib/python3.10/site-packages/cmake/data/bin/cmake filter=lfs diff=lfs merge=lfs -text
62
+ myenv/lib/python3.10/site-packages/cmake/data/bin/cpack filter=lfs diff=lfs merge=lfs -text
63
+ myenv/lib/python3.10/site-packages/cmake/data/bin/ctest filter=lfs diff=lfs merge=lfs -text
64
+ myenv/lib/python3.10/site-packages/cmake/data/doc/cmake/CMake.qch filter=lfs diff=lfs merge=lfs -text
65
+ myenv/lib/python3.10/site-packages/cryptography/hazmat/bindings/_rust.abi3.so filter=lfs diff=lfs merge=lfs -text
66
+ myenv/lib/python3.10/site-packages/ctransformers/lib/avx/ctransformers.dll filter=lfs diff=lfs merge=lfs -text
67
+ myenv/lib/python3.10/site-packages/ctransformers/lib/avx/libctransformers.dylib filter=lfs diff=lfs merge=lfs -text
68
+ myenv/lib/python3.10/site-packages/ctransformers/lib/avx/libctransformers.so filter=lfs diff=lfs merge=lfs -text
69
+ myenv/lib/python3.10/site-packages/ctransformers/lib/avx2/ctransformers.dll filter=lfs diff=lfs merge=lfs -text
70
+ myenv/lib/python3.10/site-packages/ctransformers/lib/avx2/libctransformers.dylib filter=lfs diff=lfs merge=lfs -text
71
+ myenv/lib/python3.10/site-packages/ctransformers/lib/avx2/libctransformers.so filter=lfs diff=lfs merge=lfs -text
72
+ myenv/lib/python3.10/site-packages/ctransformers/lib/basic/ctransformers.dll filter=lfs diff=lfs merge=lfs -text
73
+ myenv/lib/python3.10/site-packages/ctransformers/lib/basic/libctransformers.dylib filter=lfs diff=lfs merge=lfs -text
74
+ myenv/lib/python3.10/site-packages/ctransformers/lib/basic/libctransformers.so filter=lfs diff=lfs merge=lfs -text
75
+ myenv/lib/python3.10/site-packages/ctransformers/lib/cuda/ctransformers.dll filter=lfs diff=lfs merge=lfs -text
76
+ myenv/lib/python3.10/site-packages/ctransformers/lib/cuda/libctransformers.so filter=lfs diff=lfs merge=lfs -text
77
+ myenv/lib/python3.10/site-packages/cv2/.dylibs/libSvtAv1Enc.1.8.0.dylib filter=lfs diff=lfs merge=lfs -text
78
+ myenv/lib/python3.10/site-packages/cv2/.dylibs/libX11.6.dylib filter=lfs diff=lfs merge=lfs -text
79
+ myenv/lib/python3.10/site-packages/cv2/.dylibs/libaom.3.8.0.dylib filter=lfs diff=lfs merge=lfs -text
80
+ myenv/lib/python3.10/site-packages/cv2/.dylibs/libavcodec.60.31.102.dylib filter=lfs diff=lfs merge=lfs -text
81
+ myenv/lib/python3.10/site-packages/cv2/.dylibs/libavformat.60.16.100.dylib filter=lfs diff=lfs merge=lfs -text
82
+ myenv/lib/python3.10/site-packages/cv2/.dylibs/libcrypto.3.dylib filter=lfs diff=lfs merge=lfs -text
83
+ myenv/lib/python3.10/site-packages/cv2/.dylibs/libdav1d.7.dylib filter=lfs diff=lfs merge=lfs -text
84
+ myenv/lib/python3.10/site-packages/cv2/.dylibs/libgnutls.30.dylib filter=lfs diff=lfs merge=lfs -text
85
+ myenv/lib/python3.10/site-packages/cv2/.dylibs/libjxl.0.9.0.dylib filter=lfs diff=lfs merge=lfs -text
86
+ myenv/lib/python3.10/site-packages/cv2/.dylibs/libp11-kit.0.dylib filter=lfs diff=lfs merge=lfs -text
87
+ myenv/lib/python3.10/site-packages/cv2/.dylibs/librav1e.0.6.6.dylib filter=lfs diff=lfs merge=lfs -text
88
+ myenv/lib/python3.10/site-packages/cv2/.dylibs/libunistring.5.dylib filter=lfs diff=lfs merge=lfs -text
89
+ myenv/lib/python3.10/site-packages/cv2/.dylibs/libvpx.8.dylib filter=lfs diff=lfs merge=lfs -text
90
+ myenv/lib/python3.10/site-packages/cv2/.dylibs/libx264.164.dylib filter=lfs diff=lfs merge=lfs -text
91
+ myenv/lib/python3.10/site-packages/cv2/.dylibs/libx265.199.dylib filter=lfs diff=lfs merge=lfs -text
92
+ myenv/lib/python3.10/site-packages/cv2/cv2.abi3.so filter=lfs diff=lfs merge=lfs -text
93
+ myenv/lib/python3.10/site-packages/decord/.dylibs/libavcodec.58.35.100.dylib filter=lfs diff=lfs merge=lfs -text
94
+ myenv/lib/python3.10/site-packages/decord/.dylibs/libavfilter.7.40.101.dylib filter=lfs diff=lfs merge=lfs -text
95
+ myenv/lib/python3.10/site-packages/decord/.dylibs/libavformat.58.20.100.dylib filter=lfs diff=lfs merge=lfs -text
96
+ myenv/lib/python3.10/site-packages/decord/.dylibs/libvpx.8.dylib filter=lfs diff=lfs merge=lfs -text
97
+ myenv/lib/python3.10/site-packages/decord/.dylibs/libx264.164.dylib filter=lfs diff=lfs merge=lfs -text
98
+ myenv/lib/python3.10/site-packages/decord/libdecord.dylib filter=lfs diff=lfs merge=lfs -text
99
+ myenv/lib/python3.10/site-packages/emoji/unicode_codes/__pycache__/data_dict.cpython-310.pyc filter=lfs diff=lfs merge=lfs -text
100
+ myenv/lib/python3.10/site-packages/gradio/frpc_darwin_amd64_v0.2 filter=lfs diff=lfs merge=lfs -text
101
+ myenv/lib/python3.10/site-packages/grpc/_cython/cygrpc.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
102
+ myenv/lib/python3.10/site-packages/grpc_tools/_protoc_compiler.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
103
+ myenv/lib/python3.10/site-packages/layoutparser/misc/NotoSerifCJKjp-Regular.otf filter=lfs diff=lfs merge=lfs -text
104
+ myenv/lib/python3.10/site-packages/lib/libllama.dylib filter=lfs diff=lfs merge=lfs -text
105
+ myenv/lib/python3.10/site-packages/llama_cpp/libllama.dylib filter=lfs diff=lfs merge=lfs -text
106
+ myenv/lib/python3.10/site-packages/lxml/etree.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
107
+ myenv/lib/python3.10/site-packages/lxml/objectify.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
108
+ myenv/lib/python3.10/site-packages/magic/libmagic/magic.mgc filter=lfs diff=lfs merge=lfs -text
109
+ myenv/lib/python3.10/site-packages/minijinja/_lowlevel.abi3.so filter=lfs diff=lfs merge=lfs -text
110
+ myenv/lib/python3.10/site-packages/numpy/.dylibs/libgfortran.5.dylib filter=lfs diff=lfs merge=lfs -text
111
+ myenv/lib/python3.10/site-packages/numpy/.dylibs/libopenblas64_.0.dylib filter=lfs diff=lfs merge=lfs -text
112
+ myenv/lib/python3.10/site-packages/numpy/core/_multiarray_umath.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
113
+ myenv/lib/python3.10/site-packages/numpy/core/_simd.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
114
+ myenv/lib/python3.10/site-packages/onnx/onnx_cpp2py_export.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
115
+ myenv/lib/python3.10/site-packages/onnxruntime/capi/onnxruntime_pybind11_state.so filter=lfs diff=lfs merge=lfs -text
116
+ myenv/lib/python3.10/site-packages/pandas/_libs/algos.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
117
+ myenv/lib/python3.10/site-packages/pandas/_libs/groupby.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
118
+ myenv/lib/python3.10/site-packages/pandas/_libs/hashtable.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
119
+ myenv/lib/python3.10/site-packages/pandas/_libs/interval.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
120
+ myenv/lib/python3.10/site-packages/pandas/_libs/join.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
121
+ myenv/lib/python3.10/site-packages/pandas/_libs/tslibs/offsets.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
122
+ myenv/lib/python3.10/site-packages/pikepdf/.dylibs/libgnutls.30.dylib filter=lfs diff=lfs merge=lfs -text
123
+ myenv/lib/python3.10/site-packages/pikepdf/.dylibs/libp11-kit.0.dylib filter=lfs diff=lfs merge=lfs -text
124
+ myenv/lib/python3.10/site-packages/pikepdf/.dylibs/libqpdf.29.8.0.dylib filter=lfs diff=lfs merge=lfs -text
125
+ myenv/lib/python3.10/site-packages/pikepdf/.dylibs/libunistring.5.dylib filter=lfs diff=lfs merge=lfs -text
126
+ myenv/lib/python3.10/site-packages/pikepdf/_core.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
127
+ myenv/lib/python3.10/site-packages/pillow_heif/.dylibs/libaom.3.8.0.dylib filter=lfs diff=lfs merge=lfs -text
128
+ myenv/lib/python3.10/site-packages/pillow_heif/.dylibs/libjxl.0.8.2.dylib filter=lfs diff=lfs merge=lfs -text
129
+ myenv/lib/python3.10/site-packages/pillow_heif/.dylibs/libx265.199.dylib filter=lfs diff=lfs merge=lfs -text
130
+ myenv/lib/python3.10/site-packages/pyarrow/_compute.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
131
+ myenv/lib/python3.10/site-packages/pyarrow/_dataset.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
132
+ myenv/lib/python3.10/site-packages/pyarrow/_flight.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
133
+ myenv/lib/python3.10/site-packages/pyarrow/lib.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
134
+ myenv/lib/python3.10/site-packages/pyarrow/libarrow.1601.dylib filter=lfs diff=lfs merge=lfs -text
135
+ myenv/lib/python3.10/site-packages/pyarrow/libarrow_acero.1601.dylib filter=lfs diff=lfs merge=lfs -text
136
+ myenv/lib/python3.10/site-packages/pyarrow/libarrow_dataset.1601.dylib filter=lfs diff=lfs merge=lfs -text
137
+ myenv/lib/python3.10/site-packages/pyarrow/libarrow_flight.1601.dylib filter=lfs diff=lfs merge=lfs -text
138
+ myenv/lib/python3.10/site-packages/pyarrow/libarrow_python.dylib filter=lfs diff=lfs merge=lfs -text
139
+ myenv/lib/python3.10/site-packages/pyarrow/libarrow_substrait.1601.dylib filter=lfs diff=lfs merge=lfs -text
140
+ myenv/lib/python3.10/site-packages/pyarrow/libparquet.1601.dylib filter=lfs diff=lfs merge=lfs -text
141
+ myenv/lib/python3.10/site-packages/pydantic_core/_pydantic_core.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
142
+ myenv/lib/python3.10/site-packages/pydeck/nbextension/static/index.js.map filter=lfs diff=lfs merge=lfs -text
143
+ myenv/lib/python3.10/site-packages/pypdf/_codecs/__pycache__/adobe_glyphs.cpython-310.pyc filter=lfs diff=lfs merge=lfs -text
144
+ myenv/lib/python3.10/site-packages/pypdfium2_raw/libpdfium.dylib filter=lfs diff=lfs merge=lfs -text
145
+ myenv/lib/python3.10/site-packages/rapidfuzz/distance/metrics_cpp.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
146
+ myenv/lib/python3.10/site-packages/rapidfuzz/distance/metrics_cpp_avx2.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
147
+ myenv/lib/python3.10/site-packages/rapidfuzz/fuzz_cpp.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
148
+ myenv/lib/python3.10/site-packages/rapidfuzz/fuzz_cpp_avx2.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
149
+ myenv/lib/python3.10/site-packages/safetensors/_safetensors_rust.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
150
+ myenv/lib/python3.10/site-packages/scipy/.dylibs/libgfortran.5.dylib filter=lfs diff=lfs merge=lfs -text
151
+ myenv/lib/python3.10/site-packages/scipy/.dylibs/libopenblas.0.dylib filter=lfs diff=lfs merge=lfs -text
152
+ myenv/lib/python3.10/site-packages/scipy/fft/_pocketfft/pypocketfft.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
153
+ myenv/lib/python3.10/site-packages/scipy/io/_fast_matrix_market/_fmm_core.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
154
+ myenv/lib/python3.10/site-packages/scipy/linalg/_flapack.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
155
+ myenv/lib/python3.10/site-packages/scipy/misc/face.dat filter=lfs diff=lfs merge=lfs -text
156
+ myenv/lib/python3.10/site-packages/scipy/optimize/_highs/_highs_wrapper.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
157
+ myenv/lib/python3.10/site-packages/scipy/sparse/_sparsetools.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
158
+ myenv/lib/python3.10/site-packages/scipy/spatial/_qhull.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
159
+ myenv/lib/python3.10/site-packages/scipy/special/_ufuncs.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
160
+ myenv/lib/python3.10/site-packages/scipy/special/cython_special.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
161
+ myenv/lib/python3.10/site-packages/scipy/stats/_unuran/unuran_wrapper.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
162
+ myenv/lib/python3.10/site-packages/sentencepiece/_sentencepiece.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
163
+ myenv/lib/python3.10/site-packages/skimage/filters/rank/generic_cy.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
164
+ myenv/lib/python3.10/site-packages/sklearn/_loss/_loss.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
165
+ myenv/lib/python3.10/site-packages/tiktoken/_tiktoken.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
166
+ myenv/lib/python3.10/site-packages/tokenizers/tokenizers.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
167
+ myenv/lib/python3.10/site-packages/torch/.dylibs/libiomp5.dylib filter=lfs diff=lfs merge=lfs -text
168
+ myenv/lib/python3.10/site-packages/torch/bin/protoc filter=lfs diff=lfs merge=lfs -text
169
+ myenv/lib/python3.10/site-packages/torch/bin/protoc-3.13.0.0 filter=lfs diff=lfs merge=lfs -text
170
+ myenv/lib/python3.10/site-packages/torch/lib/libiomp5.dylib filter=lfs diff=lfs merge=lfs -text
171
+ myenv/lib/python3.10/site-packages/torch/lib/libtorch_cpu.dylib filter=lfs diff=lfs merge=lfs -text
172
+ myenv/lib/python3.10/site-packages/torch/lib/libtorch_python.dylib filter=lfs diff=lfs merge=lfs -text
173
+ myenv/lib/python3.10/site-packages/torchaudio/_torchaudio.so filter=lfs diff=lfs merge=lfs -text
174
+ myenv/lib/python3.10/site-packages/torchaudio/lib/libflashlight-text.so filter=lfs diff=lfs merge=lfs -text
175
+ myenv/lib/python3.10/site-packages/torchaudio/lib/libtorchaudio.so filter=lfs diff=lfs merge=lfs -text
176
+ myenv/lib/python3.10/site-packages/torchvision/.dylibs/libc++.1.0.dylib filter=lfs diff=lfs merge=lfs -text
177
+ myenv/lib/python3.10/site-packages/unicorn/lib/libunicorn.2.dylib filter=lfs diff=lfs merge=lfs -text
178
+ myenv/lib/python3.10/site-packages/unicorn/lib/libunicorn.a filter=lfs diff=lfs merge=lfs -text
179
+ myenv/share/jupyter/nbextensions/pydeck/index.js.map filter=lfs diff=lfs merge=lfs -text
180
+ openbiollm-llama3-8b.Q5_K_M.gguf filter=lfs diff=lfs merge=lfs -text
181
+ path/to/data/collections/image_data/0/wal/open-1 filter=lfs diff=lfs merge=lfs -text
182
+ path/to/data/collections/image_data/0/wal/open-2 filter=lfs diff=lfs merge=lfs -text
183
+ path/to/data/collections/medical_img/0/wal/open-1 filter=lfs diff=lfs merge=lfs -text
184
+ path/to/data/collections/medical_img/0/wal/open-2 filter=lfs diff=lfs merge=lfs -text
185
+ qdrant_data/collections/vector_db/0/wal/open-1 filter=lfs diff=lfs merge=lfs -text
186
+ qdrant_data/collections/vector_db/0/wal/open-2 filter=lfs diff=lfs merge=lfs -text
187
+ qdrant_storage/collections/medical_img/0/wal/open-1 filter=lfs diff=lfs merge=lfs -text
188
+ qdrant_storage/collections/medical_img/0/wal/open-2 filter=lfs diff=lfs merge=lfs -text
189
+ qdrant_storage/collections/vector_db/0/wal/open-1 filter=lfs diff=lfs merge=lfs -text
190
+ qdrant_storage/collections/vector_db/0/wal/open-2 filter=lfs diff=lfs merge=lfs -text
.github/workflows/update_space.yml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Run Python script
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - surbhi
7
+
8
+ jobs:
9
+ build:
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - name: Checkout
14
+ uses: actions/checkout@v2
15
+
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v2
18
+ with:
19
+ python-version: '3.9'
20
+
21
+ - name: Install Gradio
22
+ run: python -m pip install gradio
23
+
24
+ - name: Log in to Hugging Face
25
+ run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
26
+
27
+ - name: Deploy to Spaces
28
+ run: gradio deploy
.gitignore ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ qdrant_data
2
+ myenv
3
+ openbiollm-llama3-8b.Q5_K_M.gguf
4
+ __pycache__
5
+ secrets.toml
6
+ .streamlit/
7
+ .env
.streamlit/secrets.toml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # .streamlit/secrets.toml
2
+ QDRANT_URL = "https://f1e9a70a-afb9-498d-b66d-cb248e0d5557.us-east4-0.gcp.cloud.qdrant.io:6333"
3
+ QDRANT_API_KEY = "REXlX_PeDvCoXeS9uKCzC--e3-LQV0lw3_jBTdcLZ7P5_F6EOdwklA"
Data/AC-Aids-for-Dogs_Canine-Periodontal-Disease.pdf ADDED
Binary file (485 kB). View file
 
Data/cancer_and_cure__a_critical_analysis.27.pdf ADDED
Binary file (226 kB). View file
 
Data/medical_oncology_handbook_june_2020_edition.pdf ADDED
Binary file (818 kB). View file
 
DockerFile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use the official Python image from the Docker Hub
2
+ FROM python:3.10
3
+
4
+ # Set the working directory in the container
5
+ WORKDIR /app
6
+
7
+ # Copy the requirements file into the container at /app
8
+ COPY requirements.txt .
9
+
10
+ # Install the required libraries
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+ # Copy the rest of the application code into the container
14
+ COPY . .
15
+
16
+ # Expose the port the app runs on
17
+ EXPOSE 8501
18
+
19
+ # Command to run the application
20
+ CMD ["streamlit", "run", "stream.py", "--server.port=8501", "--server.address=0.0.0.0"]
MultimodalRAG.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
MultimodalRAGUpdatedVersion.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
README.md CHANGED
@@ -1,12 +1,129 @@
1
  ---
2
- title: Medical RAG
3
- emoji: 📉
4
- colorFrom: gray
5
- colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 4.42.0
8
- app_file: app.py
9
- pinned: false
10
  ---
 
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Medical_RAG
3
+ app_file: combinedmultimodal.py
 
 
4
  sdk: gradio
5
+ sdk_version: 4.41.0
 
 
6
  ---
7
+ # Advancing Text Searching with Advanced Indexing Techniques in Healthcare Applications(In Progress)
8
 
9
+ Welcome to the project repository for advancing text searching with advanced indexing techniques in healthcare applications. This project implements a powerful Retrieval-Augmented Generation (RAG) system using cutting-edge AI technologies, specifically designed to enhance text searching capabilities within the healthcare domain.I have also implemented Multimodal Text Searching for Medical Documents.
10
+
11
+ ## 🚀 Features For Text Based Medical Query Based System
12
+
13
+ - **BioLLM 8B**: Advanced language model for generating and processing medical text.
14
+ - **ClinicalBert**: State-of-the-art embedding model for accurate representation of medical texts.
15
+ - **Qdrant**: Self-hosted Vector Database (Vector DB) for efficient storage and retrieval of embeddings.
16
+ - **Langchain & Llama CPP**: Orchestration frameworks for seamless integration and workflow management.
17
+
18
+ # Medical Knowledge Base Query System
19
+
20
+ A multimodal medical information retrieval system combining text and image-based querying for comprehensive medical knowledge access.
21
+
22
+ ## Features For Multimodality Medical Query Based System:
23
+ [Watch the video on YouTube](https://youtu.be/pNy7RqfRUrc?si=1HQgq54oHT6YoR0B)
24
+
25
+ ### 🧠 Multimodal Medical Information Retrieval
26
+ - Combines text and image-based querying for comprehensive medical knowledge access
27
+ - Uses Qdrant vector database to store and retrieve both text and image embeddings
28
+
29
+ ### 🔤 Advanced Natural Language Processing
30
+ - Utilizes ClinicalBERT for domain-specific text embeddings
31
+ - Implements NVIDIA's Palmyra-med-70b model for medical language understanding fast Inference time.
32
+
33
+ ### 🖼️ Image Analysis Capabilities
34
+ - Incorporates CLIP (Contrastive Language-Image Pre-training) for image feature extraction
35
+ - Generates image summaries using Google's Gemini 1.5 Flash model
36
+
37
+ ### 📄 PDF Processing
38
+ - Extracts text and images from medical PDF documents
39
+ - Implements intelligent chunking strategies for text processing
40
+
41
+ ### 🔍 Vector Search
42
+ - Uses Qdrant for efficient similarity search on both text and image vectors
43
+ - Implements hybrid search combining CLIP-based image similarity and text-based summary similarity
44
+
45
+ ### 🖥️ Interactive User Interface
46
+ - Gradio-based web interface for easy querying and result visualization
47
+ - Displays relevant text responses alongside related medical images
48
+
49
+ ### 🧩 Extensible Architecture
50
+ - Modular design allowing for easy integration of new models or data sources
51
+ - Supports both local and cloud-based model deployment
52
+ The high level architectural framework for this application is given as follows:
53
+ ![System Architecture Diagram](images/architecture.png)
54
+
55
+ ### ⚡ Performance Optimization
56
+ - Implements batching and multi-threading for efficient processing of large document sets
57
+ - Utilizes GPU acceleration where available
58
+
59
+ ### 🎛️ Customizable Retrieval
60
+ - Adjustable similarity thresholds for image retrieval
61
+ - Configurable number of top-k results for both text and image queries
62
+
63
+ ### 📊 Comprehensive Visualization
64
+ - Displays query results with both textual information and related images
65
+ - Provides a gallery view of all extracted images from the knowledge base
66
+
67
+ ### 🔐 Environment Management
68
+ - Uses .env file for secure API key management
69
+ - Supports both CPU and GPU environments
70
+
71
+ ### DEMO SCREENSHOT
72
+ ![DEMO-SCREENSHOT](images/multimodal.png)
73
+
74
+ ## 🎥 Video Demonstration
75
+
76
+ Explore the capabilities of our project with our detailed [YouTube video](https://youtu.be/nKCKUcnQ390).
77
+
78
+ ## Installation
79
+
80
+ To get started with this project, follow these steps:
81
+
82
+ 1. **Install Dependencies**:
83
+ ```bash
84
+ pip install -r requirements.txt
85
+ ```
86
+
87
+ 2. **Set up Qdrant**:
88
+ - Follow the [Qdrant Installation Guide](https://qdrant.tech/documentation/quick_start/) to install and configure Qdrant.
89
+
90
+ 3. **Configure the Application**:
91
+ - Ensure configuration files for BioLLM, ClinicalBert, Langchain, and Llama CPP are correctly set up.
92
+
93
+ 4. **Run the Application**:
94
+ if you want to run the text reterival application in Flask mode
95
+ ```bash
96
+ uvicorn app:app
97
+ ```
98
+ if you want to run the text reterival application through Streamlit
99
+ ``bash
100
+ streamlit run Streaming.py
101
+ ```
102
+
103
+ if you want to run the multimodal application run it through Gradio Interface
104
+ ```bash
105
+ python combinedmultimodal.py
106
+ ```
107
+
108
+ ## 💡 Usage
109
+
110
+ - **Querying the System**: Input medical queries via the application's interface for detailed information retrieval.
111
+ - **Text Generation**: Utilize BioLLM 8B to generate comprehensive medical responses.
112
+
113
+ ## 👥 Contributing
114
+
115
+ We welcome contributions to enhance this project! Here's how you can contribute:
116
+
117
+ 1. Fork the repository.
118
+ 2. Create a new branch (`git checkout -b feature-name`).
119
+ 3. Commit your changes (`git commit -am 'Add feature'`).
120
+ 4. Push to the branch (`git push origin feature-name`).
121
+ 5. Open a Pull Request with detailed information about your changes.
122
+
123
+ ## 📜 License
124
+
125
+ This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
126
+
127
+ ## 📞 Contact
128
+
129
+ For questions or suggestions, please open an issue or contact the repository owner at [surbhisharma9099@gmail.com](mailto:surbhisharma9099@gmail.com).
Streaming.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from langchain_community.document_loaders import DirectoryLoader, UnstructuredFileLoader, PDFMinerLoader
4
+ from langchain_community.vectorstores import Qdrant
5
+ from langchain_community.embeddings import SentenceTransformerEmbeddings
6
+ from langchain_community.retrievers import BM25Retriever
7
+ from qdrant_client import QdrantClient
8
+ from qdrant_client.http.exceptions import ResponseHandlingException
9
+ from glob import glob
10
+ from llama_index.vector_stores.qdrant import QdrantVectorStore
11
+ from langchain.chains import RetrievalQA
12
+ from transformers import AutoTokenizer, AutoModel
13
+ from sentence_transformers import models, SentenceTransformer
14
+ from langchain.embeddings.base import Embeddings
15
+ from qdrant_client.models import VectorParams
16
+ import torch
17
+ import base64
18
+ from langchain_community.llms import LlamaCpp
19
+ from langchain_core.prompts import PromptTemplate
20
+ from huggingface_hub import hf_hub_download
21
+ from tempfile import NamedTemporaryFile
22
+ from langchain.retrievers import EnsembleRetriever
23
+
24
+ # Set page configuration
25
+ st.set_page_config(layout="wide")
26
+ st.markdown("""
27
+ <meta http-equiv="Content-Security-Policy"
28
+ content="default-src 'self'; object-src 'self'; frame-src 'self' data:;
29
+ script-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline';">
30
+ """, unsafe_allow_html=True)
31
+ # Streamlit secrets
32
+ qdrant_url = st.secrets["QDRANT_URL"]
33
+ qdrant_api_key = st.secrets["QDRANT_API_KEY"]
34
+
35
+ # For debugging only - remove or comment out these lines after verification
36
+ #st.write(f"QDRANT_URL: {qdrant_url}")
37
+ #st.write(f"QDRANT_API_KEY: {qdrant_api_key}")
38
+
39
+ class ClinicalBertEmbeddings(Embeddings):
40
+ def __init__(self, model_name: str = "medicalai/ClinicalBERT"):
41
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
42
+ self.model = AutoModel.from_pretrained(model_name)
43
+ self.model.eval()
44
+
45
+ def embed(self, text: str):
46
+ inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
47
+ with torch.no_grad():
48
+ outputs = self.model(**inputs)
49
+ embeddings = self.mean_pooling(outputs, inputs['attention_mask'])
50
+ return embeddings.squeeze().numpy()
51
+
52
+ def mean_pooling(self, model_output, attention_mask):
53
+ token_embeddings = model_output[0] # First element of model_output contains all token embeddings
54
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
55
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
56
+
57
+ def embed_documents(self, texts):
58
+ return [self.embed(text) for text in texts]
59
+
60
+ def embed_query(self, text):
61
+ return self.embed(text)
62
+
63
+ @st.cache_resource
64
+ def load_model():
65
+ model_name = "aaditya/OpenBioLLM-Llama3-8B-GGUF"
66
+ model_file = "openbiollm-llama3-8b.Q5_K_M.gguf"
67
+ model_path = hf_hub_download(model_name, filename=model_file, local_dir='./')
68
+ return LlamaCpp(
69
+ model_path=model_path,
70
+ temperature=0.3,
71
+ n_ctx=2048,
72
+ top_p=1
73
+ )
74
+
75
+ # Initialize embeddings
76
+ @st.cache_resource
77
+ def load_embeddings():
78
+ return ClinicalBertEmbeddings(model_name="medicalai/ClinicalBERT")
79
+
80
+ # Initialize database
81
+ @st.cache_resource
82
+ def setup_qdrant():
83
+ try:
84
+ if not qdrant_url or not qdrant_api_key:
85
+ raise ValueError("QDRANT_URL or QDRANT_API_KEY not set in environment variables.")
86
+
87
+ # Initialize Qdrant client
88
+ client = QdrantClient(
89
+ url=qdrant_url,
90
+ api_key=qdrant_api_key,
91
+ port=443, # Assuming HTTPS should use port 443
92
+ )
93
+ st.write("Qdrant client initialized successfully.")
94
+
95
+ # Create or recreate collection
96
+ collection_name = "vector_db"
97
+ try:
98
+ collection_info = client.get_collection(collection_name=collection_name)
99
+ st.write(f"Collection '{collection_name}' already exists.")
100
+ except ResponseHandlingException:
101
+ st.write(f"Collection '{collection_name}' does not exist. Creating a new one.")
102
+ client.recreate_collection(
103
+ collection_name=collection_name,
104
+ vectors_config=VectorParams(size=768, distance="Cosine")
105
+ )
106
+ st.write(f"Collection '{collection_name}' created successfully.")
107
+
108
+ embeddings = load_embeddings()
109
+ st.write("Embeddings model loaded successfully.")
110
+
111
+ return Qdrant(client=client, embeddings=embeddings, collection_name=collection_name)
112
+
113
+ except Exception as e:
114
+ st.error(f"Failed to initialize Qdrant: {e}")
115
+ return None
116
+
117
+ # Initialize database
118
+ db = setup_qdrant()
119
+
120
+ if db is None:
121
+ st.error("Qdrant setup failed, exiting.")
122
+ else:
123
+ st.success("Qdrant setup successful.")
124
+
125
+ # Load models
126
+ llm = load_model()
127
+ embeddings = load_embeddings()
128
+
129
+ # Define prompt template
130
+ prompt_template = """Use the following pieces of information to answer the user's question.
131
+ If you don't know the answer, just say that you don't know, don't try to make up an answer.
132
+
133
+ Context: {context}
134
+ Question: {question}
135
+
136
+ Only return the helpful answer. Answer must be detailed and well explained.
137
+ Helpful answer:
138
+ """
139
+ prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])
140
+ # Define retriever
141
+
142
+ # Define Streamlit app
143
+
144
+ def process_answer(query):
145
+ chain_type_kwargs = {"prompt": prompt}
146
+ global ensemble_retriever
147
+ qa = RetrievalQA.from_chain_type(
148
+ llm=llm,
149
+ chain_type="stuff",
150
+ retriever=ensemble_retriever,
151
+ return_source_documents=True,
152
+ chain_type_kwargs=chain_type_kwargs,
153
+ verbose=True
154
+ )
155
+ response = qa(query)
156
+ answer = response['result']
157
+ source_document = response['source_documents'][0].page_content
158
+ doc = response['source_documents'][0].metadata['source']
159
+ return answer, source_document, doc
160
+
161
+ def display_pdf(file):
162
+ with open(file, "rb") as f:
163
+ base64_pdf = base64.b64encode(f.read()).decode('utf-8')
164
+ pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
165
+ st.markdown(pdf_display, unsafe_allow_html=True)
166
+
167
+ def main():
168
+ st.title("PDF Question Answering System")
169
+
170
+ uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
171
+
172
+ if uploaded_file is not None:
173
+ # Save uploaded PDF
174
+ with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
175
+ temp_file.write(uploaded_file.read())
176
+ temp_file_path = temp_file.name
177
+
178
+ # Display PDF
179
+ st.subheader("PDF Preview")
180
+ display_pdf(temp_file_path)
181
+
182
+ # Load and process PDF
183
+ loader = PDFMinerLoader(temp_file_path)
184
+ documents = loader.load()
185
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
186
+ texts = text_splitter.split_documents(documents)
187
+
188
+ # Update the Qdrant database with the new PDF content
189
+
190
+ try:
191
+ db.add_documents(texts)
192
+ st.success("PDF processed and vector database updated!")
193
+ global ensemble_retriever
194
+ # Initialize retriever after documents are added
195
+ bm25_retriever = BM25Retriever.from_documents(documents=texts)
196
+ bm25_retriever.k = 3
197
+ qdrant_retriever = db.as_retriever(search_kwargs={"k":1})
198
+ # Combine both retrievers using EnsembleRetriever
199
+ ensemble_retriever = EnsembleRetriever(
200
+ retrievers=[qdrant_retriever, bm25_retriever],
201
+ weights=[0.5, 0.5] # Adjust weights based on desired contribution
202
+ )
203
+
204
+ except Exception as e:
205
+ st.error(f"Error updating database: {e}")
206
+
207
+ st.subheader("Ask a question about the PDF")
208
+ user_input = st.text_input("Your question:")
209
+
210
+ if st.button('Get Response'):
211
+ if user_input:
212
+ try:
213
+ answer, source_document, doc = process_answer(user_input)
214
+ st.write("*Answer:*", answer)
215
+ st.write("*Source Document:*", source_document)
216
+ st.write("*Document Source:*", doc)
217
+ except Exception as e:
218
+ st.error(f"Error processing query: {e}")
219
+ else:
220
+ st.warning("Please enter a query.")
221
+
222
+ if __name__ == "__main__":
223
+ main()
Streamingnewversion.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from langchain_community.document_loaders import DirectoryLoader, UnstructuredFileLoader, PDFMinerLoader
4
+ from langchain_community.vectorstores import Qdrant
5
+ from langchain_community.embeddings import SentenceTransformerEmbeddings
6
+ from langchain_community.retrievers import BM25Retriever
7
+ from qdrant_client import QdrantClient
8
+ from qdrant_client.http.exceptions import ResponseHandlingException
9
+ from glob import glob
10
+ from llama_index.vector_stores.qdrant import QdrantVectorStore
11
+ from langchain.chains import RetrievalQA
12
+ from transformers import AutoTokenizer, AutoModel
13
+ from sentence_transformers import models, SentenceTransformer
14
+ from langchain.embeddings.base import Embeddings
15
+ from qdrant_client.models import VectorParams
16
+ import torch
17
+ import base64
18
+ from langchain_community.llms import LlamaCpp
19
+ from langchain_core.prompts import PromptTemplate
20
+ from huggingface_hub import hf_hub_download
21
+ from tempfile import NamedTemporaryFile
22
+ from langchain.retrievers import EnsembleRetriever
23
+ import urllib
24
+ import nltk
25
+ import os
26
+ # Add this at the beginning of your script
27
+ import logging
28
+ logging.basicConfig(level=logging.DEBUG)
29
+
30
+
31
+ # Define the path for NLTK data
32
+ nltk_data_path = '/tmp/nltk_data'
33
+ os.makedirs(nltk_data_path, exist_ok=True)
34
+
35
+ # Set NLTK data path environment variable
36
+ nltk.data.path.append(nltk_data_path)
37
+
38
+ # Download required NLTK data
39
+ try:
40
+ nltk.data.find('tokenizers/punkt')
41
+ except LookupError:
42
+ nltk.download('punkt', download_dir=nltk_data_path)
43
+
44
+ # Set page configuration
45
+ st.set_page_config(layout="wide")
46
+ st.markdown("""
47
+ <meta http-equiv="Content-Security-Policy"
48
+ content="default-src 'self'; object-src 'self'; frame-src 'self' data:;
49
+ script-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline';">
50
+ """, unsafe_allow_html=True)
51
+ # Streamlit secrets
52
+ qdrant_url = st.secrets["QDRANT_URL"]
53
+ qdrant_api_key = st.secrets["QDRANT_API_KEY"]
54
+
55
+ # For debugging only - remove or comment out these lines after verification
56
+ #st.write(f"QDRANT_URL: {qdrant_url}")
57
+ #st.write(f"QDRANT_API_KEY: {qdrant_api_key}")
58
+
59
+ class ClinicalBertEmbeddings(Embeddings):
60
+ def __init__(self, model_name: str = "medicalai/ClinicalBERT"):
61
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
62
+ self.model = AutoModel.from_pretrained(model_name)
63
+ self.model.eval()
64
+
65
+ def embed(self, text: str):
66
+ inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
67
+ with torch.no_grad():
68
+ outputs = self.model(**inputs)
69
+ embeddings = self.mean_pooling(outputs, inputs['attention_mask'])
70
+ return embeddings.squeeze().numpy()
71
+
72
+ def mean_pooling(self, model_output, attention_mask):
73
+ token_embeddings = model_output[0] # First element of model_output contains all token embeddings
74
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
75
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
76
+
77
+ def embed_documents(self, texts):
78
+ return [self.embed(text) for text in texts]
79
+
80
+ def embed_query(self, text):
81
+ return self.embed(text)
82
+
83
+ @st.cache_resource
84
+ def load_model():
85
+ model_name = "aaditya/OpenBioLLM-Llama3-8B-GGUF"
86
+ model_file = "openbiollm-llama3-8b.Q5_K_M.gguf"
87
+ model_path = hf_hub_download(model_name, filename=model_file, local_dir='./')
88
+ return LlamaCpp(
89
+ model_path=model_path,
90
+ temperature=0.3,
91
+ n_ctx=2048,
92
+ top_p=1
93
+ )
94
+
95
+ # Initialize embeddings
96
+ @st.cache_resource
97
+ def load_embeddings():
98
+ return ClinicalBertEmbeddings(model_name="medicalai/ClinicalBERT")
99
+
100
+ # Initialize database
101
+ @st.cache_resource
102
+ def setup_qdrant():
103
+ try:
104
+ if not qdrant_url or not qdrant_api_key:
105
+ raise ValueError("QDRANT_URL or QDRANT_API_KEY not set in environment variables.")
106
+
107
+ # Initialize Qdrant client
108
+ client = QdrantClient(
109
+ url=qdrant_url,
110
+ api_key=qdrant_api_key,
111
+ port=443, # Assuming HTTPS should use port 443
112
+ )
113
+ st.write("Qdrant client initialized successfully.")
114
+
115
+ # Create or recreate collection
116
+ collection_name = "vector_db"
117
+ try:
118
+ collection_info = client.get_collection(collection_name=collection_name)
119
+ st.write(f"Collection '{collection_name}' already exists.")
120
+ except ResponseHandlingException:
121
+ st.write(f"Collection '{collection_name}' does not exist. Creating a new one.")
122
+ client.recreate_collection(
123
+ collection_name=collection_name,
124
+ vectors_config=VectorParams(size=768, distance="Cosine")
125
+ )
126
+ st.write(f"Collection '{collection_name}' created successfully.")
127
+
128
+ embeddings = load_embeddings()
129
+ st.write("Embeddings model loaded successfully.")
130
+
131
+ return Qdrant(client=client, embeddings=embeddings, collection_name=collection_name)
132
+
133
+ except Exception as e:
134
+ st.error(f"Failed to initialize Qdrant: {e}")
135
+ return None
136
+
137
+ # Initialize database
138
+ db = setup_qdrant()
139
+
140
+ if db is None:
141
+ st.error("Qdrant setup failed, exiting.")
142
+ else:
143
+ st.success("Qdrant setup successful.")
144
+
145
+ # Load models
146
+ llm = load_model()
147
+ embeddings = load_embeddings()
148
+
149
+ # Define prompt template
150
+ prompt_template = """Use the following pieces of information to answer the user's question.
151
+ If you don't know the answer, just say that you don't know, don't try to make up an answer.
152
+
153
+ Context: {context}
154
+ Question: {question}
155
+
156
+ Only return the helpful answer. Answer must be detailed and well explained.
157
+ Helpful answer:
158
+ """
159
+ prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])
160
+ # Define retriever
161
+
162
+ # Define Streamlit app
163
+
164
+ def process_answer(query):
165
+ chain_type_kwargs = {"prompt": prompt}
166
+ global ensemble_retriever
167
+ qa = RetrievalQA.from_chain_type(
168
+ llm=llm,
169
+ chain_type="stuff",
170
+ retriever=ensemble_retriever,
171
+ return_source_documents=True,
172
+ chain_type_kwargs=chain_type_kwargs,
173
+ verbose=True
174
+ )
175
+ response = qa(query)
176
+ answer = response['result']
177
+ source_document = response['source_documents'][0].page_content
178
+ doc = response['source_documents'][0].metadata['source']
179
+ return answer, source_document, doc
180
+
181
+ def display_pdf(file):
182
+ with open(file, "rb") as f:
183
+ base64_pdf = base64.b64encode(f.read()).decode('utf-8')
184
+ pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
185
+ st.markdown(pdf_display, unsafe_allow_html=True)
186
+
187
+ def main():
188
+ st.title("PDF Question Answering System")
189
+
190
+ # Displaying File
191
+ uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
192
+
193
+ if uploaded_file is not None:
194
+ # Save uploaded PDF
195
+ with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
196
+ temp_file.write(uploaded_file.read())
197
+ temp_file_path = temp_file.name
198
+
199
+ # Display PDF
200
+ st.subheader("PDF Preview")
201
+ display_pdf(temp_file_path)
202
+
203
+ # Load and process PDF
204
+ loader = PDFMinerLoader(temp_file_path)
205
+ documents = loader.load()
206
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
207
+ texts = text_splitter.split_documents(documents)
208
+
209
+ # Update the Qdrant database with the new PDF content
210
+
211
+ try:
212
+ db.add_documents(texts)
213
+ st.success("PDF processed and vector database updated!")
214
+ global ensemble_retriever
215
+ # Initialize retriever after documents are added
216
+ bm25_retriever = BM25Retriever.from_documents(documents=texts)
217
+ bm25_retriever.k = 3
218
+ qdrant_retriever = db.as_retriever(search_kwargs={"k":1})
219
+ # Combine both retrievers using EnsembleRetriever
220
+ ensemble_retriever = EnsembleRetriever(
221
+ retrievers=[qdrant_retriever, bm25_retriever],
222
+ weights=[0.5, 0.5] # Adjust weights based on desired contribution
223
+ )
224
+
225
+ except Exception as e:
226
+ st.error(f"Error updating database: {e}")
227
+
228
+ st.subheader("Ask a question about the PDF")
229
+ user_input = st.text_input("Your question:")
230
+
231
+ if st.button('Get Response'):
232
+ if user_input:
233
+ try:
234
+ answer, source_document, doc = process_answer(user_input)
235
+ st.write("*Answer:*", answer)
236
+ st.write("*Source Document:*", source_document)
237
+ st.write("*Document Source:*", doc)
238
+ except Exception as e:
239
+ st.error(f"Error processing query: {e}")
240
+ else:
241
+ st.warning("Please enter a query.")
242
+
243
+ if __name__ == "__main__":
244
+ main()
__pycache__/app.cpython-310.pyc ADDED
Binary file (2.97 kB). View file
 
__pycache__/clip_helpers.cpython-310.pyc ADDED
Binary file (644 Bytes). View file
 
__pycache__/combinedmultimodal.cpython-310.pyc ADDED
Binary file (15.3 kB). View file
 
__pycache__/imagebind.cpython-310.pyc ADDED
Binary file (2.9 kB). View file
 
__pycache__/images.cpython-310.pyc ADDED
Binary file (543 Bytes). View file
 
__pycache__/ingest.cpython-310.pyc ADDED
Binary file (3.68 kB). View file
 
app.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain import PromptTemplate
2
+ from langchain_community.llms import LlamaCpp
3
+ from langchain.chains import RetrievalQA
4
+ from langchain_community.embeddings import SentenceTransformerEmbeddings
5
+ from fastapi import FastAPI, Request, Form, Response
6
+ from fastapi.responses import HTMLResponse
7
+ from fastapi.templating import Jinja2Templates
8
+ from fastapi.staticfiles import StaticFiles
9
+ from fastapi.encoders import jsonable_encoder
10
+ from qdrant_client import QdrantClient
11
+ from langchain_community.vectorstores import Qdrant
12
+ import os
13
+ import json
14
+ from huggingface_hub import hf_hub_download
15
+ from langchain.retrievers import EnsembleRetriever
16
+ from ingest import keyword_retriever
17
+
18
+ app = FastAPI()
19
+
20
+ templates = Jinja2Templates(directory="templates")
21
+ app.mount("/static", StaticFiles(directory="static"), name="static")
22
+ model_name = "aaditya/OpenBioLLM-Llama3-8B-GGUF"
23
+ model_file = "openbiollm-llama3-8b.Q5_K_M.gguf"
24
+
25
+ model_path = hf_hub_download(model_name,
26
+ filename=model_file, local_dir='./')
27
+
28
+ local_llm = "openbiollm-llama3-8b.Q5_K_M.gguf"
29
+
30
+ # Make sure the model path is correct for your system!
31
+ llm = LlamaCpp(
32
+ model_path= local_llm,
33
+ temperature=0.3,
34
+ # max_tokens=2048,
35
+ n_ctx=2048,
36
+ top_p=1
37
+ )
38
+
39
+ print("LLM Initialized....")
40
+
41
+ prompt_template = """Use the following pieces of information to answer the user's question.
42
+ If you don't know the answer, just say that you don't know, don't try to make up an answer.
43
+
44
+ Context: {context}
45
+ Question: {question}
46
+
47
+ Only return the helpful answer. Answer must be detailed and well explained.
48
+ Helpful answer:
49
+ """
50
+
51
+ embeddings = SentenceTransformerEmbeddings(model_name="medicalai/ClinicalBERT")
52
+
53
+ url = "http://localhost:6333"
54
+
55
+ client = QdrantClient(
56
+ url=url, prefer_grpc=False
57
+ )
58
+
59
+ db = Qdrant(client=client, embeddings=embeddings, collection_name="vector_db")
60
+
61
+ prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])
62
+
63
+ retriever = db.as_retriever(search_kwargs={"k":1})
64
+ ensemble_retriever = EnsembleRetriever(retrievers=[retriever,
65
+ keyword_retriever],
66
+ weights=[0.5, 0.5])
67
+ @app.get("/", response_class=HTMLResponse)
68
+ async def read_root(request: Request):
69
+ return templates.TemplateResponse("index.html", {"request": request})
70
+
71
+ @app.post("/get_response")
72
+ async def get_response(query: str = Form(...)):
73
+ chain_type_kwargs = {"prompt": prompt}
74
+ qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=ensemble_retriever, return_source_documents=True, chain_type_kwargs=chain_type_kwargs, verbose=True)
75
+ response = qa(query)
76
+ print(response)
77
+ answer = response['result']
78
+ source_document = response['source_documents'][0].page_content
79
+ doc = response['source_documents'][0].metadata['source']
80
+ response_data = jsonable_encoder(json.dumps({"answer": answer, "source_document": source_document, "doc": doc}))
81
+
82
+ res = Response(response_data)
83
+ return res
app1.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from langchain import PromptTemplate
3
+ from langchain_community.llms import LlamaCpp
4
+ from langchain.chains import RetrievalQA
5
+ from langchain_community.embeddings import SentenceTransformerEmbeddings
6
+ from qdrant_client import QdrantClient
7
+ from langchain_community.vectorstores import Qdrant
8
+ import os
9
+ import json
10
+ from huggingface_hub import hf_hub_download
11
+ from langchain.retrievers import EnsembleRetriever
12
+ # from ingest import ClinicalBertEmbeddings, keyword_retriever
13
+ from langchain_community.llms import CTransformers
14
+ from transformers import AutoTokenizer, AutoModel
15
+ # # Initialize Streamlit app
16
+ # st.set_page_config(page_title="Document Retrieval App", layout='wide')
17
+
18
+ # # Download and initialize LLM model
19
+ # MODEL_PATH = './'
20
+
21
+ # # Some basic configurations for the model
22
+ # config = {
23
+ # "max_new_tokens": 2048,
24
+ # "context_length": 4096,
25
+ # "repetition_penalty": 1.1,
26
+ # "temperature": 0.5,
27
+ # "top_k": 50,
28
+ # "top_p": 0.9,
29
+ # "stream": True,
30
+ # "threads": int(os.cpu_count() / 2)
31
+ # }
32
+
33
+ # # We use Langchain's CTransformers llm class to load our quantized model
34
+ # llm = CTransformers(model=MODEL_PATH,
35
+ # config=config)
36
+
37
+ # # Tokenizer for Mistral-7B-Instruct from HuggingFace
38
+ # tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
39
+ # model_name = "aaditya/OpenBioLLM-Llama3-8B-GGUF"
40
+ # model_file = "openbiollm-llama3-8b.Q5_K_M.gguf"
41
+ # model_path = hf_hub_download(model_name, filename=model_file, local_dir='./')
42
+
43
+ # local_llm = "openbiollm-llama3-8b.Q5_K_M.gguf"
44
+ # llm = LlamaCpp(
45
+ # model_path=local_llm,
46
+ # temperature=0.3,
47
+ # n_ctx=2048,
48
+ # top_p=1
49
+ # )
50
+
51
+ # st.sidebar.title("Document Retrieval App")
52
+
53
+ # # Initialize embeddings
54
+ # embeddings = ClinicalBertEmbeddings()
55
+
56
+ # # Qdrant setup for medical_image collection
57
+ # url = "http://localhost:6333"
58
+ # client_medical = QdrantClient(url=url, prefer_grpc=False)
59
+ # db_medical = Qdrant(client=client_medical, embeddings=embeddings, collection_name="medical_image")
60
+
61
+ # # Qdrant setup for pdf collection
62
+ # client_pdf = QdrantClient(url=url, prefer_grpc=False)
63
+ # db_pdf = Qdrant(client=client_pdf, embeddings=embeddings, collection_name="pdf")
64
+
65
+ # # Define retrievers for both collections
66
+ # retriever_medical = db_medical.as_retriever(search_kwargs={"k": 1})
67
+ # retriever_pdf = db_pdf.as_retriever(search_kwargs={"k": 1})
68
+
69
+ # # Ensemble retriever combining both retrievers
70
+ # ensemble_retriever = EnsembleRetriever(retrievers=[retriever_medical, retriever_pdf], weights=[0.5, 0.5])
71
+
72
+ # # Prompt template for querying
73
+ # prompt_template = """Use the following pieces of information to answer the user's question.
74
+ # If you don't know the answer, just say that you don't know, don't try to make up an answer.
75
+
76
+ # Context: {context}
77
+ # Question: {question}
78
+
79
+ # Only return the helpful answer. Answer must be detailed and well explained.
80
+ # Helpful answer:
81
+ # """
82
+ # prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])
83
+
84
+ # # Streamlit app layout
85
+ # with st.sidebar:
86
+ # query = st.text_area("Enter your query here:")
87
+ # if st.button("Get Response"):
88
+ # st.write("Processing query...")
89
+ # chain_type_kwargs = {"prompt": prompt}
90
+ # qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=ensemble_retriever, return_source_documents=True, chain_type_kwargs=chain_type_kwargs, verbose=True)
91
+ # response = qa(query)
92
+
93
+ # # Process response to extract answer, source document, and metadata
94
+ # answer = response['result']
95
+ # source_document = response['source_documents'][0].page_content
96
+ # doc = response['source_documents'][0].metadata['source']
97
+
98
+ # # Display response
99
+ # st.subheader("Answer:")
100
+ # st.write(answer)
101
+ # st.subheader("Source Document:")
102
+ # st.write(source_document)
103
+ # st.subheader("Document Metadata:")
104
+ # st.write(doc)
105
+
106
+ # # Run the app
107
+ # if __name__ == '__main__':
108
+ # st.title("Document Retrieval App")
109
+ # st.write("Enter your query in the sidebar and click 'Get Response' to retrieve relevant documents.")
110
+ # Define model and prompt template
111
+
112
+
113
+ # Set your Hugging Face API token
114
+ os.environ['HUGGINGFACE_HUB_TOKEN'] = ''
115
+
116
+ model_name = "mistralai/Mistral-7B-Instruct-v0.1"
117
+ model_file = "mistral-7b-instruct.q4_0.bin"
118
+
119
+ model_path = hf_hub_download(model_name, filename=model_file, local_dir='./', use_auth_token='HUGGINGFACE_HUB_TOKEN')
combinedmultimodal.py ADDED
@@ -0,0 +1,621 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import uuid
3
+ from llama_index.vector_stores.qdrant import QdrantVectorStore
4
+ from llama_index.core import VectorStoreIndex, StorageContext
5
+ import qdrant_client
6
+ import torch
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ import clip
9
+ from llama_index.core import Document
10
+ from langchain_community.llms import LlamaCpp
11
+ import numpy as np
12
+ from huggingface_hub import hf_hub_download
13
+ from langchain_community.llms import LlamaCpp
14
+ from llama_index.core import (
15
+ ServiceContext,
16
+ SimpleDirectoryReader,
17
+ )
18
+ import threading
19
+ from dotenv import load_dotenv
20
+ from llama_index.llms.nvidia import NVIDIA
21
+ from open_clip import create_model_from_pretrained, get_tokenizer
22
+ from llama_index.core import Settings
23
+ from llama_index.core import VectorStoreIndex
24
+ from llama_index.core.vector_stores import VectorStoreQuery
25
+ from llama_index.core.query_engine import RetrieverQueryEngine
26
+ from tqdm import tqdm
27
+ from transformers import AutoTokenizer, AutoModel
28
+ from langchain.embeddings.base import Embeddings
29
+ from llama_index.embeddings.langchain import LangchainEmbedding
30
+ from langchain.embeddings.huggingface import HuggingFaceEmbeddings
31
+ from llama_index.core import Settings
32
+ from transformers import AutoProcessor, AutoModel
33
+ import hashlib
34
+ import uuid
35
+ import os
36
+ import gradio as gr
37
+ import torch
38
+ import clip
39
+ import open_clip
40
+ import numpy as np
41
+ from llama_index.core.schema import ImageDocument
42
+ import cv2
43
+ import matplotlib.pyplot as plt
44
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
45
+ from unstructured.partition.pdf import partition_pdf
46
+ from pathlib import Path
47
+ from langchain_community.document_loaders import DirectoryLoader, UnstructuredFileLoader
48
+ from PIL import Image
49
+ import logging
50
+ import concurrent.futures
51
+ import logging
52
+ from llama_index.core import set_global_service_context
53
+ from llama_index.core import Document as LlamaIndexDocument
54
+ import getpass
55
+ import os
56
+ from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
57
+ from sentence_transformers import util
58
+ from transformers import AutoTokenizer, AutoModelForCausalLM
59
+ import base64
60
+ from google.generativeai import GenerativeModel, configure
61
+ import google.generativeai as genai
62
+
63
+ # Configure logging
64
+ # logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
65
+
66
+
67
+ class MetadataMode:
68
+ EMBED = "embed"
69
+ INLINE = "inline"
70
+ NONE = "none"
71
+
72
+ # Define the vectors configuration
73
+ vectors_config = {
74
+ "vector_size": 768, # or whatever the dimensionality of your vectors is
75
+ "distance": "Cosine" # can be "Cosine", "Euclidean", etc.
76
+ }
77
+ class ClinicalBertEmbeddingWrapper:
78
+ def __init__(self, model_name: str = "medicalai/ClinicalBERT"):
79
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
80
+ self.model = AutoModel.from_pretrained(model_name)
81
+ self.model.eval()
82
+
83
+ def embed(self, text: str):
84
+ inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
85
+ with torch.no_grad():
86
+ outputs = self.model(**inputs)
87
+ embeddings = self.mean_pooling(outputs, inputs['attention_mask'])
88
+ return embeddings.squeeze().tolist()
89
+
90
+ def mean_pooling(self, model_output, attention_mask):
91
+ token_embeddings = model_output[0] # First element of model_output contains all token embeddings
92
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
93
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
94
+
95
+ def embed_documents(self, texts):
96
+ return [self.embed(text) for text in texts]
97
+
98
+ def embed_query(self, text):
99
+ return self.embed(text)
100
+ # Implement this method if needed
101
+ def get_text_embedding_batch(self, text_batch, show_progress=False):
102
+ embeddings = []
103
+ num_batches = len(text_batch)
104
+
105
+ # Process in batches of size 8
106
+ batch_size = 8
107
+ for i in tqdm(range(0, num_batches, batch_size), desc="Processing Batches", disable=not show_progress):
108
+ batch_texts = text_batch[i:i + batch_size]
109
+ batch_embeddings = self.embed_documents(batch_texts)
110
+ embeddings.extend(batch_embeddings)
111
+
112
+ return embeddings
113
+ def get_agg_embedding_from_queries(self, queries):
114
+ # Get embeddings for each query using the embed method
115
+ embeddings = [torch.tensor(self.embed(query)) for query in queries]
116
+
117
+ # Convert list of tensors to a single tensor for aggregation
118
+ embeddings_tensor = torch.stack(embeddings)
119
+
120
+ # Example: averaging embeddings
121
+ agg_embedding = embeddings_tensor.mean(dim=0)
122
+
123
+ return agg_embedding.tolist()
124
+
125
+
126
+ # Load environment variables
127
+ load_dotenv()
128
+ genai.configure(api_key=os.environ["GEMINI_API_KEY"])
129
+ nvidia_api_key = os.getenv("NVIDIA_API_KEY")
130
+ if not nvidia_api_key:
131
+ raise ValueError("NVIDIA_API_KEY not found in .env file")
132
+
133
+ os.environ["NVIDIA_API_KEY"] = nvidia_api_key
134
+
135
+ model_name = "aaditya/OpenBioLLM-Llama3-8B-GGUF"
136
+ model_file = "openbiollm-llama3-8b.Q5_K_M.gguf"
137
+ QDRANT_URL = "https://f1e9a70a-afb9-498d-b66d-cb248e0d5557.us-east4-0.gcp.cloud.qdrant.io:6333"
138
+ QDRANT_API_KEY = "REXlX_PeDvCoXeS9uKCzC--e3-LQV0lw3_jBTdcLZ7P5_F6EOdwklA"
139
+
140
+ # Download model
141
+ model_path = hf_hub_download(model_name, filename=model_file, local_dir='./')
142
+ llm = NVIDIA(model="writer/palmyra-med-70b")
143
+ llm.model
144
+ local_llm = "openbiollm-llama3-8b.Q5_K_M.gguf"
145
+ # Initialize ClinicalBert embeddings model
146
+ # text_embed_model = ClinicalBertEmbeddings(model_name="medicalai/ClinicalBERT")
147
+ text_embed_model = ClinicalBertEmbeddingWrapper(model_name="medicalai/ClinicalBERT")
148
+ # Intially I was using this biollm but for faster text response during inference I am going for external models
149
+ #but with this also it works fine.
150
+ llm1 = LlamaCpp(
151
+ model_path=local_llm,
152
+ temperature=0.3,
153
+ n_ctx=2048,
154
+ top_p=1
155
+ )
156
+ Settings.llm = llm
157
+ Settings.embed_model = text_embed_model
158
+ # Define ServiceContext with ClinicalBertEmbeddings for text
159
+ service_context = ServiceContext.from_defaults(
160
+ llm = llm,
161
+ embed_model=text_embed_model # Use ClinicalBert embeddings model
162
+ )
163
+ set_global_service_context(service_context)
164
+ # Just for logging and Debugging
165
+ # Log ServiceContext details
166
+ # logging.debug(f"LLM: {service_context.llm}")
167
+ # logging.debug(f"Embed Model: {service_context.embed_model}")
168
+ # logging.debug(f"Node Parser: {service_context.node_parser}")
169
+ # logging.debug(f"Prompt Helper: {service_context.prompt_helper}")
170
+ # Create QdrantClient with the location set to ":memory:", which means the vector db will be stored in memory
171
+ try:
172
+ text_client = qdrant_client.QdrantClient(
173
+ url=QDRANT_URL,
174
+ api_key=QDRANT_API_KEY,
175
+ port=443,
176
+ )
177
+ print("Qdrant client initialized successfully.")
178
+ except Exception as e:
179
+ print(f"Error initializing Qdrant client: {e}")
180
+ raise
181
+ # load Text documents from the data_wiki directory
182
+ # text_documents = SimpleDirectoryReader("./Data").load_data()
183
+ # Load documents
184
+ loader = DirectoryLoader("./Data/", glob="**/*.pdf", show_progress=True, loader_cls=UnstructuredFileLoader)
185
+ documents = loader.load()
186
+ # Print document names
187
+ for doc in documents:
188
+ print(f"Processing document: {doc.metadata.get('source', 'Unknown')}")
189
+ # Split documents into chunks
190
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=70)
191
+ texts = text_splitter.split_documents(documents)
192
+
193
+
194
+
195
+ print(f"Loaded {len(documents)} documents")
196
+ print(f"Split into {len(texts)} chunks")
197
+ # Convert langchain documents to llama_index documents
198
+ text_documents = [
199
+ LlamaIndexDocument(text=t.page_content, metadata=t.metadata)
200
+ for t in texts
201
+ ]
202
+ # Initialize Qdrant vector store
203
+ try:
204
+ text_vector_store = QdrantVectorStore(
205
+ client=text_client, collection_name="pdf_text"
206
+ )
207
+ print("Qdrant vector store initialized successfully.")
208
+ except Exception as e:
209
+ print(f"Error initializing Qdrant vector store: {e}")
210
+ raise
211
+
212
+ try:
213
+ image_vector_store = QdrantVectorStore(
214
+ client=text_client, collection_name="pdf_img"
215
+ )
216
+ print("Qdrant vector store initialized successfully.")
217
+ except Exception as e:
218
+ print(f"Error initializing Qdrant vector store: {e}")
219
+ raise
220
+
221
+ storage_context = StorageContext.from_defaults(vector_store=text_vector_store)
222
+
223
+ wiki_text_index = VectorStoreIndex.from_documents(text_documents
224
+ # , storage_context=storage_context
225
+ , service_context=service_context
226
+ )
227
+ print(f"VectorStoreIndex created with {len(wiki_text_index.docstore.docs)} documents")
228
+
229
+ # define the streaming query engine
230
+ streaming_qe = wiki_text_index.as_query_engine(streaming=True)
231
+ print(len(text_documents))
232
+
233
+ # Function to query the text vector database
234
+ # Modify the process_query function
235
+
236
+ model, preprocess = clip.load("ViT-B/32")
237
+ input_resolution = model.visual.input_resolution
238
+ context_length = model.context_length
239
+ vocab_size = model.vocab_size
240
+
241
+ print(
242
+ "Model parameters:",
243
+ f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}",
244
+ )
245
+ print("Input resolution:", input_resolution)
246
+ print("Context length:", context_length)
247
+ print("Vocab size:", vocab_size)
248
+
249
+ pdf_directory = Path("./data")
250
+ image_path = Path("./images1")
251
+ image_path.mkdir(exist_ok=True, parents=True)
252
+
253
+ # Dictionary to store image metadata
254
+ image_metadata_dict = {}
255
+
256
+ # Limit the number of images downloaded per PDF
257
+ MAX_IMAGES_PER_PDF = 15
258
+
259
+ # Generate a UUID for each image
260
+ image_uuid = 0
261
+
262
+ # Iterate over each PDF file in the data folder
263
+ for pdf_file in pdf_directory.glob("*.pdf"):
264
+ images_per_pdf = 0
265
+ print(f"Processing: {pdf_file}")
266
+
267
+ # Extract images from the PDF
268
+ try:
269
+ raw_pdf_elements = partition_pdf(
270
+ filename=str(pdf_file),
271
+ extract_images_in_pdf=True,
272
+ infer_table_structure=True,
273
+ chunking_strategy="by_title",
274
+ max_characters=4000,
275
+ new_after_n_chars=3800,
276
+ combine_text_under_n_chars=2000,
277
+ extract_image_block_output_dir=image_path,
278
+ )
279
+ # Loop through the elements
280
+ except Exception as e:
281
+ print(f"Error processing {pdf_file}: {e}")
282
+ import traceback
283
+ traceback.print_exc()
284
+ continue
285
+ # Function to summarize images
286
+ def summarize_image(image_path):
287
+ # Load and encode the image
288
+ with open(image_path, "rb") as image_file:
289
+ encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
290
+
291
+ # Create a GenerativeModel object
292
+ model = GenerativeModel('gemini-1.5-flash')
293
+
294
+ # Prepare the prompt
295
+ prompt = """
296
+ You are an expert in analyzing medical images. Please provide a detailed description of this medical image, including:
297
+ 1. You are a bot that is good at analyzing images related to Dog's health
298
+ 2. The body part or area being examined
299
+ 3. Any visible structures, organs, or tissues
300
+ 4. Any abnormalities, lesions, or notable features
301
+ 5. Any other relevant medical diagram description.
302
+
303
+ Please be as specific and detailed as possible in your analysis.
304
+ """
305
+
306
+ # Generate the response
307
+ response = model.generate_content([
308
+ prompt,
309
+ {"mime_type": "image/jpeg", "data": encoded_image}
310
+ ])
311
+
312
+ return response.text
313
+
314
+ # # Iterate through each file in the directory
315
+ for image_file in os.listdir(image_path):
316
+ if image_file.endswith(('.jpg', '.jpeg', '.png')):
317
+ # Generate a standard UUID for the image
318
+ image_uuid = str(uuid.uuid4())
319
+ image_file_name = image_file
320
+ image_file_path = image_path / image_file
321
+ # Generate image summary
322
+ # image_summary = generate_image_summary_with(str(image_file_path), model, feature_extractor, tokenizer, device)
323
+ # image_summary = generate_summary_with_lm(str(image_file_path), preprocess, model, device, tokenizer, lm_model)
324
+ image_summary = summarize_image(image_file_path)
325
+ # Construct metadata entry for the image
326
+ image_metadata_dict[image_uuid] = {
327
+ "filename": image_file_name,
328
+ "img_path": str(image_file_path), # Store the absolute path to the image
329
+ "summary": image_summary # Add the summary to the metadata
330
+ }
331
+
332
+ # Limit the number of images processed per folder
333
+ if len(image_metadata_dict) >= MAX_IMAGES_PER_PDF:
334
+ break
335
+
336
+ print(f"Number of items in image_dict: {len(image_metadata_dict)}")
337
+
338
+ # Print the metadata dictionary
339
+ for key, value in image_metadata_dict.items():
340
+ print(f"UUID: {key}, Metadata: {value}")
341
+
342
+
343
+ def plot_images_with_opencv(image_metadata_dict):
344
+ original_images_urls = []
345
+ images_shown = 0
346
+
347
+ plt.figure(figsize=(16, 16)) # Adjust the figure size as needed
348
+
349
+ for image_id in image_metadata_dict:
350
+ img_path = image_metadata_dict[image_id]["img_path"]
351
+ if os.path.isfile(img_path):
352
+ try:
353
+ img = cv2.imread(img_path)
354
+ if img is not None:
355
+ # Convert BGR (OpenCV) to RGB (matplotlib)
356
+ img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
357
+
358
+ plt.subplot(8, 8, len(original_images_urls) + 1)
359
+ plt.imshow(img_rgb)
360
+ plt.xticks([])
361
+ plt.yticks([])
362
+
363
+ original_images_urls.append(image_metadata_dict[image_id]["filename"])
364
+ images_shown += 1
365
+ if images_shown >= 64:
366
+ break
367
+ except Exception as e:
368
+ print(f"Error processing image {img_path}: {e}")
369
+
370
+ plt.tight_layout()
371
+ plt.show()
372
+
373
+ plot_images_with_opencv(image_metadata_dict)
374
+ # set the device to use for the CLIP model, either CUDA (GPU) or CPU, depending on availability
375
+ device = "cuda" if torch.cuda.is_available() else "cpu"
376
+ print(device)
377
+ # Function to preprocess image using OpenCV
378
+ def preprocess_image(img):
379
+ # Convert BGR to RGB
380
+ img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
381
+ # Convert the image to a PIL Image and then preprocess
382
+ img_pil = Image.fromarray(img_rgb)
383
+ return preprocess(img_pil)
384
+ # Use BiomedCLIP processor for preprocessing
385
+ # return preprocess(images=img_pil, return_tensors="pt")
386
+ # return preprocess(img_pil).unsqueeze(0)
387
+
388
+
389
+ img_emb_dict = {}
390
+ with torch.no_grad():
391
+ for image_id in image_metadata_dict:
392
+ img_file_path = image_metadata_dict[image_id]["img_path"]
393
+ if os.path.isfile(img_file_path):
394
+ try:
395
+ # Load image using OpenCV
396
+ img = cv2.imread(img_file_path)
397
+
398
+ if img is not None:
399
+ # Preprocess image
400
+ image = preprocess_image(img).unsqueeze(0).to(device)
401
+ # image = preprocess_image(img).to(device)
402
+
403
+ # Extract image features
404
+ image_features = model.encode_image(image)
405
+
406
+ # Store image features
407
+ img_emb_dict[image_id] = image_features
408
+ else:
409
+ print(f"Failed to load image {img_file_path}")
410
+ except Exception as e:
411
+ print(f"Error processing image {img_file_path}: {e}")
412
+
413
+ len(img_emb_dict) #22 image so 22 img emb
414
+
415
+
416
+
417
+
418
+ # create a list of ImageDocument objects, one for each image in the dataset
419
+ img_documents = []
420
+ for image_filename in image_metadata_dict:
421
+ # the img_emb_dict dictionary contains the image embeddings
422
+ if image_filename in img_emb_dict:
423
+ filename = image_metadata_dict[image_filename]["filename"]
424
+ filepath = image_metadata_dict[image_filename]["img_path"]
425
+ summary = image_metadata_dict[image_filename]["summary"]
426
+ #print(filepath)
427
+
428
+ # create an ImageDocument for each image
429
+ newImgDoc = ImageDocument(
430
+ text=filename, metadata={"filepath": filepath, "summary": summary} # Include the summary in the metadata
431
+
432
+ )
433
+
434
+ # set image embedding on the ImageDocument
435
+ newImgDoc.embedding = img_emb_dict[image_filename].tolist()[0]
436
+ img_documents.append(newImgDoc)
437
+
438
+ # define storage context
439
+ storage_context = StorageContext.from_defaults(vector_store=image_vector_store)
440
+
441
+ # define image index
442
+ image_index = VectorStoreIndex.from_documents(
443
+ img_documents,
444
+ storage_context=storage_context
445
+ )
446
+ # for doc in img_documents:
447
+ # print(f"ImageDocument: {doc.text}, Embedding: {doc.embedding}, Metadata: {doc.metadata}")
448
+
449
+ def retrieve_results_from_image_index(query):
450
+ """ take a text query as input and return the most similar image from the vector store """
451
+
452
+ # first tokenize the text query and convert it to a tensor
453
+ text = clip.tokenize(query).to(device)
454
+
455
+ # encode the text tensor using the CLIP model to produce a query embedding
456
+ query_embedding = model.encode_text(text).tolist()[0]
457
+ # Encode the query using ClinicalBERT for text similarity
458
+ clinical_query_embedding = text_embed_model.embed_query(query)
459
+ # create a VectorStoreQuery
460
+ image_vector_store_query = VectorStoreQuery(
461
+ query_embedding=query_embedding,
462
+ similarity_top_k=1, # returns 1 image
463
+ mode="default",
464
+ )
465
+
466
+ # execute the query against the image vector store
467
+ image_retrieval_results = image_vector_store.query(
468
+ image_vector_store_query
469
+ )
470
+ if image_retrieval_results.nodes:
471
+ best_score = -1
472
+ best_image = None
473
+
474
+ for node, clip_score in zip(image_retrieval_results.nodes, image_retrieval_results.similarities):
475
+ image_path = node.metadata["filepath"]
476
+ image_summary = node.metadata.get("summary", "") # Assuming summaries are stored in metadata
477
+
478
+ # Calculate text similarity between query and image summary
479
+ summary_embedding = text_embed_model.embed_query(image_summary)
480
+ # text_score = util.cosine_similarity(
481
+ # [clinical_query_embedding], [summary_embedding]
482
+ # )[0][0]
483
+ # Use util.cos_sim for cosine similarity
484
+ text_score = util.cos_sim(torch.tensor([clinical_query_embedding]),
485
+ torch.tensor([summary_embedding]))[0][0].item()
486
+
487
+
488
+ # Calculate average similarity score
489
+ avg_score = (clip_score + text_score) / 2
490
+
491
+ if avg_score > best_score:
492
+ best_score = avg_score
493
+ best_image = image_path
494
+
495
+ return best_image, best_score
496
+
497
+ return None, 0.0
498
+
499
+ def plot_image_retrieve_results(image_retrieval_results):
500
+ """ Take a list of image retrieval results and create a new figure """
501
+
502
+ plt.figure(figsize=(16, 5))
503
+
504
+ img_cnt = 0
505
+
506
+ # Iterate over the image retrieval results, and for each result, display the corresponding image and its score in a subplot.
507
+ # The title of the subplot is the score of the image, formatted to four decimal places.
508
+
509
+ for returned_image, score in zip(
510
+ image_retrieval_results.nodes, image_retrieval_results.similarities
511
+ ):
512
+ img_name = returned_image.text
513
+ img_path = returned_image.metadata["filepath"]
514
+
515
+ # Read image using OpenCV
516
+ image = cv2.imread(img_path)
517
+ # Convert image to RGB format (OpenCV reads in BGR by default)
518
+ image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
519
+
520
+ plt.subplot(2, 3, img_cnt + 1)
521
+ plt.title("{:.4f}".format(score))
522
+
523
+ plt.imshow(image_rgb)
524
+ plt.xticks([])
525
+ plt.yticks([])
526
+ img_cnt += 1
527
+
528
+ plt.tight_layout()
529
+ plt.show()
530
+ def get_all_images():
531
+ image_paths = []
532
+ for _, metadata in image_metadata_dict.items():
533
+ image_paths.append(metadata["img_path"])
534
+ return image_paths
535
+
536
+ def load_image(image_path):
537
+ return Image.open(image_path)
538
+
539
+ # Define the combined query function
540
+ def combined_query(query, similarity_threshold=0.3):
541
+ # Text query
542
+ text_response = streaming_qe.query(query)
543
+ text_result = ""
544
+ for text in text_response.response_gen:
545
+ text_result += text
546
+
547
+ # Image query
548
+ top_image_path, similarity_score = retrieve_results_from_image_index(query)
549
+
550
+ if similarity_score >= similarity_threshold:
551
+ return text_result, top_image_path, similarity_score
552
+ else:
553
+ return text_result, None, similarity_score
554
+ def gradio_interface(query):
555
+ text_result, image_path, similarity_score = combined_query(query)
556
+ top_image = load_image(image_path) if image_path else None
557
+ all_images = [load_image(path) for path in get_all_images()]
558
+ return text_result, top_image, all_images, f"Similarity Score: {similarity_score:.4f}"
559
+
560
+ with gr.Blocks() as iface:
561
+ gr.Markdown("# Medical Knowledge Base Query System")
562
+
563
+ with gr.Row():
564
+ query_input = gr.Textbox(lines=2, placeholder="Enter your medical query here...")
565
+ submit_button = gr.Button("Submit")
566
+
567
+ with gr.Row():
568
+ text_output = gr.Textbox(label="Text Response")
569
+ image_output = gr.Image(label="Top Related Image (if similarity > threshold)")
570
+
571
+ similarity_score_output = gr.Textbox(label="Similarity Score")
572
+
573
+ gallery_output = gr.Gallery(label="All Extracted Images", show_label=True, elem_id="gallery")
574
+
575
+ submit_button.click(
576
+ fn=gradio_interface,
577
+ inputs=query_input,
578
+ outputs=[text_output, image_output, gallery_output, similarity_score_output]
579
+ )
580
+
581
+ # Load all images on startup
582
+ iface.load(lambda: ["", None, [load_image(path) for path in get_all_images()], ""],
583
+ outputs=[text_output, image_output, gallery_output, similarity_score_output])
584
+ # Launch the Gradio interface
585
+ iface.launch(share=True)
586
+ # just to check if it works or not
587
+ # def image_query(query):
588
+ # image_retrieval_results = retrieve_results_from_image_index(query)
589
+ # plot_image_retrieve_results(image_retrieval_results)
590
+
591
+ # query1 = "What is gingivitis?"
592
+ # # generate image retrieval results
593
+ # image_query(query1)
594
+
595
+ # # Modify your text query function
596
+ # # def text_query(query):
597
+ # # text_retrieval_results = process_query(query, text_embed_model, k=10)
598
+ # # return text_retrieval_results
599
+ # # Function to query the text vector database
600
+
601
+
602
+ # def text_query(query: str, k: int = 10):
603
+ # # Create a VectorStoreIndex from the existing vector store
604
+ # index = VectorStoreIndex.from_vector_store(text_vector_store)
605
+
606
+ # # Create a retriever with top-k configuration
607
+ # retriever = index.as_retriever(similarity_top_k=k)
608
+
609
+ # # Create a query engine
610
+ # query_engine = RetrieverQueryEngine.from_args(retriever)
611
+
612
+ # # Execute the query
613
+ # response = query_engine.query(query)
614
+
615
+ # return response
616
+
617
+ # # text_retrieval_results = text_query(query1)
618
+ # streaming_response = streaming_qe.query(
619
+ # query1
620
+ # )
621
+ # streaming_response.print_response_stream()
freeze ADDED
File without changes
images.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from unstructured.partition.pdf import partition_pdf
2
+ output_path = "./images"
3
+ raw_pdf_elements = partition_pdf(
4
+ filename="./Data/AC-Aids-for-Dogs_Canine-Periodontal-Disease.pdf",
5
+ extract_images_in_pdf=True,
6
+ infer_table_structure=True,
7
+ chunking_strategy="by_title",
8
+ max_characters=4000,
9
+ new_after_n_chars=3800,
10
+ combine_text_under_n_chars=2000,
11
+ extract_image_block_output_dir=output_path,
12
+ )
images/architecture.png ADDED
images/figure-1-1.jpg ADDED
images/figure-1-10.jpg ADDED
images/figure-1-11.jpg ADDED
images/figure-1-2.jpg ADDED
images/figure-1-3.jpg ADDED
images/figure-1-4.jpg ADDED
images/figure-1-5.jpg ADDED
images/figure-1-6.jpg ADDED
images/figure-1-7.jpg ADDED
images/figure-1-8.jpg ADDED
images/figure-1-9.jpg ADDED
images/multimodal.png ADDED

Git LFS Details

  • SHA256: dbc3231d2c2523f245d369566eb3ff16441e399b45fdcc5fa52ceae806a8339d
  • Pointer size: 132 Bytes
  • Size of remote file: 1.79 MB
images1/figure-1-1.jpg ADDED
images1/figure-1-10.jpg ADDED
images1/figure-1-11.jpg ADDED
images1/figure-1-2.jpg ADDED
images1/figure-1-3.jpg ADDED
images1/figure-1-4.jpg ADDED
images1/figure-1-5.jpg ADDED
images1/figure-1-6.jpg ADDED