sajal2692 commited on
Commit
274be20
0 Parent(s):

add hfspace files

Browse files
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # default python gitignore
2
+ # Byte-compiled / optimized / DLL files
3
+ __pycache__/
4
+ *.py[cod]
5
+ *$py.class
6
+
7
+ # C extensions
8
+ *.so
9
+
10
+ # Distribution / packaging
11
+ .Python
12
+ build/
13
+ develop-eggs/
14
+ dist/
15
+ downloads/
16
+ eggs/
17
+ .eggs/
18
+ lib/
19
+ lib64/
20
+ parts/
21
+ sdist/
22
+ var/
23
+ wheels/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+ cover/
54
+
55
+ # Translations
56
+ *.mo
57
+ *.pot
58
+
59
+ # Django stuff:
60
+ *.log
61
+ local_settings.py
62
+ db.sqlite3
63
+ db.sqlite3-journal
64
+
65
+ # Flask stuff:
66
+ instance/
67
+ .webassets-cache
68
+
69
+ # Scrapy stuff:
70
+ .scrapy
71
+
72
+ # Sphinx documentation
73
+ docs/_build/
74
+
75
+ # PyBuilder
76
+ .pybuilder/
77
+ target/
78
+
79
+ # Jupyter Notebook
80
+ .ipynb_checkpoints
81
+
82
+ # IPython
83
+ profile_default/
84
+ ipython_config.py
85
+
86
+ # pyenv
87
+ # For a library or package, you might want to ignore these files since the code is
88
+ # intended to run in multiple environments; otherwise, check them in:
89
+ # .python-version
90
+
91
+ # pipenv
92
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
93
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
94
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
95
+ # install all needed dependencies.
96
+ #Pipfile.lock
97
+
98
+ # poetry
99
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
100
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
101
+ # commonly ignored for libraries.
102
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
103
+ #poetry.lock
104
+
105
+ # pdm
106
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
107
+ #pdm.lock
108
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
109
+ # in version control.
110
+ # https://pdm.fming.dev/#use-with-ide
111
+ .pdm.toml
112
+
113
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
114
+ __pypackages__/
115
+
116
+ # Celery stuff
117
+ celerybeat-schedule
118
+ celerybeat.pid
119
+
120
+ # SageMath parsed files
121
+ *.sage.py
122
+
123
+ # Environments
124
+ .env
125
+ .venv
126
+ env/
127
+ venv/
128
+ ENV/
129
+ env.bak/
130
+ venv.bak/
131
+
132
+ # Spyder project settings
133
+ .spyderproject
134
+ .spyproject
135
+
136
+ # Rope project settings
137
+ .ropeproject
138
+
139
+ # mkdocs documentation
140
+ /site
141
+
142
+ # mypy
143
+ .mypy_cache/
144
+ .dmypy.json
145
+ dmypy.json
146
+
147
+ # Pyre type checker
148
+ .pyre/
149
+
150
+ # pytype static type analyzer
151
+ .pytype/
152
+
153
+ # Cython debug symbols
154
+ cython_debug/
155
+
156
+ # PyCharm
157
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
158
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
159
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
160
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
161
+ #.idea/
162
+
163
+ # Mac OS
164
+ .DS_Store
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: {{title}}
3
+ emoji: {{emoji}}
4
+ colorFrom: {{colorFrom}}
5
+ colorTo: {{colorTo}}
6
+ sdk: {{sdk}}
7
+ sdk_version: {{sdkVersion}}
8
+ app_file: src/app.py
9
+ pinned: false
10
+ ---
11
+
12
+ # hello-sajal-sap
13
+ Repo for SAP Gen AI Developer assessment.
data/source.md ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Sajal Sharma
2
+
3
+ ## Contact Info
4
+
5
+ +65 9077-9093 | contact@sajalsharma.com | [LinkedIn](linkedin.com/in/sajals) | [GitHub](github.com/sajal2692)
6
+
7
+ ## Professional Summary
8
+ With over 7 years in the AI & ML space, my career is a testament
9
+ to the pursuit of creating AI systems that not only enhance but
10
+ harmonize with human life. Specializing in Natural Language
11
+ Processing, I've led groundbreaking projects across Asia, focusing
12
+ on transforming data into intuitive, meaningful experiences. My role
13
+ is not just about engineering solutions; it's about envisioning a future
14
+ where AI and humans coexist in a symbiotic relationship, fostering
15
+ creativity and innovation.
16
+
17
+ My expertise in Deep Learning technologies drives me to push
18
+ boundaries in AI applications. Having led teams and projects from
19
+ concept to implementation, I've helped businesses integrate AI
20
+ into their core strategies, redefining efficiency and effectiveness.
21
+
22
+ This experience underscores my commitment to leveraging AI for
23
+ enhancing human capabilities and enriching everyday experiences.
24
+
25
+ I am always eager to engage in discussions about AI's potential,
26
+ whether it be through collaboration, consulting, or a simple exchange
27
+ of ideas. The power of AI lies in collective effort and shared vision,
28
+ and I'm passionate about being at the forefront of this exciting and
29
+ evolving field.
30
+
31
+ ## Experience
32
+
33
+ ### Lead AI Engineer
34
+ **Aug 2023 – Present**
35
+ **OneByZero (contracted via Unscrambl), Singapore**
36
+ - Spearheading the AI Engineering team, pioneering bleeding-edge AI solutions by integrating proprietary and open-source LLMs, RAG, and vector databases, alongside techniques like prompt engineering, chaining, function
37
+ calling, and agent-based systems. Currently delivering transformative AI strategies across multiple high-value projects for clients throughout Asia.
38
+ Page 1 of 5
39
+ - Collaborated with key stakeholders to conceptualize and realize generative AI use cases, aligning innovative solutions with core business objectives to maximize operational and strategic impact.
40
+ - Assisted the sales team in securing deals by preparing and presenting technical collateral, driving client engagement through expert communication of complex AI concepts.
41
+ - Conducted technical workshops at client locations, promoting generative AI awareness and knowledge sharing, and reinforcing the company's position as an industry thought leader.
42
+ - Developed a POC for an LLM-powered chatbot for a major bank in the Philippines, showcased at the Singapore Fintech Festival, utilizing AWS infrastructure, including Bedrock, Sagemaker & Opensearch, focusing on LLM
43
+ safety and quality.
44
+ - Revamped data processing for a major appliance retailer by leveraging LLMs to automate data extraction, cutting product data entry time from 2 hours to just 15 minutes, significantly boosting e-commerce efficiency and accuracy.
45
+ - Improved the accuracy of a demand forecasting model for a major Thai Home Improvement retailer from 30% to 70%, employing deep learning time series forecasting, feature engineering and addressing data quality issues.
46
+
47
+ ### Senior AI Engineer
48
+ **Jan 2023 – Aug 2023**
49
+ **Splore, a Temasek-backed AI startup (contracted via Unscrambl), Singapore**
50
+ - Led overall efforts to enhance the user search experience, resulting in a 50% reduction in incorrectly answered queries and a notable increase in user
51
+ engagement.
52
+ - Collaborated with Product Managers, Data Engineers, Data Scientists, and MLOps Engineers to align AI development with business objectives, enhancing
53
+ strategic integration of AI capabilities.
54
+ - Assisted in the expansion of the AI team from 2 to 6, focusing on roles in search engineering, NLP, MLOps with a diverse set of expertise areas including ranking algorithms and large language model applications.
55
+ - Engineered an advanced NLP pipeline to analyze user queries, incorporating Symspell for spell checking, SpaCy for NER, semantic similarity for small talk detection, and ToxicBERT for profanity filtering, thereby enhancing query understanding and categorization.
56
+ - Developed a question-answering system for queries about current affairs using Langchain, GPT model and Bing Search, which tripled user engagement and daily queries.
57
+ - Developed an automated query-result dataset generation and labeling pipeline using GPT, LabelStudio, and Huggingface, facilitating pre-production performance evaluation of the search system.
58
+ - Implemented rigorous data quality checkpoints using Great Expectations, in collaboration with the Data Engineering team, improving system efficiency and search result relevance.
59
+ - Enhanced Vespa search engine performance, improving the Mean Average Precision (MAP) metric from 0.3 to 0.8 by refining ranking algorithms and integrating semantic search with vector embeddings, resulting in markedly
60
+ improved search accuracy and user engagement.
61
+ - Fine-tuned sentence transformer models for better embeddings, specifically targeting gaming-related vocabularies, which enhanced the relevance and precision of gaming query responses and contributed to improved user interaction with AI systems.
62
+
63
+ ### Senior Machine Learning Engineer
64
+ **Apr 2020 – Dec 2022**
65
+ **Unscrambl, India**
66
+ - Key member of Unscrambl’s NLP Engineering team, helping enhance the natural language understanding of their business analytics platform. Focused on advancing NER, intent recognition, and ANNOY model functionalities.
67
+ - Developed the NL to SQL system data preparation pipeline using NLTK and spaCy, cutting over 5 hours of manual effort daily and boosting system efficiency.
68
+
69
+ ### Machine Learning Engineer
70
+ **Aug 2017 – Apr 2020**
71
+ **Unscrambl, India**
72
+ - Collaborated in a dynamic, cross-functional team of 6 to research, design, and develop NLP-driven chatbot products, aligning technological capabilities with market needs and user experiences.
73
+ - Strategically coordinated with the customer solutions team to lead the deployment of chatbot solutions for clients across Asia, significantly impacting over 100,000 monthly users. Utilized Microsoft Bot Framework and Azure Cognitive Services for intent recognition and dialogue flows to ensure optimal performance and user engagement in diverse markets.
74
+
75
+ ## Education
76
+
77
+ **The University of Melbourne**
78
+ Master of Information Technology, Major in Computing
79
+ Melbourne, Australia
80
+ Aug 2014 – Aug 2016
81
+
82
+ **Bharatiya Vidyapeeth University**
83
+ Bachelor of Computer Applications
84
+ New Delhi, India
85
+ Jul 2010 – Jul 2013
86
+
87
+ ## Technical Skills
88
+
89
+ - Languages: Python (Fluent), SQL (Proficient), Javascript (Proficient), Ruby (Familiar)
90
+ - AI & ML Technologies: Huggingface, Sentence Transformers, Pytorch, GPT, Claude, llama, Langchain, BERT, SpaCy, NLTK
91
+ - Tools & Frameworks: Git, Docker, AWS, Google Cloud Platform, Opensearch, Vespa, Pinecone, VsCode, Unix
92
+
93
+ ## Activities
94
+
95
+ - Mentor & Project Reviewer, Udacity: Coached 100+ international students enrolled in Data Science courses. Recognised as an elite mentor in 2021 with A+ mentor performance grade based on student feedback scores.
96
+ - Mentor, STEM Industry Mentoring Programme, The University of Melbourne: Jul 2020 - Present
97
+ - Creator, Data Science Portfolio: Github repo with 900+ stars showcasing various classical Data Science projects.
98
+
99
+ ## Languages
100
+
101
+ - Hindi (Native or Bilingual)
102
+ - English (Native or Bilingual)
103
+ - German (Elementary)
104
+
105
+
106
+ ## Certifications
107
+
108
+ - **Practical Data Science on the AWS Cloud**
109
+ DeepLearning.AI, Jan 2023
110
+
111
+ - **Machine Learning Specialization**
112
+ DeepLearning.AI, Jul 2022
113
+ Skills: TensorFlow, Keras, Data Science
114
+
115
+ - **Deep Learning Specialization**
116
+ DeepLearning.AI, Apr 2022
117
+ Skills: TensorFlow, Keras, Deep Learning, Data Science
118
+
119
+ - **Natural Language Processing Nanodegree**
120
+ Udacity, Jun 2020
121
+ Skills: Deep Learning
122
+
123
+ - **Mathematics for Machine Learning Specialization**
124
+ Coursera, Nov 2019
125
+
126
+ - **React Nanodegree**
127
+ Udacity, Jan 2018
128
+
129
+ - **Machine Learning Nanodegree**
130
+ Udacity, Mar 2017
131
+
132
+ - **Inferential Statistics**
133
+ Coursera Course Certificates, Feb 2017
134
+
135
+ - **Data Science and Machine Learning with R**
136
+ Udemy, Dec 2016
137
+
138
+ - **Introduction to Probability and Data**
139
+ Coursera Course Certificates, Dec 2016
140
+
141
+ ## Hobbies
142
+ - I love playing video games, listening to music and reading books in my free time!
143
+ - My favourite games include The Last of Us series from Naughty Dog, and God of War from Sony Santa Monica.
144
+ - My favourite music artists include Kendrick Lamar, Radiohead, & The Beatles. I love listening to indie rock and hip hop.
145
+ - My favourite genre of books is non-fiction and magical realism. My favourite book is To Kill a Mockingbird.
requirements.txt ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ aiohttp==3.9.3
3
+ aiosignal==1.3.1
4
+ altair==5.2.0
5
+ annotated-types==0.6.0
6
+ anyio==4.3.0
7
+ asgiref==3.7.2
8
+ attrs==23.2.0
9
+ backoff==2.2.1
10
+ bcrypt==4.1.2
11
+ beautifulsoup4==4.12.3
12
+ build==1.0.3
13
+ cachetools==5.3.2
14
+ certifi==2024.2.2
15
+ chardet==5.2.0
16
+ charset-normalizer==3.3.2
17
+ chroma-hnswlib==0.7.3
18
+ chromadb==0.4.23
19
+ click==8.1.7
20
+ colorama==0.4.6
21
+ coloredlogs==15.0.1
22
+ contourpy==1.2.0
23
+ cycler==0.12.1
24
+ dataclasses-json==0.6.4
25
+ dataclasses-json-speakeasy==0.5.11
26
+ Deprecated==1.2.14
27
+ distro==1.9.0
28
+ emoji==2.10.1
29
+ fastapi==0.109.2
30
+ ffmpy==0.3.2
31
+ filelock==3.13.1
32
+ filetype==1.2.0
33
+ flatbuffers==23.5.26
34
+ fonttools==4.49.0
35
+ frozenlist==1.4.1
36
+ fsspec==2024.2.0
37
+ google-auth==2.28.1
38
+ googleapis-common-protos==1.62.0
39
+ gradio==4.19.2
40
+ gradio_client==0.10.1
41
+ grpcio==1.62.0
42
+ h11==0.14.0
43
+ httpcore==1.0.4
44
+ httptools==0.6.1
45
+ httpx==0.27.0
46
+ huggingface-hub==0.20.3
47
+ humanfriendly==10.0
48
+ idna==3.6
49
+ importlib-metadata==6.11.0
50
+ importlib-resources==6.1.1
51
+ Jinja2==3.1.3
52
+ joblib==1.3.2
53
+ jsonpatch==1.33
54
+ jsonpath-python==1.0.6
55
+ jsonpointer==2.4
56
+ jsonschema==4.21.1
57
+ jsonschema-specifications==2023.12.1
58
+ kiwisolver==1.4.5
59
+ kubernetes==29.0.0
60
+ langchain==0.1.9
61
+ langchain-community==0.0.24
62
+ langchain-core==0.1.26
63
+ langchain-openai==0.0.7
64
+ langchainhub==0.1.14
65
+ langdetect==1.0.9
66
+ langgraph==0.0.26
67
+ langsmith==0.1.7
68
+ lxml==5.1.0
69
+ Markdown==3.5.2
70
+ markdown-it-py==3.0.0
71
+ MarkupSafe==2.1.5
72
+ marshmallow==3.20.2
73
+ matplotlib==3.8.3
74
+ mdurl==0.1.2
75
+ mmh3==4.1.0
76
+ monotonic==1.6
77
+ mpmath==1.3.0
78
+ multidict==6.0.5
79
+ mypy-extensions==1.0.0
80
+ nltk==3.8.1
81
+ numpy==1.26.4
82
+ oauthlib==3.2.2
83
+ onnxruntime==1.17.0
84
+ openai==1.12.0
85
+ opentelemetry-api==1.23.0
86
+ opentelemetry-exporter-otlp-proto-common==1.23.0
87
+ opentelemetry-exporter-otlp-proto-grpc==1.23.0
88
+ opentelemetry-instrumentation==0.44b0
89
+ opentelemetry-instrumentation-asgi==0.44b0
90
+ opentelemetry-instrumentation-fastapi==0.44b0
91
+ opentelemetry-proto==1.23.0
92
+ opentelemetry-sdk==1.23.0
93
+ opentelemetry-semantic-conventions==0.44b0
94
+ opentelemetry-util-http==0.44b0
95
+ orjson==3.9.15
96
+ overrides==7.7.0
97
+ packaging==23.2
98
+ pandas==2.2.1
99
+ pillow==10.2.0
100
+ posthog==3.4.2
101
+ protobuf==4.25.3
102
+ pulsar-client==3.4.0
103
+ pyasn1==0.5.1
104
+ pyasn1-modules==0.3.0
105
+ pydantic==2.6.2
106
+ pydantic_core==2.16.3
107
+ pydub==0.25.1
108
+ Pygments==2.17.2
109
+ pyparsing==3.1.1
110
+ PyPika==0.48.9
111
+ pyproject_hooks==1.0.0
112
+ python-dateutil==2.8.2
113
+ python-dotenv==1.0.1
114
+ python-iso639==2024.2.7
115
+ python-magic==0.4.27
116
+ python-multipart==0.0.9
117
+ pytz==2024.1
118
+ PyYAML==6.0.1
119
+ rapidfuzz==3.6.1
120
+ referencing==0.33.0
121
+ regex==2023.12.25
122
+ requests==2.31.0
123
+ requests-oauthlib==1.3.1
124
+ rich==13.7.0
125
+ rpds-py==0.18.0
126
+ rsa==4.9
127
+ ruff==0.2.2
128
+ semantic-version==2.10.0
129
+ setuptools==68.2.2
130
+ shellingham==1.5.4
131
+ six==1.16.0
132
+ sniffio==1.3.0
133
+ soupsieve==2.5
134
+ SQLAlchemy==2.0.27
135
+ starlette==0.36.3
136
+ sympy==1.12
137
+ tabulate==0.9.0
138
+ tenacity==8.2.3
139
+ tiktoken==0.6.0
140
+ tokenizers==0.15.2
141
+ tomlkit==0.12.0
142
+ toolz==0.12.1
143
+ tqdm==4.66.2
144
+ typer==0.9.0
145
+ types-requests==2.31.0.20240218
146
+ typing-inspect==0.9.0
147
+ typing_extensions==4.9.0
148
+ tzdata==2024.1
149
+ unstructured==0.11.8
150
+ unstructured-client==0.18.0
151
+ urllib3==2.2.1
152
+ uvicorn==0.27.1
153
+ uvloop==0.19.0
154
+ watchfiles==0.21.0
155
+ websocket-client==1.7.0
156
+ websockets==11.0.3
157
+ wheel==0.41.2
158
+ wrapt==1.16.0
159
+ yarl==1.9.4
160
+ zipp==3.17.0
src/app.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import gradio as gr
4
+
5
+ from langchain_core.messages import AIMessage, HumanMessage
6
+ from langchain_openai import ChatOpenAI
7
+ from dotenv import load_dotenv
8
+
9
+ from graph import AssistantGraph
10
+
11
+
12
+ # load the environment variables
13
+ load_dotenv()
14
+
15
+ VECTOR_DB_PATH = "data/chroma_db"
16
+ SOURCE_DATA_PATH = "data/source.md"
17
+
18
+ # define llm
19
+ llm = ChatOpenAI(model=os.getenv("OPENAI_MODEL"), temperature=0)
20
+
21
+ # create instance of assistant graph
22
+ app = AssistantGraph(llm=llm, vector_db_path=VECTOR_DB_PATH, source_data_path=SOURCE_DATA_PATH)
23
+
24
+ def process_history(history):
25
+ """Return the history as a list of HumanMessage and AIMessage tuples"""
26
+ chat_history = []
27
+ for pair in history:
28
+ human_message, ai_message = pair
29
+ chat_history.append(HumanMessage(content=human_message))
30
+ chat_history.append(AIMessage(content=ai_message))
31
+ return chat_history
32
+
33
+ def run(message, history):
34
+ chat_history = process_history(history[1:]) # ignore the auto message
35
+ inputs = {"keys": {"message": message, "history": chat_history}}
36
+ result = app.run(inputs)
37
+ response = result["keys"]["response"]
38
+ return response
39
+
40
+ initial_message = "Hi there! I'm Saj, an AI assistant built by Sajal Sharma. I'm here to answer any questions you may have about Sajal. Ask me anything!"
41
+
42
+ if __name__ == "__main__":
43
+ gr.ChatInterface(run, chatbot=gr.Chatbot(value=[[None, initial_message]])).launch(auth=("sap-review", "polo1010"))
src/chains/document_grader.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Implements the document grader chain"""
2
+
3
+ from langchain_core.pydantic_v1 import BaseModel, Field
4
+ from langchain_core.utils.function_calling import convert_to_openai_tool
5
+ from langchain.output_parsers.openai_tools import PydanticToolsParser
6
+ from langchain_openai import ChatOpenAI
7
+ from langchain_core.prompts import PromptTemplate
8
+ from textwrap import dedent
9
+
10
+ import os
11
+ from dotenv import load_dotenv
12
+
13
+ load_dotenv()
14
+
15
+ # Data model
16
+ class grade(BaseModel):
17
+ """Binary score for relevance check."""
18
+ binary_score: str = Field(description="Relevance score 'yes' or 'no'")
19
+
20
+ class DocumentGrader:
21
+
22
+ """Implements the document grader chain"""
23
+
24
+ _GRADER_PROMPT_TEMPLATE = """
25
+ You are a grader assessing relevance of a retrieved document to a user question. \n
26
+ Retrieved document: \n\n {context} \n\n
27
+ User Question: {question} \n
28
+ When assessing the relevance of a retrieved document to a user question, consider whether the document can provide a complete answer to the question posed. A document is considered relevant only if it contains all the necessary information to fully answer the user's inquiry without requiring additional context or assumptions.
29
+ Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question.
30
+ Do not return anything other than a 'yes' or 'no'.
31
+ """
32
+
33
+ _GRADER_PROMPT = PromptTemplate(template=dedent(_GRADER_PROMPT_TEMPLATE), input_variables=["context", "question"])
34
+
35
+ def __init__(self):
36
+ # seperate the model wrapper instance for the binded tool
37
+ llm = ChatOpenAI(temperature=0, model=os.environ["OPENAI_MODEL"])
38
+ grade_tool_oai = convert_to_openai_tool(grade)
39
+ # LLM with tool and enforce invocation
40
+ llm_with_tool = llm.bind(
41
+ tools=[grade_tool_oai],
42
+ tool_choice={"type": "function", "function": {"name": "grade"}},
43
+ )
44
+ # Parser
45
+ parser_tool = PydanticToolsParser(tools=[grade])
46
+ self._grader_chain = self._GRADER_PROMPT | llm_with_tool | parser_tool
47
+
48
+ def run(self, question, context):
49
+ """Returns the response from the document grader"""
50
+ return self._grader_chain.invoke({"context": context, "question": question})
src/chains/intent_detection.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Implements the intents detection chain"""
2
+
3
+ from langchain.chains import create_tagging_chain
4
+ from langchain_core.prompts import ChatPromptTemplate
5
+ from textwrap import dedent
6
+
7
+ class IntentDetection:
8
+
9
+ """Implements the intents detection chain"""
10
+
11
+ _SCHEMA = {
12
+ "properties": {
13
+ "intent": {
14
+ "type": "string",
15
+ "enum": ["smalltalk", "sajal_question"],
16
+ "description": "The intent of the user's message. The intent is sajal_question, if the user is asking questions about Sajal Sharma"
17
+ "for example related to his contact info, work experience, educational background, certifications, hobbies, etc, or any other questions about him."
18
+ "Any questions about contact info, work experience, educational background, certifications, hobbies, should also be sajal_question."
19
+ "General greetings or smalltalk messages are smalltalk. Questions about anyone other than sajal are also smalltalk."
20
+ }
21
+ }
22
+ }
23
+
24
+ _TAGGING_PROMPT = """Extract the desired information from the following passage.
25
+
26
+ Only extract the properties mentioned in the 'information_extraction' function.
27
+ Use the chat history to guide your extraction.
28
+
29
+ Chat History:
30
+ {history}
31
+
32
+ Passage:
33
+ {input}
34
+ """
35
+ _TAGGING_PROMPT_TEMPLATE = ChatPromptTemplate.from_template(dedent(_TAGGING_PROMPT))
36
+
37
+ def __init__(self, llm):
38
+ self.tagging_chain = create_tagging_chain(self._SCHEMA, llm, prompt=self._TAGGING_PROMPT_TEMPLATE)
39
+
40
+ def run(self, message, history):
41
+ """Returns the detected intent"""
42
+ result = self.tagging_chain.invoke({"input": message, "history": history})
43
+ return result["text"]["intent"]
44
+
src/chains/qa_all_data.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Implements a QA chain to run using the full data."""
2
+
3
+ from langchain_core.prompts import ChatPromptTemplate
4
+ from langchain_core.output_parsers import StrOutputParser
5
+
6
+ class QAAllData:
7
+ """Implements a QA chain to run using the full data"""
8
+
9
+ _PROMPT_TEMPLATE = """
10
+ You are an AI assistant, Saj, built by Sajal Sharma, an AI Engineer.
11
+ Your main task is to answer questions people may have about Sajal.
12
+ Use the following information about Sajal context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
13
+ Question: {question}
14
+ Context: {context}
15
+ Answer:
16
+ """
17
+
18
+ _PROMPT = ChatPromptTemplate.from_template(_PROMPT_TEMPLATE)
19
+
20
+ def __init__(self, llm, source_data_path):
21
+ with open(source_data_path, "r") as file:
22
+ self.full_markdown_document = file.read()
23
+ self.qa_all_data_chain = self._PROMPT | llm | StrOutputParser()
24
+
25
+ def run(self, question):
26
+ """Returns the response from the LLM to the user's message using all data."""
27
+ return self.qa_all_data_chain.invoke({"question": question, "context": self.full_markdown_document})
src/chains/rag.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Implements the RAG chain"""
2
+
3
+ from langchain_core.prompts import format_document, PromptTemplate, ChatPromptTemplate
4
+ from langchain_core.output_parsers import StrOutputParser
5
+ from textwrap import dedent
6
+
7
+ class RAG:
8
+
9
+ """Implements the RAG chain"""
10
+
11
+ _RAG_PROMPT = """
12
+ You are an AI assistant, Saj, built by Sajal Sharma, an AI Engineer.
13
+ Your main task is to answer questions people may have about Sajal.
14
+ Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
15
+ Question: {question}
16
+ Context: {context}
17
+ Answer:
18
+ """
19
+
20
+ _DOCUMENT_SEPARATOR = "\n\n"
21
+ _DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")
22
+
23
+ _RAG_PROMPT_TEMPLATE = ChatPromptTemplate.from_template(dedent(_RAG_PROMPT))
24
+
25
+ def __init__(self, llm):
26
+ self.rag_chain = self._RAG_PROMPT_TEMPLATE | llm | StrOutputParser()
27
+
28
+ def _combine_documents(self, docs):
29
+ doc_strings = [format_document(doc, self._DEFAULT_DOCUMENT_PROMPT) for doc in docs]
30
+ return self._DOCUMENT_SEPARATOR.join(doc_strings)
31
+
32
+ def run(self, question, documents):
33
+ """Returns the response from the LLM to the user's message using RAG with chunked documents."""
34
+ document_str = self._combine_documents(documents)
35
+ return self.rag_chain.invoke({"question": question, "context": document_str})
src/chains/rephrase_question.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Implements the rephrase question chain"""
2
+
3
+ from langchain_core.prompts import PromptTemplate
4
+ from langchain_core.output_parsers import StrOutputParser
5
+
6
+ class RephraseQuestion:
7
+
8
+ """Implements the rephrase question chain"""
9
+
10
+ _CONDESE_QUESTION_PROMPT_TEMPLATE = """
11
+ Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
12
+ Chat History:
13
+ {chat_history}
14
+ Follow Up Input: {question}
15
+ Standalone question:
16
+ """
17
+ _CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_CONDESE_QUESTION_PROMPT_TEMPLATE)
18
+
19
+ def __init__(self, llm):
20
+ self.rephrase_question_chain = self._CONDENSE_QUESTION_PROMPT | llm | StrOutputParser()
21
+
22
+ def run(self, message, history):
23
+ """Returns the rephrased question from the LLM to the user's message."""
24
+ return self.rephrase_question_chain.invoke({"chat_history": history, "question": message})
src/chains/smalltalk.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Responds to any smalltalk or off-topic messages."""
2
+
3
+ from langchain_core.prompts import ChatPromptTemplate
4
+ from langchain_core.output_parsers import StrOutputParser
5
+ from textwrap import dedent
6
+
7
+ class Smalltalk:
8
+
9
+ """Responds to any smalltalk or off-topic messages."""
10
+
11
+ _SMALLTALK_PROMPT = """
12
+ You are an AI assistant, Saj, built by Sajal Sharma, an AI Engineer. Given the following message and chat history, please respond to the user.
13
+ You are allow to repond to smalltalk messages such as greetings or how are yous. For any message that is off topic, or is not a greeting, or not about Sajal, refuse to answer and ask the user to ask a question about Sajal.
14
+
15
+ Chat History: {chat_history}
16
+
17
+ User Message: {input}
18
+ """
19
+
20
+ _SMALLTALK_PROMPT_TEMPLATE = ChatPromptTemplate.from_template(dedent(_SMALLTALK_PROMPT))
21
+
22
+ def __init__(self, llm):
23
+ self.smalltalk_chain = self._SMALLTALK_PROMPT_TEMPLATE | llm | StrOutputParser()
24
+
25
+ def run(self, message, history):
26
+ """Returns the response from the LLM to the user's message."""
27
+ return self.smalltalk_chain.invoke({"input": message, "chat_history": history})
28
+
src/graph.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Implements the graph to handle workflows for the Sajal assistant"""
2
+
3
+ from typing import Dict, TypedDict
4
+
5
+ from chains.intent_detection import IntentDetection
6
+ from chains.smalltalk import Smalltalk
7
+ from chains.document_grader import DocumentGrader
8
+ from chains.rephrase_question import RephraseQuestion
9
+ from chains.qa_all_data import QAAllData
10
+ from chains.rag import RAG
11
+
12
+ from retriever import Retriever
13
+
14
+ from langgraph.graph import END, StateGraph
15
+
16
+ class GraphState(TypedDict):
17
+ """
18
+ Represents the state of our graph.
19
+
20
+ Attributes:
21
+ keys: A dictionary where each key is a string.
22
+ """
23
+ keys: Dict[str, any]
24
+
25
+ class AssistantGraph:
26
+
27
+ """Implements the graph to handle workflows for the Sajal assistant"""
28
+
29
+ def __init__(self, llm, vector_db_path, source_data_path):
30
+ self.intent_detector = IntentDetection(llm)
31
+ self.smalltalk = Smalltalk(llm)
32
+ self.document_grader = DocumentGrader()
33
+ self.rephrase_question_chain = RephraseQuestion(llm)
34
+ self.retriever = Retriever(vector_db_path=vector_db_path)
35
+ self.qa_all_data = QAAllData(llm=llm, source_data_path=source_data_path)
36
+ self.rag = RAG(llm)
37
+ self.app = self.compile_graph()
38
+
39
+ def run(self, inputs):
40
+ return self.app.invoke(inputs)
41
+
42
+ # define graph nodes and edges and compile graph
43
+ def compile_graph(self):
44
+ workflow = StateGraph(GraphState)
45
+ ### define the nodes
46
+ workflow.add_node("detect_intent", self.detect_intent)
47
+ workflow.add_node("chat", self.chat)
48
+ workflow.add_node("rephrase_question", self.rephrase_question)
49
+ workflow.add_node("retrieve", self.retrieve)
50
+ workflow.add_node("grade_documents", self.grade_documents)
51
+ workflow.add_node("generate_answer_with_retrieved_documents", self.generate_answer_with_retrieved_documents)
52
+ workflow.add_node("generate_answer_using_all_data", self.generate_answer_using_all_data)
53
+ ### build the graph
54
+ workflow.set_entry_point("detect_intent")
55
+ workflow.add_conditional_edges(
56
+ "detect_intent",
57
+ self.decide_to_rag,
58
+ {
59
+ "rag": "rephrase_question",
60
+ "chat": "chat",
61
+ }
62
+ )
63
+ workflow.add_edge("rephrase_question", "retrieve")
64
+ workflow.add_edge("retrieve", "grade_documents")
65
+ workflow.add_conditional_edges(
66
+ "grade_documents",
67
+ self.decide_to_use_all_data,
68
+ {
69
+ "rag": "generate_answer_with_retrieved_documents",
70
+ "generate_answer_using_all_data": "generate_answer_using_all_data",
71
+ }
72
+ )
73
+ workflow.add_edge("generate_answer_with_retrieved_documents", END)
74
+ workflow.add_edge("generate_answer_using_all_data", END)
75
+ workflow.add_edge("chat", END)
76
+ ### compile the graph
77
+ app = workflow.compile()
78
+ return app
79
+
80
+ # define the nodes
81
+ def detect_intent(self, state):
82
+ """
83
+ Detects the intent of a user's message
84
+
85
+ Args:
86
+ state (dict): The current graph state
87
+
88
+ Returns:
89
+ state (dict): New key added to state, intent, that contains the detected intent
90
+ """
91
+ state = state["keys"]
92
+ message = state["message"]
93
+ history = state["history"]
94
+ intent = self.intent_detector.run(message=message, history=history)
95
+ return {"keys": {"message": message, "intent": intent, "history": history}}
96
+
97
+ def chat(self, state):
98
+ """
99
+ Chat with the user
100
+
101
+ Args:
102
+ state (dict): The current graph state
103
+
104
+ Returns:
105
+ str: Updated graph state after adding response
106
+ """
107
+ state = state["keys"]
108
+ input = state["message"]
109
+ history = state["history"]
110
+ response = self.smalltalk.run(message=input, history=history)
111
+ return {"keys": {"message": input, "history": history, "response": response}}
112
+
113
+ def grade_documents(self, state):
114
+ """
115
+ Determines whether the retrieved documents are relevant to the question.
116
+
117
+ Args:
118
+ state (dict): The current graph state
119
+
120
+ Returns:
121
+ state (dict): Updates documents key with relevant documents
122
+ """
123
+
124
+ print("---CHECK RELEVANCE---")
125
+ state = state["keys"]
126
+ question = state["standalone_question"]
127
+ documents = state["documents"]
128
+
129
+ # Score
130
+ filtered_docs = []
131
+ all_data = False # Default do not opt to use all data for generation
132
+ for d in documents:
133
+ score = self.document_grader.run(question=question, context=d.page_content)
134
+ grade = score[0].binary_score
135
+ if grade == "yes":
136
+ print("---GRADE: FOUND RELEVANT DOCUMENT---")
137
+ filtered_docs.append(d)
138
+
139
+ if not filtered_docs:
140
+ all_data = True # Opt to use all data for generation
141
+
142
+ return {
143
+ "keys": {
144
+ "documents": filtered_docs,
145
+ "standalone_question": question,
146
+ "run_with_all_data": all_data,
147
+ }
148
+ }
149
+
150
+ def rephrase_question(self, state):
151
+ """
152
+ Rephrase the question to be a standalone question
153
+
154
+ Args:
155
+ state (dict): The current graph state
156
+
157
+ Returns:
158
+ str: Updated graph state after adding standalone question
159
+ """
160
+ state = state["keys"]
161
+ question = state["message"]
162
+ chat_history = state["history"]
163
+ result = self.rephrase_question_chain.run(message=question, history=chat_history)
164
+ return {"keys": {"message": question, "history": chat_history, "standalone_question": result}}
165
+
166
+ def retrieve(self, state):
167
+ """
168
+ Retrieve documents
169
+
170
+ Args:
171
+ state (dict): The current graph state
172
+
173
+ Returns:
174
+ state (dict): New key added to state, documents, that contains retrieved documents
175
+ """
176
+ state = state["keys"]
177
+ question = state["standalone_question"]
178
+ chat_history = state["history"]
179
+ documents = self.retriever.run(query=question)
180
+ return {"keys": {"message": state["message"], "history": chat_history, "standalone_question": question, "documents": documents}}
181
+
182
+ def generate_answer_using_all_data(self, state):
183
+ """
184
+ Generate an answer using all documents
185
+
186
+ Args:
187
+ state (dict): The current graph state
188
+
189
+ Returns:
190
+ str: Updated graph state after adding response
191
+ """
192
+ state = state["keys"]
193
+ question = state["standalone_question"]
194
+ response = self.qa_all_data.run(question=question)
195
+ return {"keys": {"message": question, "response": response}}
196
+
197
+ def generate_answer_with_retrieved_documents(self, state):
198
+ """
199
+ Generate an answer using the retrieved documents
200
+
201
+ Args:
202
+ state (dict): The current graph state
203
+
204
+ Returns:
205
+ str: Updated graph state after adding response
206
+ """
207
+ state = state["keys"]
208
+ question = state["standalone_question"]
209
+ documents = state["documents"]
210
+ response = self.rag.run(question=question, documents=documents)
211
+ return {"keys": {"message": question, "response": response}}
212
+
213
+ # define the edges
214
+ def decide_to_rag(self, state):
215
+ """
216
+ Decides whether to use RAG or not
217
+
218
+ Args:
219
+ state (dict): The current graph state
220
+
221
+ Returns:
222
+ str: Next node to call
223
+ """
224
+ state = state["keys"]
225
+ intent = state["intent"]
226
+ if intent == "sajal_question":
227
+ return "rag"
228
+ return "chat"
229
+
230
+ def decide_to_use_all_data(self, state):
231
+ """
232
+ Determines whether to use all data for generation or not.
233
+
234
+ Args:
235
+ state (dict): The current state of the agent, including all keys.
236
+
237
+ Returns:
238
+ str: Next node to call
239
+ """
240
+
241
+ state = state["keys"]
242
+ run_with_all_data = state["run_with_all_data"]
243
+
244
+ if run_with_all_data:
245
+ return "generate_answer_using_all_data"
246
+ else:
247
+ return "rag"
src/ingest_data.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Script to ingest data to a ChromaDB vector store, and persist it to disk"""
2
+
3
+ import os
4
+ from dotenv import load_dotenv
5
+
6
+ from langchain.text_splitter import MarkdownHeaderTextSplitter
7
+ from langchain_community.vectorstores import Chroma
8
+ from langchain_openai import OpenAIEmbeddings
9
+
10
+ # load the environment variables
11
+ load_dotenv()
12
+
13
+ # load the data
14
+ markdown_path = "data/source.md"
15
+ # read the markdown file and return the full document as a string
16
+ with open(markdown_path, "r") as file:
17
+ full_markdown_document = file.read()
18
+
19
+ # split the data into chunks based on the markdown heading
20
+ headers_to_split_on = [
21
+ ("#", "Header 1"),
22
+ ("##", "Header 2"),
23
+ ("###", "Header 3"),
24
+ ]
25
+ markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False)
26
+ chunked_documents = markdown_splitter.split_text(full_markdown_document)
27
+
28
+ # create a vector store
29
+ embeddings_model = OpenAIEmbeddings()
30
+ db = Chroma.from_documents(chunked_documents, embeddings_model, persist_directory="data/chroma_db")
src/retriever.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Impleemnts the Retriever class for retrieving data from the database"""
2
+
3
+ from langchain_community.vectorstores import Chroma
4
+ from langchain_openai import OpenAIEmbeddings
5
+
6
+ class Retriever:
7
+ """Retrieves data from the database"""
8
+
9
+ def __init__(self, vector_db_path):
10
+ _embedding_model = OpenAIEmbeddings()
11
+ _db = Chroma(persist_directory=vector_db_path, embedding_function=_embedding_model)
12
+ self.retriever = _db.as_retriever()
13
+
14
+ def run(self, query):
15
+ """Retrieves data from the database"""
16
+ return self.retriever.get_relevant_documents(query)