iranjan31 commited on
Commit
450b75a
1 Parent(s): dbfd1a8

Synced repo using 'sync_with_huggingface' Github Action

Browse files
.chainlit/config.toml ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ # Whether to enable telemetry (default: true). No personal data is collected.
3
+ enable_telemetry = true
4
+
5
+
6
+ # List of environment variables to be provided by each user to use the app.
7
+ user_env = []
8
+
9
+ # Duration (in seconds) during which the session is saved when the connection is lost
10
+ session_timeout = 3600
11
+
12
+ # Enable third parties caching (e.g LangChain cache)
13
+ cache = false
14
+
15
+ # Authorized origins
16
+ allow_origins = ["*"]
17
+
18
+ # Follow symlink for asset mount (see https://github.com/Chainlit/chainlit/issues/317)
19
+ # follow_symlink = false
20
+
21
+ [features]
22
+ # Show the prompt playground
23
+ prompt_playground = true
24
+
25
+ # Process and display HTML in messages. This can be a security risk (see https://stackoverflow.com/questions/19603097/why-is-it-dangerous-to-render-user-generated-html-or-javascript)
26
+ unsafe_allow_html = false
27
+
28
+ # Process and display mathematical expressions. This can clash with "$" characters in messages.
29
+ latex = false
30
+
31
+ # Automatically tag threads with the current chat profile (if a chat profile is used)
32
+ auto_tag_thread = true
33
+
34
+ # Authorize users to spontaneously upload files with messages
35
+ [features.spontaneous_file_upload]
36
+ enabled = true
37
+ accept = ["*/*"]
38
+ max_files = 20
39
+ max_size_mb = 500
40
+
41
+ [features.audio]
42
+ # Threshold for audio recording
43
+ min_decibels = -45
44
+ # Delay for the user to start speaking in MS
45
+ initial_silence_timeout = 3000
46
+ # Delay for the user to continue speaking in MS. If the user stops speaking for this duration, the recording will stop.
47
+ silence_timeout = 1500
48
+ # Above this duration (MS), the recording will forcefully stop.
49
+ max_duration = 15000
50
+ # Duration of the audio chunks in MS
51
+ chunk_duration = 1000
52
+ # Sample rate of the audio
53
+ sample_rate = 44100
54
+
55
+ [UI]
56
+ # Name of the app and chatbot.
57
+ name = "SCC Sherpa"
58
+
59
+ # Show the readme while the thread is empty.
60
+ show_readme_as_default = true
61
+
62
+ # Description of the app and chatbot. This is used for HTML tags.
63
+ # description = "SCC Guide"
64
+
65
+ # Large size content are by default collapsed for a cleaner ui
66
+ default_collapse_content = true
67
+
68
+ # The default value for the expand messages settings.
69
+ default_expand_messages = false
70
+
71
+ # Hide the chain of thought details from the user in the UI.
72
+ hide_cot = false
73
+
74
+ # Link to your github repo. This will add a github button in the UI's header.
75
+ # github = ""
76
+
77
+ # Specify a CSS file that can be used to customize the user interface.
78
+ # The CSS file can be served from the public directory or via an external link.
79
+ custom_css = "/public/test.css"
80
+
81
+ # Specify a Javascript file that can be used to customize the user interface.
82
+ # The Javascript file can be served from the public directory.
83
+ # custom_js = "/public/test.js"
84
+
85
+ # Specify a custom font url.
86
+ # custom_font = "https://fonts.googleapis.com/css2?family=Inter:wght@400;500;700&display=swap"
87
+
88
+ # Specify a custom meta image url.
89
+ # custom_meta_image_url = "https://chainlit-cloud.s3.eu-west-3.amazonaws.com/logo/chainlit_banner.png"
90
+
91
+ # Specify a custom build directory for the frontend.
92
+ # This can be used to customize the frontend code.
93
+ # Be careful: If this is a relative path, it should not start with a slash.
94
+ # custom_build = "./public/build"
95
+
96
+ [UI.theme]
97
+ #layout = "wide"
98
+ #font_family = "Inter, sans-serif"
99
+ # Override default MUI light theme. (Check theme.ts)
100
+ [UI.theme.light]
101
+ #background = "#FAFAFA"
102
+ #paper = "#FFFFFF"
103
+
104
+ [UI.theme.light.primary]
105
+ #main = "#F80061"
106
+ #dark = "#980039"
107
+ #light = "#FFE7EB"
108
+
109
+ # Override default MUI dark theme. (Check theme.ts)
110
+ [UI.theme.dark]
111
+ background = "#1C1C1C" # Slightly lighter dark background color
112
+ paper = "#2A2A2A" # Slightly lighter dark paper color
113
+
114
+ [UI.theme.dark.primary]
115
+ main = "#89CFF0" # Primary color
116
+ dark = "#3700B3" # Dark variant of primary color
117
+ light = "#CFBCFF" # Lighter variant of primary color
118
+
119
+
120
+ [meta]
121
+ generated_by = "1.1.202"
.chainlit/translations/en-US.json ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "components": {
3
+ "atoms": {
4
+ "buttons": {
5
+ "userButton": {
6
+ "menu": {
7
+ "settings": "Settings",
8
+ "settingsKey": "S",
9
+ "APIKeys": "API Keys",
10
+ "logout": "Logout"
11
+ }
12
+ }
13
+ }
14
+ },
15
+ "molecules": {
16
+ "newChatButton": {
17
+ "newChat": "New Chat"
18
+ },
19
+ "tasklist": {
20
+ "TaskList": {
21
+ "title": "\ud83d\uddd2\ufe0f Task List",
22
+ "loading": "Loading...",
23
+ "error": "An error occured"
24
+ }
25
+ },
26
+ "attachments": {
27
+ "cancelUpload": "Cancel upload",
28
+ "removeAttachment": "Remove attachment"
29
+ },
30
+ "newChatDialog": {
31
+ "createNewChat": "Create new chat?",
32
+ "clearChat": "This will clear the current messages and start a new chat.",
33
+ "cancel": "Cancel",
34
+ "confirm": "Confirm"
35
+ },
36
+ "settingsModal": {
37
+ "settings": "Settings",
38
+ "expandMessages": "Expand Messages",
39
+ "hideChainOfThought": "Hide Chain of Thought",
40
+ "darkMode": "Dark Mode"
41
+ },
42
+ "detailsButton": {
43
+ "using": "Using",
44
+ "running": "Running",
45
+ "took_one": "Took {{count}} step",
46
+ "took_other": "Took {{count}} steps"
47
+ },
48
+ "auth": {
49
+ "authLogin": {
50
+ "title": "Login to access the app.",
51
+ "form": {
52
+ "email": "Email address",
53
+ "password": "Password",
54
+ "noAccount": "Don't have an account?",
55
+ "alreadyHaveAccount": "Already have an account?",
56
+ "signup": "Sign Up",
57
+ "signin": "Sign In",
58
+ "or": "OR",
59
+ "continue": "Continue",
60
+ "forgotPassword": "Forgot password?",
61
+ "passwordMustContain": "Your password must contain:",
62
+ "emailRequired": "email is a required field",
63
+ "passwordRequired": "password is a required field"
64
+ },
65
+ "error": {
66
+ "default": "Unable to sign in.",
67
+ "signin": "Try signing in with a different account.",
68
+ "oauthsignin": "Try signing in with a different account.",
69
+ "redirect_uri_mismatch": "The redirect URI is not matching the oauth app configuration.",
70
+ "oauthcallbackerror": "Try signing in with a different account.",
71
+ "oauthcreateaccount": "Try signing in with a different account.",
72
+ "emailcreateaccount": "Try signing in with a different account.",
73
+ "callback": "Try signing in with a different account.",
74
+ "oauthaccountnotlinked": "To confirm your identity, sign in with the same account you used originally.",
75
+ "emailsignin": "The e-mail could not be sent.",
76
+ "emailverify": "Please verify your email, a new email has been sent.",
77
+ "credentialssignin": "Sign in failed. Check the details you provided are correct.",
78
+ "sessionrequired": "Please sign in to access this page."
79
+ }
80
+ },
81
+ "authVerifyEmail": {
82
+ "almostThere": "You're almost there! We've sent an email to ",
83
+ "verifyEmailLink": "Please click on the link in that email to complete your signup.",
84
+ "didNotReceive": "Can't find the email?",
85
+ "resendEmail": "Resend email",
86
+ "goBack": "Go Back",
87
+ "emailSent": "Email sent successfully.",
88
+ "verifyEmail": "Verify your email address"
89
+ },
90
+ "providerButton": {
91
+ "continue": "Continue with {{provider}}",
92
+ "signup": "Sign up with {{provider}}"
93
+ },
94
+ "authResetPassword": {
95
+ "newPasswordRequired": "New password is a required field",
96
+ "passwordsMustMatch": "Passwords must match",
97
+ "confirmPasswordRequired": "Confirm password is a required field",
98
+ "newPassword": "New password",
99
+ "confirmPassword": "Confirm password",
100
+ "resetPassword": "Reset Password"
101
+ },
102
+ "authForgotPassword": {
103
+ "email": "Email address",
104
+ "emailRequired": "email is a required field",
105
+ "emailSent": "Please check the email address {{email}} for instructions to reset your password.",
106
+ "enterEmail": "Enter your email address and we will send you instructions to reset your password.",
107
+ "resendEmail": "Resend email",
108
+ "continue": "Continue",
109
+ "goBack": "Go Back"
110
+ }
111
+ }
112
+ },
113
+ "organisms": {
114
+ "chat": {
115
+ "history": {
116
+ "index": {
117
+ "showHistory": "Show history",
118
+ "lastInputs": "Last Inputs",
119
+ "noInputs": "Such empty...",
120
+ "loading": "Loading..."
121
+ }
122
+ },
123
+ "inputBox": {
124
+ "input": {
125
+ "placeholder": "Type your message here..."
126
+ },
127
+ "speechButton": {
128
+ "start": "Start recording",
129
+ "stop": "Stop recording"
130
+ },
131
+ "SubmitButton": {
132
+ "sendMessage": "Send message",
133
+ "stopTask": "Stop Task"
134
+ },
135
+ "UploadButton": {
136
+ "attachFiles": "Attach files"
137
+ },
138
+ "waterMark": {
139
+ "text": "Built with"
140
+ }
141
+ },
142
+ "Messages": {
143
+ "index": {
144
+ "running": "Running",
145
+ "executedSuccessfully": "executed successfully",
146
+ "failed": "failed",
147
+ "feedbackUpdated": "Feedback updated",
148
+ "updating": "Updating"
149
+ }
150
+ },
151
+ "dropScreen": {
152
+ "dropYourFilesHere": "Drop your files here"
153
+ },
154
+ "index": {
155
+ "failedToUpload": "Failed to upload",
156
+ "cancelledUploadOf": "Cancelled upload of",
157
+ "couldNotReachServer": "Could not reach the server",
158
+ "continuingChat": "Continuing previous chat"
159
+ },
160
+ "settings": {
161
+ "settingsPanel": "Settings panel",
162
+ "reset": "Reset",
163
+ "cancel": "Cancel",
164
+ "confirm": "Confirm"
165
+ }
166
+ },
167
+ "threadHistory": {
168
+ "sidebar": {
169
+ "filters": {
170
+ "FeedbackSelect": {
171
+ "feedbackAll": "Feedback: All",
172
+ "feedbackPositive": "Feedback: Positive",
173
+ "feedbackNegative": "Feedback: Negative"
174
+ },
175
+ "SearchBar": {
176
+ "search": "Search"
177
+ }
178
+ },
179
+ "DeleteThreadButton": {
180
+ "confirmMessage": "This will delete the thread as well as it's messages and elements.",
181
+ "cancel": "Cancel",
182
+ "confirm": "Confirm",
183
+ "deletingChat": "Deleting chat",
184
+ "chatDeleted": "Chat deleted"
185
+ },
186
+ "index": {
187
+ "pastChats": "Past Chats"
188
+ },
189
+ "ThreadList": {
190
+ "empty": "Empty...",
191
+ "today": "Today",
192
+ "yesterday": "Yesterday",
193
+ "previous7days": "Previous 7 days",
194
+ "previous30days": "Previous 30 days"
195
+ },
196
+ "TriggerButton": {
197
+ "closeSidebar": "Close sidebar",
198
+ "openSidebar": "Open sidebar"
199
+ }
200
+ },
201
+ "Thread": {
202
+ "backToChat": "Go back to chat",
203
+ "chatCreatedOn": "This chat was created on"
204
+ }
205
+ },
206
+ "header": {
207
+ "chat": "Chat",
208
+ "readme": "Readme"
209
+ }
210
+ }
211
+ },
212
+ "hooks": {
213
+ "useLLMProviders": {
214
+ "failedToFetchProviders": "Failed to fetch providers:"
215
+ }
216
+ },
217
+ "pages": {
218
+ "Design": {},
219
+ "Env": {
220
+ "savedSuccessfully": "Saved successfully",
221
+ "requiredApiKeys": "Required API Keys",
222
+ "requiredApiKeysInfo": "To use this app, the following API keys are required. The keys are stored on your device's local storage."
223
+ },
224
+ "Page": {
225
+ "notPartOfProject": "You are not part of this project."
226
+ },
227
+ "ResumeButton": {
228
+ "resumeChat": "Resume Chat"
229
+ }
230
+ }
231
+ }
Dockerfile ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ WORKDIR /code
4
+
5
+ COPY ./requirements.txt /code/requirements.txt
6
+
7
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
8
+
9
+ RUN pip install --no-cache-dir transformers==4.36.2 torch==2.1.2
10
+
11
+ RUN pip install --upgrade --force-reinstall --no-cache-dir llama-cpp-python==0.2.32
12
+
13
+ COPY . /code
14
+
15
+ RUN ls -R
16
+
17
+ # Change permissions to allow writing to the directory
18
+ RUN chmod -R 777 /code
19
+
20
+ # Create a logs directory and set permissions
21
+ RUN mkdir /code/logs && chmod 777 /code/logs
22
+
23
+ # Create a cache directory within the application's working directory
24
+ RUN mkdir /.cache && chmod -R 777 /.cache
25
+
26
+ RUN --mount=type=secret,id=HUGGINGFACEHUB_API_TOKEN,mode=0444,required=true
27
+ RUN --mount=type=secret,id=OPENAI_API_KEY,mode=0444,required=true
28
+
29
+ CMD python code/modules/vector_db.py && chainlit run code/main.py --host 0.0.0.0 --port 7860
Dockerfile.dev ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11
2
+
3
+ WORKDIR /code
4
+
5
+ COPY ./requirements.txt /code/requirements.txt
6
+
7
+ RUN pip install --upgrade pip
8
+
9
+ RUN pip install --no-cache-dir -r /code/requirements.txt
10
+
11
+ COPY . /code
12
+
13
+ RUN ls -R
14
+
15
+ # Change permissions to allow writing to the directory
16
+ RUN chmod -R 777 /code
17
+
18
+ # Create a logs directory and set permissions
19
+ RUN mkdir /code/logs && chmod 777 /code/logs
20
+
21
+ # Create a cache directory within the application's working directory
22
+ RUN mkdir /.cache && chmod -R 777 /.cache
23
+
24
+ # Expose the port the app runs on
25
+ EXPOSE 8051
26
+
27
+ CMD chainlit run code/main.py --port 8051
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 DL4DS
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
chainlit.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Welcome to DL4DS Tutor! 🚀🤖
2
+
3
+ Hi there, this is an LLM chatbot designed to help answer your questions.
4
+ This is still very much a Work in Progress.
5
+
6
+ ### --- Please wait while the Tutor loads... ---
7
+
8
+ ## Useful Links 🔗
9
+
10
+ - **Documentation:** [Chainlit Documentation](https://docs.chainlit.io) 📚
code/config.yml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ embedding_options:
2
+ embedd_files: False # bool
3
+ data_path: 'storage/data' # str
4
+ url_file_path: 'storage/data/urls.txt' # str
5
+ expand_urls: True # bool
6
+ db_option : 'FAISS' # str [FAISS, Chroma, RAGatouille]
7
+ db_path : 'vectorstores' # str
8
+ model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
9
+ search_top_k : 3 # int
10
+ score_threshold : 0.0 # float
11
+ lambda_mult: 0.5 # float - Determines Diversity of the retrieved results
12
+ llm_params:
13
+ use_history: True # bool
14
+ memory_window: 3 # int
15
+ llm_loader: 'local_llm' # str [local_llm, openai]
16
+ openai_params:
17
+ model: 'gpt-4' # str [gpt-3.5-turbo-1106, gpt-4]
18
+ local_llm_params:
19
+ model: "storage/models/tinyllama-1.1b-chat-v0.3.Q5_K_M.gguf"
20
+ model_type: "llama"
21
+ temperature: 0.2
22
+ splitter_options:
23
+ use_splitter: True # bool
24
+ split_by_token : True # bool
25
+ remove_leftover_delimiters: True # bool
26
+ remove_chunks: False # bool
27
+ chunk_size : 300 # int
28
+ chunk_overlap : 30 # int
29
+ chunk_separators : ["\n\n", "\n", " ", ""] # list of strings
30
+ front_chunks_to_remove : null # int or None
31
+ last_chunks_to_remove : null # int or None
32
+ delimiters_to_remove : ['\t', '\n', ' ', ' '] # list of strings
code/main.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.document_loaders import PyPDFLoader, DirectoryLoader
2
+ from langchain import PromptTemplate
3
+ from langchain.embeddings import HuggingFaceEmbeddings
4
+ from langchain.vectorstores import FAISS
5
+ from langchain.chains import RetrievalQA
6
+ from langchain.llms import CTransformers
7
+ import chainlit as cl
8
+ from langchain_community.chat_models import ChatOpenAI
9
+ from langchain_community.embeddings import OpenAIEmbeddings
10
+ import yaml
11
+ import logging
12
+ from dotenv import load_dotenv
13
+
14
+ from modules.llm_tutor import LLMTutor
15
+ from modules.constants import *
16
+ from modules.helpers import get_sources
17
+
18
+
19
+ logger = logging.getLogger(__name__)
20
+ logger.setLevel(logging.INFO)
21
+
22
+ # Console Handler
23
+ console_handler = logging.StreamHandler()
24
+ console_handler.setLevel(logging.INFO)
25
+ formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
26
+ console_handler.setFormatter(formatter)
27
+ logger.addHandler(console_handler)
28
+
29
+ # File Handler
30
+ log_file_path = "log_file.log" # Change this to your desired log file path
31
+ file_handler = logging.FileHandler(log_file_path)
32
+ file_handler.setLevel(logging.INFO)
33
+ file_handler.setFormatter(formatter)
34
+ logger.addHandler(file_handler)
35
+
36
+
37
+ # Adding option to select the chat profile
38
+ @cl.set_chat_profiles
39
+ async def chat_profile():
40
+ return [
41
+ cl.ChatProfile(
42
+ name="Llama",
43
+ markdown_description="Use the local LLM: **Tiny Llama**.",
44
+ ),
45
+ # cl.ChatProfile(
46
+ # name="Mistral",
47
+ # markdown_description="Use the local LLM: **Mistral**.",
48
+ # ),
49
+ cl.ChatProfile(
50
+ name="gpt-3.5-turbo-1106",
51
+ markdown_description="Use OpenAI API for **gpt-3.5-turbo-1106**.",
52
+ ),
53
+ cl.ChatProfile(
54
+ name="gpt-4",
55
+ markdown_description="Use OpenAI API for **gpt-4**.",
56
+ ),
57
+ ]
58
+
59
+
60
+ @cl.author_rename
61
+ def rename(orig_author: str):
62
+ rename_dict = {"Chatbot": "AI Tutor"}
63
+ return rename_dict.get(orig_author, orig_author)
64
+
65
+
66
+ # chainlit code
67
+ @cl.on_chat_start
68
+ async def start():
69
+ with open("code/config.yml", "r") as f:
70
+ config = yaml.safe_load(f)
71
+ print(config)
72
+ logger.info("Config file loaded")
73
+ logger.info(f"Config: {config}")
74
+ logger.info("Creating llm_tutor instance")
75
+
76
+ chat_profile = cl.user_session.get("chat_profile")
77
+ if chat_profile is not None:
78
+ if chat_profile.lower() in ["gpt-3.5-turbo-1106", "gpt-4"]:
79
+ config["llm_params"]["llm_loader"] = "openai"
80
+ config["llm_params"]["openai_params"]["model"] = chat_profile.lower()
81
+ elif chat_profile.lower() == "llama":
82
+ config["llm_params"]["llm_loader"] = "local_llm"
83
+ config["llm_params"]["local_llm_params"]["model"] = LLAMA_PATH
84
+ config["llm_params"]["local_llm_params"]["model_type"] = "llama"
85
+ elif chat_profile.lower() == "mistral":
86
+ config["llm_params"]["llm_loader"] = "local_llm"
87
+ config["llm_params"]["local_llm_params"]["model"] = MISTRAL_PATH
88
+ config["llm_params"]["local_llm_params"]["model_type"] = "mistral"
89
+
90
+ else:
91
+ pass
92
+
93
+ llm_tutor = LLMTutor(config, logger=logger)
94
+
95
+ chain = llm_tutor.qa_bot()
96
+ model = config["llm_params"]["local_llm_params"]["model"]
97
+ msg = cl.Message(content=f"Starting the bot {model}...")
98
+ await msg.send()
99
+ msg.content = f"{opening_message}"
100
+ await msg.update()
101
+
102
+ cl.user_session.set("chain", chain)
103
+
104
+
105
+ @cl.on_message
106
+ async def main(message):
107
+ user = cl.user_session.get("user")
108
+ chain = cl.user_session.get("chain")
109
+ # cb = cl.AsyncLangchainCallbackHandler(
110
+ # stream_final_answer=True, answer_prefix_tokens=["FINAL", "ANSWER"]
111
+ # )
112
+ # cb.answer_reached = True
113
+ # res=await chain.acall(message, callbacks=[cb])
114
+ res = await chain.acall(message.content)
115
+ print(f"response: {res}")
116
+ try:
117
+ answer = res["answer"]
118
+ except:
119
+ answer = res["result"]
120
+ print(f"answer: {answer}")
121
+
122
+ answer_with_sources, source_elements = get_sources(res, answer)
123
+
124
+ await cl.Message(content=answer_with_sources, elements=source_elements).send()
code/modules/__init__.py ADDED
File without changes
code/modules/chat_model_loader.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.chat_models import ChatOpenAI
2
+ from langchain.llms import CTransformers
3
+ from langchain.llms.huggingface_pipeline import HuggingFacePipeline
4
+ from transformers import AutoTokenizer, TextStreamer
5
+ from langchain.llms import LlamaCpp
6
+ import torch
7
+ import transformers
8
+ import os
9
+ from langchain.callbacks.manager import CallbackManager
10
+ from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
11
+
12
+
13
+ class ChatModelLoader:
14
+ def __init__(self, config):
15
+ self.config = config
16
+ self.huggingface_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
17
+
18
+ def load_chat_model(self):
19
+ if self.config["llm_params"]["llm_loader"] == "openai":
20
+ llm = ChatOpenAI(
21
+ model_name=self.config["llm_params"]["openai_params"]["model"]
22
+ )
23
+ elif self.config["llm_params"]["llm_loader"] == "local_llm":
24
+ n_batch = 512 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
25
+ model_path = self.config["llm_params"]["local_llm_params"]["model"]
26
+ llm = LlamaCpp(
27
+ model_path=model_path,
28
+ n_batch=n_batch,
29
+ n_ctx=2048,
30
+ f16_kv=True,
31
+ verbose=True,
32
+ n_threads=2,
33
+ temperature=self.config["llm_params"]["local_llm_params"][
34
+ "temperature"
35
+ ],
36
+ )
37
+ else:
38
+ raise ValueError("Invalid LLM Loader")
39
+ return llm
code/modules/constants.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ import os
3
+
4
+ load_dotenv()
5
+
6
+ # API Keys - Loaded from the .env file
7
+
8
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
9
+ HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
10
+
11
+ opening_message = "Hello! You can ask me questions about the MSDS program at Boston University."
12
+
13
+ # Prompt Templates
14
+
15
+ openai_prompt_template = """Use the following pieces of information to answer the user's question.
16
+ If you don't know the answer, just say that you don't know.
17
+
18
+ Context: {context}
19
+ Question: {question}
20
+
21
+ Only return the helpful answer below and nothing else.
22
+ Helpful answer:
23
+ """
24
+
25
+ openai_prompt_template_with_history = """Use the following pieces of information to answer the user's question.
26
+ If you don't know the answer, just say that you don't know, don't try to make up an answer.
27
+ Use the history to answer the question if you can.
28
+ Chat History:
29
+ {chat_history}
30
+ Context: {context}
31
+ Question: {question}
32
+
33
+ Only return the helpful answer below and nothing else.
34
+ Helpful answer:
35
+ """
36
+
37
+ tinyllama_prompt_template = """
38
+ <|im_start|>system
39
+ Assistant is an intelligent chatbot designed to help students with questions regarding the course. Only answer questions using the context below and if you're not sure of an answer, you can say "I don't know". Always give a breif and concise answer to the question. Use the history to answer the question if you can.
40
+
41
+ Context:
42
+ {context}
43
+ <|im_end|>
44
+ <|im_start|>user
45
+ Question: Who is the instructor for this course?
46
+ <|im_end|>
47
+ <|im_start|>assistant
48
+ The instructor for this course is Prof. Thomas Gardos.
49
+ <|im_end|>
50
+ <|im_start|>user
51
+ Question: {question}
52
+ <|im_end|>
53
+ <|im_start|>assistant
54
+ """
55
+
56
+ tinyllama_prompt_template_with_history = """
57
+ <|im_start|>system
58
+ Assistant is an intelligent chatbot designed to help students with questions regarding the course. Only answer questions using the context below and if you're not sure of an answer, you can say "I don't know". Always give a breif and concise answer to the question.
59
+
60
+ Chat History:
61
+ {chat_history}
62
+ Context:
63
+ {context}
64
+ <|im_end|>
65
+ <|im_start|>user
66
+ Question: Who is the instructor for this course?
67
+ <|im_end|>
68
+ <|im_start|>assistant
69
+ The instructor for this course is Prof. Thomas Gardos.
70
+ <|im_end|>
71
+ <|im_start|>user
72
+ Question: {question}
73
+ <|im_end|>
74
+ <|im_start|>assistant
75
+ """
76
+
77
+
78
+ # Model Paths
79
+
80
+ LLAMA_PATH = "storage/models/tinyllama-1.1b-chat-v0.3.Q5_K_M.gguf"
81
+ MISTRAL_PATH = "storage/models/mistral-7b-v0.1.Q4_K_M.gguf"
code/modules/data_loader.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import requests
4
+ import pysrt
5
+ from langchain_community.document_loaders import (
6
+ PyMuPDFLoader,
7
+ Docx2txtLoader,
8
+ YoutubeLoader,
9
+ WebBaseLoader,
10
+ TextLoader,
11
+ )
12
+ from langchain_community.document_loaders import UnstructuredMarkdownLoader
13
+ from llama_parse import LlamaParse
14
+ from langchain.schema import Document
15
+ import logging
16
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
17
+ from ragatouille import RAGPretrainedModel
18
+ from langchain.chains import LLMChain
19
+ from langchain.llms import OpenAI
20
+ from langchain import PromptTemplate
21
+
22
+ try:
23
+ from modules.helpers import get_metadata
24
+ except:
25
+ from helpers import get_metadata
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ class PDFReader:
31
+ def __init__(self):
32
+ pass
33
+
34
+ def get_loader(self, pdf_path):
35
+ loader = PyMuPDFLoader(pdf_path)
36
+ return loader
37
+
38
+ def get_documents(self, loader):
39
+ return loader.load()
40
+
41
+
42
+ class FileReader:
43
+ def __init__(self):
44
+ self.pdf_reader = PDFReader()
45
+
46
+ def extract_text_from_pdf(self, pdf_path):
47
+ text = ""
48
+ with open(pdf_path, "rb") as file:
49
+ reader = PyPDF2.PdfReader(file)
50
+ num_pages = len(reader.pages)
51
+ for page_num in range(num_pages):
52
+ page = reader.pages[page_num]
53
+ text += page.extract_text()
54
+ return text
55
+
56
+ def download_pdf_from_url(self, pdf_url):
57
+ response = requests.get(pdf_url)
58
+ if response.status_code == 200:
59
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
60
+ temp_file.write(response.content)
61
+ temp_file_path = temp_file.name
62
+ return temp_file_path
63
+ else:
64
+ print("Failed to download PDF from URL:", pdf_url)
65
+ return None
66
+
67
+ def read_pdf(self, temp_file_path: str):
68
+ loader = self.pdf_reader.get_loader(temp_file_path)
69
+ documents = self.pdf_reader.get_documents(loader)
70
+ return documents
71
+
72
+ def read_txt(self, temp_file_path: str):
73
+ loader = TextLoader(temp_file_path, autodetect_encoding=True)
74
+ return loader.load()
75
+
76
+ def read_docx(self, temp_file_path: str):
77
+ loader = Docx2txtLoader(temp_file_path)
78
+ return loader.load()
79
+
80
+ def read_srt(self, temp_file_path: str):
81
+ subs = pysrt.open(temp_file_path)
82
+ text = ""
83
+ for sub in subs:
84
+ text += sub.text
85
+ return [Document(page_content=text)]
86
+
87
+ def read_youtube_transcript(self, url: str):
88
+ loader = YoutubeLoader.from_youtube_url(
89
+ url, add_video_info=True, language=["en"], translation="en"
90
+ )
91
+ return loader.load()
92
+
93
+ def read_html(self, url: str):
94
+ loader = WebBaseLoader(url)
95
+ return loader.load()
96
+
97
+ def read_tex_from_url(self, tex_url):
98
+ response = requests.get(tex_url)
99
+ if response.status_code == 200:
100
+ return [Document(page_content=response.text)]
101
+ else:
102
+ print("Failed to fetch .tex file from URL:", tex_url)
103
+ return None
104
+
105
+
106
+ class ChunkProcessor:
107
+ def __init__(self, config):
108
+ self.config = config
109
+
110
+ if config["splitter_options"]["use_splitter"]:
111
+ if config["splitter_options"]["split_by_token"]:
112
+ self.splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
113
+ chunk_size=config["splitter_options"]["chunk_size"],
114
+ chunk_overlap=config["splitter_options"]["chunk_overlap"],
115
+ separators=config["splitter_options"]["chunk_separators"],
116
+ disallowed_special=(),
117
+ )
118
+ else:
119
+ self.splitter = RecursiveCharacterTextSplitter(
120
+ chunk_size=config["splitter_options"]["chunk_size"],
121
+ chunk_overlap=config["splitter_options"]["chunk_overlap"],
122
+ separators=config["splitter_options"]["chunk_separators"],
123
+ disallowed_special=(),
124
+ )
125
+ else:
126
+ self.splitter = None
127
+ logger.info("ChunkProcessor instance created")
128
+
129
+ def remove_delimiters(self, document_chunks: list):
130
+ for chunk in document_chunks:
131
+ for delimiter in self.config["splitter_options"]["delimiters_to_remove"]:
132
+ chunk.page_content = re.sub(delimiter, " ", chunk.page_content)
133
+ return document_chunks
134
+
135
+ def remove_chunks(self, document_chunks: list):
136
+ front = self.config["splitter_options"]["front_chunk_to_remove"]
137
+ end = self.config["splitter_options"]["last_chunks_to_remove"]
138
+ for _ in range(front):
139
+ del document_chunks[0]
140
+ for _ in range(end):
141
+ document_chunks.pop()
142
+ logger.info(f"\tNumber of pages after skipping: {len(document_chunks)}")
143
+ return document_chunks
144
+
145
+ def process_chunks(
146
+ self, documents, file_type="txt", source="", page=0, metadata={}
147
+ ):
148
+ documents = [Document(page_content=documents, source=source, page=page)]
149
+ if (
150
+ file_type == "txt"
151
+ or file_type == "docx"
152
+ or file_type == "srt"
153
+ or file_type == "tex"
154
+ ):
155
+ document_chunks = self.splitter.split_documents(documents)
156
+ elif file_type == "pdf":
157
+ document_chunks = documents # Full page for now
158
+
159
+ # add the source and page number back to the metadata
160
+ for chunk in document_chunks:
161
+ chunk.metadata["source"] = source
162
+ chunk.metadata["page"] = page
163
+
164
+ # add the metadata extracted from the document
165
+ for key, value in metadata.items():
166
+ chunk.metadata[key] = value
167
+
168
+ if self.config["splitter_options"]["remove_leftover_delimiters"]:
169
+ document_chunks = self.remove_delimiters(document_chunks)
170
+ if self.config["splitter_options"]["remove_chunks"]:
171
+ document_chunks = self.remove_chunks(document_chunks)
172
+
173
+ return document_chunks
174
+
175
+ def get_chunks(self, file_reader, uploaded_files, weblinks):
176
+ self.document_chunks_full = []
177
+ self.parent_document_names = []
178
+ self.child_document_names = []
179
+ self.documents = []
180
+ self.document_metadata = []
181
+
182
+ addl_metadata = get_metadata(uploaded_files) # For any additional metadata
183
+
184
+ for file_index, file_path in enumerate(uploaded_files):
185
+ file_name = os.path.basename(file_path)
186
+ if file_name not in self.parent_document_names:
187
+ file_type = file_name.split(".")[-1].lower()
188
+
189
+ # try:
190
+ if file_type == "pdf":
191
+ documents = file_reader.read_pdf(file_path)
192
+ elif file_type == "txt":
193
+ documents = file_reader.read_txt(file_path)
194
+ elif file_type == "docx":
195
+ documents = file_reader.read_docx(file_path)
196
+ elif file_type == "srt":
197
+ documents = file_reader.read_srt(file_path)
198
+ elif file_type == "tex":
199
+ documents = file_reader.read_tex_from_url(file_path)
200
+ else:
201
+ logger.warning(f"Unsupported file type: {file_type}")
202
+ continue
203
+
204
+ for doc in documents:
205
+ page_num = doc.metadata.get("page", 0)
206
+ self.documents.append(doc.page_content)
207
+ self.document_metadata.append(
208
+ {"source": file_path, "page": page_num}
209
+ )
210
+ metadata = addl_metadata.get(file_path, {})
211
+ self.document_metadata[-1].update(metadata)
212
+
213
+ self.child_document_names.append(f"{file_name}_{page_num}")
214
+
215
+ self.parent_document_names.append(file_name)
216
+ if self.config["embedding_options"]["db_option"] not in [
217
+ "RAGatouille"
218
+ ]:
219
+ document_chunks = self.process_chunks(
220
+ self.documents[-1],
221
+ file_type,
222
+ source=file_path,
223
+ page=page_num,
224
+ metadata=metadata,
225
+ )
226
+ self.document_chunks_full.extend(document_chunks)
227
+
228
+ # except Exception as e:
229
+ # logger.error(f"Error processing file {file_name}: {str(e)}")
230
+
231
+ self.process_weblinks(file_reader, weblinks)
232
+
233
+ logger.info(
234
+ f"Total document chunks extracted: {len(self.document_chunks_full)}"
235
+ )
236
+ return (
237
+ self.document_chunks_full,
238
+ self.child_document_names,
239
+ self.documents,
240
+ self.document_metadata,
241
+ )
242
+
243
+ def process_weblinks(self, file_reader, weblinks):
244
+ if weblinks[0] != "":
245
+ logger.info(f"Splitting weblinks: total of {len(weblinks)}")
246
+
247
+ for link_index, link in enumerate(weblinks):
248
+ if link not in self.parent_document_names:
249
+ try:
250
+ logger.info(f"\tSplitting link {link_index+1} : {link}")
251
+ if "youtube" in link:
252
+ documents = file_reader.read_youtube_transcript(link)
253
+ else:
254
+ documents = file_reader.read_html(link)
255
+
256
+ for doc in documents:
257
+ page_num = doc.metadata.get("page", 0)
258
+ self.documents.append(doc.page_content)
259
+ self.document_metadata.append(
260
+ {"source": link, "page": page_num}
261
+ )
262
+ self.child_document_names.append(f"{link}")
263
+
264
+ self.parent_document_names.append(link)
265
+ if self.config["embedding_options"]["db_option"] not in [
266
+ "RAGatouille"
267
+ ]:
268
+ document_chunks = self.process_chunks(
269
+ self.documents[-1],
270
+ "txt",
271
+ source=link,
272
+ page=0,
273
+ metadata={"source_type": "webpage"},
274
+ )
275
+ self.document_chunks_full.extend(document_chunks)
276
+ except Exception as e:
277
+ logger.error(
278
+ f"Error splitting link {link_index+1} : {link}: {str(e)}"
279
+ )
280
+
281
+
282
+ class DataLoader:
283
+ def __init__(self, config):
284
+ self.file_reader = FileReader()
285
+ self.chunk_processor = ChunkProcessor(config)
286
+
287
+ def get_chunks(self, uploaded_files, weblinks):
288
+ return self.chunk_processor.get_chunks(
289
+ self.file_reader, uploaded_files, weblinks
290
+ )
code/modules/embedding_model_loader.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.embeddings import OpenAIEmbeddings
2
+ from langchain_community.embeddings import HuggingFaceEmbeddings
3
+ from langchain_community.embeddings import LlamaCppEmbeddings
4
+
5
+ try:
6
+ from modules.constants import *
7
+ except:
8
+ from constants import *
9
+ import os
10
+
11
+
12
+ class EmbeddingModelLoader:
13
+ def __init__(self, config):
14
+ self.config = config
15
+
16
+ def load_embedding_model(self):
17
+ if self.config["embedding_options"]["model"] in ["text-embedding-ada-002"]:
18
+ embedding_model = OpenAIEmbeddings(
19
+ deployment="SL-document_embedder",
20
+ model=self.config["embedding_options"]["model"],
21
+ show_progress_bar=True,
22
+ openai_api_key=OPENAI_API_KEY,
23
+ disallowed_special=(),
24
+ )
25
+ else:
26
+ embedding_model = HuggingFaceEmbeddings(
27
+ model_name=self.config["embedding_options"]["model"],
28
+ model_kwargs={
29
+ "device": "cpu",
30
+ "token": f"{HUGGINGFACE_TOKEN}",
31
+ "trust_remote_code": True,
32
+ },
33
+ )
34
+ # embedding_model = LlamaCppEmbeddings(
35
+ # model_path=os.path.abspath("storage/llama-7b.ggmlv3.q4_0.bin")
36
+ # )
37
+
38
+ return embedding_model
code/modules/helpers.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from tqdm import tqdm
4
+ import chainlit as cl
5
+ from langchain import PromptTemplate
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+ from urllib.parse import urlparse, urljoin, urldefrag
9
+ import asyncio
10
+ import aiohttp
11
+ from aiohttp import ClientSession
12
+
13
+ try:
14
+ from modules.constants import *
15
+ except:
16
+ from constants import *
17
+
18
+ """
19
+ Ref: https://python.plainenglish.io/scraping-the-subpages-on-a-website-ea2d4e3db113
20
+ """
21
+
22
+
23
+ class WebpageCrawler:
24
+ def __init__(self):
25
+ self.dict_href_links = {}
26
+
27
+ async def fetch(self, session: ClientSession, url: str) -> str:
28
+ async with session.get(url) as response:
29
+ try:
30
+ return await response.text()
31
+ except UnicodeDecodeError:
32
+ return await response.text(encoding="latin1")
33
+
34
+ def url_exists(self, url: str) -> bool:
35
+ try:
36
+ response = requests.head(url)
37
+ return response.status_code == 200
38
+ except requests.ConnectionError:
39
+ return False
40
+
41
+ async def get_links(self, session: ClientSession, website_link: str, base_url: str):
42
+ html_data = await self.fetch(session, website_link)
43
+ soup = BeautifulSoup(html_data, "html.parser")
44
+ list_links = []
45
+ for link in soup.find_all("a", href=True):
46
+ href = link["href"].strip()
47
+ full_url = urljoin(base_url, href)
48
+ normalized_url = self.normalize_url(full_url) # sections removed
49
+ if (
50
+ normalized_url not in self.dict_href_links
51
+ and self.is_child_url(normalized_url, base_url)
52
+ and self.url_exists(normalized_url)
53
+ ):
54
+ self.dict_href_links[normalized_url] = None
55
+ list_links.append(normalized_url)
56
+
57
+ return list_links
58
+
59
+ async def get_subpage_links(
60
+ self, session: ClientSession, urls: list, base_url: str
61
+ ):
62
+ tasks = [self.get_links(session, url, base_url) for url in urls]
63
+ results = await asyncio.gather(*tasks)
64
+ all_links = [link for sublist in results for link in sublist]
65
+ return all_links
66
+
67
+ async def get_all_pages(self, url: str, base_url: str):
68
+ async with aiohttp.ClientSession() as session:
69
+ dict_links = {url: "Not-checked"}
70
+ counter = None
71
+ while counter != 0:
72
+ unchecked_links = [
73
+ link
74
+ for link, status in dict_links.items()
75
+ if status == "Not-checked"
76
+ ]
77
+ if not unchecked_links:
78
+ break
79
+ new_links = await self.get_subpage_links(
80
+ session, unchecked_links, base_url
81
+ )
82
+ for link in unchecked_links:
83
+ dict_links[link] = "Checked"
84
+ print(f"Checked: {link}")
85
+ dict_links.update(
86
+ {
87
+ link: "Not-checked"
88
+ for link in new_links
89
+ if link not in dict_links
90
+ }
91
+ )
92
+ counter = len(
93
+ [
94
+ status
95
+ for status in dict_links.values()
96
+ if status == "Not-checked"
97
+ ]
98
+ )
99
+
100
+ checked_urls = [
101
+ url for url, status in dict_links.items() if status == "Checked"
102
+ ]
103
+ return checked_urls
104
+
105
+ def is_webpage(self, url: str) -> bool:
106
+ try:
107
+ response = requests.head(url, allow_redirects=True)
108
+ content_type = response.headers.get("Content-Type", "").lower()
109
+ return "text/html" in content_type
110
+ except requests.RequestException:
111
+ return False
112
+
113
+ def clean_url_list(self, urls):
114
+ files, webpages = [], []
115
+
116
+ for url in urls:
117
+ if self.is_webpage(url):
118
+ webpages.append(url)
119
+ else:
120
+ files.append(url)
121
+
122
+ return files, webpages
123
+
124
+ def is_child_url(self, url, base_url):
125
+ return url.startswith(base_url)
126
+
127
+ def normalize_url(self, url: str):
128
+ # Strip the fragment identifier
129
+ defragged_url, _ = urldefrag(url)
130
+ return defragged_url
131
+
132
+
133
+ def get_urls_from_file(file_path: str):
134
+ """
135
+ Function to get urls from a file
136
+ """
137
+ with open(file_path, "r") as f:
138
+ urls = f.readlines()
139
+ urls = [url.strip() for url in urls]
140
+ return urls
141
+
142
+
143
+ def get_base_url(url):
144
+ parsed_url = urlparse(url)
145
+ base_url = f"{parsed_url.scheme}://{parsed_url.netloc}/"
146
+ return base_url
147
+
148
+
149
+ def get_prompt(config):
150
+ if config["llm_params"]["use_history"]:
151
+ if config["llm_params"]["llm_loader"] == "local_llm":
152
+ custom_prompt_template = tinyllama_prompt_template_with_history
153
+ elif config["llm_params"]["llm_loader"] == "openai":
154
+ custom_prompt_template = openai_prompt_template_with_history
155
+ # else:
156
+ # custom_prompt_template = tinyllama_prompt_template_with_history # default
157
+ prompt = PromptTemplate(
158
+ template=custom_prompt_template,
159
+ input_variables=["context", "chat_history", "question"],
160
+ )
161
+ else:
162
+ if config["llm_params"]["llm_loader"] == "local_llm":
163
+ custom_prompt_template = tinyllama_prompt_template
164
+ elif config["llm_params"]["llm_loader"] == "openai":
165
+ custom_prompt_template = openai_prompt_template
166
+ # else:
167
+ # custom_prompt_template = tinyllama_prompt_template
168
+ prompt = PromptTemplate(
169
+ template=custom_prompt_template,
170
+ input_variables=["context", "question"],
171
+ )
172
+ return prompt
173
+
174
+
175
+ def get_sources(res, answer):
176
+ source_elements = []
177
+ source_dict = {} # Dictionary to store URL elements
178
+
179
+ for idx, source in enumerate(res["source_documents"]):
180
+ source_metadata = source.metadata
181
+ url = source_metadata["source"]
182
+ score = source_metadata.get("score", "N/A")
183
+ page = source_metadata.get("page", 1)
184
+ date = source_metadata.get("date", "N/A")
185
+
186
+ url_name = f"{url}_{page}"
187
+ if url_name not in source_dict:
188
+ source_dict[url_name] = {
189
+ "text": source.page_content,
190
+ "url": url,
191
+ "score": score,
192
+ "page": page,
193
+ "date": date,
194
+ }
195
+ else:
196
+ source_dict[url_name]["text"] += f"\n\n{source.page_content}"
197
+
198
+ # First, display the answer
199
+ full_answer = "**Answer:**\n"
200
+ full_answer += answer
201
+
202
+ # Then, display the sources
203
+ full_answer += "\n\n**Sources:**\n"
204
+ for idx, (url_name, source_data) in enumerate(source_dict.items()):
205
+ full_answer += f"\nSource {idx + 1} (Score: {source_data['score']}): {source_data['url']}\n"
206
+
207
+ name = f"Source {idx + 1} Text\n"
208
+ full_answer += name
209
+ source_elements.append(
210
+ cl.Text(name=name, content=source_data["text"], display="side")
211
+ )
212
+
213
+ # Add a PDF element if the source is a PDF file
214
+ if source_data["url"].lower().endswith(".pdf"):
215
+ name = f"Source {idx + 1} PDF\n"
216
+ full_answer += name
217
+ pdf_url = f"{source_data['url']}#page={source_data['page']+1}"
218
+ source_elements.append(cl.Pdf(name=name, url=pdf_url, display="side"))
219
+
220
+ full_answer += "\n**Metadata:**\n"
221
+ for idx, (url_name, source_data) in enumerate(source_dict.items()):
222
+ full_answer += f"Source {idx+1} Metadata\n"
223
+ source_elements.append(
224
+ cl.Text(
225
+ name=f"Source {idx+1} Metadata",
226
+ content=f"Page: {source_data['page']}\nDate: {source_data['date']}\n",
227
+ display="side",
228
+ )
229
+ )
230
+
231
+ return full_answer, source_elements
232
+
233
+
234
+ def get_metadata(file_names):
235
+ """
236
+ Function to get any additional metadata from the files
237
+ Returns a dict with the file_name: {metadata: value}
238
+ """
239
+ metadata_dict = {}
240
+ for file in file_names:
241
+ metadata_dict[file] = {
242
+ "source_type": "N/A",
243
+ }
244
+ return metadata_dict
code/modules/llm_tutor.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain import PromptTemplate
2
+ from langchain.embeddings import HuggingFaceEmbeddings
3
+ from langchain_community.chat_models import ChatOpenAI
4
+ from langchain_community.embeddings import OpenAIEmbeddings
5
+ from langchain.vectorstores import FAISS
6
+ from langchain.chains import RetrievalQA, ConversationalRetrievalChain
7
+ from langchain.llms import CTransformers
8
+ from langchain.memory import ConversationBufferWindowMemory
9
+ from langchain.chains.conversational_retrieval.prompts import QA_PROMPT
10
+ import os
11
+ from modules.constants import *
12
+ from modules.helpers import get_prompt
13
+ from modules.chat_model_loader import ChatModelLoader
14
+ from modules.vector_db import VectorDB, VectorDBScore
15
+
16
+
17
+ class LLMTutor:
18
+ def __init__(self, config, logger=None):
19
+ self.config = config
20
+ self.vector_db = VectorDB(config, logger=logger)
21
+ if self.config["embedding_options"]["embedd_files"]:
22
+ self.vector_db.create_database()
23
+ self.vector_db.save_database()
24
+
25
+ def set_custom_prompt(self):
26
+ """
27
+ Prompt template for QA retrieval for each vectorstore
28
+ """
29
+ prompt = get_prompt(self.config)
30
+ # prompt = QA_PROMPT
31
+
32
+ return prompt
33
+
34
+ # Retrieval QA Chain
35
+ def retrieval_qa_chain(self, llm, prompt, db):
36
+ if self.config["embedding_options"]["db_option"] in ["FAISS", "Chroma"]:
37
+ retriever = VectorDBScore(
38
+ vectorstore=db,
39
+ # search_kwargs={
40
+ # "k": self.config["embedding_options"]["search_top_k"],
41
+ # "lambda_mult": self.config["embedding_options"]["lambda_mult"],
42
+ # },
43
+ )
44
+ elif self.config["embedding_options"]["db_option"] == "RAGatouille":
45
+ retriever = db.as_langchain_retriever(
46
+ k=self.config["embedding_options"]["search_top_k"]
47
+ )
48
+ if self.config["llm_params"]["use_history"]:
49
+ memory = ConversationBufferWindowMemory(
50
+ k=self.config["llm_params"]["memory_window"],
51
+ memory_key="chat_history",
52
+ return_messages=True,
53
+ output_key="answer",
54
+ )
55
+ qa_chain = ConversationalRetrievalChain.from_llm(
56
+ llm=llm,
57
+ chain_type="stuff",
58
+ retriever=retriever,
59
+ return_source_documents=True,
60
+ memory=memory,
61
+ combine_docs_chain_kwargs={"prompt": prompt},
62
+ )
63
+ else:
64
+ qa_chain = RetrievalQA.from_chain_type(
65
+ llm=llm,
66
+ chain_type="stuff",
67
+ retriever=retriever,
68
+ return_source_documents=True,
69
+ chain_type_kwargs={"prompt": prompt},
70
+ )
71
+ return qa_chain
72
+
73
+ # Loading the model
74
+ def load_llm(self):
75
+ chat_model_loader = ChatModelLoader(self.config)
76
+ llm = chat_model_loader.load_chat_model()
77
+ return llm
78
+
79
+ # QA Model Function
80
+ def qa_bot(self):
81
+ db = self.vector_db.load_database()
82
+ self.llm = self.load_llm()
83
+ qa_prompt = self.set_custom_prompt()
84
+ qa = self.retrieval_qa_chain(self.llm, qa_prompt, db)
85
+
86
+ return qa
87
+
88
+ # output function
89
+ def final_result(query):
90
+ qa_result = qa_bot()
91
+ response = qa_result({"query": query})
92
+ return response
code/modules/vector_db.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import yaml
4
+ from langchain_community.vectorstores import FAISS, Chroma
5
+ from langchain.schema.vectorstore import VectorStoreRetriever
6
+ from langchain.callbacks.manager import CallbackManagerForRetrieverRun
7
+ from langchain.schema.document import Document
8
+ from langchain_core.callbacks import AsyncCallbackManagerForRetrieverRun
9
+ from ragatouille import RAGPretrainedModel
10
+
11
+ try:
12
+ from modules.embedding_model_loader import EmbeddingModelLoader
13
+ from modules.data_loader import DataLoader
14
+ from modules.constants import *
15
+ from modules.helpers import *
16
+ except:
17
+ from embedding_model_loader import EmbeddingModelLoader
18
+ from data_loader import DataLoader
19
+ from constants import *
20
+ from helpers import *
21
+
22
+ from typing import List
23
+
24
+
25
+ class VectorDBScore(VectorStoreRetriever):
26
+
27
+ # See https://github.com/langchain-ai/langchain/blob/61dd92f8215daef3d9cf1734b0d1f8c70c1571c3/libs/langchain/langchain/vectorstores/base.py#L500
28
+ def _get_relevant_documents(
29
+ self, query: str, *, run_manager: CallbackManagerForRetrieverRun
30
+ ) -> List[Document]:
31
+ docs_and_similarities = (
32
+ self.vectorstore.similarity_search_with_relevance_scores(
33
+ query, **self.search_kwargs
34
+ )
35
+ )
36
+ # Make the score part of the document metadata
37
+ for doc, similarity in docs_and_similarities:
38
+ doc.metadata["score"] = similarity
39
+
40
+ docs = [doc for doc, _ in docs_and_similarities]
41
+ return docs
42
+
43
+ async def _aget_relevant_documents(
44
+ self, query: str, *, run_manager: AsyncCallbackManagerForRetrieverRun
45
+ ) -> List[Document]:
46
+ docs_and_similarities = (
47
+ self.vectorstore.similarity_search_with_relevance_scores(
48
+ query, **self.search_kwargs
49
+ )
50
+ )
51
+ # Make the score part of the document metadata
52
+ for doc, similarity in docs_and_similarities:
53
+ doc.metadata["score"] = similarity
54
+
55
+ docs = [doc for doc, _ in docs_and_similarities]
56
+ return docs
57
+
58
+
59
+ class VectorDB:
60
+ def __init__(self, config, logger=None):
61
+ self.config = config
62
+ self.db_option = config["embedding_options"]["db_option"]
63
+ self.document_names = None
64
+ self.webpage_crawler = WebpageCrawler()
65
+
66
+ # Set up logging to both console and a file
67
+ if logger is None:
68
+ self.logger = logging.getLogger(__name__)
69
+ self.logger.setLevel(logging.INFO)
70
+
71
+ # Console Handler
72
+ console_handler = logging.StreamHandler()
73
+ console_handler.setLevel(logging.INFO)
74
+ formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
75
+ console_handler.setFormatter(formatter)
76
+ self.logger.addHandler(console_handler)
77
+
78
+ # File Handler
79
+ log_file_path = "vector_db.log" # Change this to your desired log file path
80
+ file_handler = logging.FileHandler(log_file_path, mode="w")
81
+ file_handler.setLevel(logging.INFO)
82
+ file_handler.setFormatter(formatter)
83
+ self.logger.addHandler(file_handler)
84
+ else:
85
+ self.logger = logger
86
+
87
+ self.logger.info("VectorDB instance instantiated")
88
+
89
+ def load_files(self):
90
+ files = os.listdir(self.config["embedding_options"]["data_path"])
91
+ files = [
92
+ os.path.join(self.config["embedding_options"]["data_path"], file)
93
+ for file in files
94
+ ]
95
+ urls = get_urls_from_file(self.config["embedding_options"]["url_file_path"])
96
+ if self.config["embedding_options"]["expand_urls"]:
97
+ all_urls = []
98
+ for url in urls:
99
+ loop = asyncio.get_event_loop()
100
+ all_urls.extend(
101
+ loop.run_until_complete(
102
+ self.webpage_crawler.get_all_pages(
103
+ url, url
104
+ ) # only get child urls, if you want to get all urls, replace the second argument with the base url
105
+ )
106
+ )
107
+ urls = all_urls
108
+ return files, urls
109
+
110
+ def create_embedding_model(self):
111
+ self.logger.info("Creating embedding function")
112
+ self.embedding_model_loader = EmbeddingModelLoader(self.config)
113
+ self.embedding_model = self.embedding_model_loader.load_embedding_model()
114
+
115
+ def initialize_database(
116
+ self,
117
+ document_chunks: list,
118
+ document_names: list,
119
+ documents: list,
120
+ document_metadata: list,
121
+ ):
122
+ if self.db_option in ["FAISS", "Chroma"]:
123
+ self.create_embedding_model()
124
+ # Track token usage
125
+ self.logger.info("Initializing vector_db")
126
+ self.logger.info("\tUsing {} as db_option".format(self.db_option))
127
+ if self.db_option == "FAISS":
128
+ self.vector_db = FAISS.from_documents(
129
+ documents=document_chunks, embedding=self.embedding_model
130
+ )
131
+ elif self.db_option == "Chroma":
132
+ self.vector_db = Chroma.from_documents(
133
+ documents=document_chunks,
134
+ embedding=self.embedding_model,
135
+ persist_directory=os.path.join(
136
+ self.config["embedding_options"]["db_path"],
137
+ "db_"
138
+ + self.config["embedding_options"]["db_option"]
139
+ + "_"
140
+ + self.config["embedding_options"]["model"],
141
+ ),
142
+ )
143
+ elif self.db_option == "RAGatouille":
144
+ self.RAG = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
145
+ index_path = self.RAG.index(
146
+ index_name="new_idx",
147
+ collection=documents,
148
+ document_ids=document_names,
149
+ document_metadatas=document_metadata,
150
+ )
151
+ self.logger.info("Completed initializing vector_db")
152
+
153
+ def create_database(self):
154
+ data_loader = DataLoader(self.config)
155
+ self.logger.info("Loading data")
156
+ files, urls = self.load_files()
157
+ files, webpages = self.webpage_crawler.clean_url_list(urls)
158
+ if "storage/data/urls.txt" in files:
159
+ files.remove("storage/data/urls.txt")
160
+ document_chunks, document_names, documents, document_metadata = (
161
+ data_loader.get_chunks(files, webpages)
162
+ )
163
+ self.logger.info("Completed loading data")
164
+ self.initialize_database(
165
+ document_chunks, document_names, documents, document_metadata
166
+ )
167
+
168
+ def save_database(self):
169
+ if self.db_option == "FAISS":
170
+ self.vector_db.save_local(
171
+ os.path.join(
172
+ self.config["embedding_options"]["db_path"],
173
+ "db_"
174
+ + self.config["embedding_options"]["db_option"]
175
+ + "_"
176
+ + self.config["embedding_options"]["model"],
177
+ )
178
+ )
179
+ elif self.db_option == "Chroma":
180
+ # db is saved in the persist directory during initialization
181
+ pass
182
+ elif self.db_option == "RAGatouille":
183
+ # index is saved during initialization
184
+ pass
185
+ self.logger.info("Saved database")
186
+
187
+ def load_database(self):
188
+ self.create_embedding_model()
189
+ if self.db_option == "FAISS":
190
+ self.vector_db = FAISS.load_local(
191
+ os.path.join(
192
+ self.config["embedding_options"]["db_path"],
193
+ "db_"
194
+ + self.config["embedding_options"]["db_option"]
195
+ + "_"
196
+ + self.config["embedding_options"]["model"],
197
+ ),
198
+ self.embedding_model,
199
+ allow_dangerous_deserialization=True,
200
+ )
201
+ elif self.db_option == "Chroma":
202
+ self.vector_db = Chroma(
203
+ persist_directory=os.path.join(
204
+ self.config["embedding_options"]["db_path"],
205
+ "db_"
206
+ + self.config["embedding_options"]["db_option"]
207
+ + "_"
208
+ + self.config["embedding_options"]["model"],
209
+ ),
210
+ embedding_function=self.embedding_model,
211
+ )
212
+ elif self.db_option == "RAGatouille":
213
+ self.vector_db = RAGPretrainedModel.from_index(
214
+ ".ragatouille/colbert/indexes/new_idx"
215
+ )
216
+ self.logger.info("Loaded database")
217
+ return self.vector_db
218
+
219
+
220
+ if __name__ == "__main__":
221
+ with open("code/config.yml", "r") as f:
222
+ config = yaml.safe_load(f)
223
+ print(config)
224
+ vector_db = VectorDB(config)
225
+ vector_db.create_database()
226
+ vector_db.save_database()
public/logo_dark.png ADDED
public/logo_light.png ADDED
public/test.css ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ a[href*='https://github.com/Chainlit/chainlit'] {
2
+ visibility: hidden;
3
+ }
4
+
5
+ .message-avatar .MuiAvatar-root {
6
+ background-color: transparent; /* Remove the background color */
7
+ color: #FFFFFF; /* Change this to your desired text color */
8
+ border: 0.25px solid #FFFFFF; /* Add a white border for the circle */
9
+ border-radius: 50%; /* Ensure the avatar remains circular */
10
+ background-image: url('http://localhost:8051/logo?theme=dark'); /* Path to your logo */
11
+ background-size: cover; /* Ensure the logo covers the entire avatar */
12
+ background-position: center; /* Center the logo */
13
+ background-repeat: no-repeat; /* Prevent the logo from repeating */
14
+ width: 38px; /* Adjust the width as needed */
15
+ height: 38px; /* Adjust the height as needed */
16
+ }
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Automatically generated by https://github.com/damnever/pigar.
2
+
3
+ beautifulsoup4==4.12.3
4
+ chainlit==1.1.202
5
+ langchain==0.1.20
6
+ langchain-community==0.0.38
7
+ langchain-core==0.1.52
8
+ llama-parse==0.4.4
9
+ pysrt==1.1.2
10
+ python-dotenv==1.0.1
11
+ PyYAML==6.0.1
12
+ RAGatouille==0.0.8.post2
13
+ requests==2.32.3
14
+ torch==2.3.1
15
+ tqdm==4.66.4
16
+ transformers==4.41.2
17
+ llama-cpp-python==0.2.77
18
+ fake_useragent==1.5.1
19
+ chromadb==0.5.0
storage/data/urls.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://www.bu.edu/cds-faculty/programs-admissions/ms-data-science/