Spaces:
Runtime error
Runtime error
Synced repo using 'sync_with_huggingface' Github Action
Browse files- .chainlit/config.toml +121 -0
- .chainlit/translations/en-US.json +231 -0
- Dockerfile +29 -0
- Dockerfile.dev +27 -0
- LICENSE +21 -0
- chainlit.md +10 -0
- code/config.yml +32 -0
- code/main.py +124 -0
- code/modules/__init__.py +0 -0
- code/modules/chat_model_loader.py +39 -0
- code/modules/constants.py +81 -0
- code/modules/data_loader.py +290 -0
- code/modules/embedding_model_loader.py +38 -0
- code/modules/helpers.py +244 -0
- code/modules/llm_tutor.py +92 -0
- code/modules/vector_db.py +226 -0
- public/logo_dark.png +0 -0
- public/logo_light.png +0 -0
- public/test.css +16 -0
- requirements.txt +19 -0
- storage/data/urls.txt +1 -0
.chainlit/config.toml
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[project]
|
2 |
+
# Whether to enable telemetry (default: true). No personal data is collected.
|
3 |
+
enable_telemetry = true
|
4 |
+
|
5 |
+
|
6 |
+
# List of environment variables to be provided by each user to use the app.
|
7 |
+
user_env = []
|
8 |
+
|
9 |
+
# Duration (in seconds) during which the session is saved when the connection is lost
|
10 |
+
session_timeout = 3600
|
11 |
+
|
12 |
+
# Enable third parties caching (e.g LangChain cache)
|
13 |
+
cache = false
|
14 |
+
|
15 |
+
# Authorized origins
|
16 |
+
allow_origins = ["*"]
|
17 |
+
|
18 |
+
# Follow symlink for asset mount (see https://github.com/Chainlit/chainlit/issues/317)
|
19 |
+
# follow_symlink = false
|
20 |
+
|
21 |
+
[features]
|
22 |
+
# Show the prompt playground
|
23 |
+
prompt_playground = true
|
24 |
+
|
25 |
+
# Process and display HTML in messages. This can be a security risk (see https://stackoverflow.com/questions/19603097/why-is-it-dangerous-to-render-user-generated-html-or-javascript)
|
26 |
+
unsafe_allow_html = false
|
27 |
+
|
28 |
+
# Process and display mathematical expressions. This can clash with "$" characters in messages.
|
29 |
+
latex = false
|
30 |
+
|
31 |
+
# Automatically tag threads with the current chat profile (if a chat profile is used)
|
32 |
+
auto_tag_thread = true
|
33 |
+
|
34 |
+
# Authorize users to spontaneously upload files with messages
|
35 |
+
[features.spontaneous_file_upload]
|
36 |
+
enabled = true
|
37 |
+
accept = ["*/*"]
|
38 |
+
max_files = 20
|
39 |
+
max_size_mb = 500
|
40 |
+
|
41 |
+
[features.audio]
|
42 |
+
# Threshold for audio recording
|
43 |
+
min_decibels = -45
|
44 |
+
# Delay for the user to start speaking in MS
|
45 |
+
initial_silence_timeout = 3000
|
46 |
+
# Delay for the user to continue speaking in MS. If the user stops speaking for this duration, the recording will stop.
|
47 |
+
silence_timeout = 1500
|
48 |
+
# Above this duration (MS), the recording will forcefully stop.
|
49 |
+
max_duration = 15000
|
50 |
+
# Duration of the audio chunks in MS
|
51 |
+
chunk_duration = 1000
|
52 |
+
# Sample rate of the audio
|
53 |
+
sample_rate = 44100
|
54 |
+
|
55 |
+
[UI]
|
56 |
+
# Name of the app and chatbot.
|
57 |
+
name = "SCC Sherpa"
|
58 |
+
|
59 |
+
# Show the readme while the thread is empty.
|
60 |
+
show_readme_as_default = true
|
61 |
+
|
62 |
+
# Description of the app and chatbot. This is used for HTML tags.
|
63 |
+
# description = "SCC Guide"
|
64 |
+
|
65 |
+
# Large size content are by default collapsed for a cleaner ui
|
66 |
+
default_collapse_content = true
|
67 |
+
|
68 |
+
# The default value for the expand messages settings.
|
69 |
+
default_expand_messages = false
|
70 |
+
|
71 |
+
# Hide the chain of thought details from the user in the UI.
|
72 |
+
hide_cot = false
|
73 |
+
|
74 |
+
# Link to your github repo. This will add a github button in the UI's header.
|
75 |
+
# github = ""
|
76 |
+
|
77 |
+
# Specify a CSS file that can be used to customize the user interface.
|
78 |
+
# The CSS file can be served from the public directory or via an external link.
|
79 |
+
custom_css = "/public/test.css"
|
80 |
+
|
81 |
+
# Specify a Javascript file that can be used to customize the user interface.
|
82 |
+
# The Javascript file can be served from the public directory.
|
83 |
+
# custom_js = "/public/test.js"
|
84 |
+
|
85 |
+
# Specify a custom font url.
|
86 |
+
# custom_font = "https://fonts.googleapis.com/css2?family=Inter:wght@400;500;700&display=swap"
|
87 |
+
|
88 |
+
# Specify a custom meta image url.
|
89 |
+
# custom_meta_image_url = "https://chainlit-cloud.s3.eu-west-3.amazonaws.com/logo/chainlit_banner.png"
|
90 |
+
|
91 |
+
# Specify a custom build directory for the frontend.
|
92 |
+
# This can be used to customize the frontend code.
|
93 |
+
# Be careful: If this is a relative path, it should not start with a slash.
|
94 |
+
# custom_build = "./public/build"
|
95 |
+
|
96 |
+
[UI.theme]
|
97 |
+
#layout = "wide"
|
98 |
+
#font_family = "Inter, sans-serif"
|
99 |
+
# Override default MUI light theme. (Check theme.ts)
|
100 |
+
[UI.theme.light]
|
101 |
+
#background = "#FAFAFA"
|
102 |
+
#paper = "#FFFFFF"
|
103 |
+
|
104 |
+
[UI.theme.light.primary]
|
105 |
+
#main = "#F80061"
|
106 |
+
#dark = "#980039"
|
107 |
+
#light = "#FFE7EB"
|
108 |
+
|
109 |
+
# Override default MUI dark theme. (Check theme.ts)
|
110 |
+
[UI.theme.dark]
|
111 |
+
background = "#1C1C1C" # Slightly lighter dark background color
|
112 |
+
paper = "#2A2A2A" # Slightly lighter dark paper color
|
113 |
+
|
114 |
+
[UI.theme.dark.primary]
|
115 |
+
main = "#89CFF0" # Primary color
|
116 |
+
dark = "#3700B3" # Dark variant of primary color
|
117 |
+
light = "#CFBCFF" # Lighter variant of primary color
|
118 |
+
|
119 |
+
|
120 |
+
[meta]
|
121 |
+
generated_by = "1.1.202"
|
.chainlit/translations/en-US.json
ADDED
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"components": {
|
3 |
+
"atoms": {
|
4 |
+
"buttons": {
|
5 |
+
"userButton": {
|
6 |
+
"menu": {
|
7 |
+
"settings": "Settings",
|
8 |
+
"settingsKey": "S",
|
9 |
+
"APIKeys": "API Keys",
|
10 |
+
"logout": "Logout"
|
11 |
+
}
|
12 |
+
}
|
13 |
+
}
|
14 |
+
},
|
15 |
+
"molecules": {
|
16 |
+
"newChatButton": {
|
17 |
+
"newChat": "New Chat"
|
18 |
+
},
|
19 |
+
"tasklist": {
|
20 |
+
"TaskList": {
|
21 |
+
"title": "\ud83d\uddd2\ufe0f Task List",
|
22 |
+
"loading": "Loading...",
|
23 |
+
"error": "An error occured"
|
24 |
+
}
|
25 |
+
},
|
26 |
+
"attachments": {
|
27 |
+
"cancelUpload": "Cancel upload",
|
28 |
+
"removeAttachment": "Remove attachment"
|
29 |
+
},
|
30 |
+
"newChatDialog": {
|
31 |
+
"createNewChat": "Create new chat?",
|
32 |
+
"clearChat": "This will clear the current messages and start a new chat.",
|
33 |
+
"cancel": "Cancel",
|
34 |
+
"confirm": "Confirm"
|
35 |
+
},
|
36 |
+
"settingsModal": {
|
37 |
+
"settings": "Settings",
|
38 |
+
"expandMessages": "Expand Messages",
|
39 |
+
"hideChainOfThought": "Hide Chain of Thought",
|
40 |
+
"darkMode": "Dark Mode"
|
41 |
+
},
|
42 |
+
"detailsButton": {
|
43 |
+
"using": "Using",
|
44 |
+
"running": "Running",
|
45 |
+
"took_one": "Took {{count}} step",
|
46 |
+
"took_other": "Took {{count}} steps"
|
47 |
+
},
|
48 |
+
"auth": {
|
49 |
+
"authLogin": {
|
50 |
+
"title": "Login to access the app.",
|
51 |
+
"form": {
|
52 |
+
"email": "Email address",
|
53 |
+
"password": "Password",
|
54 |
+
"noAccount": "Don't have an account?",
|
55 |
+
"alreadyHaveAccount": "Already have an account?",
|
56 |
+
"signup": "Sign Up",
|
57 |
+
"signin": "Sign In",
|
58 |
+
"or": "OR",
|
59 |
+
"continue": "Continue",
|
60 |
+
"forgotPassword": "Forgot password?",
|
61 |
+
"passwordMustContain": "Your password must contain:",
|
62 |
+
"emailRequired": "email is a required field",
|
63 |
+
"passwordRequired": "password is a required field"
|
64 |
+
},
|
65 |
+
"error": {
|
66 |
+
"default": "Unable to sign in.",
|
67 |
+
"signin": "Try signing in with a different account.",
|
68 |
+
"oauthsignin": "Try signing in with a different account.",
|
69 |
+
"redirect_uri_mismatch": "The redirect URI is not matching the oauth app configuration.",
|
70 |
+
"oauthcallbackerror": "Try signing in with a different account.",
|
71 |
+
"oauthcreateaccount": "Try signing in with a different account.",
|
72 |
+
"emailcreateaccount": "Try signing in with a different account.",
|
73 |
+
"callback": "Try signing in with a different account.",
|
74 |
+
"oauthaccountnotlinked": "To confirm your identity, sign in with the same account you used originally.",
|
75 |
+
"emailsignin": "The e-mail could not be sent.",
|
76 |
+
"emailverify": "Please verify your email, a new email has been sent.",
|
77 |
+
"credentialssignin": "Sign in failed. Check the details you provided are correct.",
|
78 |
+
"sessionrequired": "Please sign in to access this page."
|
79 |
+
}
|
80 |
+
},
|
81 |
+
"authVerifyEmail": {
|
82 |
+
"almostThere": "You're almost there! We've sent an email to ",
|
83 |
+
"verifyEmailLink": "Please click on the link in that email to complete your signup.",
|
84 |
+
"didNotReceive": "Can't find the email?",
|
85 |
+
"resendEmail": "Resend email",
|
86 |
+
"goBack": "Go Back",
|
87 |
+
"emailSent": "Email sent successfully.",
|
88 |
+
"verifyEmail": "Verify your email address"
|
89 |
+
},
|
90 |
+
"providerButton": {
|
91 |
+
"continue": "Continue with {{provider}}",
|
92 |
+
"signup": "Sign up with {{provider}}"
|
93 |
+
},
|
94 |
+
"authResetPassword": {
|
95 |
+
"newPasswordRequired": "New password is a required field",
|
96 |
+
"passwordsMustMatch": "Passwords must match",
|
97 |
+
"confirmPasswordRequired": "Confirm password is a required field",
|
98 |
+
"newPassword": "New password",
|
99 |
+
"confirmPassword": "Confirm password",
|
100 |
+
"resetPassword": "Reset Password"
|
101 |
+
},
|
102 |
+
"authForgotPassword": {
|
103 |
+
"email": "Email address",
|
104 |
+
"emailRequired": "email is a required field",
|
105 |
+
"emailSent": "Please check the email address {{email}} for instructions to reset your password.",
|
106 |
+
"enterEmail": "Enter your email address and we will send you instructions to reset your password.",
|
107 |
+
"resendEmail": "Resend email",
|
108 |
+
"continue": "Continue",
|
109 |
+
"goBack": "Go Back"
|
110 |
+
}
|
111 |
+
}
|
112 |
+
},
|
113 |
+
"organisms": {
|
114 |
+
"chat": {
|
115 |
+
"history": {
|
116 |
+
"index": {
|
117 |
+
"showHistory": "Show history",
|
118 |
+
"lastInputs": "Last Inputs",
|
119 |
+
"noInputs": "Such empty...",
|
120 |
+
"loading": "Loading..."
|
121 |
+
}
|
122 |
+
},
|
123 |
+
"inputBox": {
|
124 |
+
"input": {
|
125 |
+
"placeholder": "Type your message here..."
|
126 |
+
},
|
127 |
+
"speechButton": {
|
128 |
+
"start": "Start recording",
|
129 |
+
"stop": "Stop recording"
|
130 |
+
},
|
131 |
+
"SubmitButton": {
|
132 |
+
"sendMessage": "Send message",
|
133 |
+
"stopTask": "Stop Task"
|
134 |
+
},
|
135 |
+
"UploadButton": {
|
136 |
+
"attachFiles": "Attach files"
|
137 |
+
},
|
138 |
+
"waterMark": {
|
139 |
+
"text": "Built with"
|
140 |
+
}
|
141 |
+
},
|
142 |
+
"Messages": {
|
143 |
+
"index": {
|
144 |
+
"running": "Running",
|
145 |
+
"executedSuccessfully": "executed successfully",
|
146 |
+
"failed": "failed",
|
147 |
+
"feedbackUpdated": "Feedback updated",
|
148 |
+
"updating": "Updating"
|
149 |
+
}
|
150 |
+
},
|
151 |
+
"dropScreen": {
|
152 |
+
"dropYourFilesHere": "Drop your files here"
|
153 |
+
},
|
154 |
+
"index": {
|
155 |
+
"failedToUpload": "Failed to upload",
|
156 |
+
"cancelledUploadOf": "Cancelled upload of",
|
157 |
+
"couldNotReachServer": "Could not reach the server",
|
158 |
+
"continuingChat": "Continuing previous chat"
|
159 |
+
},
|
160 |
+
"settings": {
|
161 |
+
"settingsPanel": "Settings panel",
|
162 |
+
"reset": "Reset",
|
163 |
+
"cancel": "Cancel",
|
164 |
+
"confirm": "Confirm"
|
165 |
+
}
|
166 |
+
},
|
167 |
+
"threadHistory": {
|
168 |
+
"sidebar": {
|
169 |
+
"filters": {
|
170 |
+
"FeedbackSelect": {
|
171 |
+
"feedbackAll": "Feedback: All",
|
172 |
+
"feedbackPositive": "Feedback: Positive",
|
173 |
+
"feedbackNegative": "Feedback: Negative"
|
174 |
+
},
|
175 |
+
"SearchBar": {
|
176 |
+
"search": "Search"
|
177 |
+
}
|
178 |
+
},
|
179 |
+
"DeleteThreadButton": {
|
180 |
+
"confirmMessage": "This will delete the thread as well as it's messages and elements.",
|
181 |
+
"cancel": "Cancel",
|
182 |
+
"confirm": "Confirm",
|
183 |
+
"deletingChat": "Deleting chat",
|
184 |
+
"chatDeleted": "Chat deleted"
|
185 |
+
},
|
186 |
+
"index": {
|
187 |
+
"pastChats": "Past Chats"
|
188 |
+
},
|
189 |
+
"ThreadList": {
|
190 |
+
"empty": "Empty...",
|
191 |
+
"today": "Today",
|
192 |
+
"yesterday": "Yesterday",
|
193 |
+
"previous7days": "Previous 7 days",
|
194 |
+
"previous30days": "Previous 30 days"
|
195 |
+
},
|
196 |
+
"TriggerButton": {
|
197 |
+
"closeSidebar": "Close sidebar",
|
198 |
+
"openSidebar": "Open sidebar"
|
199 |
+
}
|
200 |
+
},
|
201 |
+
"Thread": {
|
202 |
+
"backToChat": "Go back to chat",
|
203 |
+
"chatCreatedOn": "This chat was created on"
|
204 |
+
}
|
205 |
+
},
|
206 |
+
"header": {
|
207 |
+
"chat": "Chat",
|
208 |
+
"readme": "Readme"
|
209 |
+
}
|
210 |
+
}
|
211 |
+
},
|
212 |
+
"hooks": {
|
213 |
+
"useLLMProviders": {
|
214 |
+
"failedToFetchProviders": "Failed to fetch providers:"
|
215 |
+
}
|
216 |
+
},
|
217 |
+
"pages": {
|
218 |
+
"Design": {},
|
219 |
+
"Env": {
|
220 |
+
"savedSuccessfully": "Saved successfully",
|
221 |
+
"requiredApiKeys": "Required API Keys",
|
222 |
+
"requiredApiKeysInfo": "To use this app, the following API keys are required. The keys are stored on your device's local storage."
|
223 |
+
},
|
224 |
+
"Page": {
|
225 |
+
"notPartOfProject": "You are not part of this project."
|
226 |
+
},
|
227 |
+
"ResumeButton": {
|
228 |
+
"resumeChat": "Resume Chat"
|
229 |
+
}
|
230 |
+
}
|
231 |
+
}
|
Dockerfile
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9
|
2 |
+
|
3 |
+
WORKDIR /code
|
4 |
+
|
5 |
+
COPY ./requirements.txt /code/requirements.txt
|
6 |
+
|
7 |
+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
8 |
+
|
9 |
+
RUN pip install --no-cache-dir transformers==4.36.2 torch==2.1.2
|
10 |
+
|
11 |
+
RUN pip install --upgrade --force-reinstall --no-cache-dir llama-cpp-python==0.2.32
|
12 |
+
|
13 |
+
COPY . /code
|
14 |
+
|
15 |
+
RUN ls -R
|
16 |
+
|
17 |
+
# Change permissions to allow writing to the directory
|
18 |
+
RUN chmod -R 777 /code
|
19 |
+
|
20 |
+
# Create a logs directory and set permissions
|
21 |
+
RUN mkdir /code/logs && chmod 777 /code/logs
|
22 |
+
|
23 |
+
# Create a cache directory within the application's working directory
|
24 |
+
RUN mkdir /.cache && chmod -R 777 /.cache
|
25 |
+
|
26 |
+
RUN --mount=type=secret,id=HUGGINGFACEHUB_API_TOKEN,mode=0444,required=true
|
27 |
+
RUN --mount=type=secret,id=OPENAI_API_KEY,mode=0444,required=true
|
28 |
+
|
29 |
+
CMD python code/modules/vector_db.py && chainlit run code/main.py --host 0.0.0.0 --port 7860
|
Dockerfile.dev
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.11
|
2 |
+
|
3 |
+
WORKDIR /code
|
4 |
+
|
5 |
+
COPY ./requirements.txt /code/requirements.txt
|
6 |
+
|
7 |
+
RUN pip install --upgrade pip
|
8 |
+
|
9 |
+
RUN pip install --no-cache-dir -r /code/requirements.txt
|
10 |
+
|
11 |
+
COPY . /code
|
12 |
+
|
13 |
+
RUN ls -R
|
14 |
+
|
15 |
+
# Change permissions to allow writing to the directory
|
16 |
+
RUN chmod -R 777 /code
|
17 |
+
|
18 |
+
# Create a logs directory and set permissions
|
19 |
+
RUN mkdir /code/logs && chmod 777 /code/logs
|
20 |
+
|
21 |
+
# Create a cache directory within the application's working directory
|
22 |
+
RUN mkdir /.cache && chmod -R 777 /.cache
|
23 |
+
|
24 |
+
# Expose the port the app runs on
|
25 |
+
EXPOSE 8051
|
26 |
+
|
27 |
+
CMD chainlit run code/main.py --port 8051
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2023 DL4DS
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
chainlit.md
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Welcome to DL4DS Tutor! 🚀🤖
|
2 |
+
|
3 |
+
Hi there, this is an LLM chatbot designed to help answer your questions.
|
4 |
+
This is still very much a Work in Progress.
|
5 |
+
|
6 |
+
### --- Please wait while the Tutor loads... ---
|
7 |
+
|
8 |
+
## Useful Links 🔗
|
9 |
+
|
10 |
+
- **Documentation:** [Chainlit Documentation](https://docs.chainlit.io) 📚
|
code/config.yml
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
embedding_options:
|
2 |
+
embedd_files: False # bool
|
3 |
+
data_path: 'storage/data' # str
|
4 |
+
url_file_path: 'storage/data/urls.txt' # str
|
5 |
+
expand_urls: True # bool
|
6 |
+
db_option : 'FAISS' # str [FAISS, Chroma, RAGatouille]
|
7 |
+
db_path : 'vectorstores' # str
|
8 |
+
model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
|
9 |
+
search_top_k : 3 # int
|
10 |
+
score_threshold : 0.0 # float
|
11 |
+
lambda_mult: 0.5 # float - Determines Diversity of the retrieved results
|
12 |
+
llm_params:
|
13 |
+
use_history: True # bool
|
14 |
+
memory_window: 3 # int
|
15 |
+
llm_loader: 'local_llm' # str [local_llm, openai]
|
16 |
+
openai_params:
|
17 |
+
model: 'gpt-4' # str [gpt-3.5-turbo-1106, gpt-4]
|
18 |
+
local_llm_params:
|
19 |
+
model: "storage/models/tinyllama-1.1b-chat-v0.3.Q5_K_M.gguf"
|
20 |
+
model_type: "llama"
|
21 |
+
temperature: 0.2
|
22 |
+
splitter_options:
|
23 |
+
use_splitter: True # bool
|
24 |
+
split_by_token : True # bool
|
25 |
+
remove_leftover_delimiters: True # bool
|
26 |
+
remove_chunks: False # bool
|
27 |
+
chunk_size : 300 # int
|
28 |
+
chunk_overlap : 30 # int
|
29 |
+
chunk_separators : ["\n\n", "\n", " ", ""] # list of strings
|
30 |
+
front_chunks_to_remove : null # int or None
|
31 |
+
last_chunks_to_remove : null # int or None
|
32 |
+
delimiters_to_remove : ['\t', '\n', ' ', ' '] # list of strings
|
code/main.py
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
|
2 |
+
from langchain import PromptTemplate
|
3 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
4 |
+
from langchain.vectorstores import FAISS
|
5 |
+
from langchain.chains import RetrievalQA
|
6 |
+
from langchain.llms import CTransformers
|
7 |
+
import chainlit as cl
|
8 |
+
from langchain_community.chat_models import ChatOpenAI
|
9 |
+
from langchain_community.embeddings import OpenAIEmbeddings
|
10 |
+
import yaml
|
11 |
+
import logging
|
12 |
+
from dotenv import load_dotenv
|
13 |
+
|
14 |
+
from modules.llm_tutor import LLMTutor
|
15 |
+
from modules.constants import *
|
16 |
+
from modules.helpers import get_sources
|
17 |
+
|
18 |
+
|
19 |
+
logger = logging.getLogger(__name__)
|
20 |
+
logger.setLevel(logging.INFO)
|
21 |
+
|
22 |
+
# Console Handler
|
23 |
+
console_handler = logging.StreamHandler()
|
24 |
+
console_handler.setLevel(logging.INFO)
|
25 |
+
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
26 |
+
console_handler.setFormatter(formatter)
|
27 |
+
logger.addHandler(console_handler)
|
28 |
+
|
29 |
+
# File Handler
|
30 |
+
log_file_path = "log_file.log" # Change this to your desired log file path
|
31 |
+
file_handler = logging.FileHandler(log_file_path)
|
32 |
+
file_handler.setLevel(logging.INFO)
|
33 |
+
file_handler.setFormatter(formatter)
|
34 |
+
logger.addHandler(file_handler)
|
35 |
+
|
36 |
+
|
37 |
+
# Adding option to select the chat profile
|
38 |
+
@cl.set_chat_profiles
|
39 |
+
async def chat_profile():
|
40 |
+
return [
|
41 |
+
cl.ChatProfile(
|
42 |
+
name="Llama",
|
43 |
+
markdown_description="Use the local LLM: **Tiny Llama**.",
|
44 |
+
),
|
45 |
+
# cl.ChatProfile(
|
46 |
+
# name="Mistral",
|
47 |
+
# markdown_description="Use the local LLM: **Mistral**.",
|
48 |
+
# ),
|
49 |
+
cl.ChatProfile(
|
50 |
+
name="gpt-3.5-turbo-1106",
|
51 |
+
markdown_description="Use OpenAI API for **gpt-3.5-turbo-1106**.",
|
52 |
+
),
|
53 |
+
cl.ChatProfile(
|
54 |
+
name="gpt-4",
|
55 |
+
markdown_description="Use OpenAI API for **gpt-4**.",
|
56 |
+
),
|
57 |
+
]
|
58 |
+
|
59 |
+
|
60 |
+
@cl.author_rename
|
61 |
+
def rename(orig_author: str):
|
62 |
+
rename_dict = {"Chatbot": "AI Tutor"}
|
63 |
+
return rename_dict.get(orig_author, orig_author)
|
64 |
+
|
65 |
+
|
66 |
+
# chainlit code
|
67 |
+
@cl.on_chat_start
|
68 |
+
async def start():
|
69 |
+
with open("code/config.yml", "r") as f:
|
70 |
+
config = yaml.safe_load(f)
|
71 |
+
print(config)
|
72 |
+
logger.info("Config file loaded")
|
73 |
+
logger.info(f"Config: {config}")
|
74 |
+
logger.info("Creating llm_tutor instance")
|
75 |
+
|
76 |
+
chat_profile = cl.user_session.get("chat_profile")
|
77 |
+
if chat_profile is not None:
|
78 |
+
if chat_profile.lower() in ["gpt-3.5-turbo-1106", "gpt-4"]:
|
79 |
+
config["llm_params"]["llm_loader"] = "openai"
|
80 |
+
config["llm_params"]["openai_params"]["model"] = chat_profile.lower()
|
81 |
+
elif chat_profile.lower() == "llama":
|
82 |
+
config["llm_params"]["llm_loader"] = "local_llm"
|
83 |
+
config["llm_params"]["local_llm_params"]["model"] = LLAMA_PATH
|
84 |
+
config["llm_params"]["local_llm_params"]["model_type"] = "llama"
|
85 |
+
elif chat_profile.lower() == "mistral":
|
86 |
+
config["llm_params"]["llm_loader"] = "local_llm"
|
87 |
+
config["llm_params"]["local_llm_params"]["model"] = MISTRAL_PATH
|
88 |
+
config["llm_params"]["local_llm_params"]["model_type"] = "mistral"
|
89 |
+
|
90 |
+
else:
|
91 |
+
pass
|
92 |
+
|
93 |
+
llm_tutor = LLMTutor(config, logger=logger)
|
94 |
+
|
95 |
+
chain = llm_tutor.qa_bot()
|
96 |
+
model = config["llm_params"]["local_llm_params"]["model"]
|
97 |
+
msg = cl.Message(content=f"Starting the bot {model}...")
|
98 |
+
await msg.send()
|
99 |
+
msg.content = f"{opening_message}"
|
100 |
+
await msg.update()
|
101 |
+
|
102 |
+
cl.user_session.set("chain", chain)
|
103 |
+
|
104 |
+
|
105 |
+
@cl.on_message
|
106 |
+
async def main(message):
|
107 |
+
user = cl.user_session.get("user")
|
108 |
+
chain = cl.user_session.get("chain")
|
109 |
+
# cb = cl.AsyncLangchainCallbackHandler(
|
110 |
+
# stream_final_answer=True, answer_prefix_tokens=["FINAL", "ANSWER"]
|
111 |
+
# )
|
112 |
+
# cb.answer_reached = True
|
113 |
+
# res=await chain.acall(message, callbacks=[cb])
|
114 |
+
res = await chain.acall(message.content)
|
115 |
+
print(f"response: {res}")
|
116 |
+
try:
|
117 |
+
answer = res["answer"]
|
118 |
+
except:
|
119 |
+
answer = res["result"]
|
120 |
+
print(f"answer: {answer}")
|
121 |
+
|
122 |
+
answer_with_sources, source_elements = get_sources(res, answer)
|
123 |
+
|
124 |
+
await cl.Message(content=answer_with_sources, elements=source_elements).send()
|
code/modules/__init__.py
ADDED
File without changes
|
code/modules/chat_model_loader.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.chat_models import ChatOpenAI
|
2 |
+
from langchain.llms import CTransformers
|
3 |
+
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
|
4 |
+
from transformers import AutoTokenizer, TextStreamer
|
5 |
+
from langchain.llms import LlamaCpp
|
6 |
+
import torch
|
7 |
+
import transformers
|
8 |
+
import os
|
9 |
+
from langchain.callbacks.manager import CallbackManager
|
10 |
+
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
11 |
+
|
12 |
+
|
13 |
+
class ChatModelLoader:
|
14 |
+
def __init__(self, config):
|
15 |
+
self.config = config
|
16 |
+
self.huggingface_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
17 |
+
|
18 |
+
def load_chat_model(self):
|
19 |
+
if self.config["llm_params"]["llm_loader"] == "openai":
|
20 |
+
llm = ChatOpenAI(
|
21 |
+
model_name=self.config["llm_params"]["openai_params"]["model"]
|
22 |
+
)
|
23 |
+
elif self.config["llm_params"]["llm_loader"] == "local_llm":
|
24 |
+
n_batch = 512 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
|
25 |
+
model_path = self.config["llm_params"]["local_llm_params"]["model"]
|
26 |
+
llm = LlamaCpp(
|
27 |
+
model_path=model_path,
|
28 |
+
n_batch=n_batch,
|
29 |
+
n_ctx=2048,
|
30 |
+
f16_kv=True,
|
31 |
+
verbose=True,
|
32 |
+
n_threads=2,
|
33 |
+
temperature=self.config["llm_params"]["local_llm_params"][
|
34 |
+
"temperature"
|
35 |
+
],
|
36 |
+
)
|
37 |
+
else:
|
38 |
+
raise ValueError("Invalid LLM Loader")
|
39 |
+
return llm
|
code/modules/constants.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dotenv import load_dotenv
|
2 |
+
import os
|
3 |
+
|
4 |
+
load_dotenv()
|
5 |
+
|
6 |
+
# API Keys - Loaded from the .env file
|
7 |
+
|
8 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
9 |
+
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
|
10 |
+
|
11 |
+
opening_message = "Hello! You can ask me questions about the MSDS program at Boston University."
|
12 |
+
|
13 |
+
# Prompt Templates
|
14 |
+
|
15 |
+
openai_prompt_template = """Use the following pieces of information to answer the user's question.
|
16 |
+
If you don't know the answer, just say that you don't know.
|
17 |
+
|
18 |
+
Context: {context}
|
19 |
+
Question: {question}
|
20 |
+
|
21 |
+
Only return the helpful answer below and nothing else.
|
22 |
+
Helpful answer:
|
23 |
+
"""
|
24 |
+
|
25 |
+
openai_prompt_template_with_history = """Use the following pieces of information to answer the user's question.
|
26 |
+
If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
27 |
+
Use the history to answer the question if you can.
|
28 |
+
Chat History:
|
29 |
+
{chat_history}
|
30 |
+
Context: {context}
|
31 |
+
Question: {question}
|
32 |
+
|
33 |
+
Only return the helpful answer below and nothing else.
|
34 |
+
Helpful answer:
|
35 |
+
"""
|
36 |
+
|
37 |
+
tinyllama_prompt_template = """
|
38 |
+
<|im_start|>system
|
39 |
+
Assistant is an intelligent chatbot designed to help students with questions regarding the course. Only answer questions using the context below and if you're not sure of an answer, you can say "I don't know". Always give a breif and concise answer to the question. Use the history to answer the question if you can.
|
40 |
+
|
41 |
+
Context:
|
42 |
+
{context}
|
43 |
+
<|im_end|>
|
44 |
+
<|im_start|>user
|
45 |
+
Question: Who is the instructor for this course?
|
46 |
+
<|im_end|>
|
47 |
+
<|im_start|>assistant
|
48 |
+
The instructor for this course is Prof. Thomas Gardos.
|
49 |
+
<|im_end|>
|
50 |
+
<|im_start|>user
|
51 |
+
Question: {question}
|
52 |
+
<|im_end|>
|
53 |
+
<|im_start|>assistant
|
54 |
+
"""
|
55 |
+
|
56 |
+
tinyllama_prompt_template_with_history = """
|
57 |
+
<|im_start|>system
|
58 |
+
Assistant is an intelligent chatbot designed to help students with questions regarding the course. Only answer questions using the context below and if you're not sure of an answer, you can say "I don't know". Always give a breif and concise answer to the question.
|
59 |
+
|
60 |
+
Chat History:
|
61 |
+
{chat_history}
|
62 |
+
Context:
|
63 |
+
{context}
|
64 |
+
<|im_end|>
|
65 |
+
<|im_start|>user
|
66 |
+
Question: Who is the instructor for this course?
|
67 |
+
<|im_end|>
|
68 |
+
<|im_start|>assistant
|
69 |
+
The instructor for this course is Prof. Thomas Gardos.
|
70 |
+
<|im_end|>
|
71 |
+
<|im_start|>user
|
72 |
+
Question: {question}
|
73 |
+
<|im_end|>
|
74 |
+
<|im_start|>assistant
|
75 |
+
"""
|
76 |
+
|
77 |
+
|
78 |
+
# Model Paths
|
79 |
+
|
80 |
+
LLAMA_PATH = "storage/models/tinyllama-1.1b-chat-v0.3.Q5_K_M.gguf"
|
81 |
+
MISTRAL_PATH = "storage/models/mistral-7b-v0.1.Q4_K_M.gguf"
|
code/modules/data_loader.py
ADDED
@@ -0,0 +1,290 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import requests
|
4 |
+
import pysrt
|
5 |
+
from langchain_community.document_loaders import (
|
6 |
+
PyMuPDFLoader,
|
7 |
+
Docx2txtLoader,
|
8 |
+
YoutubeLoader,
|
9 |
+
WebBaseLoader,
|
10 |
+
TextLoader,
|
11 |
+
)
|
12 |
+
from langchain_community.document_loaders import UnstructuredMarkdownLoader
|
13 |
+
from llama_parse import LlamaParse
|
14 |
+
from langchain.schema import Document
|
15 |
+
import logging
|
16 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
17 |
+
from ragatouille import RAGPretrainedModel
|
18 |
+
from langchain.chains import LLMChain
|
19 |
+
from langchain.llms import OpenAI
|
20 |
+
from langchain import PromptTemplate
|
21 |
+
|
22 |
+
try:
|
23 |
+
from modules.helpers import get_metadata
|
24 |
+
except:
|
25 |
+
from helpers import get_metadata
|
26 |
+
|
27 |
+
logger = logging.getLogger(__name__)
|
28 |
+
|
29 |
+
|
30 |
+
class PDFReader:
|
31 |
+
def __init__(self):
|
32 |
+
pass
|
33 |
+
|
34 |
+
def get_loader(self, pdf_path):
|
35 |
+
loader = PyMuPDFLoader(pdf_path)
|
36 |
+
return loader
|
37 |
+
|
38 |
+
def get_documents(self, loader):
|
39 |
+
return loader.load()
|
40 |
+
|
41 |
+
|
42 |
+
class FileReader:
|
43 |
+
def __init__(self):
|
44 |
+
self.pdf_reader = PDFReader()
|
45 |
+
|
46 |
+
def extract_text_from_pdf(self, pdf_path):
|
47 |
+
text = ""
|
48 |
+
with open(pdf_path, "rb") as file:
|
49 |
+
reader = PyPDF2.PdfReader(file)
|
50 |
+
num_pages = len(reader.pages)
|
51 |
+
for page_num in range(num_pages):
|
52 |
+
page = reader.pages[page_num]
|
53 |
+
text += page.extract_text()
|
54 |
+
return text
|
55 |
+
|
56 |
+
def download_pdf_from_url(self, pdf_url):
|
57 |
+
response = requests.get(pdf_url)
|
58 |
+
if response.status_code == 200:
|
59 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
|
60 |
+
temp_file.write(response.content)
|
61 |
+
temp_file_path = temp_file.name
|
62 |
+
return temp_file_path
|
63 |
+
else:
|
64 |
+
print("Failed to download PDF from URL:", pdf_url)
|
65 |
+
return None
|
66 |
+
|
67 |
+
def read_pdf(self, temp_file_path: str):
|
68 |
+
loader = self.pdf_reader.get_loader(temp_file_path)
|
69 |
+
documents = self.pdf_reader.get_documents(loader)
|
70 |
+
return documents
|
71 |
+
|
72 |
+
def read_txt(self, temp_file_path: str):
|
73 |
+
loader = TextLoader(temp_file_path, autodetect_encoding=True)
|
74 |
+
return loader.load()
|
75 |
+
|
76 |
+
def read_docx(self, temp_file_path: str):
|
77 |
+
loader = Docx2txtLoader(temp_file_path)
|
78 |
+
return loader.load()
|
79 |
+
|
80 |
+
def read_srt(self, temp_file_path: str):
|
81 |
+
subs = pysrt.open(temp_file_path)
|
82 |
+
text = ""
|
83 |
+
for sub in subs:
|
84 |
+
text += sub.text
|
85 |
+
return [Document(page_content=text)]
|
86 |
+
|
87 |
+
def read_youtube_transcript(self, url: str):
|
88 |
+
loader = YoutubeLoader.from_youtube_url(
|
89 |
+
url, add_video_info=True, language=["en"], translation="en"
|
90 |
+
)
|
91 |
+
return loader.load()
|
92 |
+
|
93 |
+
def read_html(self, url: str):
|
94 |
+
loader = WebBaseLoader(url)
|
95 |
+
return loader.load()
|
96 |
+
|
97 |
+
def read_tex_from_url(self, tex_url):
|
98 |
+
response = requests.get(tex_url)
|
99 |
+
if response.status_code == 200:
|
100 |
+
return [Document(page_content=response.text)]
|
101 |
+
else:
|
102 |
+
print("Failed to fetch .tex file from URL:", tex_url)
|
103 |
+
return None
|
104 |
+
|
105 |
+
|
106 |
+
class ChunkProcessor:
|
107 |
+
def __init__(self, config):
|
108 |
+
self.config = config
|
109 |
+
|
110 |
+
if config["splitter_options"]["use_splitter"]:
|
111 |
+
if config["splitter_options"]["split_by_token"]:
|
112 |
+
self.splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
113 |
+
chunk_size=config["splitter_options"]["chunk_size"],
|
114 |
+
chunk_overlap=config["splitter_options"]["chunk_overlap"],
|
115 |
+
separators=config["splitter_options"]["chunk_separators"],
|
116 |
+
disallowed_special=(),
|
117 |
+
)
|
118 |
+
else:
|
119 |
+
self.splitter = RecursiveCharacterTextSplitter(
|
120 |
+
chunk_size=config["splitter_options"]["chunk_size"],
|
121 |
+
chunk_overlap=config["splitter_options"]["chunk_overlap"],
|
122 |
+
separators=config["splitter_options"]["chunk_separators"],
|
123 |
+
disallowed_special=(),
|
124 |
+
)
|
125 |
+
else:
|
126 |
+
self.splitter = None
|
127 |
+
logger.info("ChunkProcessor instance created")
|
128 |
+
|
129 |
+
def remove_delimiters(self, document_chunks: list):
|
130 |
+
for chunk in document_chunks:
|
131 |
+
for delimiter in self.config["splitter_options"]["delimiters_to_remove"]:
|
132 |
+
chunk.page_content = re.sub(delimiter, " ", chunk.page_content)
|
133 |
+
return document_chunks
|
134 |
+
|
135 |
+
def remove_chunks(self, document_chunks: list):
|
136 |
+
front = self.config["splitter_options"]["front_chunk_to_remove"]
|
137 |
+
end = self.config["splitter_options"]["last_chunks_to_remove"]
|
138 |
+
for _ in range(front):
|
139 |
+
del document_chunks[0]
|
140 |
+
for _ in range(end):
|
141 |
+
document_chunks.pop()
|
142 |
+
logger.info(f"\tNumber of pages after skipping: {len(document_chunks)}")
|
143 |
+
return document_chunks
|
144 |
+
|
145 |
+
def process_chunks(
|
146 |
+
self, documents, file_type="txt", source="", page=0, metadata={}
|
147 |
+
):
|
148 |
+
documents = [Document(page_content=documents, source=source, page=page)]
|
149 |
+
if (
|
150 |
+
file_type == "txt"
|
151 |
+
or file_type == "docx"
|
152 |
+
or file_type == "srt"
|
153 |
+
or file_type == "tex"
|
154 |
+
):
|
155 |
+
document_chunks = self.splitter.split_documents(documents)
|
156 |
+
elif file_type == "pdf":
|
157 |
+
document_chunks = documents # Full page for now
|
158 |
+
|
159 |
+
# add the source and page number back to the metadata
|
160 |
+
for chunk in document_chunks:
|
161 |
+
chunk.metadata["source"] = source
|
162 |
+
chunk.metadata["page"] = page
|
163 |
+
|
164 |
+
# add the metadata extracted from the document
|
165 |
+
for key, value in metadata.items():
|
166 |
+
chunk.metadata[key] = value
|
167 |
+
|
168 |
+
if self.config["splitter_options"]["remove_leftover_delimiters"]:
|
169 |
+
document_chunks = self.remove_delimiters(document_chunks)
|
170 |
+
if self.config["splitter_options"]["remove_chunks"]:
|
171 |
+
document_chunks = self.remove_chunks(document_chunks)
|
172 |
+
|
173 |
+
return document_chunks
|
174 |
+
|
175 |
+
def get_chunks(self, file_reader, uploaded_files, weblinks):
|
176 |
+
self.document_chunks_full = []
|
177 |
+
self.parent_document_names = []
|
178 |
+
self.child_document_names = []
|
179 |
+
self.documents = []
|
180 |
+
self.document_metadata = []
|
181 |
+
|
182 |
+
addl_metadata = get_metadata(uploaded_files) # For any additional metadata
|
183 |
+
|
184 |
+
for file_index, file_path in enumerate(uploaded_files):
|
185 |
+
file_name = os.path.basename(file_path)
|
186 |
+
if file_name not in self.parent_document_names:
|
187 |
+
file_type = file_name.split(".")[-1].lower()
|
188 |
+
|
189 |
+
# try:
|
190 |
+
if file_type == "pdf":
|
191 |
+
documents = file_reader.read_pdf(file_path)
|
192 |
+
elif file_type == "txt":
|
193 |
+
documents = file_reader.read_txt(file_path)
|
194 |
+
elif file_type == "docx":
|
195 |
+
documents = file_reader.read_docx(file_path)
|
196 |
+
elif file_type == "srt":
|
197 |
+
documents = file_reader.read_srt(file_path)
|
198 |
+
elif file_type == "tex":
|
199 |
+
documents = file_reader.read_tex_from_url(file_path)
|
200 |
+
else:
|
201 |
+
logger.warning(f"Unsupported file type: {file_type}")
|
202 |
+
continue
|
203 |
+
|
204 |
+
for doc in documents:
|
205 |
+
page_num = doc.metadata.get("page", 0)
|
206 |
+
self.documents.append(doc.page_content)
|
207 |
+
self.document_metadata.append(
|
208 |
+
{"source": file_path, "page": page_num}
|
209 |
+
)
|
210 |
+
metadata = addl_metadata.get(file_path, {})
|
211 |
+
self.document_metadata[-1].update(metadata)
|
212 |
+
|
213 |
+
self.child_document_names.append(f"{file_name}_{page_num}")
|
214 |
+
|
215 |
+
self.parent_document_names.append(file_name)
|
216 |
+
if self.config["embedding_options"]["db_option"] not in [
|
217 |
+
"RAGatouille"
|
218 |
+
]:
|
219 |
+
document_chunks = self.process_chunks(
|
220 |
+
self.documents[-1],
|
221 |
+
file_type,
|
222 |
+
source=file_path,
|
223 |
+
page=page_num,
|
224 |
+
metadata=metadata,
|
225 |
+
)
|
226 |
+
self.document_chunks_full.extend(document_chunks)
|
227 |
+
|
228 |
+
# except Exception as e:
|
229 |
+
# logger.error(f"Error processing file {file_name}: {str(e)}")
|
230 |
+
|
231 |
+
self.process_weblinks(file_reader, weblinks)
|
232 |
+
|
233 |
+
logger.info(
|
234 |
+
f"Total document chunks extracted: {len(self.document_chunks_full)}"
|
235 |
+
)
|
236 |
+
return (
|
237 |
+
self.document_chunks_full,
|
238 |
+
self.child_document_names,
|
239 |
+
self.documents,
|
240 |
+
self.document_metadata,
|
241 |
+
)
|
242 |
+
|
243 |
+
def process_weblinks(self, file_reader, weblinks):
|
244 |
+
if weblinks[0] != "":
|
245 |
+
logger.info(f"Splitting weblinks: total of {len(weblinks)}")
|
246 |
+
|
247 |
+
for link_index, link in enumerate(weblinks):
|
248 |
+
if link not in self.parent_document_names:
|
249 |
+
try:
|
250 |
+
logger.info(f"\tSplitting link {link_index+1} : {link}")
|
251 |
+
if "youtube" in link:
|
252 |
+
documents = file_reader.read_youtube_transcript(link)
|
253 |
+
else:
|
254 |
+
documents = file_reader.read_html(link)
|
255 |
+
|
256 |
+
for doc in documents:
|
257 |
+
page_num = doc.metadata.get("page", 0)
|
258 |
+
self.documents.append(doc.page_content)
|
259 |
+
self.document_metadata.append(
|
260 |
+
{"source": link, "page": page_num}
|
261 |
+
)
|
262 |
+
self.child_document_names.append(f"{link}")
|
263 |
+
|
264 |
+
self.parent_document_names.append(link)
|
265 |
+
if self.config["embedding_options"]["db_option"] not in [
|
266 |
+
"RAGatouille"
|
267 |
+
]:
|
268 |
+
document_chunks = self.process_chunks(
|
269 |
+
self.documents[-1],
|
270 |
+
"txt",
|
271 |
+
source=link,
|
272 |
+
page=0,
|
273 |
+
metadata={"source_type": "webpage"},
|
274 |
+
)
|
275 |
+
self.document_chunks_full.extend(document_chunks)
|
276 |
+
except Exception as e:
|
277 |
+
logger.error(
|
278 |
+
f"Error splitting link {link_index+1} : {link}: {str(e)}"
|
279 |
+
)
|
280 |
+
|
281 |
+
|
282 |
+
class DataLoader:
|
283 |
+
def __init__(self, config):
|
284 |
+
self.file_reader = FileReader()
|
285 |
+
self.chunk_processor = ChunkProcessor(config)
|
286 |
+
|
287 |
+
def get_chunks(self, uploaded_files, weblinks):
|
288 |
+
return self.chunk_processor.get_chunks(
|
289 |
+
self.file_reader, uploaded_files, weblinks
|
290 |
+
)
|
code/modules/embedding_model_loader.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.embeddings import OpenAIEmbeddings
|
2 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
3 |
+
from langchain_community.embeddings import LlamaCppEmbeddings
|
4 |
+
|
5 |
+
try:
|
6 |
+
from modules.constants import *
|
7 |
+
except:
|
8 |
+
from constants import *
|
9 |
+
import os
|
10 |
+
|
11 |
+
|
12 |
+
class EmbeddingModelLoader:
|
13 |
+
def __init__(self, config):
|
14 |
+
self.config = config
|
15 |
+
|
16 |
+
def load_embedding_model(self):
|
17 |
+
if self.config["embedding_options"]["model"] in ["text-embedding-ada-002"]:
|
18 |
+
embedding_model = OpenAIEmbeddings(
|
19 |
+
deployment="SL-document_embedder",
|
20 |
+
model=self.config["embedding_options"]["model"],
|
21 |
+
show_progress_bar=True,
|
22 |
+
openai_api_key=OPENAI_API_KEY,
|
23 |
+
disallowed_special=(),
|
24 |
+
)
|
25 |
+
else:
|
26 |
+
embedding_model = HuggingFaceEmbeddings(
|
27 |
+
model_name=self.config["embedding_options"]["model"],
|
28 |
+
model_kwargs={
|
29 |
+
"device": "cpu",
|
30 |
+
"token": f"{HUGGINGFACE_TOKEN}",
|
31 |
+
"trust_remote_code": True,
|
32 |
+
},
|
33 |
+
)
|
34 |
+
# embedding_model = LlamaCppEmbeddings(
|
35 |
+
# model_path=os.path.abspath("storage/llama-7b.ggmlv3.q4_0.bin")
|
36 |
+
# )
|
37 |
+
|
38 |
+
return embedding_model
|
code/modules/helpers.py
ADDED
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
from tqdm import tqdm
|
4 |
+
import chainlit as cl
|
5 |
+
from langchain import PromptTemplate
|
6 |
+
import requests
|
7 |
+
from bs4 import BeautifulSoup
|
8 |
+
from urllib.parse import urlparse, urljoin, urldefrag
|
9 |
+
import asyncio
|
10 |
+
import aiohttp
|
11 |
+
from aiohttp import ClientSession
|
12 |
+
|
13 |
+
try:
|
14 |
+
from modules.constants import *
|
15 |
+
except:
|
16 |
+
from constants import *
|
17 |
+
|
18 |
+
"""
|
19 |
+
Ref: https://python.plainenglish.io/scraping-the-subpages-on-a-website-ea2d4e3db113
|
20 |
+
"""
|
21 |
+
|
22 |
+
|
23 |
+
class WebpageCrawler:
|
24 |
+
def __init__(self):
|
25 |
+
self.dict_href_links = {}
|
26 |
+
|
27 |
+
async def fetch(self, session: ClientSession, url: str) -> str:
|
28 |
+
async with session.get(url) as response:
|
29 |
+
try:
|
30 |
+
return await response.text()
|
31 |
+
except UnicodeDecodeError:
|
32 |
+
return await response.text(encoding="latin1")
|
33 |
+
|
34 |
+
def url_exists(self, url: str) -> bool:
|
35 |
+
try:
|
36 |
+
response = requests.head(url)
|
37 |
+
return response.status_code == 200
|
38 |
+
except requests.ConnectionError:
|
39 |
+
return False
|
40 |
+
|
41 |
+
async def get_links(self, session: ClientSession, website_link: str, base_url: str):
|
42 |
+
html_data = await self.fetch(session, website_link)
|
43 |
+
soup = BeautifulSoup(html_data, "html.parser")
|
44 |
+
list_links = []
|
45 |
+
for link in soup.find_all("a", href=True):
|
46 |
+
href = link["href"].strip()
|
47 |
+
full_url = urljoin(base_url, href)
|
48 |
+
normalized_url = self.normalize_url(full_url) # sections removed
|
49 |
+
if (
|
50 |
+
normalized_url not in self.dict_href_links
|
51 |
+
and self.is_child_url(normalized_url, base_url)
|
52 |
+
and self.url_exists(normalized_url)
|
53 |
+
):
|
54 |
+
self.dict_href_links[normalized_url] = None
|
55 |
+
list_links.append(normalized_url)
|
56 |
+
|
57 |
+
return list_links
|
58 |
+
|
59 |
+
async def get_subpage_links(
|
60 |
+
self, session: ClientSession, urls: list, base_url: str
|
61 |
+
):
|
62 |
+
tasks = [self.get_links(session, url, base_url) for url in urls]
|
63 |
+
results = await asyncio.gather(*tasks)
|
64 |
+
all_links = [link for sublist in results for link in sublist]
|
65 |
+
return all_links
|
66 |
+
|
67 |
+
async def get_all_pages(self, url: str, base_url: str):
|
68 |
+
async with aiohttp.ClientSession() as session:
|
69 |
+
dict_links = {url: "Not-checked"}
|
70 |
+
counter = None
|
71 |
+
while counter != 0:
|
72 |
+
unchecked_links = [
|
73 |
+
link
|
74 |
+
for link, status in dict_links.items()
|
75 |
+
if status == "Not-checked"
|
76 |
+
]
|
77 |
+
if not unchecked_links:
|
78 |
+
break
|
79 |
+
new_links = await self.get_subpage_links(
|
80 |
+
session, unchecked_links, base_url
|
81 |
+
)
|
82 |
+
for link in unchecked_links:
|
83 |
+
dict_links[link] = "Checked"
|
84 |
+
print(f"Checked: {link}")
|
85 |
+
dict_links.update(
|
86 |
+
{
|
87 |
+
link: "Not-checked"
|
88 |
+
for link in new_links
|
89 |
+
if link not in dict_links
|
90 |
+
}
|
91 |
+
)
|
92 |
+
counter = len(
|
93 |
+
[
|
94 |
+
status
|
95 |
+
for status in dict_links.values()
|
96 |
+
if status == "Not-checked"
|
97 |
+
]
|
98 |
+
)
|
99 |
+
|
100 |
+
checked_urls = [
|
101 |
+
url for url, status in dict_links.items() if status == "Checked"
|
102 |
+
]
|
103 |
+
return checked_urls
|
104 |
+
|
105 |
+
def is_webpage(self, url: str) -> bool:
|
106 |
+
try:
|
107 |
+
response = requests.head(url, allow_redirects=True)
|
108 |
+
content_type = response.headers.get("Content-Type", "").lower()
|
109 |
+
return "text/html" in content_type
|
110 |
+
except requests.RequestException:
|
111 |
+
return False
|
112 |
+
|
113 |
+
def clean_url_list(self, urls):
|
114 |
+
files, webpages = [], []
|
115 |
+
|
116 |
+
for url in urls:
|
117 |
+
if self.is_webpage(url):
|
118 |
+
webpages.append(url)
|
119 |
+
else:
|
120 |
+
files.append(url)
|
121 |
+
|
122 |
+
return files, webpages
|
123 |
+
|
124 |
+
def is_child_url(self, url, base_url):
|
125 |
+
return url.startswith(base_url)
|
126 |
+
|
127 |
+
def normalize_url(self, url: str):
|
128 |
+
# Strip the fragment identifier
|
129 |
+
defragged_url, _ = urldefrag(url)
|
130 |
+
return defragged_url
|
131 |
+
|
132 |
+
|
133 |
+
def get_urls_from_file(file_path: str):
|
134 |
+
"""
|
135 |
+
Function to get urls from a file
|
136 |
+
"""
|
137 |
+
with open(file_path, "r") as f:
|
138 |
+
urls = f.readlines()
|
139 |
+
urls = [url.strip() for url in urls]
|
140 |
+
return urls
|
141 |
+
|
142 |
+
|
143 |
+
def get_base_url(url):
|
144 |
+
parsed_url = urlparse(url)
|
145 |
+
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}/"
|
146 |
+
return base_url
|
147 |
+
|
148 |
+
|
149 |
+
def get_prompt(config):
|
150 |
+
if config["llm_params"]["use_history"]:
|
151 |
+
if config["llm_params"]["llm_loader"] == "local_llm":
|
152 |
+
custom_prompt_template = tinyllama_prompt_template_with_history
|
153 |
+
elif config["llm_params"]["llm_loader"] == "openai":
|
154 |
+
custom_prompt_template = openai_prompt_template_with_history
|
155 |
+
# else:
|
156 |
+
# custom_prompt_template = tinyllama_prompt_template_with_history # default
|
157 |
+
prompt = PromptTemplate(
|
158 |
+
template=custom_prompt_template,
|
159 |
+
input_variables=["context", "chat_history", "question"],
|
160 |
+
)
|
161 |
+
else:
|
162 |
+
if config["llm_params"]["llm_loader"] == "local_llm":
|
163 |
+
custom_prompt_template = tinyllama_prompt_template
|
164 |
+
elif config["llm_params"]["llm_loader"] == "openai":
|
165 |
+
custom_prompt_template = openai_prompt_template
|
166 |
+
# else:
|
167 |
+
# custom_prompt_template = tinyllama_prompt_template
|
168 |
+
prompt = PromptTemplate(
|
169 |
+
template=custom_prompt_template,
|
170 |
+
input_variables=["context", "question"],
|
171 |
+
)
|
172 |
+
return prompt
|
173 |
+
|
174 |
+
|
175 |
+
def get_sources(res, answer):
|
176 |
+
source_elements = []
|
177 |
+
source_dict = {} # Dictionary to store URL elements
|
178 |
+
|
179 |
+
for idx, source in enumerate(res["source_documents"]):
|
180 |
+
source_metadata = source.metadata
|
181 |
+
url = source_metadata["source"]
|
182 |
+
score = source_metadata.get("score", "N/A")
|
183 |
+
page = source_metadata.get("page", 1)
|
184 |
+
date = source_metadata.get("date", "N/A")
|
185 |
+
|
186 |
+
url_name = f"{url}_{page}"
|
187 |
+
if url_name not in source_dict:
|
188 |
+
source_dict[url_name] = {
|
189 |
+
"text": source.page_content,
|
190 |
+
"url": url,
|
191 |
+
"score": score,
|
192 |
+
"page": page,
|
193 |
+
"date": date,
|
194 |
+
}
|
195 |
+
else:
|
196 |
+
source_dict[url_name]["text"] += f"\n\n{source.page_content}"
|
197 |
+
|
198 |
+
# First, display the answer
|
199 |
+
full_answer = "**Answer:**\n"
|
200 |
+
full_answer += answer
|
201 |
+
|
202 |
+
# Then, display the sources
|
203 |
+
full_answer += "\n\n**Sources:**\n"
|
204 |
+
for idx, (url_name, source_data) in enumerate(source_dict.items()):
|
205 |
+
full_answer += f"\nSource {idx + 1} (Score: {source_data['score']}): {source_data['url']}\n"
|
206 |
+
|
207 |
+
name = f"Source {idx + 1} Text\n"
|
208 |
+
full_answer += name
|
209 |
+
source_elements.append(
|
210 |
+
cl.Text(name=name, content=source_data["text"], display="side")
|
211 |
+
)
|
212 |
+
|
213 |
+
# Add a PDF element if the source is a PDF file
|
214 |
+
if source_data["url"].lower().endswith(".pdf"):
|
215 |
+
name = f"Source {idx + 1} PDF\n"
|
216 |
+
full_answer += name
|
217 |
+
pdf_url = f"{source_data['url']}#page={source_data['page']+1}"
|
218 |
+
source_elements.append(cl.Pdf(name=name, url=pdf_url, display="side"))
|
219 |
+
|
220 |
+
full_answer += "\n**Metadata:**\n"
|
221 |
+
for idx, (url_name, source_data) in enumerate(source_dict.items()):
|
222 |
+
full_answer += f"Source {idx+1} Metadata\n"
|
223 |
+
source_elements.append(
|
224 |
+
cl.Text(
|
225 |
+
name=f"Source {idx+1} Metadata",
|
226 |
+
content=f"Page: {source_data['page']}\nDate: {source_data['date']}\n",
|
227 |
+
display="side",
|
228 |
+
)
|
229 |
+
)
|
230 |
+
|
231 |
+
return full_answer, source_elements
|
232 |
+
|
233 |
+
|
234 |
+
def get_metadata(file_names):
|
235 |
+
"""
|
236 |
+
Function to get any additional metadata from the files
|
237 |
+
Returns a dict with the file_name: {metadata: value}
|
238 |
+
"""
|
239 |
+
metadata_dict = {}
|
240 |
+
for file in file_names:
|
241 |
+
metadata_dict[file] = {
|
242 |
+
"source_type": "N/A",
|
243 |
+
}
|
244 |
+
return metadata_dict
|
code/modules/llm_tutor.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain import PromptTemplate
|
2 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
3 |
+
from langchain_community.chat_models import ChatOpenAI
|
4 |
+
from langchain_community.embeddings import OpenAIEmbeddings
|
5 |
+
from langchain.vectorstores import FAISS
|
6 |
+
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
|
7 |
+
from langchain.llms import CTransformers
|
8 |
+
from langchain.memory import ConversationBufferWindowMemory
|
9 |
+
from langchain.chains.conversational_retrieval.prompts import QA_PROMPT
|
10 |
+
import os
|
11 |
+
from modules.constants import *
|
12 |
+
from modules.helpers import get_prompt
|
13 |
+
from modules.chat_model_loader import ChatModelLoader
|
14 |
+
from modules.vector_db import VectorDB, VectorDBScore
|
15 |
+
|
16 |
+
|
17 |
+
class LLMTutor:
|
18 |
+
def __init__(self, config, logger=None):
|
19 |
+
self.config = config
|
20 |
+
self.vector_db = VectorDB(config, logger=logger)
|
21 |
+
if self.config["embedding_options"]["embedd_files"]:
|
22 |
+
self.vector_db.create_database()
|
23 |
+
self.vector_db.save_database()
|
24 |
+
|
25 |
+
def set_custom_prompt(self):
|
26 |
+
"""
|
27 |
+
Prompt template for QA retrieval for each vectorstore
|
28 |
+
"""
|
29 |
+
prompt = get_prompt(self.config)
|
30 |
+
# prompt = QA_PROMPT
|
31 |
+
|
32 |
+
return prompt
|
33 |
+
|
34 |
+
# Retrieval QA Chain
|
35 |
+
def retrieval_qa_chain(self, llm, prompt, db):
|
36 |
+
if self.config["embedding_options"]["db_option"] in ["FAISS", "Chroma"]:
|
37 |
+
retriever = VectorDBScore(
|
38 |
+
vectorstore=db,
|
39 |
+
# search_kwargs={
|
40 |
+
# "k": self.config["embedding_options"]["search_top_k"],
|
41 |
+
# "lambda_mult": self.config["embedding_options"]["lambda_mult"],
|
42 |
+
# },
|
43 |
+
)
|
44 |
+
elif self.config["embedding_options"]["db_option"] == "RAGatouille":
|
45 |
+
retriever = db.as_langchain_retriever(
|
46 |
+
k=self.config["embedding_options"]["search_top_k"]
|
47 |
+
)
|
48 |
+
if self.config["llm_params"]["use_history"]:
|
49 |
+
memory = ConversationBufferWindowMemory(
|
50 |
+
k=self.config["llm_params"]["memory_window"],
|
51 |
+
memory_key="chat_history",
|
52 |
+
return_messages=True,
|
53 |
+
output_key="answer",
|
54 |
+
)
|
55 |
+
qa_chain = ConversationalRetrievalChain.from_llm(
|
56 |
+
llm=llm,
|
57 |
+
chain_type="stuff",
|
58 |
+
retriever=retriever,
|
59 |
+
return_source_documents=True,
|
60 |
+
memory=memory,
|
61 |
+
combine_docs_chain_kwargs={"prompt": prompt},
|
62 |
+
)
|
63 |
+
else:
|
64 |
+
qa_chain = RetrievalQA.from_chain_type(
|
65 |
+
llm=llm,
|
66 |
+
chain_type="stuff",
|
67 |
+
retriever=retriever,
|
68 |
+
return_source_documents=True,
|
69 |
+
chain_type_kwargs={"prompt": prompt},
|
70 |
+
)
|
71 |
+
return qa_chain
|
72 |
+
|
73 |
+
# Loading the model
|
74 |
+
def load_llm(self):
|
75 |
+
chat_model_loader = ChatModelLoader(self.config)
|
76 |
+
llm = chat_model_loader.load_chat_model()
|
77 |
+
return llm
|
78 |
+
|
79 |
+
# QA Model Function
|
80 |
+
def qa_bot(self):
|
81 |
+
db = self.vector_db.load_database()
|
82 |
+
self.llm = self.load_llm()
|
83 |
+
qa_prompt = self.set_custom_prompt()
|
84 |
+
qa = self.retrieval_qa_chain(self.llm, qa_prompt, db)
|
85 |
+
|
86 |
+
return qa
|
87 |
+
|
88 |
+
# output function
|
89 |
+
def final_result(query):
|
90 |
+
qa_result = qa_bot()
|
91 |
+
response = qa_result({"query": query})
|
92 |
+
return response
|
code/modules/vector_db.py
ADDED
@@ -0,0 +1,226 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
import yaml
|
4 |
+
from langchain_community.vectorstores import FAISS, Chroma
|
5 |
+
from langchain.schema.vectorstore import VectorStoreRetriever
|
6 |
+
from langchain.callbacks.manager import CallbackManagerForRetrieverRun
|
7 |
+
from langchain.schema.document import Document
|
8 |
+
from langchain_core.callbacks import AsyncCallbackManagerForRetrieverRun
|
9 |
+
from ragatouille import RAGPretrainedModel
|
10 |
+
|
11 |
+
try:
|
12 |
+
from modules.embedding_model_loader import EmbeddingModelLoader
|
13 |
+
from modules.data_loader import DataLoader
|
14 |
+
from modules.constants import *
|
15 |
+
from modules.helpers import *
|
16 |
+
except:
|
17 |
+
from embedding_model_loader import EmbeddingModelLoader
|
18 |
+
from data_loader import DataLoader
|
19 |
+
from constants import *
|
20 |
+
from helpers import *
|
21 |
+
|
22 |
+
from typing import List
|
23 |
+
|
24 |
+
|
25 |
+
class VectorDBScore(VectorStoreRetriever):
|
26 |
+
|
27 |
+
# See https://github.com/langchain-ai/langchain/blob/61dd92f8215daef3d9cf1734b0d1f8c70c1571c3/libs/langchain/langchain/vectorstores/base.py#L500
|
28 |
+
def _get_relevant_documents(
|
29 |
+
self, query: str, *, run_manager: CallbackManagerForRetrieverRun
|
30 |
+
) -> List[Document]:
|
31 |
+
docs_and_similarities = (
|
32 |
+
self.vectorstore.similarity_search_with_relevance_scores(
|
33 |
+
query, **self.search_kwargs
|
34 |
+
)
|
35 |
+
)
|
36 |
+
# Make the score part of the document metadata
|
37 |
+
for doc, similarity in docs_and_similarities:
|
38 |
+
doc.metadata["score"] = similarity
|
39 |
+
|
40 |
+
docs = [doc for doc, _ in docs_and_similarities]
|
41 |
+
return docs
|
42 |
+
|
43 |
+
async def _aget_relevant_documents(
|
44 |
+
self, query: str, *, run_manager: AsyncCallbackManagerForRetrieverRun
|
45 |
+
) -> List[Document]:
|
46 |
+
docs_and_similarities = (
|
47 |
+
self.vectorstore.similarity_search_with_relevance_scores(
|
48 |
+
query, **self.search_kwargs
|
49 |
+
)
|
50 |
+
)
|
51 |
+
# Make the score part of the document metadata
|
52 |
+
for doc, similarity in docs_and_similarities:
|
53 |
+
doc.metadata["score"] = similarity
|
54 |
+
|
55 |
+
docs = [doc for doc, _ in docs_and_similarities]
|
56 |
+
return docs
|
57 |
+
|
58 |
+
|
59 |
+
class VectorDB:
|
60 |
+
def __init__(self, config, logger=None):
|
61 |
+
self.config = config
|
62 |
+
self.db_option = config["embedding_options"]["db_option"]
|
63 |
+
self.document_names = None
|
64 |
+
self.webpage_crawler = WebpageCrawler()
|
65 |
+
|
66 |
+
# Set up logging to both console and a file
|
67 |
+
if logger is None:
|
68 |
+
self.logger = logging.getLogger(__name__)
|
69 |
+
self.logger.setLevel(logging.INFO)
|
70 |
+
|
71 |
+
# Console Handler
|
72 |
+
console_handler = logging.StreamHandler()
|
73 |
+
console_handler.setLevel(logging.INFO)
|
74 |
+
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
75 |
+
console_handler.setFormatter(formatter)
|
76 |
+
self.logger.addHandler(console_handler)
|
77 |
+
|
78 |
+
# File Handler
|
79 |
+
log_file_path = "vector_db.log" # Change this to your desired log file path
|
80 |
+
file_handler = logging.FileHandler(log_file_path, mode="w")
|
81 |
+
file_handler.setLevel(logging.INFO)
|
82 |
+
file_handler.setFormatter(formatter)
|
83 |
+
self.logger.addHandler(file_handler)
|
84 |
+
else:
|
85 |
+
self.logger = logger
|
86 |
+
|
87 |
+
self.logger.info("VectorDB instance instantiated")
|
88 |
+
|
89 |
+
def load_files(self):
|
90 |
+
files = os.listdir(self.config["embedding_options"]["data_path"])
|
91 |
+
files = [
|
92 |
+
os.path.join(self.config["embedding_options"]["data_path"], file)
|
93 |
+
for file in files
|
94 |
+
]
|
95 |
+
urls = get_urls_from_file(self.config["embedding_options"]["url_file_path"])
|
96 |
+
if self.config["embedding_options"]["expand_urls"]:
|
97 |
+
all_urls = []
|
98 |
+
for url in urls:
|
99 |
+
loop = asyncio.get_event_loop()
|
100 |
+
all_urls.extend(
|
101 |
+
loop.run_until_complete(
|
102 |
+
self.webpage_crawler.get_all_pages(
|
103 |
+
url, url
|
104 |
+
) # only get child urls, if you want to get all urls, replace the second argument with the base url
|
105 |
+
)
|
106 |
+
)
|
107 |
+
urls = all_urls
|
108 |
+
return files, urls
|
109 |
+
|
110 |
+
def create_embedding_model(self):
|
111 |
+
self.logger.info("Creating embedding function")
|
112 |
+
self.embedding_model_loader = EmbeddingModelLoader(self.config)
|
113 |
+
self.embedding_model = self.embedding_model_loader.load_embedding_model()
|
114 |
+
|
115 |
+
def initialize_database(
|
116 |
+
self,
|
117 |
+
document_chunks: list,
|
118 |
+
document_names: list,
|
119 |
+
documents: list,
|
120 |
+
document_metadata: list,
|
121 |
+
):
|
122 |
+
if self.db_option in ["FAISS", "Chroma"]:
|
123 |
+
self.create_embedding_model()
|
124 |
+
# Track token usage
|
125 |
+
self.logger.info("Initializing vector_db")
|
126 |
+
self.logger.info("\tUsing {} as db_option".format(self.db_option))
|
127 |
+
if self.db_option == "FAISS":
|
128 |
+
self.vector_db = FAISS.from_documents(
|
129 |
+
documents=document_chunks, embedding=self.embedding_model
|
130 |
+
)
|
131 |
+
elif self.db_option == "Chroma":
|
132 |
+
self.vector_db = Chroma.from_documents(
|
133 |
+
documents=document_chunks,
|
134 |
+
embedding=self.embedding_model,
|
135 |
+
persist_directory=os.path.join(
|
136 |
+
self.config["embedding_options"]["db_path"],
|
137 |
+
"db_"
|
138 |
+
+ self.config["embedding_options"]["db_option"]
|
139 |
+
+ "_"
|
140 |
+
+ self.config["embedding_options"]["model"],
|
141 |
+
),
|
142 |
+
)
|
143 |
+
elif self.db_option == "RAGatouille":
|
144 |
+
self.RAG = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
|
145 |
+
index_path = self.RAG.index(
|
146 |
+
index_name="new_idx",
|
147 |
+
collection=documents,
|
148 |
+
document_ids=document_names,
|
149 |
+
document_metadatas=document_metadata,
|
150 |
+
)
|
151 |
+
self.logger.info("Completed initializing vector_db")
|
152 |
+
|
153 |
+
def create_database(self):
|
154 |
+
data_loader = DataLoader(self.config)
|
155 |
+
self.logger.info("Loading data")
|
156 |
+
files, urls = self.load_files()
|
157 |
+
files, webpages = self.webpage_crawler.clean_url_list(urls)
|
158 |
+
if "storage/data/urls.txt" in files:
|
159 |
+
files.remove("storage/data/urls.txt")
|
160 |
+
document_chunks, document_names, documents, document_metadata = (
|
161 |
+
data_loader.get_chunks(files, webpages)
|
162 |
+
)
|
163 |
+
self.logger.info("Completed loading data")
|
164 |
+
self.initialize_database(
|
165 |
+
document_chunks, document_names, documents, document_metadata
|
166 |
+
)
|
167 |
+
|
168 |
+
def save_database(self):
|
169 |
+
if self.db_option == "FAISS":
|
170 |
+
self.vector_db.save_local(
|
171 |
+
os.path.join(
|
172 |
+
self.config["embedding_options"]["db_path"],
|
173 |
+
"db_"
|
174 |
+
+ self.config["embedding_options"]["db_option"]
|
175 |
+
+ "_"
|
176 |
+
+ self.config["embedding_options"]["model"],
|
177 |
+
)
|
178 |
+
)
|
179 |
+
elif self.db_option == "Chroma":
|
180 |
+
# db is saved in the persist directory during initialization
|
181 |
+
pass
|
182 |
+
elif self.db_option == "RAGatouille":
|
183 |
+
# index is saved during initialization
|
184 |
+
pass
|
185 |
+
self.logger.info("Saved database")
|
186 |
+
|
187 |
+
def load_database(self):
|
188 |
+
self.create_embedding_model()
|
189 |
+
if self.db_option == "FAISS":
|
190 |
+
self.vector_db = FAISS.load_local(
|
191 |
+
os.path.join(
|
192 |
+
self.config["embedding_options"]["db_path"],
|
193 |
+
"db_"
|
194 |
+
+ self.config["embedding_options"]["db_option"]
|
195 |
+
+ "_"
|
196 |
+
+ self.config["embedding_options"]["model"],
|
197 |
+
),
|
198 |
+
self.embedding_model,
|
199 |
+
allow_dangerous_deserialization=True,
|
200 |
+
)
|
201 |
+
elif self.db_option == "Chroma":
|
202 |
+
self.vector_db = Chroma(
|
203 |
+
persist_directory=os.path.join(
|
204 |
+
self.config["embedding_options"]["db_path"],
|
205 |
+
"db_"
|
206 |
+
+ self.config["embedding_options"]["db_option"]
|
207 |
+
+ "_"
|
208 |
+
+ self.config["embedding_options"]["model"],
|
209 |
+
),
|
210 |
+
embedding_function=self.embedding_model,
|
211 |
+
)
|
212 |
+
elif self.db_option == "RAGatouille":
|
213 |
+
self.vector_db = RAGPretrainedModel.from_index(
|
214 |
+
".ragatouille/colbert/indexes/new_idx"
|
215 |
+
)
|
216 |
+
self.logger.info("Loaded database")
|
217 |
+
return self.vector_db
|
218 |
+
|
219 |
+
|
220 |
+
if __name__ == "__main__":
|
221 |
+
with open("code/config.yml", "r") as f:
|
222 |
+
config = yaml.safe_load(f)
|
223 |
+
print(config)
|
224 |
+
vector_db = VectorDB(config)
|
225 |
+
vector_db.create_database()
|
226 |
+
vector_db.save_database()
|
public/logo_dark.png
ADDED
public/logo_light.png
ADDED
public/test.css
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
a[href*='https://github.com/Chainlit/chainlit'] {
|
2 |
+
visibility: hidden;
|
3 |
+
}
|
4 |
+
|
5 |
+
.message-avatar .MuiAvatar-root {
|
6 |
+
background-color: transparent; /* Remove the background color */
|
7 |
+
color: #FFFFFF; /* Change this to your desired text color */
|
8 |
+
border: 0.25px solid #FFFFFF; /* Add a white border for the circle */
|
9 |
+
border-radius: 50%; /* Ensure the avatar remains circular */
|
10 |
+
background-image: url('http://localhost:8051/logo?theme=dark'); /* Path to your logo */
|
11 |
+
background-size: cover; /* Ensure the logo covers the entire avatar */
|
12 |
+
background-position: center; /* Center the logo */
|
13 |
+
background-repeat: no-repeat; /* Prevent the logo from repeating */
|
14 |
+
width: 38px; /* Adjust the width as needed */
|
15 |
+
height: 38px; /* Adjust the height as needed */
|
16 |
+
}
|
requirements.txt
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Automatically generated by https://github.com/damnever/pigar.
|
2 |
+
|
3 |
+
beautifulsoup4==4.12.3
|
4 |
+
chainlit==1.1.202
|
5 |
+
langchain==0.1.20
|
6 |
+
langchain-community==0.0.38
|
7 |
+
langchain-core==0.1.52
|
8 |
+
llama-parse==0.4.4
|
9 |
+
pysrt==1.1.2
|
10 |
+
python-dotenv==1.0.1
|
11 |
+
PyYAML==6.0.1
|
12 |
+
RAGatouille==0.0.8.post2
|
13 |
+
requests==2.32.3
|
14 |
+
torch==2.3.1
|
15 |
+
tqdm==4.66.4
|
16 |
+
transformers==4.41.2
|
17 |
+
llama-cpp-python==0.2.77
|
18 |
+
fake_useragent==1.5.1
|
19 |
+
chromadb==0.5.0
|
storage/data/urls.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
https://www.bu.edu/cds-faculty/programs-admissions/ms-data-science/
|