diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000000000000000000000000000000000000..6e5668f1c0fc2e2b0b0f2b7c743744f31a9fcada --- /dev/null +++ b/.flake8 @@ -0,0 +1,3 @@ +[flake8] +max-line-length = 88 +extend-ignore = E203, E266, E501, W503 diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..a5ac8ad264bb64f5d1f3ee935841b1507bdcc272 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +apps/ai_tutor/public/assets/images/avatars/ai-tutor.png filter=lfs diff=lfs merge=lfs -text +apps/ai_tutor/public/assets/images/avatars/ai_tutor.png filter=lfs diff=lfs merge=lfs -text +apps/ai_tutor/public/logo_dark.png filter=lfs diff=lfs merge=lfs -text +apps/ai_tutor/public/logo_light.png filter=lfs diff=lfs merge=lfs -text +apps/chainlit_base/public/assets/images/avatars/ai-tutor.png filter=lfs diff=lfs merge=lfs -text +apps/chainlit_base/public/assets/images/avatars/ai_tutor.png filter=lfs diff=lfs merge=lfs -text +apps/chainlit_base/public/logo_dark.png filter=lfs diff=lfs merge=lfs -text +apps/chainlit_base/public/logo_light.png filter=lfs diff=lfs merge=lfs -text diff --git a/.github/workflows/code_quality_check.yml b/.github/workflows/code_quality_check.yml new file mode 100644 index 0000000000000000000000000000000000000000..679b3799bb8a7e342883811f2abaa8337485d016 --- /dev/null +++ b/.github/workflows/code_quality_check.yml @@ -0,0 +1,33 @@ +name: Code Quality and Security Checks + +on: + push: + branches: [ main] + pull_request: + branches: [ main ] + +jobs: + code-quality: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 black bandit + + - name: Run Black + run: black --check . + + - name: Run Flake8 + run: flake8 . + + - name: Run Bandit + run: | + bandit -r . \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..9c41fd818264e4ff64576a01f0739da23f4225bc --- /dev/null +++ b/.gitignore @@ -0,0 +1,19 @@ +venv/ +venv/* +**/*.pyc + +**/vectorstores/* +**/private/students.json +.ragatouille/* +*/__pycache__/* +**/.chainlit/translations/* +storage/logs/* +vectorstores/* +**/apps/*/storage/logs/* +**/apps/*/private/* +*.log +**/.files/* +.env + +**/*.pyc + diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..ad270d8ad46460d63000a01b385b884666910d7a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,30 @@ +FROM python:3.11 + +WORKDIR /code + +RUN pip install --upgrade pip +RUN pip install --no-cache-dir edubotics_core +RUN pip install chainlit +RUN pip install literalai + +COPY . /code + +# List the contents of the /code directory to verify files are copied correctly +RUN ls -R /code + +# Change permissions to allow writing to the directory +RUN chmod -R 777 /code + +# Create a logs directory and set permissions +RUN mkdir /code/apps/ai_tutor/logs && chmod 777 /code/apps/ai_tutor/logs + +# Create a cache directory within the application's working directory +RUN mkdir /.cache && chmod -R 777 /.cache + +WORKDIR /code/apps/ai_tutor + +# Expose the port the app runs on +EXPOSE 7860 + +# Default command to run the application +CMD vectorstore_creator --config_file config/config.yml --project_config_file config/project_config.yml && python -m uvicorn app:app --host 0.0.0.0 --port 7860 diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..f8ac098862418f47b1b4ac2842413486dde815ae --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 Edubotics AI + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 7d39bf1865f48c3b4ed5c75536055b7422d8587d..ef7690dc5c59c1450e84b3698ccf596d088cff99 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,12 @@ --- -title: Cs111 Assistant -emoji: 📈 -colorFrom: yellow -colorTo: pink + +--- +title: Intro to CS (CS111) +description: AI Assistant for Intro to CS class (CS111) +emoji: 🎓 +colorFrom: red +colorTo: green sdk: docker -pinned: false +app_port: 7860 +--- --- - -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference diff --git a/apps/ai_tutor/.chainlit/config.toml b/apps/ai_tutor/.chainlit/config.toml new file mode 100644 index 0000000000000000000000000000000000000000..e1ecdb7c9767c341423545a17f3b74aa6731176c --- /dev/null +++ b/apps/ai_tutor/.chainlit/config.toml @@ -0,0 +1,120 @@ +[project] +# Whether to enable telemetry (default: true). No personal data is collected. +enable_telemetry = true + + +# List of environment variables to be provided by each user to use the app. +user_env = [] + +# Duration (in seconds) during which the session is saved when the connection is lost +session_timeout = 3600 + +# Enable third parties caching (e.g LangChain cache) +cache = false + +# Authorized origins +allow_origins = ["*"] + +# Follow symlink for asset mount (see https://github.com/Chainlit/chainlit/issues/317) +# follow_symlink = false + +[features] +# Process and display HTML in messages. This can be a security risk (see https://stackoverflow.com/questions/19603097/why-is-it-dangerous-to-render-user-generated-html-or-javascript) +unsafe_allow_html = true + +# Process and display mathematical expressions. This can clash with "$" characters in messages. +latex = true + +# Automatically tag threads with the current chat profile (if a chat profile is used) +auto_tag_thread = true + +# Authorize users to spontaneously upload files with messages +[features.spontaneous_file_upload] + enabled = true + accept = ["*/*"] + max_files = 20 + max_size_mb = 500 + +[features.audio] + # Threshold for audio recording + min_decibels = -45 + # Delay for the user to start speaking in MS + initial_silence_timeout = 3000 + # Delay for the user to continue speaking in MS. If the user stops speaking for this duration, the recording will stop. + silence_timeout = 1500 + # Above this duration (MS), the recording will forcefully stop. + max_duration = 15000 + # Duration of the audio chunks in MS + chunk_duration = 1000 + # Sample rate of the audio + sample_rate = 44100 + +edit_message = true + +[UI] +# Name of the assistant. +name = "AI Tutor" + +# Description of the assistant. This is used for HTML tags. +# description = "" + +# Large size content are by default collapsed for a cleaner ui +default_collapse_content = true + +# Chain of Thought (CoT) display mode. Can be "hidden", "tool_call" or "full". +cot = "hidden" + +# Link to your github repo. This will add a github button in the UI's header. +github = "https://github.com/edubotics-ai/edubot-core" + +# Specify a CSS file that can be used to customize the user interface. +# The CSS file can be served from the public directory or via an external link. +custom_css = "/public/files/test.css" + +# Specify a Javascript file that can be used to customize the user interface. +# The Javascript file can be served from the public directory. +# custom_js = "/public/test.js" + +# Specify a custom font url. +# custom_font = "https://fonts.googleapis.com/css2?family=Inter:wght@400;500;700&display=swap" + +# Specify a custom meta image url. +custom_meta_image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/f/f5/Boston_University_seal.svg/1200px-Boston_University_seal.svg.png" + +# Specify a custom build directory for the frontend. +# This can be used to customize the frontend code. +# Be careful: If this is a relative path, it should not start with a slash. +# custom_build = "./public/build" + +[UI.theme] + default = "light" + #layout = "wide" + #font_family = "Inter, sans-serif" +# Override default MUI light theme. (Check theme.ts) +[UI.theme.light] + #background = "#FAFAFA" + #paper = "#FFFFFF" + + [UI.theme.light.primary] + #main = "#F80061" + #dark = "#980039" + #light = "#FFE7EB" + [UI.theme.light.text] + #primary = "#212121" + #secondary = "#616161" + +# Override default MUI dark theme. (Check theme.ts) +[UI.theme.dark] + #background = "#FAFAFA" + #paper = "#FFFFFF" + + [UI.theme.dark.primary] + #main = "#F80061" + #dark = "#980039" + #light = "#FFE7EB" + [UI.theme.dark.text] + #primary = "#EEEEEE" + #secondary = "#BDBDBD" + +[meta] +generated_by = "1.1.402" diff --git a/apps/ai_tutor/README.md b/apps/ai_tutor/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ce60b629a88e9e59e51aec4e503994cd7bc9411f --- /dev/null +++ b/apps/ai_tutor/README.md @@ -0,0 +1,12 @@ +# WIP + + +## Run the encrypt_students script + +- If you don't want the emails to be public, run this script to encrypt the emails of the students. +- This will create a new file in the public/files/ directory. +- Place your file with the students' emails in the private/ directory (do not commit this file to the repository). + +```bash +python encrypt_students.py --students-file private/students.json --encrypted-students-file public/files/students_encrypted.json +``` diff --git a/apps/ai_tutor/app.py b/apps/ai_tutor/app.py new file mode 100644 index 0000000000000000000000000000000000000000..2089e48a385b414fdfac3576ae15110da4c94bd8 --- /dev/null +++ b/apps/ai_tutor/app.py @@ -0,0 +1,401 @@ +from fastapi import FastAPI, Request, Response, HTTPException +from fastapi.responses import HTMLResponse, RedirectResponse +from fastapi.templating import Jinja2Templates +from google.oauth2 import id_token +from google.auth.transport import requests as google_requests +from google_auth_oauthlib.flow import Flow +from chainlit.utils import mount_chainlit +import secrets +import json +import base64 +from config.constants import ( + OAUTH_GOOGLE_CLIENT_ID, + OAUTH_GOOGLE_CLIENT_SECRET, + CHAINLIT_URL, + EMAIL_ENCRYPTION_KEY, +) +from fastapi.middleware.cors import CORSMiddleware +from fastapi.staticfiles import StaticFiles +from helpers import ( + get_time, + reset_tokens_for_user, + check_user_cooldown, +) +from edubotics_core.chat_processor.helpers import get_user_details, update_user_info +from config.config_manager import config_manager +import hashlib +from pprint import pprint +# set config +config = config_manager.get_config() + +# set constants +GITHUB_REPO = config["misc"]["github_repo"] +DOCS_WEBSITE = config["misc"]["docs_website"] +ALL_TIME_TOKENS_ALLOCATED = config["token_config"]["all_time_tokens_allocated"] +TOKENS_LEFT = config["token_config"]["tokens_left"] +COOLDOWN_TIME = config["token_config"]["cooldown_time"] +REGEN_TIME = config["token_config"]["regen_time"] + +GOOGLE_CLIENT_ID = OAUTH_GOOGLE_CLIENT_ID +GOOGLE_CLIENT_SECRET = OAUTH_GOOGLE_CLIENT_SECRET +GOOGLE_REDIRECT_URI = f"{CHAINLIT_URL}/auth/oauth/google/callback" + +CLASS_METADATA = { + "class_name": config["metadata"]["class_name"], + "class_number": config["metadata"]["class_number"], + "instructor_name": config["metadata"]["instructor_name"], +} + +app = FastAPI() +app.mount("/public", StaticFiles(directory="public"), name="public") +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], # Update with appropriate origins + allow_methods=["*"], + allow_headers=["*"], # or specify the headers you want to allow + expose_headers=["X-User-Info"], # Expose the custom header +) + +templates = Jinja2Templates(directory="templates") +session_store = {} +CHAINLIT_PATH = "/chainlit_tutor" + +# only admin is given any additional permissions for now -- no limits on tokens +with open("public/files/students_encrypted.json", "r") as file: + USER_ROLES = json.load(file) + +# Create a Google OAuth flow +flow = Flow.from_client_config( + { + "web": { + "client_id": GOOGLE_CLIENT_ID, + "client_secret": GOOGLE_CLIENT_SECRET, + "auth_uri": "https://accounts.google.com/o/oauth2/auth", + "token_uri": "https://oauth2.googleapis.com/token", + "redirect_uris": [GOOGLE_REDIRECT_URI], + "scopes": [ + "openid", + # "https://www.googleapis.com/auth/userinfo.email", + # "https://www.googleapis.com/auth/userinfo.profile", + ], + } + }, + scopes=[ + "openid", + "https://www.googleapis.com/auth/userinfo.email", + "https://www.googleapis.com/auth/userinfo.profile", + ], + redirect_uri=GOOGLE_REDIRECT_URI, +) + + +def get_user_role(username: str): + + # Function to deterministically hash emails + def deterministic_hash(email, salt): + return hashlib.pbkdf2_hmac("sha256", email.encode(), salt, 100000).hex() + + # encrypt email (#FIXME: this is not the best way to do this, not really encryption, more like a hash) + encryption_salt = EMAIL_ENCRYPTION_KEY.encode() + encrypted_email = deterministic_hash(username, encryption_salt) + role = USER_ROLES.get(encrypted_email, ["guest"]) + + if "guest" in role: + return "unauthorized" + + return role + + +async def get_user_info_from_cookie(request: Request): + user_info_encoded = request.cookies.get("X-User-Info") + if user_info_encoded: + try: + user_info_json = base64.b64decode(user_info_encoded).decode() + return json.loads(user_info_json) + except Exception as e: + print(f"Error decoding user info: {e}") + return None + return None + + +async def del_user_info_from_cookie(request: Request, response: Response): + # Delete cookies from the response + response.delete_cookie("X-User-Info") + response.delete_cookie("session_token") + # Get the session token from the request cookies + session_token = request.cookies.get("session_token") + # Check if the session token exists in the session_store before deleting + if session_token and session_token in session_store: + del session_store[session_token] + + +def get_user_info(request: Request): + session_token = request.cookies.get("session_token") + if session_token and session_token in session_store: + return session_store[session_token] + return None + + +@app.get("/", response_class=HTMLResponse) +async def login_page(request: Request): + user_info = await get_user_info_from_cookie(request) + if user_info and user_info.get("google_signed_in"): + return RedirectResponse("/post-signin") + return templates.TemplateResponse( + "login.html", + { + "request": request, + "GITHUB_REPO": GITHUB_REPO, + "DOCS_WEBSITE": DOCS_WEBSITE, + "CLASS_METADATA": CLASS_METADATA, + }, + ) + + +# @app.get("/login/guest") +# async def login_guest(): +# username = "guest" +# session_token = secrets.token_hex(16) +# unique_session_id = secrets.token_hex(8) +# username = f"{username}_{unique_session_id}" +# session_store[session_token] = { +# "email": username, +# "name": "Guest", +# "profile_image": "", +# "google_signed_in": False, # Ensure guest users do not have this flag +# } +# user_info_json = json.dumps(session_store[session_token]) +# user_info_encoded = base64.b64encode(user_info_json.encode()).decode() + +# # Set cookies +# response = RedirectResponse(url="/post-signin", status_code=303) +# response.set_cookie(key="session_token", value=session_token) +# response.set_cookie(key="X-User-Info", value=user_info_encoded, httponly=True) +# return response + + +@app.get("/unauthorized", response_class=HTMLResponse) +async def unauthorized(request: Request): + return templates.TemplateResponse("unauthorized.html", {"request": request}) + + +@app.get("/login/google") +async def login_google(request: Request): + # Clear any existing session cookies to avoid conflicts with guest sessions + response = RedirectResponse(url="/post-signin") + response.delete_cookie(key="session_token") + response.delete_cookie(key="X-User-Info") + + user_info = await get_user_info_from_cookie(request) + # Check if user is already signed in using Google + if user_info and user_info.get("google_signed_in"): + return RedirectResponse("/post-signin") + else: + authorization_url, _ = flow.authorization_url(prompt="consent") + return RedirectResponse(authorization_url, headers=response.headers) + + +@app.get("/auth/oauth/google/callback") +async def auth_google(request: Request): + try: + flow.fetch_token(code=request.query_params.get("code")) + credentials = flow.credentials + user_info = id_token.verify_oauth2_token( + credentials.id_token, google_requests.Request(), GOOGLE_CLIENT_ID + ) + + email = user_info["email"] + name = user_info.get("name", "") + profile_image = user_info.get("picture", "") + role = get_user_role(email) + + if role == "unauthorized": + return RedirectResponse("/unauthorized") + + session_token = secrets.token_hex(16) + session_store[session_token] = { + "email": email, + "name": name, + "profile_image": profile_image, + "google_signed_in": True, # Set this flag to True for Google-signed users + } + + # add literalai user info to session store to be sent to chainlit + literalai_user = await get_user_details(email) + session_store[session_token]["literalai_info"] = literalai_user.to_dict() + session_store[session_token]["literalai_info"]["metadata"]["role"] = role + + user_info_json = json.dumps(session_store[session_token]) + user_info_encoded = base64.b64encode(user_info_json.encode()).decode() + + # Set cookies + response = RedirectResponse(url="/post-signin", status_code=303) + response.set_cookie(key="session_token", value=session_token) + response.set_cookie( + key="X-User-Info", value=user_info_encoded, httponly=True + ) # TODO: is the flag httponly=True necessary? + return response + except Exception as e: + print(f"Error during Google OAuth callback: {e}") + return RedirectResponse(url="/", status_code=302) + + +@app.get("/cooldown") +async def cooldown(request: Request): + user_info = await get_user_info_from_cookie(request) + user_details = await get_user_details(user_info["email"]) + current_datetime = get_time() + cooldown, cooldown_end_time = await check_user_cooldown( + user_details, current_datetime, COOLDOWN_TIME, TOKENS_LEFT, REGEN_TIME + ) + print(f"User in cooldown: {cooldown}") + print(f"Cooldown end time: {cooldown_end_time}") + if cooldown and "admin" not in get_user_role(user_info["email"]): + return templates.TemplateResponse( + "cooldown.html", + { + "request": request, + "username": user_info["email"], + "role": get_user_role(user_info["email"]), + "cooldown_end_time": cooldown_end_time, + "tokens_left": user_details.metadata["tokens_left"], + }, + ) + else: + user_details.metadata["in_cooldown"] = False + await update_user_info(user_details) + await reset_tokens_for_user( + user_details, + config["token_config"]["tokens_left"], + config["token_config"]["regen_time"], + ) + return RedirectResponse("/post-signin") + + +@app.get("/post-signin", response_class=HTMLResponse) +async def post_signin(request: Request): + user_info = await get_user_info_from_cookie(request) + if not user_info: + user_info = get_user_info(request) + user_details = await get_user_details(user_info["email"]) + current_datetime = get_time() + user_details.metadata["last_login"] = current_datetime + # if new user, set the number of tries + if "tokens_left" not in user_details.metadata: + user_details.metadata["tokens_left"] = ( + TOKENS_LEFT # set the number of tokens left for the new user + ) + if "last_message_time" not in user_details.metadata: + user_details.metadata["last_message_time"] = current_datetime + if "all_time_tokens_allocated" not in user_details.metadata: + user_details.metadata["all_time_tokens_allocated"] = ALL_TIME_TOKENS_ALLOCATED + if "in_cooldown" not in user_details.metadata: + user_details.metadata["in_cooldown"] = False + await update_user_info(user_details) + + if "last_message_time" in user_details.metadata and "admin" not in get_user_role( + user_info["email"] + ): + cooldown, _ = await check_user_cooldown( + user_details, current_datetime, COOLDOWN_TIME, TOKENS_LEFT, REGEN_TIME + ) + if cooldown: + user_details.metadata["in_cooldown"] = True + return RedirectResponse("/cooldown") + else: + user_details.metadata["in_cooldown"] = False + await reset_tokens_for_user( + user_details, + config["token_config"]["tokens_left"], + config["token_config"]["regen_time"], + ) + + if user_info: + username = user_info["email"] + role = get_user_role(username) + jwt_token = request.cookies.get("X-User-Info") + return templates.TemplateResponse( + "dashboard.html", + { + "request": request, + "username": username, + "role": role, + "jwt_token": jwt_token, + "tokens_left": user_details.metadata["tokens_left"], + "all_time_tokens_allocated": user_details.metadata[ + "all_time_tokens_allocated" + ], + "total_tokens_allocated": ALL_TIME_TOKENS_ALLOCATED, + }, + ) + return RedirectResponse("/") + + +@app.get("/start-tutor") +@app.post("/start-tutor") +async def start_tutor(request: Request): + user_info = await get_user_info_from_cookie(request) + if user_info: + user_info_json = json.dumps(user_info) + user_info_encoded = base64.b64encode(user_info_json.encode()).decode() + + response = RedirectResponse(CHAINLIT_PATH, status_code=303) + response.set_cookie(key="X-User-Info", value=user_info_encoded, httponly=True) + return response + + return RedirectResponse(url="/") + + +@app.exception_handler(HTTPException) +async def http_exception_handler(request: Request, exc: HTTPException): + if exc.status_code == 404: + return templates.TemplateResponse( + "error_404.html", {"request": request}, status_code=404 + ) + return templates.TemplateResponse( + "error.html", + {"request": request, "error": str(exc)}, + status_code=exc.status_code, + ) + + +@app.exception_handler(Exception) +async def exception_handler(request: Request, exc: Exception): + return templates.TemplateResponse( + "error.html", {"request": request, "error": str(exc)}, status_code=500 + ) + + +@app.get("/logout", response_class=HTMLResponse) +async def logout(request: Request, response: Response): + await del_user_info_from_cookie(request=request, response=response) + response = RedirectResponse(url="/", status_code=302) + # Set cookies to empty values and expire them immediately + response.set_cookie(key="session_token", value="", expires=0) + response.set_cookie(key="X-User-Info", value="", expires=0) + return response + + +@app.get("/get-tokens-left") +async def get_tokens_left(request: Request): + try: + user_info = await get_user_info_from_cookie(request) + user_details = await get_user_details(user_info["email"]) + await reset_tokens_for_user( + user_details, + config["token_config"]["tokens_left"], + config["token_config"]["regen_time"], + ) + tokens_left = user_details.metadata["tokens_left"] + return {"tokens_left": tokens_left} + except Exception as e: + print(f"Error getting tokens left: {e}") + return {"tokens_left": 0} + + +mount_chainlit(app=app, target="chainlit_app.py", path=CHAINLIT_PATH) + +if __name__ == "__main__": + import uvicorn + + uvicorn.run(app, host="127.0.0.1", port=7860) diff --git a/apps/ai_tutor/chainlit_app.py b/apps/ai_tutor/chainlit_app.py new file mode 100644 index 0000000000000000000000000000000000000000..980e8a99fe9d6abc46b87d3153d558702bf0cc00 --- /dev/null +++ b/apps/ai_tutor/chainlit_app.py @@ -0,0 +1,563 @@ +import chainlit.data as cl_data +import asyncio +from config.constants import ( + LITERAL_API_KEY_LOGGING, + LITERAL_API_URL, +) +from edubotics_core.chat_processor.literal_ai import CustomLiteralDataLayer +import json +from typing import Any, Dict, no_type_check +import chainlit as cl +from edubotics_core.chat.llm_tutor import LLMTutor +from edubotics_core.chat.helpers import ( + get_sources, + get_history_chat_resume, + get_history_setup_llm, + # get_last_config, +) +from edubotics_core.chat_processor.helpers import ( + update_user_info, + get_user_details, +) +from helpers import ( + check_user_cooldown, + reset_tokens_for_user, +) +from helpers import get_time +import copy +from typing import Optional +from chainlit.types import ThreadDict +import base64 +from langchain_community.callbacks import get_openai_callback +from datetime import datetime, timezone +from config.config_manager import config_manager + +USER_TIMEOUT = 60_000 +SYSTEM = "System" +LLM = "AI Tutor" +AGENT = "Agent" +YOU = "User" +ERROR = "Error" + +# set config +config = config_manager.get_config() + + +async def setup_data_layer(): + """ + Set up the data layer for chat logging. + """ + if config["chat_logging"]["log_chat"]: + data_layer = CustomLiteralDataLayer( + api_key=LITERAL_API_KEY_LOGGING, server=LITERAL_API_URL + ) + else: + data_layer = None + + return data_layer + + +async def update_user_from_chainlit(user, token_count=0): + if "admin" not in user.metadata["role"]: + user.metadata["tokens_left"] = user.metadata["tokens_left"] - token_count + user.metadata["all_time_tokens_allocated"] = ( + user.metadata["all_time_tokens_allocated"] - token_count + ) + user.metadata["tokens_left_at_last_message"] = user.metadata[ + "tokens_left" + ] # tokens_left will keep regenerating outside of chainlit + user.metadata["last_message_time"] = get_time() + await update_user_info(user) + + tokens_left = user.metadata["tokens_left"] + if tokens_left < 0: + tokens_left = 0 + return tokens_left + + +class Chatbot: + def __init__(self, config): + """ + Initialize the Chatbot class. + """ + self.config = config + + @no_type_check + async def setup_llm(self): + """ + Set up the LLM with the provided settings. Update the configuration and initialize the LLM tutor. + + #TODO: Clean this up. + """ + + llm_settings = cl.user_session.get("llm_settings", {}) + ( + chat_profile, + retriever_method, + memory_window, + llm_style, + generate_follow_up, + chunking_mode, + ) = ( + llm_settings.get("chat_model"), + llm_settings.get("retriever_method"), + llm_settings.get("memory_window"), + llm_settings.get("llm_style"), + llm_settings.get("follow_up_questions"), + llm_settings.get("chunking_mode"), + ) + + chain = cl.user_session.get("chain") + memory_list = cl.user_session.get( + "memory", + ( + list(chain.store.values())[0].messages + if len(chain.store.values()) > 0 + else [] + ), + ) + conversation_list = get_history_setup_llm(memory_list) + + old_config = copy.deepcopy(self.config) + self.config["vectorstore"]["db_option"] = retriever_method + self.config["llm_params"]["memory_window"] = memory_window + self.config["llm_params"]["llm_style"] = llm_style + self.config["llm_params"]["llm_loader"] = chat_profile + self.config["llm_params"]["generate_follow_up"] = generate_follow_up + self.config["splitter_options"]["chunking_mode"] = chunking_mode + + self.llm_tutor.update_llm( + old_config, self.config + ) # update only llm attributes that are changed + self.chain = self.llm_tutor.qa_bot( + memory=conversation_list, + ) + + cl.user_session.set("chain", self.chain) + cl.user_session.set("llm_tutor", self.llm_tutor) + + @no_type_check + async def update_llm(self, new_settings: Dict[str, Any]): + """ + Update the LLM settings and reinitialize the LLM with the new settings. + + Args: + new_settings (Dict[str, Any]): The new settings to update. + """ + cl.user_session.set("llm_settings", new_settings) + await self.inform_llm_settings() + await self.setup_llm() + + async def make_llm_settings_widgets(self, config=None): + """ + Create and send the widgets for LLM settings configuration. + + Args: + config: The configuration to use for setting up the widgets. + """ + config = config or self.config + await cl.ChatSettings( + [ + cl.input_widget.Select( + id="chat_model", + label="Model Name (Default GPT-3)", + values=["local_llm", "gpt-3.5-turbo-1106", "gpt-4", "gpt-4o-mini"], + initial_index=[ + "local_llm", + "gpt-3.5-turbo-1106", + "gpt-4", + "gpt-4o-mini", + ].index(config["llm_params"]["llm_loader"]), + ), + cl.input_widget.Select( + id="retriever_method", + label="Retriever (Default FAISS)", + values=["FAISS", "Chroma", "RAGatouille", "RAPTOR"], + initial_index=["FAISS", "Chroma", "RAGatouille", "RAPTOR"].index( + config["vectorstore"]["db_option"] + ), + ), + cl.input_widget.Slider( + id="memory_window", + label="Memory Window (Default 3)", + initial=3, + min=0, + max=10, + step=1, + ), + cl.input_widget.Switch( + id="view_sources", label="View Sources", initial=False + ), + cl.input_widget.Switch( + id="stream_response", + label="Stream response", + initial=config["llm_params"]["stream"], + ), + cl.input_widget.Select( + id="chunking_mode", + label="Chunking mode", + values=["fixed", "semantic"], + initial_index=1, + ), + cl.input_widget.Switch( + id="follow_up_questions", + label="Generate follow up questions", + initial=False, + ), + cl.input_widget.Select( + id="llm_style", + label="Type of Conversation (Default Normal)", + values=["Normal", "ELI5"], + initial_index=0, + ), + ] + ).send() + + @no_type_check + async def inform_llm_settings(self): + """ + Inform the user about the updated LLM settings and display them as a message. + """ + await cl.Message( + author=SYSTEM, + content="LLM settings have been updated. You can continue with your Query!", + ).send() + + async def set_starters(self): + """ + Set starter messages for the chatbot. + """ + # Return Starters only if the chat is new + + try: + thread = cl_data._data_layer.get_thread( + cl.context.session.thread_id + ) # see if the thread has any steps + if thread.steps or len(thread.steps) > 0: + return None + except Exception as e: + print(e) + return [ + cl.Starter( + label="recording on Transformers?", + message="Where can I find the recording for the lecture on Transformers?", + icon="/public/assets/images/starter_icons/adv-screen-recorder-svgrepo-com.svg", + ), + cl.Starter( + label="where's the schedule?", + message="When are the lectures? I can't find the schedule.", + icon="/public/assets/images/starter_icons/alarmy-svgrepo-com.svg", + ), + cl.Starter( + label="Due Date?", + message="When is the final project due?", + icon="/public/assets/images/starter_icons/calendar-samsung-17-svgrepo-com.svg", + ), + cl.Starter( + label="Explain backprop.", + message="I didn't understand the math behind backprop, could you explain it?", + icon="/public/assets/images/starter_icons/acastusphoton-svgrepo-com.svg", + ), + ] + + def rename(self, orig_author: str): + """ + Rename the original author to a more user-friendly name. + + Args: + orig_author (str): The original author's name. + + Returns: + str: The renamed author. + """ + rename_dict = {"Chatbot": LLM} + return rename_dict.get(orig_author, orig_author) + + async def start(self): + """ + Start the chatbot, initialize settings widgets, + and display and load previous conversation if chat logging is enabled. + """ + + await self.make_llm_settings_widgets(self.config) # Reload the settings widgets + + user = cl.user_session.get("user") + + # TODO: remove self.user with cl.user_session.get("user") + try: + self.user = { + "user_id": user.identifier, + "session_id": cl.context.session.thread_id, + } + except Exception as e: + print(e) + self.user = { + "user_id": "guest", + "session_id": cl.context.session.thread_id, + } + + memory = cl.user_session.get("memory", []) + self.llm_tutor = LLMTutor(self.config, user=self.user) + + self.chain = self.llm_tutor.qa_bot( + memory=memory, + ) + self.question_generator = self.llm_tutor.question_generator + cl.user_session.set("llm_tutor", self.llm_tutor) + cl.user_session.set("chain", self.chain) + + async def stream_response(self, response): + """ + Stream the response from the LLM. + + Args: + response: The response from the LLM. + """ + msg = cl.Message(content="") + await msg.send() + + output = {} + for chunk in response: + if "answer" in chunk: + await msg.stream_token(chunk["answer"]) + + for key in chunk: + if key not in output: + output[key] = chunk[key] + else: + output[key] += chunk[key] + return output + + async def main(self, message): + """ + Process and Display the Conversation. + + Args: + message: The incoming chat message. + """ + + chain = cl.user_session.get("chain") + token_count = 0 # initialize token count + if not chain: + await self.start() # start the chatbot if the chain is not present + chain = cl.user_session.get("chain") + + # update user info with last message time + user = cl.user_session.get("user") + await reset_tokens_for_user( + user, + self.config["token_config"]["tokens_left"], + self.config["token_config"]["regen_time"], + ) + updated_user = await get_user_details(user.identifier) + user.metadata = updated_user.metadata + cl.user_session.set("user", user) + + # see if user has token credits left + # if not, return message saying they have run out of tokens + if user.metadata["tokens_left"] <= 0 and "admin" not in user.metadata["role"]: + current_datetime = get_time() + cooldown, cooldown_end_time = await check_user_cooldown( + user, + current_datetime, + self.config["token_config"]["cooldown_time"], + self.config["token_config"]["tokens_left"], + self.config["token_config"]["regen_time"], + ) + if cooldown: + # get time left in cooldown + # convert both to datetime objects + cooldown_end_time = datetime.fromisoformat(cooldown_end_time).replace( + tzinfo=timezone.utc + ) + current_datetime = datetime.fromisoformat(current_datetime).replace( + tzinfo=timezone.utc + ) + cooldown_time_left = cooldown_end_time - current_datetime + # Get the total seconds + total_seconds = int(cooldown_time_left.total_seconds()) + # Calculate hours, minutes, and seconds + hours, remainder = divmod(total_seconds, 3600) + minutes, seconds = divmod(remainder, 60) + # Format the time as 00 hrs 00 mins 00 secs + formatted_time = f"{hours:02} hrs {minutes:02} mins {seconds:02} secs" + await cl.Message( + content=( + "Ah, seems like you have run out of tokens...Click " + 'here for more info. Please come back after {}'.format( + formatted_time + ) + ), + author=SYSTEM, + ).send() + user.metadata["in_cooldown"] = True + await update_user_info(user) + return + else: + await cl.Message( + content=( + "Ah, seems like you don't have any tokens left...Please wait while we regenerate your tokens. Click " + 'here to view your token credits.' + ), + author=SYSTEM, + ).send() + return + + user.metadata["in_cooldown"] = False + + llm_settings = cl.user_session.get("llm_settings", {}) + view_sources = llm_settings.get("view_sources", False) + stream = llm_settings.get("stream_response", False) + stream = False # Fix streaming + user_query_dict = {"input": message.content} + # Define the base configuration + cb = cl.AsyncLangchainCallbackHandler() + chain_config = { + "configurable": { + "user_id": self.user["user_id"], + "conversation_id": self.user["session_id"], + "memory_window": self.config["llm_params"]["memory_window"], + }, + "callbacks": ( + [cb] + if cl_data._data_layer and self.config["chat_logging"]["callbacks"] + else None + ), + } + + with get_openai_callback() as token_count_cb: + if stream: + res = chain.stream(user_query=user_query_dict, config=chain_config) + res = await self.stream_response(res) + else: + res = await chain.invoke( + user_query=user_query_dict, + config=chain_config, + ) + token_count += token_count_cb.total_tokens + + answer = res.get("answer", res.get("result")) + + answer_with_sources, source_elements, sources_dict = get_sources( + res, answer, stream=stream, view_sources=view_sources + ) + answer_with_sources = answer_with_sources.replace("$$", "$") + + actions = [] + + if self.config["llm_params"]["generate_follow_up"]: + cb_follow_up = cl.AsyncLangchainCallbackHandler() + config = { + "callbacks": ( + [cb_follow_up] + if cl_data._data_layer and self.config["chat_logging"]["callbacks"] + else None + ) + } + with get_openai_callback() as token_count_cb: + list_of_questions = await self.question_generator.generate_questions( + query=user_query_dict["input"], + response=answer, + chat_history=res.get("chat_history"), + context=res.get("context"), + config=config, + ) + + token_count += token_count_cb.total_tokens + + for question in list_of_questions: + actions.append( + cl.Action( + name="follow up question", + value="example_value", + description=question, + label=question, + ) + ) + + # # update user info with token count + tokens_left = await update_user_from_chainlit(user, token_count) + + answer_with_sources += ( + '\n\n\n" + ) + + await cl.Message( + content=answer_with_sources, + elements=source_elements, + author=LLM, + actions=actions, + ).send() + + async def on_chat_resume(self, thread: ThreadDict): + # thread_config = None + steps = thread["steps"] + k = self.config["llm_params"][ + "memory_window" + ] # on resume, alwyas use the default memory window + conversation_list = get_history_chat_resume(steps, k, SYSTEM, LLM) + # thread_config = get_last_config( + # steps + # ) # TODO: Returns None for now - which causes config to be reloaded with default values + cl.user_session.set("memory", conversation_list) + await self.start() + + @cl.header_auth_callback + def header_auth_callback(headers: dict) -> Optional[cl.User]: + # try: # TODO: Add try-except block after testing + # TODO: Implement to get the user information from the headers (not the cookie) + cookie = headers.get("cookie") # gets back a str + # Create a dictionary from the pairs + cookie_dict = {} + for pair in cookie.split("; "): + key, value = pair.split("=", 1) + # Strip surrounding quotes if present + cookie_dict[key] = value.strip('"') + + decoded_user_info = base64.b64decode( + cookie_dict.get("X-User-Info", "") + ).decode() + decoded_user_info = json.loads(decoded_user_info) + + return cl.User( + id=decoded_user_info["literalai_info"]["id"], + identifier=decoded_user_info["literalai_info"]["identifier"], + metadata=decoded_user_info["literalai_info"]["metadata"], + ) + + async def on_follow_up(self, action: cl.Action): + user = cl.user_session.get("user") + message = await cl.Message( + content=action.description, + type="user_message", + author=user.identifier, + ).send() + async with cl.Step( + name="on_follow_up", type="run", parent_id=message.id + ) as step: + await self.main(message) + step.output = message.content + + +chatbot = Chatbot(config=config) + + +async def start_app(): + cl_data._data_layer = await setup_data_layer() + chatbot.literal_client = cl_data._data_layer.client if cl_data._data_layer else None + cl.set_starters(chatbot.set_starters) + cl.author_rename(chatbot.rename) + cl.on_chat_start(chatbot.start) + cl.on_chat_resume(chatbot.on_chat_resume) + cl.on_message(chatbot.main) + cl.on_settings_update(chatbot.update_llm) + cl.action_callback("follow up question")(chatbot.on_follow_up) + + +loop = asyncio.get_event_loop() +if loop.is_running(): + asyncio.ensure_future(start_app()) +else: + asyncio.run(start_app()) diff --git a/apps/ai_tutor/config/config.yml b/apps/ai_tutor/config/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..5a2d8c1d7cffb0722e7d5562e8c23c1aaa035552 --- /dev/null +++ b/apps/ai_tutor/config/config.yml @@ -0,0 +1,60 @@ +log_dir: "storage/logs" # str +log_chunk_dir: "storage/logs/chunks" # str +device: "cpu" # str [cuda, cpu] + +vectorstore: + load_from_HF: false # bool + reparse_files: true # bool + data_path: "storage/data" # str + url_file_path: "storage/data/urls.txt" # str + expand_urls: false # bool + db_option: "FAISS" # str [FAISS, Chroma, RAGatouille, RAPTOR] + db_path: "vectorstores" # str + model: "text-embedding-ada-002" # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002'] + search_top_k: 5 # int + score_threshold: 0.3 # float + + faiss_params: # Not used as of now + index_path: "vectorstores/faiss.index" # str + index_type: "Flat" # str [Flat, HNSW, IVF] + index_dimension: 384 # int + index_nlist: 100 # int + index_nprobe: 10 # int + + colbert_params: + index_name: "new_idx" # str + +llm_params: + llm_arch: "langchain" # [langchain] + use_history: true # bool + generate_follow_up: false # bool + memory_window: 3 # int + llm_style: "Normal" # str [Normal, ELI5] + llm_loader: "gpt-4o-mini" # str [local_llm, gpt-3.5-turbo-1106, gpt-4, gpt-4o-mini] + openai_params: + temperature: 0.7 # float + local_llm_params: + temperature: 0.7 # float + repo_id: "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF" # HuggingFace repo id + filename: "tinyllama-1.1b-chat-v1.0.Q5_0.gguf" # Specific name of gguf file in the repo + model_path: "storage/models/tinyllama-1.1b-chat-v1.0.Q5_0.gguf" # Path to the model file + stream: false # bool + pdf_reader: "gpt" # str [llama, pymupdf, gpt] + +chat_logging: + log_chat: true # bool + platform: "literalai" + callbacks: true # bool + +splitter_options: + use_splitter: true # bool + split_by_token: true # bool + remove_leftover_delimiters: true # bool + remove_chunks: false # bool + chunking_mode: "semantic" # str [fixed, semantic] + chunk_size: 300 # int + chunk_overlap: 30 # int + chunk_separators: ["\n\n", "\n", " ", ""] # list of strings + front_chunks_to_remove: # int or None + last_chunks_to_remove: # int or None + delimiters_to_remove: ['\t', '\n', " ", " "] # list of strings diff --git a/apps/ai_tutor/config/config_manager.py b/apps/ai_tutor/config/config_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..91714efe599d778c1aab00bd54789eada402eb02 --- /dev/null +++ b/apps/ai_tutor/config/config_manager.py @@ -0,0 +1,202 @@ +from pydantic import BaseModel, conint, confloat, HttpUrl +from typing import Optional, List +import yaml +import os + + +class FaissParams(BaseModel): + index_path: str = "vectorstores/faiss.index" + index_type: str = "Flat" # Options: [Flat, HNSW, IVF] + index_dimension: int = 384 + index_nlist: int = 100 + index_nprobe: int = 10 + + +class ColbertParams(BaseModel): + index_name: str = "new_idx" + + +class VectorStoreConfig(BaseModel): + load_from_HF: bool = True + reparse_files: bool = True + data_path: str = "storage/data" + url_file_path: str = "storage/data/urls.txt" + expand_urls: bool = True + db_option: str = "RAGatouille" # Options: [FAISS, Chroma, RAGatouille, RAPTOR] + db_path: str = "vectorstores" + model: str = ( + # Options: [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002] + "sentence-transformers/all-MiniLM-L6-v2" + ) + search_top_k: int = 3 + score_threshold: float = 0.2 + + faiss_params: Optional[FaissParams] = None + colbert_params: Optional[ColbertParams] = None + + +class OpenAIParams(BaseModel): + temperature: float = 0.7 + + +class LocalLLMParams(BaseModel): + temperature: float = 0.7 + repo_id: str = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF" # HuggingFace repo id + filename: str = ( + "tinyllama-1.1b-chat-v1.0.Q5_0.gguf" # Specific name of gguf file in the repo + ) + model_path: str = ( + "storage/models/tinyllama-1.1b-chat-v1.0.Q5_0.gguf" # Path to the model file + ) + + +class LLMParams(BaseModel): + llm_arch: str = "langchain" # Options: [langchain] + use_history: bool = True + generate_follow_up: bool = False + memory_window: int = 3 + llm_style: str = "Normal" # Options: [Normal, ELI5] + llm_loader: str = ( + "gpt-4o-mini" # Options: [local_llm, gpt-3.5-turbo-1106, gpt-4, gpt-4o-mini] + ) + openai_params: Optional[OpenAIParams] = None + local_llm_params: Optional[LocalLLMParams] = None + stream: bool = False + pdf_reader: str = "gpt" # Options: [llama, pymupdf, gpt] + + +class ChatLoggingConfig(BaseModel): + log_chat: bool = True + platform: str = "literalai" + callbacks: bool = True + + +class SplitterOptions(BaseModel): + use_splitter: bool = True + split_by_token: bool = True + remove_leftover_delimiters: bool = True + remove_chunks: bool = False + chunking_mode: str = "semantic" # Options: [fixed, semantic] + chunk_size: int = 300 + chunk_overlap: int = 30 + chunk_separators: List[str] = ["\n\n", "\n", " ", ""] + front_chunks_to_remove: Optional[int] = None + last_chunks_to_remove: Optional[int] = None + delimiters_to_remove: List[str] = ["\t", "\n", " ", " "] + + +class RetrieverConfig(BaseModel): + retriever_hf_paths: dict[str, str] = {"RAGatouille": "XThomasBU/Colbert_Index"} + + +class MetadataConfig(BaseModel): + class_name: str + class_number: str + instructor_name: str + metadata_links: List[HttpUrl] = [ + "https://dl4ds.github.io/sp2024/lectures/", + "https://dl4ds.github.io/sp2024/schedule/", + ] + slide_base_link: HttpUrl = "https://dl4ds.github.io" + assignment_base_link: HttpUrl = "https://dl4ds.github.io/sp2024/assignments/" + content_types: List[str] + lectures_pattern: str + assignments_pattern: str + discussion_pattern: str + project_pattern: str + lecture_metadata_fields: List[str] + assignment_metadata_fields: List[str] + + +class TokenConfig(BaseModel): + cooldown_time: int = 60 + regen_time: int = 180 + tokens_left: int = 2000 + all_time_tokens_allocated: int = 1000000 + + +class MiscConfig(BaseModel): + github_repo: HttpUrl = "https://github.com/edubotics-ai/edubot-core" + docs_website: HttpUrl = "https://dl4ds.github.io/dl4ds_tutor/" + + +class APIConfig(BaseModel): + timeout: int = 60 + + +class Config(BaseModel): + log_dir: str = "storage/logs" + log_chunk_dir: str = "storage/logs/chunks" + device: str = "cpu" # Options: ['cuda', 'cpu'] + + vectorstore: VectorStoreConfig + llm_params: LLMParams + chat_logging: ChatLoggingConfig + splitter_options: SplitterOptions + retriever: RetrieverConfig + metadata: MetadataConfig + token_config: TokenConfig + misc: MiscConfig + api_config: APIConfig + + +class ConfigManager: + def __init__(self, config_path: str, project_config_path: str): + self.config_path = config_path + self.project_config_path = project_config_path + self.config = self.load_config() + self.validate_config() + + def load_config(self) -> Config: + with open(self.config_path, "r") as f: + config_data = yaml.safe_load(f) + + with open(self.project_config_path, "r") as f: + project_config_data = yaml.safe_load(f) + + # Merge the two configurations + merged_config = {**config_data, **project_config_data} + + return Config(**merged_config) + + def get_config(self) -> Config: + return ConfigWrapper(self.config).dict() + + def validate_config(self): + # If any required fields are missing, raise an error + # required_fields = [ + # "vectorstore", "llm_params", "chat_logging", "splitter_options", + # "retriever", "metadata", "token_config", "misc", "api_config" + # ] + # for field in required_fields: + # if not hasattr(self.config, field): + # raise ValueError(f"Missing required configuration field: {field}") + + # # Validate types of specific fields + # if not isinstance(self.config.vectorstore, VectorStoreConfig): + # raise TypeError("vectorstore must be an instance of VectorStoreConfig") + # if not isinstance(self.config.llm_params, LLMParams): + # raise TypeError("llm_params must be an instance of LLMParams") + pass + + +class ConfigWrapper: + def __init__(self, config: Config): + self._config = config + + def __getitem__(self, key): + return getattr(self._config, key) + + def __getattr__(self, name): + return getattr(self._config, name) + + def dict(self): + return self._config.dict() + + +config_path = os.path.join(os.getcwd(), "config/config.yml") +project_config_path = os.path.join(os.getcwd(), "config/project_config.yml") + +config_manager = ConfigManager( + config_path=config_path, project_config_path=project_config_path +) diff --git a/apps/ai_tutor/config/constants.py b/apps/ai_tutor/config/constants.py new file mode 100644 index 0000000000000000000000000000000000000000..506d0afc61c05719371979ba0fa60e491c72593f --- /dev/null +++ b/apps/ai_tutor/config/constants.py @@ -0,0 +1,26 @@ +from dotenv import load_dotenv +import os + +load_dotenv() + +# API Keys - Loaded from the .env file + +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") +LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY") +HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN") +LITERAL_API_KEY_LOGGING = os.getenv("LITERAL_API_KEY_LOGGING") +LITERAL_API_URL = os.getenv("LITERAL_API_URL") +CHAINLIT_URL = os.getenv("CHAINLIT_URL") +EMAIL_ENCRYPTION_KEY = os.getenv("EMAIL_ENCRYPTION_KEY") + +OAUTH_GOOGLE_CLIENT_ID = os.getenv("OAUTH_GOOGLE_CLIENT_ID") +OAUTH_GOOGLE_CLIENT_SECRET = os.getenv("OAUTH_GOOGLE_CLIENT_SECRET") + +opening_message = "Hey, What Can I Help You With?\n\nYou can me ask me questions about the course logistics, course content, about the final project, or anything else!" +chat_end_message = ( + "I hope I was able to help you. If you have any more questions, feel free to ask!" +) + +# Model Paths + +LLAMA_PATH = "../storage/models/tinyllama" diff --git a/apps/ai_tutor/config/project_config.yml b/apps/ai_tutor/config/project_config.yml new file mode 100644 index 0000000000000000000000000000000000000000..c6510da5d1dc3ec01b071b7af672d660712a62e2 --- /dev/null +++ b/apps/ai_tutor/config/project_config.yml @@ -0,0 +1,61 @@ +retriever: + retriever_hf_paths: + RAGatouille: "XThomasBU/Colbert_Index" + +metadata: + class_name: "Intro to CS" + class_number: "CS111" + instructor_name: "Farid Karimli" + + base_link: "https://example.com/" + metadata_links: ["https://example.com/lectures/", "https://example.com/schedule/"] + slide_base_link: "https://example.com/lectures/" + + # Assignment base link is used to find the webpage where the assignment is described/posted + assignment_base_link: "https://example.com/assignments/" + + # Define content types - assignments, lectures, etc. + content_types: + - "lecture" + - "assignment" + - "discussion" + - "project" + - "other" + + # These need to be patterns from URLs that can be used to identify the type of content uniquely + lectures_pattern: "/lectures/" + assignments_pattern: "/assignments/" + discussion_pattern: "/discussion/" + project_pattern: "/project/" + + # These are fields that can be extracted from the webpages of the course content + lecture_metadata_fields: + - "title" + - "tldr" + - "date" + - "lecture_recording" + - "suggested_readings" + + assignment_metadata_fields: + - "title" + - "release_date" + - "due_date" + - "source_file" + +token_config: + cooldown_time: 60 + regen_time: 180 + tokens_left: 50000 + all_time_tokens_allocated: 1000000 + +content: + notebookheaders_to_split_on: + - ["##", "Section"] + - ["#", "Title"] + +misc: + github_repo: "https://github.com/edubotics-ai/edubot-core" + docs_website: "https://edubotics.ai/docs/" + +api_config: + timeout: 60 diff --git a/apps/ai_tutor/config/prompts.py b/apps/ai_tutor/config/prompts.py new file mode 100644 index 0000000000000000000000000000000000000000..69fa4f5869c882cf0c98381fe3a055d852e5e923 --- /dev/null +++ b/apps/ai_tutor/config/prompts.py @@ -0,0 +1,109 @@ +from .config_manager import config_manager + +config = config_manager.config + +class_name = config.metadata.class_name +class_number = config.metadata.class_number +instructor_name = config.metadata.instructor_name + +prompts = { + "openai": { + "rephrase_prompt": ( + "You are someone that rephrases statements. Rephrase the student's question to add context from their chat history if relevant, ensuring it remains from the student's point of view. " + "Incorporate relevant details from the chat history to make the question clearer and more specific. " + "Do not change the meaning of the original statement, and maintain the student's tone and perspective. " + "If the question is conversational and doesn't require context, do not rephrase it. " + "Example: If the student previously asked about backpropagation in the context of deep learning and now asks 'what is it', rephrase to 'What is backpropagation.'. " + "Example: Do not rephrase if the user is asking something specific like 'cool, suggest a project with transformers to use as my final project' " + "Chat history: \n{chat_history}\n" + "Rephrase the following question only if necessary: '{input}'" + "Rephrased Question:'" + ), + "prompt_with_history": { + "normal": ( + f"You are an AI Tutor for the course {class_name} ({class_number}), taught by {instructor_name}. Answer the user's question using the provided context. Only use the context if it is relevant. The context is ordered by relevance. " + "If you don't know the answer, do not make things up, just say you don't know and ask the user to rephrase. Keep the conversation flowing naturally. " + "Use chat history and context as guides but avoid repeating past responses. Provide links from the source_file metadata. Use the source context that is most relevant. " + "Render math equations in LaTeX format between $ or $$ signs, stick to the parameter and variable icons found in your context. Be sure to explain the parameters and variables in the equations." + "Speak in a friendly and engaging manner, like talking to a friend. Avoid sounding repetitive or robotic.\n\n" + "Do not get influenced by the style of conversation in the chat history. Follow the instructions given here." + "Chat History:\n{chat_history}\n\n" + "Context:\n{context}\n\n" + "Answer the student's question below in a friendly, concise, and engaging manner. Use the context and history only if relevant, otherwise, engage in a free-flowing conversation.\n" + "Student: {input}\n" + "AI Tutor:" + ), + "eli5": ( + f"You are an AI Tutor for the course {class_name} ({class_number}), taught by {instructor_name}. Your job is to explain things in the simplest and most engaging way possible, just like the 'Explain Like I'm 5' (ELI5) concept." + "If you don't know the answer, do your best without making things up. Keep your explanations straightforward and very easy to understand." + "Use the chat history and context to help you, but avoid repeating past responses. Provide links from the source_file metadata when they're helpful." + "Use very simple language and examples to explain any math equations, and put the equations in LaTeX format between $ or $$ signs." + "Be friendly and engaging, like you're chatting with a young child who's curious and eager to learn. Avoid complex terms and jargon." + "Include simple and clear examples wherever you can to make things easier to understand." + "Do not get influenced by the style of conversation in the chat history. Follow the instructions given here." + "Chat History:\n{chat_history}\n\n" + "Context:\n{context}\n\n" + "Answer the student's question below in a friendly, simple, and engaging way, just like the ELI5 concept. Use the context and history only if they're relevant, otherwise, just have a natural conversation." + "Give a clear and detailed explanation with simple examples to make it easier to understand. Remember, your goal is to break down complex topics into very simple terms, just like ELI5." + "Student: {input}\n" + "AI Tutor:" + ), + "socratic": ( + f"You are an AI Tutor for the course {class_name} ({class_number}), taught by {instructor_name}. Engage the student in a Socratic dialogue to help them discover answers on their own. Use the provided context to guide your questioning." + "If you don't know the answer, do your best without making things up. Keep the conversation engaging and inquisitive." + "Use chat history and context as guides but avoid repeating past responses. Provide links from the source_file metadata when relevant. Use the source context that is most relevant." + "Speak in a friendly and engaging manner, encouraging critical thinking and self-discovery." + "Use questions to lead the student to explore the topic and uncover answers." + "Chat History:\n{chat_history}\n\n" + "Context:\n{context}\n\n" + "Answer the student's question below by guiding them through a series of questions and insights that lead to deeper understanding. Use the context and history only if relevant, otherwise, engage in a free-flowing conversation." + "Foster an inquisitive mindset and help the student discover answers through dialogue." + "Student: {input}\n" + "AI Tutor:" + ), + }, + "prompt_no_history": ( + f"You are an AI Tutor for the course {class_name} ({class_number}), taught by {instructor_name}. Answer the user's question using the provided context. Only use the context if it is relevant. The context is ordered by relevance. " + "If you don't know the answer, do your best without making things up. Keep the conversation flowing naturally. " + "Provide links from the source_file metadata. Use the source context that is most relevant. " + "Speak in a friendly and engaging manner, like talking to a friend. Avoid sounding repetitive or robotic.\n\n" + "Context:\n{context}\n\n" + "Answer the student's question below in a friendly, concise, and engaging manner. Use the context and history only if relevant, otherwise, engage in a free-flowing conversation.\n" + "Student: {input}\n" + "AI Tutor:" + ), + }, + "tiny_llama": { + "prompt_no_history": ( + "system\n" + f"Assistant is an intelligent chatbot designed to help students with questions regarding the course {class_name} ({class_number}), taught by {instructor_name}. Answer the user's question using the provided context. Only use the context if it is relevant. The context is ordered by relevance.\n" + "If you don't know the answer, do your best without making things up. Keep the conversation flowing naturally.\n" + "Provide links from the source_file metadata. Use the source context that is most relevant.\n" + "Speak in a friendly and engaging manner, like talking to a friend. Avoid sounding repetitive or robotic.\n" + "\n\n" + "user\n" + "Context:\n{context}\n\n" + "Question: {input}\n" + "\n\n" + "assistant" + ), + "prompt_with_history": ( + "system\n" + f"You are an AI Tutor for the course {class_name} ({class_number}), taught by {instructor_name}. Answer the user's question using the provided context. Only use the context if it is relevant. The context is ordered by relevance. " + "If you don't know the answer, do your best without making things up. Keep the conversation flowing naturally. " + "Use chat history and context as guides but avoid repeating past responses. Provide links from the source_file metadata. Use the source context that is most relevant. " + "Speak in a friendly and engaging manner, like talking to a friend. Avoid sounding repetitive or robotic.\n" + "\n\n" + "user\n" + "Chat History:\n{chat_history}\n\n" + "Context:\n{context}\n\n" + "Question: {input}\n" + "\n\n" + "assistant" + ), + }, +} + + +if __name__ == "__main__": + print(prompts["openai"]["prompt_with_history"]["normal"]) diff --git a/apps/ai_tutor/encrypt_students.py b/apps/ai_tutor/encrypt_students.py new file mode 100644 index 0000000000000000000000000000000000000000..1eccf5c89a57497aaa6b22549eeef092cd0c9d80 --- /dev/null +++ b/apps/ai_tutor/encrypt_students.py @@ -0,0 +1,53 @@ +import os +import hashlib +import json +import argparse +from dotenv import load_dotenv + + +# Function to deterministically hash emails +def deterministic_hash(email, salt): + return hashlib.pbkdf2_hmac("sha256", email.encode(), salt, 100000).hex() + + +def main(args): + # Load the .env file + load_dotenv() + + # Get the encryption key (salt) + encryption_salt = os.getenv("EMAIL_ENCRYPTION_KEY").encode() + + # Load emails from the specified JSON file + with open(args.students_file, "r") as file: + emails = json.load(file) + + # Replace emails with deterministic hashed emails, {hashed_email: [roles]} + hashed_emails = { + deterministic_hash(email, encryption_salt): roles + for email, roles in emails.items() + } + + # Save hashed emails to the specified encrypted JSON file + with open(args.encrypted_students_file, "w") as file: + json.dump(hashed_emails, file) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Encrypt student emails in a JSON file." + ) + parser.add_argument( + "--students-file", + type=str, + default="private/students.json", + help="Path to the students JSON file", + ) + parser.add_argument( + "--encrypted-students-file", + type=str, + default="public/files/students_encrypted.json", + help="Path to save the encrypted students JSON file", + ) + args = parser.parse_args() + + main(args) diff --git a/apps/ai_tutor/helpers.py b/apps/ai_tutor/helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..0569219f74a2fc7a2739648dd51a5b869de7b50b --- /dev/null +++ b/apps/ai_tutor/helpers.py @@ -0,0 +1,94 @@ +from datetime import datetime, timedelta, timezone +import tiktoken +from edubotics_core.chat_processor.helpers import update_user_info, convert_to_dict + + +def get_time(): + return datetime.now(timezone.utc).isoformat() + + +async def check_user_cooldown( + user_info, current_time, COOLDOWN_TIME, TOKENS_LEFT, REGEN_TIME +): + # # Check if no tokens left + tokens_left = user_info.metadata.get("tokens_left", 0) + if tokens_left > 0 and not user_info.metadata.get("in_cooldown", False): + return False, None + + user_info = convert_to_dict(user_info) + last_message_time_str = user_info["metadata"].get("last_message_time") + + # Convert from ISO format string to datetime object and ensure UTC timezone + last_message_time = datetime.fromisoformat(last_message_time_str).replace( + tzinfo=timezone.utc + ) + current_time = datetime.fromisoformat(current_time).replace(tzinfo=timezone.utc) + + # Calculate the elapsed time + elapsed_time = current_time - last_message_time + elapsed_time_in_seconds = elapsed_time.total_seconds() + + # Calculate when the cooldown period ends + cooldown_end_time = last_message_time + timedelta(seconds=COOLDOWN_TIME) + cooldown_end_time_iso = cooldown_end_time.isoformat() + + # Check if the user is still in cooldown + if elapsed_time_in_seconds < COOLDOWN_TIME: + return True, cooldown_end_time_iso # Return in ISO 8601 format + + user_info["metadata"]["in_cooldown"] = False + # If not in cooldown, regenerate tokens + await reset_tokens_for_user(user_info, TOKENS_LEFT, REGEN_TIME) + + return False, None + + +async def reset_tokens_for_user(user_info, TOKENS_LEFT, REGEN_TIME): + user_info = convert_to_dict(user_info) + last_message_time_str = user_info["metadata"].get("last_message_time") + + try: + last_message_time = datetime.fromisoformat(last_message_time_str).replace( + tzinfo=timezone.utc + ) + except Exception: # this probably means the user has never sent a message before + last_message_time = datetime.min.replace(tzinfo=timezone.utc) + + current_time = datetime.fromisoformat(get_time()).replace(tzinfo=timezone.utc) + + # Calculate the elapsed time since the last message + elapsed_time_in_seconds = (current_time - last_message_time).total_seconds() + + # Current token count (can be negative) + current_tokens = user_info["metadata"].get("tokens_left_at_last_message", 0) + current_tokens = min(current_tokens, TOKENS_LEFT) + + # Maximum tokens that can be regenerated + max_tokens = user_info["metadata"].get("max_tokens", TOKENS_LEFT) + + # Calculate how many tokens should have been regenerated proportionally + if current_tokens < max_tokens: + # Calculate the regeneration rate per second based on REGEN_TIME for full regeneration + # If current_tokens is close to 0, then the regeneration rate is relatively high, and if current_tokens is close to max_tokens, then the regeneration rate is relatively low + regeneration_rate_per_second = ( + max_tokens - max(current_tokens, 0) + ) / REGEN_TIME + + # Calculate how many tokens should have been regenerated based on the elapsed time + tokens_to_regenerate = int( + elapsed_time_in_seconds * regeneration_rate_per_second + ) + + # Ensure the new token count does not exceed max_tokens + new_token_count = min(current_tokens + tokens_to_regenerate, max_tokens) + + # Update the user's token count + user_info["metadata"]["tokens_left"] = new_token_count + + await update_user_info(user_info) + + +def get_num_tokens(text, model): + encoding = tiktoken.encoding_for_model(model) + tokens = encoding.encode(text) + return len(tokens) diff --git a/apps/ai_tutor/public/assets/images/avatars/ai-tutor.png b/apps/ai_tutor/public/assets/images/avatars/ai-tutor.png new file mode 100644 index 0000000000000000000000000000000000000000..ca144c996c7ed7baef6c2d1addc6912131e0d9cd --- /dev/null +++ b/apps/ai_tutor/public/assets/images/avatars/ai-tutor.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a899bc45dc6d84ef265aecdc21fbfb085c2fc0c16760107c5e782811b19ddfb7 +size 168742 diff --git a/apps/ai_tutor/public/assets/images/avatars/ai_tutor.png b/apps/ai_tutor/public/assets/images/avatars/ai_tutor.png new file mode 100644 index 0000000000000000000000000000000000000000..ca144c996c7ed7baef6c2d1addc6912131e0d9cd --- /dev/null +++ b/apps/ai_tutor/public/assets/images/avatars/ai_tutor.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a899bc45dc6d84ef265aecdc21fbfb085c2fc0c16760107c5e782811b19ddfb7 +size 168742 diff --git a/apps/ai_tutor/public/assets/images/starter_icons/acastusphoton-svgrepo-com.svg b/apps/ai_tutor/public/assets/images/starter_icons/acastusphoton-svgrepo-com.svg new file mode 100644 index 0000000000000000000000000000000000000000..e8edaf55d695486fbf134388dbfb84f7dbb7ca8c --- /dev/null +++ b/apps/ai_tutor/public/assets/images/starter_icons/acastusphoton-svgrepo-com.svg @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/apps/ai_tutor/public/assets/images/starter_icons/adv-screen-recorder-svgrepo-com.svg b/apps/ai_tutor/public/assets/images/starter_icons/adv-screen-recorder-svgrepo-com.svg new file mode 100644 index 0000000000000000000000000000000000000000..439e638eaf976a2278a2aa043de2feb2d65c3862 --- /dev/null +++ b/apps/ai_tutor/public/assets/images/starter_icons/adv-screen-recorder-svgrepo-com.svg @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/apps/ai_tutor/public/assets/images/starter_icons/alarmy-svgrepo-com.svg b/apps/ai_tutor/public/assets/images/starter_icons/alarmy-svgrepo-com.svg new file mode 100644 index 0000000000000000000000000000000000000000..d2dee1b557beab61067452d4700fa1d62ba0b0e8 --- /dev/null +++ b/apps/ai_tutor/public/assets/images/starter_icons/alarmy-svgrepo-com.svg @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/apps/ai_tutor/public/assets/images/starter_icons/calendar-samsung-17-svgrepo-com.svg b/apps/ai_tutor/public/assets/images/starter_icons/calendar-samsung-17-svgrepo-com.svg new file mode 100644 index 0000000000000000000000000000000000000000..8f9caa7ac74d4ea04369169e830e4042b267de89 --- /dev/null +++ b/apps/ai_tutor/public/assets/images/starter_icons/calendar-samsung-17-svgrepo-com.svg @@ -0,0 +1,36 @@ + + + + \ No newline at end of file diff --git a/apps/ai_tutor/public/files/students_encrypted.json b/apps/ai_tutor/public/files/students_encrypted.json new file mode 100644 index 0000000000000000000000000000000000000000..4a337d5d0de26119e3d7956b48d774c1b792b4d1 --- /dev/null +++ b/apps/ai_tutor/public/files/students_encrypted.json @@ -0,0 +1 @@ +{"0645db6f7b415e3b04a4fc327151c3c7bbcd25ec546ee0b3604957b571a79bc2": ["instructor", "bu"], "51ebf87ac51618300acfef8bfa9768fdee40e2d3f39cfb4ae8a76722ee336de4": ["admin", "instructor", "bu"], "7810b25bef84317130e2a59da978ee716bb96f6a8a9296c051b7ad4108aa8e6a": ["instructor", "bu"], "a95f36e2700c554639d3522834b47733f5ed1f05c5a43d04ac2575571dd43563": ["student", "bu"]} \ No newline at end of file diff --git a/apps/ai_tutor/public/files/test.css b/apps/ai_tutor/public/files/test.css new file mode 100644 index 0000000000000000000000000000000000000000..dc4787b22a872e4050074bb2854632dd4b0b9e80 --- /dev/null +++ b/apps/ai_tutor/public/files/test.css @@ -0,0 +1,32 @@ +a[href*='https://github.com/Chainlit/chainlit'] { + visibility: hidden; +} + +/* Hide the default avatar image */ +.MuiAvatar-root img.MuiAvatar-img { + display: none; + } + +/* Target the container of the image and set a custom background image */ +.MuiAvatar-root.MuiAvatar-circular.css-m2icte { + background-image: url('/public/assets/images/avatars/ai-tutor.png'); /* Replace with your custom image URL */ + background-size: cover; /* Ensure the image covers the entire container */ + background-position: center; /* Center the image */ + width: 100px; /* Ensure the dimensions match the original */ + height: 100px; /* Ensure the dimensions match the original */ + border-radius: 50%; /* Maintain circular shape */ +} +.MuiAvatar-root.MuiAvatar-circular.css-v72an7 { + background-image: url('/public/assets/images/avatars/ai-tutor.png'); /* Replace with your custom image URL */ + background-size: cover; /* Ensure the image covers the entire container */ + background-position: center; /* Center the image */ + width: 40px; /* Ensure the dimensions match the original */ + height: 40px; /* Ensure the dimensions match the original */ + border-radius: 50%; /* Maintain circular shape */ +} + +.MuiStack-root.css-14k6mw7 img { + content: url('/public/assets/images/avatars/ai-tutor.png'); /* Replace with the path to your custom image */ + max-height: 45px; /* Ensure the height remains consistent */ + max-width: 45px; /* Ensure the width remains consistent */ +} \ No newline at end of file diff --git a/apps/ai_tutor/public/logo_dark.png b/apps/ai_tutor/public/logo_dark.png new file mode 100644 index 0000000000000000000000000000000000000000..ca144c996c7ed7baef6c2d1addc6912131e0d9cd --- /dev/null +++ b/apps/ai_tutor/public/logo_dark.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a899bc45dc6d84ef265aecdc21fbfb085c2fc0c16760107c5e782811b19ddfb7 +size 168742 diff --git a/apps/ai_tutor/public/logo_light.png b/apps/ai_tutor/public/logo_light.png new file mode 100644 index 0000000000000000000000000000000000000000..ca144c996c7ed7baef6c2d1addc6912131e0d9cd --- /dev/null +++ b/apps/ai_tutor/public/logo_light.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a899bc45dc6d84ef265aecdc21fbfb085c2fc0c16760107c5e782811b19ddfb7 +size 168742 diff --git a/apps/ai_tutor/storage/data/urls.txt b/apps/ai_tutor/storage/data/urls.txt new file mode 100644 index 0000000000000000000000000000000000000000..340efca5b9f59ff4d4a4937ea82a439117e5d93e --- /dev/null +++ b/apps/ai_tutor/storage/data/urls.txt @@ -0,0 +1 @@ +https://www.cs.bu.edu/courses/cs111/ \ No newline at end of file diff --git a/apps/ai_tutor/templates/cooldown.html b/apps/ai_tutor/templates/cooldown.html new file mode 100644 index 0000000000000000000000000000000000000000..a735a21a1b2e47d02b7cd0081102f29c588ad98c --- /dev/null +++ b/apps/ai_tutor/templates/cooldown.html @@ -0,0 +1,181 @@ + + +
+ + +It seems like you need to wait a bit before starting a new session.
+ + +Tokens Left: {{ tokens_left }}
+ + + +Ready to start your AI tutoring session?
+Tokens Left: {{ tokens_left }}
+All-Time Tokens Allocated: {{ all_time_tokens_allocated }} / {{ total_tokens_allocated }}
+ + + +An unexpected error occurred. The details are below:
+{{ error }}
+ To get back to reality, click the button below.
+ ++ Welcome to the AI Tutor for {{CLASS_METADATA.class_number}} - + {{CLASS_METADATA.class_name}}. Please sign in to continue. +
+ + +Logging out... If you are not redirected, click here.
+ + \ No newline at end of file diff --git a/apps/ai_tutor/templates/unauthorized.html b/apps/ai_tutor/templates/unauthorized.html new file mode 100644 index 0000000000000000000000000000000000000000..423cc12cd2195bc9b44e352ba365d28eee7a1858 --- /dev/null +++ b/apps/ai_tutor/templates/unauthorized.html @@ -0,0 +1,94 @@ + + + + + +
+ We're currently testing things out for the DS598 course.
+ Access is restricted to students of the course. If you're enrolled in DS598 and seeing this message,
+ please reach out to us, and we'll help you get access.
+ P.S. Don't forget to use your BU email when logging in!
+