# Imports from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_loaders import WebBaseLoader from langchain_community.embeddings import GPT4AllEmbeddings from langchain_community.vectorstores import Chroma from langchain_community.vectorstores import FAISS from langchain.document_loaders import TextLoader from langchain.document_loaders import CSVLoader from langchain_openai import OpenAIEmbeddings from langchain import PromptTemplate from langchain_openai import OpenAI from time import time import gradio as gr import pandas as pd import numpy as np import getpass import pickle import time import os os.environ['OPENAI_API_KEY'] read_key = os.environ.get('HF_TOKEN', None) # embedding = GPT4AllEmbeddings() embedding = GPT4AllEmbeddings(model_name="all-MiniLM-L6-v2.gguf2.f16.gguf") def get_openai_embedding(text): response = openai.Embedding.create( input=text, model="text-embedding-3-small" ) return response['data'][0]['embedding'] # CSV # ======================================== # save db_path_csv = "./data/penn_course_csv" # vectordb_faiss_csv.save_local(db_path_csv) # load vectorstore_faiss_csv = FAISS.load_local(db_path_csv, embedding, allow_dangerous_deserialization=True) # vectorstore_faiss_csv = FAISS.load_local(db_path_csv, embedding_function=get_openai_embedding, allow_dangerous_deserialization=True) # ======================================== # WEB # ======================================== # save db_path_web = "./data/penn_curriculum_web" # vectordb_faiss_web.save_local(db_path_web) # load vectorstore_faiss_web = FAISS.load_local(db_path_web, embedding, allow_dangerous_deserialization=True) # vectorstore_faiss_web = FAISS.load_local(db_path_web, embedding_function=get_openai_embedding, allow_dangerous_deserialization=True) # ======================================== from langchain.retrievers.multi_query import MultiQueryRetriever from langchain_openai import ChatOpenAI llm = ChatOpenAI(model_name="gpt-3.5-turbo-0125",temperature=0) # llm = ChatOpenAI(model_name="gpt-4-turbo",temperature=0) # Setting up separate retriever retriever_csv = MultiQueryRetriever.from_llm( retriever=vectorstore_faiss_csv.as_retriever(search_type="mmr", search_kwargs={"k":35}), llm=llm ) retriever_web = MultiQueryRetriever.from_llm( retriever=vectorstore_faiss_web.as_retriever(search_type="mmr", search_kwargs={"k":2}), llm=llm ) from langchain.prompts import PromptTemplate from langchain.chains import LLMChain from openai import OpenAI client = OpenAI() qa_prompt = PromptTemplate( input_variables=['query','contexts'], template = """ You are a course recommendation system that analyze user's interest and query the vector database to create a personalized course recommendation for the user. Answer the question based on the context below where the context is the most similar result of the courses that matches the user query. Generate as detailed and accurate response as possible and do not limite the number of responses. If the question cannot be answered using the information provided answer with 'I don't know' Make sure to include course code, title, description and reasoning for recommending the course in the answer. Context: {context} Question: {query}, """, ) qa_chain = LLMChain(llm=llm, prompt=qa_prompt) def summarize_response(strings): combined_string = '\n'.join(strings) response = client.chat.completions.create( model="gpt-3.5-turbo-0125", # model="gpt-4-turbo", messages=[ {"role":"system", "content":""" Please summarize the following input into a detailed statement that captures the most important information. Make sure to extract the all the course code and course title as well as their prerequisite information as well as the reasoning for suggesting the courses that were recommended from the given string. Do not consider 'I don't know' response but make sure to include as many responses as possible in the summary. Make sure the tone of the summary is as a school councelor but do not include phrase such as 'as a school councelor' or 'as a councelor'. Provide the summary in extensive detail and organized. """}, {"role":"user","content":combined_string} ] ) summary = response.choices[0].message.content return summary def query_retriever_mod_question_seperate(question, n=5): retrieved_answers = [] i = 0 for i in range(0,n): print(f"{i+1} iteration") # revised_question = gpt_improved_query(question) revised_question=question # print(f"improved question is \n {revised_question}") start = time.time() # first get relevant doc from csv docs_csv = retriever_csv.get_relevant_documents( query=revised_question ) # second get relevant doc from curriculum docs_web = retriever_web.get_relevant_documents( query=revised_question ) docs_csv.extend(docs_web) out = qa_chain.invoke( input={ "query": revised_question, "context": "\n---\n".join([d.page_content for d in docs_csv]) } ) retrieved_answers.append(out["text"]) end = time.time() print("completed", "time:", end - start, "sec") summary = summarize_response(retrieved_answers) return summary, retrieved_answers, docs_csv, docs_web def call_rag(question): sum, list, returned_doc_csv, returned_doc_web = query_retriever_mod_question_seperate(question) return sum interface = gr.Interface( fn=call_rag, inputs="text", outputs="text", title="RAG Demo System - Penn Course Recommendation", description=""" Try input below example prompts in the model! Example prompt: \n 1. I want to major in Design. Can you provide all the required courses for the major? 2. I want to major in Computer Science. Can you provide all the required courses for the major? """, ) interface.launch(auth=('user',read_key))