Spaces:
Runtime error
Runtime error
from langchain.document_loaders import PyPDFLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.vectorstores import Chroma | |
import torch | |
from langchain import HuggingFacePipeline | |
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline | |
from langchain.chains import RetrievalQA | |
from langchain import PromptTemplate | |
from textwrap import fill | |
import gradio as gr | |
import time | |
# Loading the PDF files from Google Files | |
loader = PyPDFLoader("AAUs_Cand_Merc_Masters_Chatbot/Business_Data_Science_Aalborg_University.pdf", | |
"AAUs_Cand_Merc_Masters_Chatbot/Business_Data_Science_Curriculum.pdf", | |
"AAUs_Cand_Merc_Masters_Chatbot/Finance_Aalborg_University.pdf", | |
"AAUs_Cand_Merc_Masters_Chatbot/Finance_Curriculum.pdf", | |
"AAUs_Cand_Merc_Masters_Chatbot/Innovation_Management_Aalborg_University.pdf", | |
"AAUs_Cand_Merc_Masters_Chatbot/Innovation_Management_Curriculumn_2023.pdf", | |
"AAUs_Cand_Merc_Masters_Chatbot/International_Business_Aalborg_University.pdf", | |
"AAUs_Cand_Merc_Masters_Chatbot/International_Business_Curriculum.pdf", | |
"AAUs_Cand_Merc_Masters_Chatbot/Marketing_and_Sales_Aalborg_University.pdf", | |
"AAUs_Cand_Merc_Masters_Chatbot/Marketing_and_Sales_Curriculumn.pdf") | |
docs = loader.load() | |
# Splitting the text in smaller chunks | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64) | |
texts = text_splitter.split_documents(docs) | |
# Creating embeddings | |
embeddings = HuggingFaceEmbeddings( | |
model_name="thenlper/gte-large", | |
model_kwargs={"device": "cuda"}, | |
encode_kwargs={"normalize_embeddings": True}, | |
) | |
query_result = embeddings.embed_query(texts[0].page_content) | |
# Saving the embeddings in the Chroma database | |
db = Chroma.from_documents(texts, embeddings, persist_directory="db") | |
results = db.similarity_search("Transformer models", k=2) | |
# Loading the transformer model | |
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1" | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True) | |
model = AutoModelForCausalLM.from_pretrained( | |
MODEL_NAME, torch_dtype=torch.float16, trust_remote_code=True, device_map="auto" | |
) | |
# Create a configuration for text generation based on the specified model name | |
generation_config = GenerationConfig.from_pretrained(MODEL_NAME) | |
# Set the maximum number of new tokens in the generated text to 1024. | |
# This limits the length of the generated output to 1024 tokens. | |
generation_config.max_new_tokens = 1024 | |
# Set the temperature for text generation. Lower values (e.g., 0.0001) make output more deterministic, following likely predictions. | |
# Higher values make the output more random. | |
generation_config.temperature = 0.0001 | |
# Set the top-p sampling value. A value of 0.95 means focusing on the most likely words that make up 95% of the probability distribution. | |
generation_config.top_p = 0.95 | |
# Enable text sampling. When set to True, the model randomly selects words based on their probabilities, introducing randomness. | |
generation_config.do_sample = True | |
# Set the repetition penalty. A value of 1.15 discourages the model from repeating the same words or phrases too frequently in the output. | |
generation_config.repetition_penalty = 1.15 | |
# Create a text generation pipeline using the initialized model, tokenizer, and generation configuration | |
text_pipeline = pipeline( | |
"text-generation", | |
model=model, | |
tokenizer=tokenizer, | |
generation_config=generation_config, | |
) | |
# Create a LangChain pipeline that wraps the text generation pipeline and set a specific temperature for generation | |
llm = HuggingFacePipeline(pipeline=text_pipeline, model_kwargs={"temperature": 0}) | |
template_3 = """ | |
<s>[INST] <<SYS>> | |
Act as a student counselor at Aalborg University Business School and answer the question at the end. | |
The answer should be about the master programs found in the provided documents ONLY. | |
The answer should be MAXIMUM 40 words. | |
Use the examples in {context} to generate the answer, without directly mentioning any of it. | |
<</SYS>> | |
{context} | |
N-shot prompting: | |
N-1 | |
Q: How do I find out what masters degree I want to study | |
A: To determine which master's degree you would like to study, you should consider which business-related modules are within your interest, which modeules from the bachelor's degree did you find intresting? | |
N-2 | |
Q: I liked the modules [input] in the bachelor, what masters could be relevant for me? | |
A: Based on your interests in [input], it may be beneficial to consider studying [output]. | |
The curriculum for this program includes several modules that align with your | |
interests. | |
ReAct prompting: | |
Q: "how do i find out what masters degree i want to study" | |
A: “To determine which master's degree you would like to study, you should consider which business-related modules are within your interest, | |
which modules from the bachelor's degree did you find interesting? | |
Q: "I liked macro economics and organisation" | |
A: “Based on your interests in macroeconomics and organizations, it may be | |
beneficial to consider studying the Master of Science (MSc) in Economics and | |
Business Administration (Finance) program at Aalborg University Business School. | |
The curriculum for this program includes several modules that align with your | |
interests, such as "Network Theory and Analysis" and "Data-Driven Business | |
Modeling and Strategy". These modules cover topics related to macroeconomics and | |
organizational behavior, providing you with valuable insights and skills that | |
could help you achieve your career goals. Additionally, the program offers an | |
application-focused approach, allowing you to apply your knowledge to real-world | |
problems and develop practical solutions.” | |
Feedback: The advice should focus on unique modules in the 1st and 2nd semester for each master, as the 3rd semester modules are elective options for all masters. | |
{question} [/INST] | |
""" | |
prompt_3 = PromptTemplate(template=template_3, input_variables=["context", "question"]) | |
qa_chain_3 = RetrievalQA.from_chain_type( | |
llm=llm, | |
chain_type="stuff", | |
retriever=db.as_retriever(search_kwargs={"k": 2}), | |
return_source_documents=True, | |
chain_type_kwargs={"prompt": prompt_3}, | |
) | |
def reply_bot(txt): | |
bot_result = qa_chain_3(txt) | |
return (bot_result["result"].strip()) | |
bot_name = "Master Supervisor" | |
with gr.Blocks() as demo: | |
gr.Markdown("### Master's Degree Program Advisor") | |
gr.Markdown("I can help you find the master's degree program that's right for you. Ask me any question related to choosing a master's program.") | |
chatbot = gr.Chatbot() | |
msg = gr.Textbox() | |
clear = gr.ClearButton([msg, chatbot]) | |
def reply_bot(message, chat_history): | |
bot_result = qa_chain_3(message) | |
chat_history.append((message, (bot_result["result"].strip()))), | |
time.sleep(2), | |
return "", chat_history | |
msg.submit(reply_bot, [msg, chatbot], [msg, chatbot]) | |
demo.queue().launch(share=True) |