Spaces:

Annikaijak
/

AAUs_Cand_Merc_Masters_Chatbot

Runtime error

App Files Files Community

AAUs_Cand_Merc_Masters_Chatbot / app.py

Annikaijak

Update app.py

999bfbd verified 5 months ago

raw

history blame

No virus

7.12 kB

	from langchain.document_loaders import PyPDFLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.embeddings import HuggingFaceEmbeddings
	from langchain.vectorstores import Chroma
	import torch
	from langchain import HuggingFacePipeline
	from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline
	from langchain.chains import RetrievalQA
	from langchain import PromptTemplate
	from textwrap import fill
	import gradio as gr
	import time


	# Loading the PDF files from Google Files
	loader = PyPDFLoader("AAUs_Cand_Merc_Masters_Chatbot/Business_Data_Science_Aalborg_University.pdf",
	"AAUs_Cand_Merc_Masters_Chatbot/Business_Data_Science_Curriculum.pdf",
	"AAUs_Cand_Merc_Masters_Chatbot/Finance_Aalborg_University.pdf",
	"AAUs_Cand_Merc_Masters_Chatbot/Finance_Curriculum.pdf",
	"AAUs_Cand_Merc_Masters_Chatbot/Innovation_Management_Aalborg_University.pdf",
	"AAUs_Cand_Merc_Masters_Chatbot/Innovation_Management_Curriculumn_2023.pdf",
	"AAUs_Cand_Merc_Masters_Chatbot/International_Business_Aalborg_University.pdf",
	"AAUs_Cand_Merc_Masters_Chatbot/International_Business_Curriculum.pdf",
	"AAUs_Cand_Merc_Masters_Chatbot/Marketing_and_Sales_Aalborg_University.pdf",
	"AAUs_Cand_Merc_Masters_Chatbot/Marketing_and_Sales_Curriculumn.pdf")
	docs = loader.load()

	# Splitting the text in smaller chunks
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
	texts = text_splitter.split_documents(docs)

	# Creating embeddings
	embeddings = HuggingFaceEmbeddings(
	model_name="thenlper/gte-large",
	model_kwargs={"device": "cuda"},
	encode_kwargs={"normalize_embeddings": True},
	)

	query_result = embeddings.embed_query(texts[0].page_content)

	# Saving the embeddings in the Chroma database
	db = Chroma.from_documents(texts, embeddings, persist_directory="db")
	results = db.similarity_search("Transformer models", k=2)

	# Loading the transformer model
	MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1"

	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

	model = AutoModelForCausalLM.from_pretrained(
	MODEL_NAME, torch_dtype=torch.float16, trust_remote_code=True, device_map="auto"
	)

	# Create a configuration for text generation based on the specified model name
	generation_config = GenerationConfig.from_pretrained(MODEL_NAME)

	# Set the maximum number of new tokens in the generated text to 1024.
	# This limits the length of the generated output to 1024 tokens.
	generation_config.max_new_tokens = 1024

	# Set the temperature for text generation. Lower values (e.g., 0.0001) make output more deterministic, following likely predictions.
	# Higher values make the output more random.
	generation_config.temperature = 0.0001

	# Set the top-p sampling value. A value of 0.95 means focusing on the most likely words that make up 95% of the probability distribution.
	generation_config.top_p = 0.95

	# Enable text sampling. When set to True, the model randomly selects words based on their probabilities, introducing randomness.
	generation_config.do_sample = True

	# Set the repetition penalty. A value of 1.15 discourages the model from repeating the same words or phrases too frequently in the output.
	generation_config.repetition_penalty = 1.15


	# Create a text generation pipeline using the initialized model, tokenizer, and generation configuration
	text_pipeline = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	generation_config=generation_config,
	)

	# Create a LangChain pipeline that wraps the text generation pipeline and set a specific temperature for generation
	llm = HuggingFacePipeline(pipeline=text_pipeline, model_kwargs={"temperature": 0})

	template_3 = """
	<s>[INST] <<SYS>>
	Act as a student counselor at Aalborg University Business School and answer the question at the end.
	The answer should be about the master programs found in the provided documents ONLY.
	The answer should be MAXIMUM 40 words.
	Use the examples in {context} to generate the answer, without directly mentioning any of it.

	<</SYS>>

	{context}

	N-shot prompting:
	N-1
	Q: How do I find out what masters degree I want to study
	A: To determine which master's degree you would like to study, you should consider which business-related modules are within your interest, which modeules from the bachelor's degree did you find intresting?

	N-2
	Q: I liked the modules [input] in the bachelor, what masters could be relevant for me?
	A: Based on your interests in [input], it may be beneficial to consider studying [output].
	The curriculum for this program includes several modules that align with your
	interests.

	ReAct prompting:
	Q: "how do i find out what masters degree i want to study"
	A: “To determine which master's degree you would like to study, you should consider which business-related modules are within your interest,
	which modules from the bachelor's degree did you find interesting?
	Q: "I liked macro economics and organisation"
	A: “Based on your interests in macroeconomics and organizations, it may be
	beneficial to consider studying the Master of Science (MSc) in Economics and
	Business Administration (Finance) program at Aalborg University Business School.
	The curriculum for this program includes several modules that align with your
	interests, such as "Network Theory and Analysis" and "Data-Driven Business
	Modeling and Strategy". These modules cover topics related to macroeconomics and
	organizational behavior, providing you with valuable insights and skills that
	could help you achieve your career goals. Additionally, the program offers an
	application-focused approach, allowing you to apply your knowledge to real-world
	problems and develop practical solutions.”
	Feedback: The advice should focus on unique modules in the 1st and 2nd semester for each master, as the 3rd semester modules are elective options for all masters.

	{question} [/INST]
	"""

	prompt_3 = PromptTemplate(template=template_3, input_variables=["context", "question"])


	qa_chain_3 = RetrievalQA.from_chain_type(
	llm=llm,
	chain_type="stuff",
	retriever=db.as_retriever(search_kwargs={"k": 2}),
	return_source_documents=True,
	chain_type_kwargs={"prompt": prompt_3},
	)

	def reply_bot(txt):
	bot_result = qa_chain_3(txt)
	return (bot_result["result"].strip())

	bot_name = "Master Supervisor"

	with gr.Blocks() as demo:
	gr.Markdown("### Master's Degree Program Advisor")
	gr.Markdown("I can help you find the master's degree program that's right for you. Ask me any question related to choosing a master's program.")

	chatbot = gr.Chatbot()
	msg = gr.Textbox()
	clear = gr.ClearButton([msg, chatbot])

	def reply_bot(message, chat_history):
	bot_result = qa_chain_3(message)
	chat_history.append((message, (bot_result["result"].strip()))),
	time.sleep(2),
	return "", chat_history

	msg.submit(reply_bot, [msg, chatbot], [msg, chatbot])

	demo.queue().launch(share=True)