File size: 7,116 Bytes
c19f439
6d88f6f
 
 
 
 
 
 
 
 
 
 
 
 
 
999bfbd
 
 
 
 
 
 
 
 
 
6d88f6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
import torch
from langchain import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline
from langchain.chains import RetrievalQA
from langchain import PromptTemplate
from textwrap import fill
import gradio as gr
import time


# Loading the PDF files from Google Files
loader = PyPDFLoader("AAUs_Cand_Merc_Masters_Chatbot/Business_Data_Science_Aalborg_University.pdf",
                    "AAUs_Cand_Merc_Masters_Chatbot/Business_Data_Science_Curriculum.pdf",
                    "AAUs_Cand_Merc_Masters_Chatbot/Finance_Aalborg_University.pdf",
                    "AAUs_Cand_Merc_Masters_Chatbot/Finance_Curriculum.pdf",
                    "AAUs_Cand_Merc_Masters_Chatbot/Innovation_Management_Aalborg_University.pdf",
                    "AAUs_Cand_Merc_Masters_Chatbot/Innovation_Management_Curriculumn_2023.pdf",
                    "AAUs_Cand_Merc_Masters_Chatbot/International_Business_Aalborg_University.pdf",
                    "AAUs_Cand_Merc_Masters_Chatbot/International_Business_Curriculum.pdf",
                    "AAUs_Cand_Merc_Masters_Chatbot/Marketing_and_Sales_Aalborg_University.pdf",
                    "AAUs_Cand_Merc_Masters_Chatbot/Marketing_and_Sales_Curriculumn.pdf")
docs = loader.load()

# Splitting the text in smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
texts = text_splitter.split_documents(docs)

# Creating embeddings
embeddings = HuggingFaceEmbeddings(
    model_name="thenlper/gte-large",
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},
)

query_result = embeddings.embed_query(texts[0].page_content)

# Saving the embeddings in the Chroma database
db = Chroma.from_documents(texts, embeddings, persist_directory="db")
results = db.similarity_search("Transformer models", k=2)

# Loading the transformer model
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, torch_dtype=torch.float16, trust_remote_code=True, device_map="auto"
)

# Create a configuration for text generation based on the specified model name
generation_config = GenerationConfig.from_pretrained(MODEL_NAME)

# Set the maximum number of new tokens in the generated text to 1024.
# This limits the length of the generated output to 1024 tokens.
generation_config.max_new_tokens = 1024

# Set the temperature for text generation. Lower values (e.g., 0.0001) make output more deterministic, following likely predictions.
# Higher values make the output more random.
generation_config.temperature = 0.0001

# Set the top-p sampling value. A value of 0.95 means focusing on the most likely words that make up 95% of the probability distribution.
generation_config.top_p = 0.95

# Enable text sampling. When set to True, the model randomly selects words based on their probabilities, introducing randomness.
generation_config.do_sample = True

# Set the repetition penalty. A value of 1.15 discourages the model from repeating the same words or phrases too frequently in the output.
generation_config.repetition_penalty = 1.15


# Create a text generation pipeline using the initialized model, tokenizer, and generation configuration
text_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    generation_config=generation_config,
)

# Create a LangChain pipeline that wraps the text generation pipeline and set a specific temperature for generation
llm = HuggingFacePipeline(pipeline=text_pipeline, model_kwargs={"temperature": 0})

template_3 = """
<s>[INST] <<SYS>>
Act as a student counselor at Aalborg University Business School and answer the question at the end.
The answer should be about the master programs found in the provided documents ONLY.
The answer should be MAXIMUM 40 words.
Use the examples in {context} to generate the answer, without directly mentioning any of it.

<</SYS>>

{context}

N-shot prompting:
N-1
Q: How do I find out what masters degree I want to study
A: To determine which master's degree you would like to study, you should consider which business-related modules are within your interest, which modeules from the bachelor's degree did you find intresting?

N-2
Q: I liked the modules [input] in the bachelor, what masters could be relevant for me?
A: Based on your interests in [input], it may be beneficial to consider studying [output].
The curriculum for this program includes several modules that align with your
interests.

ReAct prompting:
Q: "how do i find out what masters degree i want to study"
A: “To determine which master's degree you would like to study, you should consider which business-related modules are within your interest,
which modules from the bachelor's degree did you find interesting?
Q: "I liked macro economics and organisation"
A: “Based on your interests in macroeconomics and organizations, it may be
beneficial to consider studying the Master of Science (MSc) in Economics and
Business Administration (Finance) program at Aalborg University Business School.
The curriculum for this program includes several modules that align with your
interests, such as "Network Theory and Analysis" and "Data-Driven Business
Modeling and Strategy". These modules cover topics related to macroeconomics and
organizational behavior, providing you with valuable insights and skills that
could help you achieve your career goals. Additionally, the program offers an
application-focused approach, allowing you to apply your knowledge to real-world
problems and develop practical solutions.”
Feedback: The advice should focus on unique modules in the 1st and 2nd semester for each master, as the 3rd semester modules are elective options for all masters.

{question} [/INST]
"""

prompt_3 = PromptTemplate(template=template_3, input_variables=["context", "question"])


qa_chain_3 = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db.as_retriever(search_kwargs={"k": 2}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt_3},
)

def reply_bot(txt):
  bot_result = qa_chain_3(txt)
  return (bot_result["result"].strip())

bot_name = "Master Supervisor"

with gr.Blocks() as demo:
    gr.Markdown("### Master's Degree Program Advisor")
    gr.Markdown("I can help you find the master's degree program that's right for you. Ask me any question related to choosing a master's program.")

    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    clear = gr.ClearButton([msg, chatbot])

    def reply_bot(message, chat_history):
      bot_result = qa_chain_3(message)
      chat_history.append((message, (bot_result["result"].strip()))),
      time.sleep(2),
      return "", chat_history

    msg.submit(reply_bot, [msg, chatbot], [msg, chatbot])

demo.queue().launch(share=True)