Annikaijak commited on
Commit
6d88f6f
1 Parent(s): 91190fb

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +155 -0
app.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Cloning our Github repository to Google Files
2
+ !git clone https://github.com/annikaijak/deeplearning_assignment_4
3
+
4
+ from langchain.document_loaders import PyPDFDirectoryLoader
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+ from langchain.embeddings import HuggingFaceEmbeddings
7
+ from langchain.vectorstores import Chroma
8
+ import torch
9
+ from langchain import HuggingFacePipeline
10
+ from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline
11
+ from langchain.chains import RetrievalQA
12
+ from langchain import PromptTemplate
13
+ from textwrap import fill
14
+ import gradio as gr
15
+ import time
16
+
17
+
18
+ # Loading the PDF files from Google Files
19
+ loader = PyPDFDirectoryLoader("/content/deeplearning_assignment_4/data/PDF_Documents")
20
+ docs = loader.load()
21
+
22
+ # Splitting the text in smaller chunks
23
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
24
+ texts = text_splitter.split_documents(docs)
25
+
26
+ # Creating embeddings
27
+ embeddings = HuggingFaceEmbeddings(
28
+ model_name="thenlper/gte-large",
29
+ model_kwargs={"device": "cuda"},
30
+ encode_kwargs={"normalize_embeddings": True},
31
+ )
32
+
33
+ query_result = embeddings.embed_query(texts[0].page_content)
34
+
35
+ # Saving the embeddings in the Chroma database
36
+ db = Chroma.from_documents(texts, embeddings, persist_directory="db")
37
+ results = db.similarity_search("Transformer models", k=2)
38
+
39
+ # Loading the transformer model
40
+ MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1"
41
+
42
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
43
+
44
+ model = AutoModelForCausalLM.from_pretrained(
45
+ MODEL_NAME, torch_dtype=torch.float16, trust_remote_code=True, device_map="auto"
46
+ )
47
+
48
+ # Create a configuration for text generation based on the specified model name
49
+ generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
50
+
51
+ # Set the maximum number of new tokens in the generated text to 1024.
52
+ # This limits the length of the generated output to 1024 tokens.
53
+ generation_config.max_new_tokens = 1024
54
+
55
+ # Set the temperature for text generation. Lower values (e.g., 0.0001) make output more deterministic, following likely predictions.
56
+ # Higher values make the output more random.
57
+ generation_config.temperature = 0.0001
58
+
59
+ # Set the top-p sampling value. A value of 0.95 means focusing on the most likely words that make up 95% of the probability distribution.
60
+ generation_config.top_p = 0.95
61
+
62
+ # Enable text sampling. When set to True, the model randomly selects words based on their probabilities, introducing randomness.
63
+ generation_config.do_sample = True
64
+
65
+ # Set the repetition penalty. A value of 1.15 discourages the model from repeating the same words or phrases too frequently in the output.
66
+ generation_config.repetition_penalty = 1.15
67
+
68
+
69
+ # Create a text generation pipeline using the initialized model, tokenizer, and generation configuration
70
+ text_pipeline = pipeline(
71
+ "text-generation",
72
+ model=model,
73
+ tokenizer=tokenizer,
74
+ generation_config=generation_config,
75
+ )
76
+
77
+ # Create a LangChain pipeline that wraps the text generation pipeline and set a specific temperature for generation
78
+ llm = HuggingFacePipeline(pipeline=text_pipeline, model_kwargs={"temperature": 0})
79
+
80
+ template_3 = """
81
+ <s>[INST] <<SYS>>
82
+ Act as a student counselor at Aalborg University Business School and answer the question at the end.
83
+ The answer should be about the master programs found in the provided documents ONLY.
84
+ The answer should be MAXIMUM 40 words.
85
+ Use the examples in {context} to generate the answer, without directly mentioning any of it.
86
+
87
+ <</SYS>>
88
+
89
+ {context}
90
+
91
+ N-shot prompting:
92
+ N-1
93
+ Q: How do I find out what masters degree I want to study
94
+ A: To determine which master's degree you would like to study, you should consider which business-related modules are within your interest, which modeules from the bachelor's degree did you find intresting?
95
+
96
+ N-2
97
+ Q: I liked the modules [input] in the bachelor, what masters could be relevant for me?
98
+ A: Based on your interests in [input], it may be beneficial to consider studying [output].
99
+ The curriculum for this program includes several modules that align with your
100
+ interests.
101
+
102
+ ReAct prompting:
103
+ Q: "how do i find out what masters degree i want to study"
104
+ A: “To determine which master's degree you would like to study, you should consider which business-related modules are within your interest,
105
+ which modules from the bachelor's degree did you find interesting?
106
+ Q: "I liked macro economics and organisation"
107
+ A: “Based on your interests in macroeconomics and organizations, it may be
108
+ beneficial to consider studying the Master of Science (MSc) in Economics and
109
+ Business Administration (Finance) program at Aalborg University Business School.
110
+ The curriculum for this program includes several modules that align with your
111
+ interests, such as "Network Theory and Analysis" and "Data-Driven Business
112
+ Modeling and Strategy". These modules cover topics related to macroeconomics and
113
+ organizational behavior, providing you with valuable insights and skills that
114
+ could help you achieve your career goals. Additionally, the program offers an
115
+ application-focused approach, allowing you to apply your knowledge to real-world
116
+ problems and develop practical solutions.”
117
+ Feedback: The advice should focus on unique modules in the 1st and 2nd semester for each master, as the 3rd semester modules are elective options for all masters.
118
+
119
+ {question} [/INST]
120
+ """
121
+
122
+ prompt_3 = PromptTemplate(template=template_3, input_variables=["context", "question"])
123
+
124
+
125
+ qa_chain_3 = RetrievalQA.from_chain_type(
126
+ llm=llm,
127
+ chain_type="stuff",
128
+ retriever=db.as_retriever(search_kwargs={"k": 2}),
129
+ return_source_documents=True,
130
+ chain_type_kwargs={"prompt": prompt_3},
131
+ )
132
+
133
+ def reply_bot(txt):
134
+ bot_result = qa_chain_3(txt)
135
+ return (bot_result["result"].strip())
136
+
137
+ bot_name = "Master Supervisor"
138
+
139
+ with gr.Blocks() as demo:
140
+ gr.Markdown("### Master's Degree Program Advisor")
141
+ gr.Markdown("I can help you find the master's degree program that's right for you. Ask me any question related to choosing a master's program.")
142
+
143
+ chatbot = gr.Chatbot()
144
+ msg = gr.Textbox()
145
+ clear = gr.ClearButton([msg, chatbot])
146
+
147
+ def reply_bot(message, chat_history):
148
+ bot_result = qa_chain_3(message)
149
+ chat_history.append((message, (bot_result["result"].strip()))),
150
+ time.sleep(2),
151
+ return "", chat_history
152
+
153
+ msg.submit(reply_bot, [msg, chatbot], [msg, chatbot])
154
+
155
+ demo.queue().launch(share=True)