justinj92 commited on
Commit
9ba0e23
1 Parent(s): bbabd2a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -368
app.py CHANGED
@@ -1,378 +1,77 @@
1
- # import gradio as gr
2
- # import torch
3
- # from transformers import (
4
- # AutoModelForCausalLM,
5
- # AutoTokenizer,
6
- # TextIteratorStreamer,
7
- # pipeline
8
- # )
9
- # import os
10
- # from threading import Thread
11
- # import spaces
12
- # import time
13
-
14
- # import langchain
15
- # import os
16
- # import glob
17
- # import gc
18
-
19
- # # loaders
20
- # from langchain.document_loaders import PyPDFLoader, DirectoryLoader
21
-
22
- # # splits
23
- # from langchain.text_splitter import RecursiveCharacterTextSplitter
24
-
25
- # # prompts
26
- # from langchain import PromptTemplate
27
-
28
- # # vector stores
29
- # from langchain_community.vectorstores import FAISS
30
-
31
- # # models
32
- # from langchain.llms import HuggingFacePipeline
33
- # from langchain.embeddings import HuggingFaceInstructEmbeddings
34
-
35
- # # retrievers
36
- # from langchain.chains import RetrievalQA
37
-
38
-
39
- # import subprocess
40
-
41
- # subprocess.run(
42
- # "pip install flash-attn --no-build-isolation",
43
- # env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
44
- # shell=True,
45
- # )
46
-
47
-
48
- # class CFG:
49
- # DEBUG = False
50
-
51
- # ### LLM
52
- # model_name = 'justinj92/phi3-orpo'
53
- # temperature = 0.7
54
- # top_p = 0.90
55
- # repetition_penalty = 1.15
56
- # max_len = 8192
57
- # max_new_tokens = 512
58
-
59
- # ### splitting
60
- # split_chunk_size = 800
61
- # split_overlap = 400
62
-
63
- # ### embeddings
64
- # embeddings_model_repo = 'BAAI/bge-base-en-v1.5'
65
-
66
- # ### similar passages
67
- # k = 6
68
-
69
- # ### paths
70
- # PDFs_path = './data'
71
- # Embeddings_path = './embeddings/input'
72
- # Output_folder = './ml-papers-vector'
73
-
74
- # loader = DirectoryLoader(CFG.PDFs_path, glob="*.pdf", loader_cls=PyPDFLoader)
75
-
76
- # documents = loader.load()
77
-
78
-
79
- # text_splitter = RecursiveCharacterTextSplitter(chunk_size = CFG.split_chunk_size, chunk_overlap = CFG.split_overlap)
80
- # texts = text_splitter.split_documents(documents)
81
-
82
- # if not os.path.exists(CFG.Embeddings_path + '/index.faiss'):
83
- # embeddings = HuggingFaceInstructEmbeddings(model_name = CFG.embeddings_model_repo, model_kwargs={"device":"cuda"})
84
- # vectordb = FAISS.from_documents(documents=texts, embedding=embeddings)
85
- # vectordb.save_local(f"{CFG.Output_folder}/faiss_index_ml_papers")
86
-
87
- # embeddings = HuggingFaceInstructEmbeddings(model_name = CFG.embeddings_model_repo, model_kwargs={"device":"cuda"})
88
- # vectordb = FAISS.load_local(CFG.Output_folder + '/faiss_index_ml_papers', embeddings, allow_dangerous_deserialization=True)
89
-
90
-
91
- # def build_model(model_repo = CFG.model_name):
92
- # tokenizer = AutoTokenizer.from_pretrained(model_repo)
93
- # model = AutoModelForCausalLM.from_pretrained(model_repo, attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16)
94
- # if torch.cuda.is_available():
95
- # device = torch.device("cuda")
96
- # print(f"Using GPU: {torch.cuda.get_device_name(device)}")
97
- # else:
98
- # device = torch.device("cpu")
99
- # print("Using CPU")
100
- # device = torch.device("cuda")
101
- # model = model.to(device)
102
-
103
- # return tokenizer, model
104
-
105
-
106
- # tok, model = build_model(model_repo = CFG.model_name)
107
-
108
- # terminators = [
109
- # tok.eos_token_id,
110
- # 32007,
111
- # 32011,
112
- # 32001,
113
- # 32000
114
- # ]
115
-
116
-
117
-
118
-
119
- # pipe = pipeline(task="text-generation", model=model, tokenizer=tok, eos_token_id=terminators, do_sample=True, max_new_tokens=CFG.max_new_tokens, temperature=CFG.temperature, top_p=CFG.top_p, repetition_penalty=CFG.repetition_penalty)
120
-
121
- # llm = HuggingFacePipeline(pipeline = pipe)
122
-
123
- # prompt_template = """
124
- # <|system|>
125
-
126
- # You are an expert assistant that answers questions about machine learning and Large Language Models (LLMs).
127
-
128
- # You are given some extracted parts from machine learning papers along with a question.
129
-
130
- # If you don't know the answer, just say "I don't know." Don't try to make up an answer.
131
-
132
- # It is very important that you ALWAYS answer the question in the same language the question is in. Remember to always do that.
133
-
134
- # Use only the following pieces of context to answer the question at the end.
135
-
136
- # <|end|>
137
-
138
- # <|user|>
139
-
140
- # Context: {context}
141
-
142
- # Question is below. Remember to answer in the same language:
143
-
144
- # Question: {question}
145
-
146
- # <|end|>
147
-
148
- # <|assistant|>
149
-
150
- # """
151
-
152
-
153
- # PROMPT = PromptTemplate(
154
- # template = prompt_template,
155
- # input_variables = ["context", "question"]
156
- # )
157
-
158
- # retriever = vectordb.as_retriever(
159
- # search_type = "similarity",
160
- # search_kwargs = {"k": CFG.k}
161
- # )
162
-
163
- # qa_chain = RetrievalQA.from_chain_type(
164
- # llm = llm,
165
- # chain_type = "stuff", # map_reduce, map_rerank, stuff, refine
166
- # retriever = retriever,
167
- # chain_type_kwargs = {"prompt": PROMPT},
168
- # return_source_documents = True,
169
- # verbose = False
170
- # )
171
-
172
-
173
- # def wrap_text_preserve_newlines(text, width=1500):
174
- # # Split the input text into lines based on newline characters
175
- # lines = text.split('\n')
176
-
177
- # # Wrap each line individually
178
- # wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
179
-
180
- # # Join the wrapped lines back together using newline characters
181
- # wrapped_text = '\n'.join(wrapped_lines)
182
-
183
- # return wrapped_text
184
-
185
-
186
- # def process_llm_response(llm_response):
187
- # ans = wrap_text_preserve_newlines(llm_response['result'])
188
-
189
- # sources_used = ' \n'.join(
190
- # [
191
- # source.metadata['source'].split('/')[-1][:-4]
192
- # + ' - page: '
193
- # + str(source.metadata['page'])
194
- # for source in llm_response['source_documents']
195
- # ]
196
- # )
197
-
198
- # ans = ans + '\n\nSources: \n' + sources_used
199
-
200
- # ### return only the text after the pattern
201
- # pattern = "<|assistant|>"
202
- # index = ans.find(pattern)
203
- # if index != -1:
204
- # ans = ans[index + len(pattern):]
205
-
206
- # return ans.strip()
207
-
208
- # @spaces.GPU
209
- # def llm_ans(message, history):
210
-
211
- # llm_response = qa_chain.invoke(message)
212
- # ans = process_llm_response(llm_response)
213
-
214
- # return ans
215
-
216
-
217
- # # @spaces.GPU(duration=60)
218
- # # def chat(message, history, temperature, do_sample, max_tokens):
219
- # # chat = [{"role": "system", "content": "You are ORPO Tuned Phi Beast. Answer all questions in the most helpful way. No yapping."}]
220
- # # for item in history:
221
- # # chat.append({"role": "user", "content": item[0]})
222
- # # if item[1] is not None:
223
- # # chat.append({"role": "assistant", "content": item[1]})
224
- # # chat.append({"role": "user", "content": message})
225
- # # messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
226
- # # model_inputs = tok([messages], return_tensors="pt").to(device)
227
- # # streamer = TextIteratorStreamer(
228
- # # tok, timeout=20.0, skip_prompt=True, skip_special_tokens=True
229
- # # )
230
- # # generate_kwargs = dict(
231
- # # model_inputs,
232
- # # streamer=streamer,
233
- # # max_new_tokens=max_tokens,
234
- # # do_sample=True,
235
- # # temperature=temperature,
236
- # # eos_token_id=terminators,
237
- # # )
238
-
239
- # # if temperature == 0:
240
- # # generate_kwargs["do_sample"] = False
241
-
242
- # # t = Thread(target=model.generate, kwargs=generate_kwargs)
243
- # # t.start()
244
-
245
- # # partial_text = ""
246
- # # for new_text in streamer:
247
- # # partial_text += new_text
248
- # # yield partial_text
249
-
250
- # # yield partial_text
251
-
252
-
253
- # demo = gr.ChatInterface(
254
- # fn=llm_ans,
255
- # examples=[["Write me a poem about Machine Learning."]],
256
- # # multimodal=False,
257
- # stop_btn="Stop Generation",
258
- # title="Chat With LLMs",
259
- # description="Now Running Phi3-ORPO",
260
- # )
261
- # demo.launch()
262
-
263
-
264
- import gradio as gr
265
  import torch
266
- from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
267
- import os
268
  import spaces
269
- from threading import Thread
270
-
271
- import langchain
272
- from langchain.document_loaders import DirectoryLoader, PyPDFLoader
273
- from langchain.text_splitter import RecursiveCharacterTextSplitter
274
- from langchain import PromptTemplate
275
- from langchain_community.vectorstores import FAISS
276
- from langchain.llms import HuggingFacePipeline
277
- from langchain.embeddings import HuggingFaceInstructEmbeddings
278
- from langchain.chains import RetrievalQA
279
- import subprocess
280
- import textwrap
281
-
282
- # Installation command for specific libraries
283
- subprocess.run("pip install flash-attn --no-build-isolation", env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, shell=True)
284
-
285
- class CFG:
286
- DEBUG = False
287
- model_name = 'justinj92/phi3-orpo'
288
- temperature = 0.7
289
- top_p = 0.90
290
- repetition_penalty = 1.15
291
- max_len = 8192
292
- max_new_tokens = 512
293
- split_chunk_size = 800
294
- split_overlap = 400
295
- embeddings_model_repo = 'BAAI/bge-base-en-v1.5'
296
- k = 6
297
- PDFs_path = './data'
298
- Embeddings_path = './embeddings/input'
299
- Output_folder = './ml-papers-vector'
300
-
301
- loader = DirectoryLoader(CFG.PDFs_path, glob="*.pdf", loader_cls=PyPDFLoader)
302
- documents = loader.load()
303
-
304
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=CFG.split_chunk_size, chunk_overlap=CFG.split_overlap)
305
- texts = text_splitter.split_documents(documents)
306
-
307
- if not os.path.exists(f"{CFG.Embeddings_path}/index.faiss"):
308
- embeddings = HuggingFaceInstructEmbeddings(model_name=CFG.embeddings_model_repo, model_kwargs={"device":"cuda"})
309
- vectordb = FAISS.from_documents(documents=texts, embedding=embeddings)
310
- vectordb.save_local(f"{CFG.Output_folder}/faiss_index_ml_papers")
311
-
312
- embeddings = HuggingFaceInstructEmbeddings(model_name=CFG.embeddings_model_repo, model_kwargs={"device":"cuda"})
313
- vectordb = FAISS.load_local(f"{CFG.Output_folder}/faiss_index_ml_papers", embeddings, allow_dangerous_deserialization=True)
314
-
315
-
316
- def build_model(model_repo=CFG.model_name):
317
- tokenizer = AutoTokenizer.from_pretrained(model_repo)
318
- model = AutoModelForCausalLM.from_pretrained(model_repo, attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16)
319
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
320
- model = model.to(device)
321
- return tokenizer, model
322
-
323
- tok, model = build_model()
324
-
325
- terminators = [tok.eos_token_id, 32007, 32011, 32001, 32000]
326
-
327
- pipe = pipeline(task="text-generation", model=model, tokenizer=tok, eos_token_id=terminators, do_sample=True, max_new_tokens=CFG.max_new_tokens, temperature=CFG.temperature, top_p=CFG.top_p, repetition_penalty=CFG.repetition_penalty)
328
- llm = HuggingFacePipeline(pipeline=pipe)
329
-
330
- prompt_template = """
331
- You are an expert assistant that answers questions about machine learning and Large Language Models (LLMs).
332
- You are given some extracted parts from machine learning papers along with a question.
333
- If you don't know the answer, just say "I don't know." Don't try to make up an answer.
334
- It is very important that you ALWAYS answer the question in the same language the question is in. Remember to always do that.
335
- Use only the following pieces of context to answer the question at the end.
336
- Context: {context}
337
- Question is below. Remember to answer in the same language:
338
- Question: {question}
339
- """
340
-
341
- PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
342
-
343
- retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={"k": CFG.k})
344
-
345
-
346
- def process_llm_response(llm_response):
347
- ans = textwrap.fill(llm_response['result'], width=1500)
348
- sources_used = ' \n'.join([f"{source.metadata['source'].split('/')[-1][:-4]} - page: {str(source.metadata['page'])}" for source in llm_response['source_documents']])
349
-
350
- return f"{ans}\n\nSources:\n{sources_used}"
351
 
352
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353
 
 
 
 
 
354
 
 
 
 
 
 
355
 
356
- tok, model = build_model()
357
 
358
- @spaces.GPU(duration=60)
359
- def llm_ans(message, history):
360
- terminators = [tok.eos_token_id, 32007, 32011, 32001, 32000]
361
- pipe = pipeline(task="text-generation", model=model, tokenizer=tok, eos_token_id=terminators, do_sample=True, max_new_tokens=CFG.max_new_tokens, temperature=CFG.temperature, top_p=CFG.top_p, repetition_penalty=CFG.repetition_penalty)
362
- llm = HuggingFacePipeline(pipeline=pipe)
363
- qa_chain = RetrievalQA(llm=llm, retriever=retriever, prompt_template=PROMPT, return_source_documents=True, verbose=False)
364
-
365
-
366
- llm_response = qa_chain.invoke(message)
367
- return process_llm_response(llm_response)
368
 
 
 
 
 
369
 
370
- demo = gr.ChatInterface(
371
- fn=llm_ans,
372
- examples=[["Write me a poem about Machine Learning."]],
373
- # multimodal=False,
374
- stop_btn="Stop Generation",
375
- title="Chat With LLMs",
376
- description="Now Running Phi3-ORPO",
377
- )
378
- demo.launch()
 
1
+ from llama_index.core import VectorStoreIndex,SimpleDirectoryReader,ServiceContext,SummaryIndex
2
+ from llama_index.llms.huggingface import HuggingFaceLLM
3
+ from llama_index.core import Settings
4
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  import torch
 
 
6
  import spaces
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
 
9
+ documents = SimpleDirectoryReader("./data").load_data()
10
+ # vector_index = VectorStoreIndex.from_documents(documents)
11
+ summary_index = SummaryIndex.from_documents(documents)
12
+
13
+ def messages_to_prompt(messages):
14
+ prompt = ""
15
+ system_found = False
16
+ for message in messages:
17
+ if message.role == "system":
18
+ prompt += f"<|system|>\n{message.content}<|end|>\n"
19
+ system_found = True
20
+ elif message.role == "user":
21
+ prompt += f"<|user|>\n{message.content}<|end|>\n"
22
+ elif message.role == "assistant":
23
+ prompt += f"<|assistant|>\n{message.content}<|end|>\n"
24
+ else:
25
+ prompt += f"<|user|>\n{message.content}<|end|>\n"
26
+
27
+ # trailing prompt
28
+ prompt += "<|assistant|>\n"
29
+
30
+ if not system_found:
31
+ prompt = (
32
+ "<|system|>\nYou are a helpful AI research assistant built by Justin. You only answer from the context provided.<|end|>\n" + prompt
33
+ )
34
+
35
+ return prompt
36
+
37
+ llm = HuggingFaceLLM(
38
+ model_name="justinj92/phi3-orpo",
39
+ model_kwargs={
40
+ "trust_remote_code": True,
41
+ "torch_dtype": torch.bfloat16
42
+ },
43
+ generate_kwargs={"do_sample": True, "temperature": 0.7},
44
+ tokenizer_name="justinj92/phi3-orpo",
45
+ query_wrapper_prompt=(
46
+ "<|system|>\n"
47
+ "You are a helpful AI research assistant built by Justin. You only answer from the context provided.<|end|>\n"
48
+ "<|user|>\n"
49
+ "{query_str}<|end|>\n"
50
+ "<|assistant|>\n"
51
+ ),
52
+ messages_to_prompt=messages_to_prompt,
53
+ is_chat_model=True,
54
+ )
55
 
56
+ Settings.llm = llm
57
+ Settings.embed_model = HuggingFaceEmbedding(
58
+ model_name="BAAI/bge-small-en-v1.5"
59
+ )
60
 
61
+ service_context = ServiceContext.from_defaults(
62
+ chunk_size=1024,
63
+ llm=llm,
64
+ embed_model=Settings.embed_model
65
+ )
66
 
67
+ index = VectorStoreIndex.from_documents(documents, service_context=service_context)
68
 
69
+ query_engine = index.as_query_engine()
 
 
 
 
 
 
 
 
 
70
 
71
+ @spaces.GPU
72
+ def predict(input, history):
73
+ response = query_engine.query(input)
74
+ return str(response)
75
 
76
+ import gradio as gr
77
+ gr.ChatInterface(predict).launch(share=True)