Phi3-ORPO

Paused

App Files Files Community

justinj92 commited on May 7

Commit

9ba0e23

•

1 Parent(s): bbabd2a

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -368

app.py CHANGED Viewed

@@ -1,378 +1,77 @@
-# import gradio as gr
-# import torch
-# from transformers import (
-#     AutoModelForCausalLM,
-#     AutoTokenizer,
-#     TextIteratorStreamer,
-#     pipeline
-# )
-# import os
-# from threading import Thread
-# import spaces
-# import time
-# import langchain
-# import os
-# import glob
-# import gc
-# # loaders
-# from langchain.document_loaders import PyPDFLoader, DirectoryLoader
-# # splits
-# from langchain.text_splitter import RecursiveCharacterTextSplitter
-# # prompts
-# from langchain import PromptTemplate
-# # vector stores
-# from langchain_community.vectorstores import FAISS
-# # models
-# from langchain.llms import HuggingFacePipeline
-# from langchain.embeddings import HuggingFaceInstructEmbeddings
-# # retrievers
-# from langchain.chains import RetrievalQA
-# import subprocess
-# subprocess.run(
-#     "pip install flash-attn --no-build-isolation",
-#     env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
-#     shell=True,
-# )
-# class CFG:
-#     DEBUG = False
-#     ### LLM
-#     model_name = 'justinj92/phi3-orpo'
-#     temperature = 0.7
-#     top_p = 0.90
-#     repetition_penalty = 1.15
-#     max_len = 8192
-#     max_new_tokens = 512
-#     ### splitting
-#     split_chunk_size = 800
-#     split_overlap = 400
-#     ### embeddings
-#     embeddings_model_repo = 'BAAI/bge-base-en-v1.5'
-#     ### similar passages
-#     k = 6
-#     ### paths
-#     PDFs_path = './data'
-#     Embeddings_path =  './embeddings/input'
-#     Output_folder = './ml-papers-vector'
-# loader = DirectoryLoader(CFG.PDFs_path, glob="*.pdf", loader_cls=PyPDFLoader)
-# documents = loader.load()
-# text_splitter = RecursiveCharacterTextSplitter(chunk_size = CFG.split_chunk_size, chunk_overlap = CFG.split_overlap)
-# texts = text_splitter.split_documents(documents)
-# if not os.path.exists(CFG.Embeddings_path + '/index.faiss'):
-#     embeddings = HuggingFaceInstructEmbeddings(model_name = CFG.embeddings_model_repo, model_kwargs={"device":"cuda"})
-#     vectordb = FAISS.from_documents(documents=texts, embedding=embeddings)
-#     vectordb.save_local(f"{CFG.Output_folder}/faiss_index_ml_papers")
-# embeddings = HuggingFaceInstructEmbeddings(model_name = CFG.embeddings_model_repo, model_kwargs={"device":"cuda"})
-# vectordb = FAISS.load_local(CFG.Output_folder + '/faiss_index_ml_papers', embeddings, allow_dangerous_deserialization=True)
-# def build_model(model_repo = CFG.model_name):
-#     tokenizer = AutoTokenizer.from_pretrained(model_repo)
-#     model = AutoModelForCausalLM.from_pretrained(model_repo, attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16)
-#     if torch.cuda.is_available():
-#       device = torch.device("cuda")
-#       print(f"Using GPU: {torch.cuda.get_device_name(device)}")
-#     else:
-#        device = torch.device("cpu")
-#        print("Using CPU")
-#     device = torch.device("cuda")
-#     model = model.to(device)
-#     return tokenizer, model
-# tok, model = build_model(model_repo = CFG.model_name)
-# terminators = [
-#     tok.eos_token_id,
-#     32007,
-#     32011,
-#     32001,
-#     32000
-# ]
-# pipe = pipeline(task="text-generation", model=model, tokenizer=tok, eos_token_id=terminators, do_sample=True, max_new_tokens=CFG.max_new_tokens, temperature=CFG.temperature, top_p=CFG.top_p, repetition_penalty=CFG.repetition_penalty)
-# llm = HuggingFacePipeline(pipeline = pipe)
-# prompt_template = """
-# <|system|>
-# You are an expert assistant that answers questions about machine learning and Large Language Models (LLMs).
-# You are given some extracted parts from machine learning papers along with a question.
-# If you don't know the answer, just say "I don't know." Don't try to make up an answer.
-# It is very important that you ALWAYS answer the question in the same language the question is in. Remember to always do that.
-# Use only the following pieces of context to answer the question at the end.
-# <|end|>
-# <|user|>
-# Context: {context}
-# Question is below. Remember to answer in the same language:
-# Question: {question}
-# <|end|>
-# <|assistant|>
-# """
-# PROMPT = PromptTemplate(
-#     template = prompt_template,
-#     input_variables = ["context", "question"]
-# )
-# retriever = vectordb.as_retriever(
-#     search_type = "similarity",
-#     search_kwargs = {"k": CFG.k}
-# )
-# qa_chain = RetrievalQA.from_chain_type(
-#     llm = llm,
-#     chain_type = "stuff", # map_reduce, map_rerank, stuff, refine
-#     retriever = retriever,
-#     chain_type_kwargs = {"prompt": PROMPT},
-#     return_source_documents = True,
-#     verbose = False
-# )
-# def wrap_text_preserve_newlines(text, width=1500):
-#     # Split the input text into lines based on newline characters
-#     lines = text.split('\n')
-#     # Wrap each line individually
-#     wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
-#     # Join the wrapped lines back together using newline characters
-#     wrapped_text = '\n'.join(wrapped_lines)
-#     return wrapped_text
-# def process_llm_response(llm_response):
-#     ans = wrap_text_preserve_newlines(llm_response['result'])
-#     sources_used = ' \n'.join(
-#         [
-#             source.metadata['source'].split('/')[-1][:-4]
-#             + ' - page: '
-#             + str(source.metadata['page'])
-#             for source in llm_response['source_documents']
-#         ]
-#     )
-#     ans = ans + '\n\nSources: \n' + sources_used
-#     ### return only the text after the pattern
-#     pattern = "<|assistant|>"
-#     index = ans.find(pattern)
-#     if index != -1:
-#         ans = ans[index + len(pattern):]
-#     return ans.strip()
-# @spaces.GPU
-# def llm_ans(message, history):
-#     llm_response = qa_chain.invoke(message)
-#     ans = process_llm_response(llm_response)
-#     return ans
-# # @spaces.GPU(duration=60)
-# # def chat(message, history, temperature, do_sample, max_tokens):
-# #     chat = [{"role": "system", "content": "You are ORPO Tuned Phi Beast. Answer all questions in the most helpful way. No yapping."}]
-# #     for item in history:
-# #         chat.append({"role": "user", "content": item[0]})
-# #         if item[1] is not None:
-# #             chat.append({"role": "assistant", "content": item[1]})
-# #     chat.append({"role": "user", "content": message})
-# #     messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
-# #     model_inputs = tok([messages], return_tensors="pt").to(device)
-# #     streamer = TextIteratorStreamer(
-# #         tok, timeout=20.0, skip_prompt=True, skip_special_tokens=True
-# #     )
-# #     generate_kwargs = dict(
-# #         model_inputs,
-# #         streamer=streamer,
-# #         max_new_tokens=max_tokens,
-# #         do_sample=True,
-# #         temperature=temperature,
-# #         eos_token_id=terminators,
-# #     )
-# #     if temperature == 0:
-# #         generate_kwargs["do_sample"] = False
-# #     t = Thread(target=model.generate, kwargs=generate_kwargs)
-# #     t.start()
-# #     partial_text = ""
-# #     for new_text in streamer:
-# #         partial_text += new_text
-# #         yield partial_text
-# #     yield partial_text
-# demo = gr.ChatInterface(
-#     fn=llm_ans,
-#     examples=[["Write me a poem about Machine Learning."]],
-#     # multimodal=False,
-#     stop_btn="Stop Generation",
-#     title="Chat With LLMs",
-#     description="Now Running Phi3-ORPO",
-# )
-# demo.launch()
-import gradio as gr
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
-import os
 import spaces
-from threading import Thread
-import langchain
-from langchain.document_loaders import DirectoryLoader, PyPDFLoader
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain import PromptTemplate
-from langchain_community.vectorstores import FAISS
-from langchain.llms import HuggingFacePipeline
-from langchain.embeddings import HuggingFaceInstructEmbeddings
-from langchain.chains import RetrievalQA
-import subprocess
-import textwrap
-# Installation command for specific libraries
-subprocess.run("pip install flash-attn --no-build-isolation", env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, shell=True)
-class CFG:
-    DEBUG = False
-    model_name = 'justinj92/phi3-orpo'
-    temperature = 0.7
-    top_p = 0.90
-    repetition_penalty = 1.15
-    max_len = 8192
-    max_new_tokens = 512
-    split_chunk_size = 800
-    split_overlap = 400
-    embeddings_model_repo = 'BAAI/bge-base-en-v1.5'
-    k = 6
-    PDFs_path = './data'
-    Embeddings_path = './embeddings/input'
-    Output_folder = './ml-papers-vector'
-loader = DirectoryLoader(CFG.PDFs_path, glob="*.pdf", loader_cls=PyPDFLoader)
-documents = loader.load()
-text_splitter = RecursiveCharacterTextSplitter(chunk_size=CFG.split_chunk_size, chunk_overlap=CFG.split_overlap)
-texts = text_splitter.split_documents(documents)
-if not os.path.exists(f"{CFG.Embeddings_path}/index.faiss"):
-    embeddings = HuggingFaceInstructEmbeddings(model_name=CFG.embeddings_model_repo, model_kwargs={"device":"cuda"})
-    vectordb = FAISS.from_documents(documents=texts, embedding=embeddings)
-    vectordb.save_local(f"{CFG.Output_folder}/faiss_index_ml_papers")
-embeddings = HuggingFaceInstructEmbeddings(model_name=CFG.embeddings_model_repo, model_kwargs={"device":"cuda"})
-vectordb = FAISS.load_local(f"{CFG.Output_folder}/faiss_index_ml_papers", embeddings, allow_dangerous_deserialization=True)
-def build_model(model_repo=CFG.model_name):
-    tokenizer = AutoTokenizer.from_pretrained(model_repo)
-    model = AutoModelForCausalLM.from_pretrained(model_repo, attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16)
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    model = model.to(device)
-    return tokenizer, model
-tok, model = build_model()
-terminators = [tok.eos_token_id, 32007, 32011, 32001, 32000]
-pipe = pipeline(task="text-generation", model=model, tokenizer=tok, eos_token_id=terminators, do_sample=True, max_new_tokens=CFG.max_new_tokens, temperature=CFG.temperature, top_p=CFG.top_p, repetition_penalty=CFG.repetition_penalty)
-llm = HuggingFacePipeline(pipeline=pipe)
-prompt_template = """
-You are an expert assistant that answers questions about machine learning and Large Language Models (LLMs).
-You are given some extracted parts from machine learning papers along with a question.
-If you don't know the answer, just say "I don't know." Don't try to make up an answer.
-It is very important that you ALWAYS answer the question in the same language the question is in. Remember to always do that.
-Use only the following pieces of context to answer the question at the end.
-Context: {context}
-Question is below. Remember to answer in the same language:
-Question: {question}
-"""
-PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
-retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={"k": CFG.k})
-def process_llm_response(llm_response):
-    ans = textwrap.fill(llm_response['result'], width=1500)
-    sources_used = ' \n'.join([f"{source.metadata['source'].split('/')[-1][:-4]} - page: {str(source.metadata['page'])}" for source in llm_response['source_documents']])
-    return f"{ans}\n\nSources:\n{sources_used}"
-tok, model = build_model()
-@spaces.GPU(duration=60)
-def llm_ans(message, history):
-    terminators = [tok.eos_token_id, 32007, 32011, 32001, 32000]
-    pipe = pipeline(task="text-generation", model=model, tokenizer=tok, eos_token_id=terminators, do_sample=True, max_new_tokens=CFG.max_new_tokens, temperature=CFG.temperature, top_p=CFG.top_p, repetition_penalty=CFG.repetition_penalty)
-    llm = HuggingFacePipeline(pipeline=pipe)
-    qa_chain = RetrievalQA(llm=llm, retriever=retriever, prompt_template=PROMPT, return_source_documents=True, verbose=False)
-    llm_response = qa_chain.invoke(message)
-    return process_llm_response(llm_response)
-demo = gr.ChatInterface(
-     fn=llm_ans,
-     examples=[["Write me a poem about Machine Learning."]],
-     # multimodal=False,
-     stop_btn="Stop Generation",
-     title="Chat With LLMs",
-     description="Now Running Phi3-ORPO",
-)
-demo.launch()

+from llama_index.core import VectorStoreIndex,SimpleDirectoryReader,ServiceContext,SummaryIndex
+from llama_index.llms.huggingface import HuggingFaceLLM
+from llama_index.core import Settings
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 import torch
 import spaces
+documents = SimpleDirectoryReader("./data").load_data()
+# vector_index = VectorStoreIndex.from_documents(documents)
+summary_index = SummaryIndex.from_documents(documents)
+def messages_to_prompt(messages):
+    prompt = ""
+    system_found = False
+    for message in messages:
+        if message.role == "system":
+            prompt += f"<|system|>\n{message.content}<|end|>\n"
+            system_found = True
+        elif message.role == "user":
+            prompt += f"<|user|>\n{message.content}<|end|>\n"
+        elif message.role == "assistant":
+            prompt += f"<|assistant|>\n{message.content}<|end|>\n"
+        else:
+            prompt += f"<|user|>\n{message.content}<|end|>\n"
+    # trailing prompt
+    prompt += "<|assistant|>\n"
+    if not system_found:
+        prompt = (
+            "<|system|>\nYou are a helpful AI research assistant built by Justin. You only answer from the context provided.<|end|>\n" + prompt
+        )
+    return prompt
+llm = HuggingFaceLLM(
+    model_name="justinj92/phi3-orpo",
+    model_kwargs={
+        "trust_remote_code": True,
+        "torch_dtype": torch.bfloat16
+    },
+    generate_kwargs={"do_sample": True, "temperature": 0.7},
+    tokenizer_name="justinj92/phi3-orpo",
+    query_wrapper_prompt=(
+        "<|system|>\n"
+        "You are a helpful AI research assistant built by Justin. You only answer from the context provided.<|end|>\n"
+        "<|user|>\n"
+        "{query_str}<|end|>\n"
+        "<|assistant|>\n"
+    ),
+    messages_to_prompt=messages_to_prompt,
+    is_chat_model=True,
+)
+Settings.llm = llm
+Settings.embed_model = HuggingFaceEmbedding(
+    model_name="BAAI/bge-small-en-v1.5"
+)
+service_context = ServiceContext.from_defaults(
+    chunk_size=1024,
+    llm=llm,
+    embed_model=Settings.embed_model
+)
+index = VectorStoreIndex.from_documents(documents, service_context=service_context)
+query_engine = index.as_query_engine()
+@spaces.GPU
+def predict(input, history):
+  response = query_engine.query(input)
+  return str(response)
+import gradio as gr
+gr.ChatInterface(predict).launch(share=True)