Spaces:

Warlord-K
/

AVA

Runtime error

App Files Files Community

Warlord-K commited on Oct 21, 2023

Commit

1ac9113

•

1 Parent(s): 7732090

Added MPT

Browse files

Files changed (1) hide show

app.py +62 -36

app.py CHANGED Viewed

@@ -7,19 +7,20 @@ import numpy as np
 from typing import Iterator
 import gradio as gr
 import pandas as pd
 import torch
-from easyllm.clients import huggingface
 from transformers import AutoTokenizer
-huggingface.prompt_builder = "llama2"
-huggingface.api_key = os.environ["HUGGINGFACE_TOKEN"]
 MAX_MAX_NEW_TOKENS = 250
 DEFAULT_MAX_NEW_TOKENS = 250
 MAX_INPUT_TOKEN_LENGTH = 4000
 EMBED_DIM = 1024
-K = 10
 EF = 100
 TEXT_FILE = 'data.txt'
 SEARCH_INDEX = "search_index.bin"
@@ -28,20 +29,24 @@ DOCUMENT_DATASET = "chunked_data.parquet"
 COSINE_THRESHOLD = 0.7
 torch_device = "cuda" if torch.cuda.is_available() else "cpu"
 print("Running on device:", torch_device)
 print("CPU threads:", torch.get_num_threads())
-model_id = "princeton-nlp/Sheared-LLaMA-2.7B"
-biencoder = SentenceTransformer("intfloat/e5-large-v2", device=torch_device)
-cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-12-v2", max_length=512, device=torch_device)
-tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=os.environ["HUGGINGFACE_TOKEN"])
 def read_text_from_file(file_path):
-    with open(file_path, "r") as text_file:
         text = text_file.read()
     texts = text.split("&&")
     return [t.strip() for t in texts]
@@ -89,37 +94,56 @@ def get_input_token_length(message: str, chat_history: list[tuple[str, str]], sy
     input_ids = tokenizer([prompt], return_tensors="np", add_special_tokens=False)["input_ids"]
     return input_ids.shape[-1]
 # https://www.philschmid.de/llama-2#how-to-prompt-llama-2-chat
 def get_completion(
     prompt,
     system_prompt=None,
-    model=model_id,
-    max_new_tokens=1024,
     temperature=0.2,
     top_p=0.95,
     top_k=50,
     stream=False,
     debug=False,
 ):
     if temperature < 1e-2:
         temperature = 1e-2
-    messages = []
-    if system_prompt is not None:
-        messages.append({"role": "system", "content": system_prompt})
-    messages.append({"role": "user", "content": prompt})
-    response = huggingface.ChatCompletion.create(
-        model=model,
-        messages=messages,
-        temperature=temperature,  # this is the degree of randomness of the model's output
-        max_tokens=250,  # this is the number of new tokens being generated
-        top_p=top_p,
-        top_k=top_k,
-        stream=stream,
-        debug=debug,
     )
-    return response["choices"][0]["message"]["content"] if not stream else response
 # load the index for the data
 def load_hnsw_index(index_file):
@@ -177,17 +201,18 @@ def generate_condensed_query(query, history):
         chat_history += f"Assistant: {turn[1]}\n"
     condense_question_prompt = create_condense_question_prompt(query, chat_history)
-    condensed_question = json.loads(get_completion(condense_question_prompt, max_new_tokens=64, temperature=0))
-    return condensed_question["question"]
 DEFAULT_SYSTEM_PROMPT = """\
 You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
 If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\
 """
-MAX_MAX_NEW_TOKENS = 2048
-DEFAULT_MAX_NEW_TOKENS = 1024
-MAX_INPUT_TOKEN_LENGTH = 4000
 DESCRIPTION = """
 # AVA Southampton Chatbot 🤗
@@ -265,7 +290,8 @@ def generate(
     output = ""
     for idx, response in enumerate(generator):
-        token = response["choices"][0]["delta"].get("content", "") or ""
         output += token
         if idx == 0:
             history.append((message, output))
@@ -273,7 +299,7 @@ def generate(
             history[-1] = (message, output)
         history = [
-            (wrap_html_code(history[i][0].strip()), wrap_html_code(history[i][1].strip()))
             for i in range(0, len(history))
         ]
         yield history
@@ -483,4 +509,4 @@ with gr.Blocks(css="style.css") as demo:
         api_name=False,
     )
-demo.queue(max_size=20).launch(debug=True, share=False)

 from typing import Iterator
 import gradio as gr
+from gradio_client import Client
 import pandas as pd
 import torch
 from transformers import AutoTokenizer
+from awq import AutoAWQForCausalLM
+from transformers import AutoTokenizer
 MAX_MAX_NEW_TOKENS = 250
 DEFAULT_MAX_NEW_TOKENS = 250
 MAX_INPUT_TOKEN_LENGTH = 4000
 EMBED_DIM = 1024
+K = 2
 EF = 100
 TEXT_FILE = 'data.txt'
 SEARCH_INDEX = "search_index.bin"
 COSINE_THRESHOLD = 0.7
 torch_device = "cuda" if torch.cuda.is_available() else "cpu"
 print("Running on device:", torch_device)
 print("CPU threads:", torch.get_num_threads())
+biencoder = SentenceTransformer("intfloat/e5-large-v2", device="cpu")
+cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-12-v2", max_length=512, device="cpu")
+model_name_or_path = "TheBloke/TinyLlama-1.1B-1T-OpenOrca-AWQ"
+# Load model
+# model = AutoAWQForCausalLM.from_quantized(model_name_or_path, fuse_layers=True,
+#                                           trust_remote_code=False, safetensors=True)
+# tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=False)
+tokenizer = AutoTokenizer.from_pretrained("mosaicml/mpt-30b-chat", trust_remote_code=False)
+chat_client = Client("https://mosaicml-mpt-30b-chat.hf.space/", serialize = False)
+chat_bot = [["", None]]
 def read_text_from_file(file_path):
+    with open(file_path, "r", encoding="utf-8") as text_file:
         text = text_file.read()
     texts = text.split("&&")
     return [t.strip() for t in texts]
     input_ids = tokenizer([prompt], return_tensors="np", add_special_tokens=False)["input_ids"]
     return input_ids.shape[-1]
+def prompt_builder(prompt, system_message="You are a helpful chatbot which gives correct and truthful answers"):
+  return f'''<|im_start|>system
+{system_message}<|im_end|>
+<|im_start|>user
+{prompt}<|im_end|>
+<|im_start|>assistant
+  '''
 # https://www.philschmid.de/llama-2#how-to-prompt-llama-2-chat
 def get_completion(
     prompt,
     system_prompt=None,
+    # model=model,
+    max_new_tokens=250,
     temperature=0.2,
     top_p=0.95,
     top_k=50,
     stream=False,
     debug=False,
 ):
+    global chat_bot
     if temperature < 1e-2:
         temperature = 1e-2
+    answer=chat_client.predict(
+                prompt, # str  in 'Type an input and press Enter' Textbox component
+                chat_bot,
+                fn_index=1
     )
+    chat_bot = answer[1]
+    yield answer[1][0][1]
+    # prompt = prompt_builder(prompt)
+    # tokens = tokenizer(
+    #     prompt,
+    #     return_tensors='pt'
+    # ).input_ids.cuda()
+    # # Generate output
+    # for i in range(max_new_tokens):
+    #   generation_output = model.generate(
+    #       tokens,
+    #       do_sample=True,
+    #       temperature=temperature,
+    #       top_p=top_p,
+    #       top_k=top_k,
+    #       max_new_tokens=1
+    #   )
+    #   tokens = generation_output
+    #   yield tokenizer.decode(generation_output[0][-1])
 # load the index for the data
 def load_hnsw_index(index_file):
         chat_history += f"Assistant: {turn[1]}\n"
     condense_question_prompt = create_condense_question_prompt(query, chat_history)
+    # condensed_question = json.loads(get_completion(condense_question_prompt, max_new_tokens=64, temperature=0))
+    condensed_question = "".join([token for token in get_completion(condense_question_prompt, max_new_tokens=64, temperature=0)])
+    return condensed_question
 DEFAULT_SYSTEM_PROMPT = """\
 You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
 If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\
 """
+# MAX_MAX_NEW_TOKENS = 2048
+# DEFAULT_MAX_NEW_TOKENS = 1024
+# MAX_INPUT_TOKEN_LENGTH = 4000
 DESCRIPTION = """
 # AVA Southampton Chatbot 🤗
     output = ""
     for idx, response in enumerate(generator):
+        # token = response["choices"][0]["delta"].get("content", "") or ""
+        token = response
         output += token
         if idx == 0:
             history.append((message, output))
             history[-1] = (message, output)
         history = [
+            (wrap_html_code(history[i][0]), wrap_html_code(history[i][1]))
             for i in range(0, len(history))
         ]
         yield history
         api_name=False,
     )
+demo.queue(max_size=20).launch(debug=True)