Spaces:

georgesung
/

llama2_7b_uncensored_chat

Paused

App Files Files Community

georgesung commited on Jul 20, 2023

Commit

8cf6e52

•

1 Parent(s): 7a57d84

Not using vllm

Browse files

Files changed (1) hide show

app.py +71 -92

app.py CHANGED Viewed

@@ -1,99 +1,78 @@
-import re
-import gradio as gr
 import torch
-from transformers import (AutoConfig, AutoModel, AutoModelForSeq2SeqLM,
-                          AutoTokenizer, LlamaForCausalLM, LlamaTokenizer)
-from vllm import LLM, SamplingParams
-model_id = "georgesung/llama2_7b_chat_uncensored"
-prompt_config = {
-    "system_header": None,
-    "system_footer": None,
-    "user_header": "### HUMAN:",
-    "user_footer": None,
-    "input_header": None,
-    "response_header": "### RESPONSE:",
-}
-def get_llm_response_chat(prompt):
-    outputs = llm.generate(prompt, sampling_params)
-    output = outputs[0].outputs[0].text
-    # Remove trailing eos token
-    eos_token = llm.get_tokenizer().eos_token
-    if output.endswith(eos_token):
-        output = output[:-len(eos_token)]
-    return output
-def hist_to_prompt(history):
-    prompt = ""
-    if prompt_config["system_header"]:
-        system_footer = ""
-        if prompt_config["system_footer"]:
-            system_footer = prompt_config["system_footer"]
-        prompt += f"{prompt_config['system_header']}\n{SYSTEM_MESSAGE}{system_footer}\n\n"
-    for i, (human_text, bot_text) in enumerate(history):
-        user_footer = ""
-        if prompt_config["user_footer"]:
-            user_footer = prompt_config["user_footer"]
-        prompt += f"{prompt_config['user_header']}\n{human_text}{user_footer}\n\n"
-        prompt += f"{prompt_config['response_header']}\n"
-        if bot_text:
-            prompt += f"{bot_text}\n\n"
-    return prompt
-def get_bot_response(text):
-    bot_text_index = text.rfind(prompt_config['response_header'])
-    if bot_text_index != -1:
-        text = text[bot_text_index + len(prompt_config['response_header']):].strip()
-    return text
-def main():
-    # RE llama tokenizer:
-    # RuntimeError: Failed to load the tokenizer.
-    # If you are using a LLaMA-based model, use 'hf-internal-testing/llama-tokenizer' instead of the original tokenizer.
-    llm = LLM(model=model_id, tokenizer='hf-internal-testing/llama-tokenizer')
-    sampling_params = SamplingParams(temperature=0.01, top_p=0.1, top_k=40, max_tokens=2048)
-    tokenizer = llm.get_tokenizer()
-    with gr.Blocks() as demo:
-        gr.Markdown(
-        """
-        # Let's chat
-        """)
-        chatbot = gr.Chatbot()
-        msg = gr.Textbox()
-        clear = gr.Button("Clear")
-        def user(user_message, history):
-            return "", history + [[user_message, None]]
-        def bot(history):
-            hist_text = hist_to_prompt(history)
-            bot_message = get_llm_response_chat(hist_text) #+ tokenizer.eos_token
-            history[-1][1] = bot_message  # add bot message to overall history
-            return history
-        msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
-            bot, chatbot, chatbot
-        )
-        clear.click(lambda: None, None, chatbot, queue=False)
-    demo.queue()
-    demo.launch()
-if __name__ == "__main__":
-    main()

+from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline
 import torch
+import gradio as gr
+# LLM helper functions
+def get_response_text(data):
+    text = data[0]["generated_text"]
+    assistant_text_index = text.rfind('### RESPONSE:')
+    if assistant_text_index != -1:
+        text = text[assistant_text_index+len('### RESPONSE:'):].strip()
+    return text
+def get_llm_response(prompt, pipe):
+    raw_output = pipe(prompt)
+    text = get_response_text(raw_output)
+    return text
+# Load LLM
+model_id = "georgesung/llama2_7b_chat_uncensored"
+tokenizer = LlamaTokenizer.from_pretrained(model_id)
+model = LlamaForCausalLM.from_pretrained(model_id, device_map="auto", load_in_8bit=True)
+# Llama tokenizer missing pad token
+tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+pipe = pipeline(
+    "text-generation",
+    model=model,
+    tokenizer=tokenizer,
+    max_length=4096,  # Llama-2 default context window
+    temperature=0.7,
+    top_p=0.95,
+    repetition_penalty=1.15
+)
+with gr.Blocks() as demo:
+    chatbot = gr.Chatbot()
+    msg = gr.Textbox()
+    clear = gr.Button("Clear")
+    def hist_to_prompt(history):
+        prompt = ""
+        for human_text, bot_text in history:
+            prompt += f"### HUMAN:\n{human_text}\n\n### RESPONSE:\n"
+            if bot_text:
+                prompt += f"{bot_text}\n\n"
+        return prompt
+    def get_bot_response(text):
+        bot_text_index = text.rfind('### RESPONSE:')
+        if bot_text_index != -1:
+            text = text[bot_text_index + len('### RESPONSE:'):].strip()
+        return text
+    def user(user_message, history):
+        return "", history + [[user_message, None]]
+    def bot(history):
+        #bot_message = random.choice(["How are you?", "I love you", "I'm very hungry"])
+        #history[-1][1] = bot_message + '</s>'
+        hist_text = hist_to_prompt(history)
+        print(hist_text)
+        bot_message = get_llm_response(hist_text, pipe) + tokenizer.eos_token
+        history[-1][1] = bot_message  # add bot message to overall history
+        return history
+    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
+        bot, chatbot, chatbot
+    )
+    clear.click(lambda: None, None, chatbot, queue=False)
+demo.queue()
+demo.launch()