Spaces:

flash88
/

glm4-9b-sft-uncensored-64k

Runtime error

File size: 3,466 Bytes

51a7d9e
 
 
 
22f5f54
51a7d9e
edb9e8a
51a7d9e
 
 
850186a
1ec2e60
 
51a7d9e
9f44fac
51a7d9e
9f44fac
51a7d9e
 
 
 
 
 
 
 
 
 
 
22f5f54
 
 
 
 
 
 
51a7d9e
f663115
51a7d9e
030c23d
fd6304d
 
51a7d9e
 
 
 
 
fd6304d
3b9cb87
22f5f54
803a940
639e063
edb9e8a
030c23d
 
f663115
 
51a7d9e
9b70592
51a7d9e
030c23d
0961bc7
f663115
030c23d
 
b4d1f01
 
 
 
8ea3132
51a7d9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9b70592
51a7d9e
9b70592
030c23d
51a7d9e

import torch
from PIL import Image
import gradio as gr
import spaces
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import os
from threading import Thread


HF_TOKEN = os.environ.get("HF_TOKEN", None)

MODELS = os.environ.get("MODELS")
MODEL_NAME = MODELS.split("/")[-1]

TITLE = "<h1><center>JosephusCheung/glm4-9b-sft-uncensored-64k</center></h1>"

DESCRIPTION = "You can use full 64K ctx with this online demo."

CSS = """
.duplicate-button {
  margin: auto !important;
  color: white !important;
  background: black !important;
  border-radius: 100vh !important;
}
"""

model = AutoModelForCausalLM.from_pretrained(
        MODELS,
        torch_dtype=torch.bfloat16,
        low_cpu_mem_usage=True,
        trust_remote_code=True,
        ).to(0).eval()

tokenizer = AutoTokenizer.from_pretrained(MODELS,trust_remote_code=True)


@spaces.GPU
def stream_chat(message: str, history: list, temperature: float, max_length: int):
    print(f'message is - {message}')
    print(f'history is - {history}')
    conversation = []
    for prompt, answer in history:
        conversation.extend([{"role": "user", "content": prompt}, {"role": "assistant", "content": answer}])
    conversation.append({"role": "user", "content": message})

    print(f"Conversation is -\n{conversation}")
    
    input_ids = tokenizer.apply_chat_template(conversation, tokenize=True, add_generation_prompt=True, return_tensors="pt", return_dict=True).to(model.device)
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    generate_kwargs = dict(
        max_length=max_length,
        streamer=streamer,
        do_sample=True,
        top_k=1,
        temperature=temperature,
        repetition_penalty=1,
    )
    gen_kwargs = {**input_ids, **generate_kwargs}

    with torch.no_grad():
        thread = Thread(target=model.generate, kwargs=gen_kwargs)
        thread.start()
        buffer = ""
        for new_text in streamer:
            buffer += new_text
            yield buffer
 



chatbot = gr.Chatbot(height=450)

with gr.Blocks(css=CSS) as demo:
    gr.HTML(TITLE)
    gr.HTML(DESCRIPTION)
    gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
    gr.ChatInterface(
        fn=stream_chat,
        chatbot=chatbot,
        fill_height=True,
        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
        additional_inputs=[
            gr.Slider(
                minimum=0,
                maximum=1,
                step=0.1,
                value=0.8,
                label="Temperature",
                render=False,
            ),
            gr.Slider(
                minimum=128,
                maximum=64000,
                step=1,
                value=64000,
                label="Max Length",
                render=False,
            ),
        ],
        examples=[
            ["Help me study vocabulary: write a sentence for me to fill in the blank, and I'll try to pick the correct option."],
            ["What are 5 creative things I could do with my kids' art? I don't want to throw them away, but it's also so much clutter."],
            ["Tell me a random fun fact about the Roman Empire."],
            ["Show me a code snippet of a website's sticky header in CSS and JavaScript."],
        ],
        cache_examples=False,
    )


if __name__ == "__main__":
    demo.launch()