| import gradio as gr |
| import torch |
| from transformers import ( |
| AutoModelForCausalLM, |
| AutoTokenizer, |
| TextIteratorStreamer, |
| ) |
| import os |
| from threading import Thread |
| import spaces |
| import time |
| import subprocess |
|
|
| subprocess.run( |
| "pip install flash-attn --no-build-isolation", |
| env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, |
| shell=True, |
| ) |
|
|
| token = os.environ["HF_TOKEN"] |
|
|
|
|
| model = AutoModelForCausalLM.from_pretrained( |
| "microsoft/phi-4", |
| token=token, |
| trust_remote_code=True, |
| torch_dtype=torch.bfloat16 |
| ) |
| tok = AutoTokenizer.from_pretrained("microsoft/phi-4", token=token) |
| terminators = [ |
| tok.eos_token_id, |
| ] |
|
|
| if torch.cuda.is_available(): |
| device = torch.device("cuda") |
| print(f"Using GPU: {torch.cuda.get_device_name(device)}") |
| else: |
| device = torch.device("cpu") |
| print("Using CPU") |
|
|
| model = model.to(device) |
| |
|
|
|
|
| @spaces.GPU(duration=60) |
| def chat(message, history, temperature, do_sample, max_tokens): |
| chat = [] |
| for item in history: |
| chat.append({"role": "user", "content": item[0]}) |
| if item[1] is not None: |
| chat.append({"role": "assistant", "content": item[1]}) |
| chat.append({"role": "user", "content": message}) |
| messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True) |
| model_inputs = tok([messages], return_tensors="pt").to(device) |
| streamer = TextIteratorStreamer( |
| tok, timeout=20.0, skip_prompt=True, skip_special_tokens=True |
| ) |
| generate_kwargs = dict( |
| model_inputs, |
| streamer=streamer, |
| max_new_tokens=max_tokens, |
| do_sample=True, |
| temperature=temperature, |
| eos_token_id=terminators, |
| ) |
|
|
| if temperature == 0: |
| generate_kwargs["do_sample"] = False |
|
|
| t = Thread(target=model.generate, kwargs=generate_kwargs) |
| t.start() |
|
|
| partial_text = "" |
| for new_text in streamer: |
| partial_text += new_text |
| yield partial_text |
|
|
| yield partial_text |
|
|
|
|
| demo = gr.ChatInterface( |
| fn=chat, |
| examples=[["Write me a poem about Machine Learning."]], |
| |
| additional_inputs_accordion=gr.Accordion( |
| label="⚙️ Parameters", open=False, render=False |
| ), |
| additional_inputs=[ |
| gr.Slider( |
| minimum=0, maximum=1, step=0.1, value=0.9, label="Temperature", render=False |
| ), |
| gr.Checkbox(label="Sampling", value=True), |
| gr.Slider( |
| minimum=128, |
| maximum=4096, |
| step=1, |
| value=512, |
| label="Max new tokens", |
| render=False, |
| ), |
| ], |
| stop_btn="Stop Generation", |
| title="Chat With LLMs", |
| description="Now Running [microsoft/phi-4](https://huggingface.co/microsoft/phi-4)", |
| ) |
| demo.launch() |
|
|