OpenELM-270M_chat

Runtime error

File size: 4,044 Bytes

b8c24aa
3a82207
63b82b4
 
 
 
42799d2
63b82b4
c8fdb3b
3a82207
4e81072
deaeb85
00f3401
33cc946
00f3401
 
 
 
 
08c1bd3
33cc946
4e81072
7dc3087
b693a74
42799d2
 
13bee58
2165e54
63b82b4
81e5bac
00f3401
 
63b82b4
895beee
ecf6383
895beee
 
42799d2
895beee
33cc946
 
ea9c0d3
7115ad7
ea9c0d3
7dc3087
33cc946
64d8a64
63b82b4
64d8a64
 
63b82b4
64d8a64
63b82b4
c7f7d96
08c1bd3
33cc946
a6b8174
00f3401
33cc946
3a82207
 
 
 
 
 
33cc946
 
3a82207
 
33cc946
 
3a82207
a6b8174
63b82b4
33cc946
 
3a82207
 
 
33cc946
 
 
 
3a82207
00f3401
33cc946
ea9c0d3
00f3401
 
33cc946
3a82207
 
 
33cc946
3a82207
 
 
 
 
00f3401
3a82207
33cc946
63b82b4
 
34b43d3
63b82b4
 
 
 
 
b638764
63b82b4
00f3401
63b82b4
 
 
 
ea9c0d3
63b82b4
 
 
 
9a34670
63b82b4
b693a74
63b82b4
33cc946

import gradio as gr
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TextIteratorStreamer,
    LlamaTokenizer,
)
import os
from threading import Thread
import spaces
import subprocess

# flash-attn 라이브러리 설치. CUDA 빌드는 건너뜀.
subprocess.run(
    "pip install flash-attn --no-build-isolation",
    env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
    shell=True,
)

# Hugging Face 토큰 가져오기
token = os.environ["HF_TOKEN"]

# apple/OpenELM-270M 모델과 토크나이저 로드
# 토크나이저가 오류나는 문제가 있어서 NousResearch/Llama-2-7b-hf를 씀 
# 한국어 모델 토크나이저로 바꿔봄 beomi/llama-2-ko-7b
# apple/OpenELM-1.1B 토크나이저만 크게 해봄 <- 안됨
# apple/OpenELM-270M-Instruct로 둘다 변경 해봄 <- 안됨
model = AutoModelForCausalLM.from_pretrained(
    "apple/OpenELM-270M-Instruct",
    token=token,
    trust_remote_code=True,
)
tok = AutoTokenizer.from_pretrained(
    "NousResearch/Llama-2-7b-hf",
    token=token,
    trust_remote_code=True,
    tokenizer_class=LlamaTokenizer,
)

# 종료 토큰 ID 설정
terminators = [
    tok.eos_token_id,
]

# GPU가 사용 가능한 경우 GPU로, 아니면 CPU로 모델 로드
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(device)}")
else:
    device = torch.device("cpu")
    print("Using CPU")

model = model.to(device)

# Spaces의 GPU 자원을 사용하여 chat 함수 실행. 최대 60초 동안 GPU 자원 사용 가능.
@spaces.GPU(duration=60)
def chat(message, history, temperature, do_sample, max_tokens):
    # 채팅 기록을 적절한 형식으로 변환
    chat = []
    for item in history:
        chat.append({"role": "user", "content": item[0]})
        if item[1] is not None:
            chat.append({"role": "assistant", "content": item[1]})
    chat.append({"role": "user", "content": message})
    
    # 토크나이저를 사용하여 입력 처리
    messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
    model_inputs = tok([messages], return_tensors="pt").to(device)
    
    # TextIteratorStreamer를 사용하여 모델 출력 스트리밍
    streamer = TextIteratorStreamer(
        tok, timeout=20.0, skip_prompt=True, skip_special_tokens=True
    )
    
    # 생성 관련 매개변수 설정
    generate_kwargs = dict(
        model_inputs,
        streamer=streamer,
        max_new_tokens=max_tokens,  # 생성할 최대 새 토큰 수
        do_sample=True,  # 샘플링 여부
        temperature=temperature,  # 온도 매개변수. 높을수록 다양성 증가
        eos_token_id=terminators,  # 종료 토큰 ID
    )

    # 온도가 0이면 샘플링하지 않음
    if temperature == 0:
        generate_kwargs["do_sample"] = False

    # 별도 스레드에서 모델 생성 시작
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()

    # 생성된 텍스트를 반복적으로 yield
    partial_text = ""
    for new_text in streamer:
        partial_text += new_text
        yield partial_text

    yield partial_text

# Gradio의 ChatInterface를 사용하여 대화형 인터페이스 생성
demo = gr.ChatInterface(
    fn=chat,
    examples=[["let's talk about korea"]],
    additional_inputs_accordion=gr.Accordion(
        label="⚙️ Parameters", open=False, render=False
    ),
    additional_inputs=[
        gr.Slider(
            minimum=0, maximum=1, step=0.1, value=0.7, label="Temperature", render=False
        ),
        gr.Checkbox(label="Sampling", value=True),
        gr.Slider(
            minimum=128,
            maximum=4096,
            step=1,
            value=512,
            label="Max new tokens",
            render=False,
        ),
    ],
    stop_btn="Stop Generation",
    title="Chat With LLMs",
    description="Now Running [apple/OpenELM-270M](https://huggingface.co/apple/OpenELM-270M)",
)

# Gradio 인터페이스 실행
demo.launch()