Spaces:

Daeyongkwon98
/

chatbot_demo

Sleeping

File size: 3,103 Bytes

5fd0daa
154a87e
a51e5f4
 
 
5fd0daa
 
 
 
154a87e
a51e5f4
 
154a87e
5fd0daa
ed12022
128b55e
d14f87e
154a87e
a51e5f4
 
32006fa
 
 
a51e5f4
 
 
 
32006fa
a51e5f4
154a87e
a51e5f4
 
54b7145
a51e5f4
 
 
f6b2655
 
 
a51e5f4
 
 
154a87e
 
a51e5f4
 
 
 
 
32006fa
 
 
 
 
 
a51e5f4
 
 
 
 
154a87e
a51e5f4
154a87e
 
 
54b7145
a51e5f4
154a87e
 
 
e3ab0ad
32006fa
 
 
154a87e

import os
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
import torch
from string import Template
from huggingface_hub import login

# Hugging Face에 로그인 (환경 변수에서 Access Token 가져오기)
login(os.getenv("ACCESS_TOKEN"))  # ACCESS_TOKEN을 환경 변수에서 불러옴

# 프롬프트 템플릿 설정
prompt_template = Template("Human: ${inst} </s> Assistant: ")

# 모델과 토크나이저 로드
model_name = "meta-llama/Llama-3.2-1b-instruct"  # 모델 경로
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="cpu").eval()

# 생성 설정 (Gradio UI에서 제어할 수 있는 변수들)
default_generation_config = GenerationConfig(
    temperature=0.1,
    top_k=30,
    top_p=0.5,
    do_sample=True,
    num_beams=1,
    repetition_penalty=1.1,
    min_new_tokens=10,
    max_new_tokens=30
)

# 응답 생성 함수
def respond(message, history, system_message, max_tokens, temperature, top_p):
    # 생성 설정
    generation_config = GenerationConfig(
        **default_generation_config.to_dict()  # 기본 설정과 병합
    )
    generation_config.max_new_tokens = max_tokens  # max_tokens 따로 설정
    generation_config.temperature = temperature   # temperature 따로 설정
    generation_config.top_p = top_p
    
    # 대화 히스토리와 시스템 메시지를 포함한 프롬프트 구성
    prompt = prompt_template.safe_substitute({"inst": system_message})
    for val in history:
        if val[0]:
            prompt += f"Human: {val[0]} </s> Assistant: {val[1]} </s> "
    prompt += f"Human: {message} </s> Assistant: "
    
    # 모델 입력 생성
    inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=False).to(model.device)
    response_ids = model.generate(
        **inputs, 
        generation_config=generation_config, 
        eos_token_id=tokenizer.eos_token_id,  # 종료 토큰 설정
        pad_token_id=tokenizer.eos_token_id   # pad_token_id도 종료 토큰으로 설정
    )
    
    # 모델 응답 디코딩
    response_text = tokenizer.decode(response_ids[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    
    # 실시간 응답을 위한 부분적 텍스트 반환
    response = ""
    for token in response_text:
        response += token
        yield response


# Gradio Chat Interface 설정
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly and knowledgeable assistant who can discuss a wide range of topics related to music, including genres, artists, albums, instruments, and music history.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=30, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.1, step=0.1, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.5, step=0.05, label="Top-p (nucleus sampling)"),
    ],
)

if __name__ == "__main__":
    demo.launch()