Spaces:

jjgomez
/

UCMBot

Sleeping

File size: 2,175 Bytes

b6d9528
d16f61e
 
 
 
c40e28a
d16f61e
 
c40e28a
 
 
3266040
c40e28a
 
 
 
 
d16f61e
d374559
0dec038
4f63a1e
 
c40e28a
d16f61e
c40e28a
d16f61e
 
 
 
 
 
c40e28a
d16f61e
 
c40e28a
 
d16f61e
 
 
 
c40e28a
 
d16f61e
 
 
c40e28a
d16f61e

from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, BitsAndBytesConfig, GenerationConfig
import gradio as gr
import torch


title = "????AI ChatBot bajo GPU"
description = "A State-of-the-Art Large-scale Pretrained Response generation model (DialoGPT)"
examples = [["How are you?"]]
model_id="clibrain/Llama-2-13b-ft-instruct-es-gptq-4bit"
config = AutoConfig.from_pretrained(model_id)
#config.quantization_config["use_exllama"] = True
config.quantization_config["disable_exllama"] = True
config.quantization_config["exllama_config"] = {"version":2}
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("********************")
print(device)
print("********************")


model = AutoModelForCausalLM.from_pretrained(model_id,  config=config, torch_dtype=torch.float32) #float 32 es necesario para CPU

#model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
model = model.to(device)

tokenizer = AutoTokenizer.from_pretrained(model_id)


def predict(input, history=[]):
    # tokenize the new input sentence
    new_user_input_ids = tokenizer.encode(
        input + tokenizer.eos_token, return_tensors="pt"
    ).to(device)

    # append the new user input tokens to the chat history
    historygpu=torch.LongTensor(history).to(device)
    bot_input_ids = torch.cat([historygpu, new_user_input_ids], dim=-1)

    # generate a response
    history = model.generate(
        bot_input_ids, max_length=4000, pad_token_id=tokenizer.eos_token_id
    )
    
    # convert the tokens to text, and then split the responses into lines
    response = tokenizer.decode(history[0]).split("<|endoftext|>")
    # print('decoded_response-->>'+str(response))
    print(response)
    response = [
        (response[i], response[i + 1]) for i in range(0, len(response) - 1, 2)
    ]  # convert to tuples of list
    # print('response-->>'+str(response))
    return response, history


gr.Interface(
    fn=predict,
    title=title,
    description=description,
    examples=examples,
    inputs=["text", "state"],
    outputs=["chatbot", "state"],
    theme="finlaymacklon/boxy_violet",
).launch()