File size: 2,467 Bytes
f7537f3
b12166e
f7537f3
6765159
878bac0
6765159
 
 
 
5ca28fd
6765159
f7537f3
 
6765159
 
5ca28fd
6f1cdd7
6765159
 
f7537f3
 
 
6765159
f7537f3
 
6765159
f7537f3
 
6f1cdd7
6765159
f7537f3
 
52291b3
6765159
b12166e
 
 
 
 
 
da6d98b
6765159
 
 
 
 
 
 
 
 
 
 
6f1cdd7
6765159
 
 
 
b12166e
6765159
3b6ab63
6765159
b12166e
6765159
52291b3
5ca28fd
b12166e
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68

import gradio as gr
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the custom model and tokenizer
model_path =  'redael/model_udc'
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Check if CUDA is available and use GPU if possible, enable FP16 precision
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
if device.type == 'cuda':
    model = model.half()  # Use FP16 precision

def generate_response(prompt, model, tokenizer, max_length=100, num_beams=1, temperature=0.7, top_p=0.9, repetition_penalty=2.0):
    # Prepare the prompt
    prompt = f"User: {prompt}\nAssistant:"
    inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    outputs = model.generate(
        inputs['input_ids'],
        max_length=max_length,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        num_beams=num_beams,  # Use a lower number of beams
        temperature=temperature,
        top_p=top_p,
        repetition_penalty=repetition_penalty,  # Increased repetition penalty
        early_stopping=True
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Post-processing to clean up the response
    response = response.split("Assistant:")[-1].strip()
    response_lines = response.split('\n')
    clean_response = []
    for line in response_lines:
        if "User:" not in line and "Assistant:" not in line:
            clean_response.append(line)
    response = ' '.join(clean_response)
    return response.strip()

def respond(message, history: list[tuple[str, str]]):
    # Prepare the prompt from the history and the new message
    system_message = "You are a friendly chatbot."
    conversation = system_message + "\n"
    for user_message, assistant_response in history:
        conversation += f"User: {user_message}\nAssistant: {assistant_response}\n"
    conversation += f"User: {message}\nAssistant:"

    # Fixed values for generation parameters
    max_tokens = 100  # Adjusted max tokens
    temperature = 0.7
    top_p = 0.9

    response = generate_response(conversation, model, tokenizer, max_length=max_tokens, temperature=temperature, top_p=top_p)
    
    return response

# Gradio Chat Interface without customizable inputs
demo = gr.ChatInterface(
    respond
)

if __name__ == "__main__":
    demo.launch()