File size: 4,448 Bytes
700d53e
 
 
6bce6c7
 
 
 
700d53e
 
 
 
 
 
 
7a451c4
 
 
 
 
 
 
700d53e
 
7a451c4
 
 
 
 
 
 
700d53e
7a451c4
 
 
 
 
700d53e
7a451c4
 
 
 
 
700d53e
7a451c4
 
 
 
 
700d53e
7a451c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
700d53e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a451c4
700d53e
 
 
 
 
 
 
7a451c4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
from huggingface_hub import Repository
import gradio as gr
import os
from peft import AutoPeftModelForCausalLM
from transformers import GenerationConfig
from transformers import AutoTokenizer
import torch

# Load the model from your Hugging Face account
model_name = "adi1193/mistral-postv6"
repository = Repository(model_name, clone_from="adi1193/mistral-postv6")

# Load the model
model_path = repository.local_dir
model = AutoPeftModelForCausalLM.from_pretrained(
    model_path,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="cuda")
# model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

generation_config = GenerationConfig(
    do_sample=True,
    top_k=1,
    temperature=0.1,
    max_new_tokens=100,
    pad_token_id=tokenizer.eos_token_id
)

def format_prompt(message):
    input_str = "###Human: " + message + "###Assistant: "
    inputs = tokenizer(input_str, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, generation_config=generation_config)
    return tokenizer.decode(outputs[0], skip_special_tokens=True).replace(input_str, '')

# def format_prompt(message, history, enable_hinglish=False):
#     prompt = "<s>"
#     # Adding the Hinglish prompt
#     if enable_hinglish and not any("[INST] You are a Hinglish LLM." in user_prompt for user_prompt, bot_response in history):
#         prompt += Hinglish_Prompt

#     for user_prompt, bot_response in history:
#         prompt += f"[INST] {user_prompt} [/INST]"
#         prompt += f" {bot_response}  "
#     prompt += f"[INST] {message} [/INST]"
#     return prompt

# def generate(prompt, history, temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0, enable_hinglish=False):
#     temperature = float(temperature)
#     if temperature < 1e-2:
#         temperature = 1e-2
#     top_p = float(top_p)

#     generate_kwargs = {
#         "model": model,
#         "tokenizer": tokenizer,
#         "max_length": max_new_tokens + len(tokenizer.encode(prompt)),
#         "temperature": temperature,
#         "top_p": top_p,
#         "repetition_penalty": repetition_penalty,
#         "do_sample": True,
#         "seed": 42,
#     }

#     formatted_prompt = format_prompt(prompt, history, enable_hinglish)
#     input_ids = tokenizer.encode(formatted_prompt, return_tensors="pt")
#     output = model.generate(input_ids, **generate_kwargs)
#     return tokenizer.decode(output[0], skip_special_tokens=True)

additional_inputs=[
    gr.Slider(
        label="Temperature",
        value=0.9,
        minimum=0.0,
        maximum=1.0,
        step=0.05,
        interactive=True,
        info="Higher values produce more diverse outputs",
    ),
    gr.Slider(
        label="Max new tokens",
        value=256,
        minimum=0,
        maximum=1048,
        step=64,
        interactive=True,
        info="The maximum numbers of new tokens",
    ),
    gr.Slider(
        label="Top-p (nucleus sampling)",
        value=0.90,
        minimum=0.0,
        maximum=1,
        step=0.05,
        interactive=True,
        info="Higher values sample more low-probability tokens",
    ),
    gr.Slider(
        label="Repetition penalty",
        value=1.2,
        minimum=1.0,
        maximum=2.0,
        step=0.05,
        interactive=True,
        info="Penalize repeated tokens",
    ),
    gr.Checkbox(
        label="Hinglish",
        value=False,
        interactive=True,
        info="Enables the MistralTalk to talk in Hinglish (Combination of Hindi and English)",
    )
]

css = """
  #mkd {
    height: 500px;
    overflow: auto;
    border: 1px solid #ccc;
  }
"""

with gr.Blocks(css=css) as demo:
    gr.HTML("<h1><center>MistralTalk🗣️<h1><center>")
    gr.HTML("<h3><center>In this demo, you can chat with <a href='https://huggingface.co/adi1193/mistral-postv6'>Mistral-8x7B</a> model. 💬<h3><center>")
    gr.HTML("<h3><center>Learn more about the model <a href='https://huggingface.co/docs/transformers/main/model_doc/mistral'>here</a>. 📚<h3><center>")
    gr.ChatInterface(
        format_prompt,
        additional_inputs=additional_inputs,
        theme = gr.themes.Soft(),
        examples=[["What is the interest?"], ["How does the universe work?"],["What can you do?"],["What is quantum mechanics?"],["Do you believe in an after life?"]]
    )

if __name__ == "__main__":
    demo.launch()