Mikhil-jivus commited on
Commit
37ca5d0
1 Parent(s): 6fc221d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +138 -84
app.py CHANGED
@@ -1,97 +1,151 @@
1
- import gradio as gr
2
  import os
3
- import torch
4
- from transformers import (
5
- AutoTokenizer,
6
- AutoModelForCausalLM,
7
- TextIteratorStreamer,
8
- pipeline,
9
- )
10
  from threading import Thread
 
11
 
12
- access_token = os.getenv('HF_TOKEN')
13
-
14
- # The huggingface model id for Finetuned model
15
- checkpoint = "Mikhil-jivus/Llama-32-3B-FineTuned"
16
-
17
- # Download and load model and tokenizer
18
- tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True,token=access_token)
19
- model = AutoModelForCausalLM.from_pretrained(
20
- checkpoint, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True,token=access_token
21
- )
22
-
23
- # Text generation pipeline
24
- phi2 = pipeline(
25
- "text-generation",
26
- tokenizer=tokenizer,
27
- model=model,
28
- pad_token_id=tokenizer.eos_token_id,
29
- eos_token_id=tokenizer.eos_token_id,
30
- device_map="auto",
31
- )
32
-
33
-
34
- # Function that accepts a prompt and generates text using the phi2 pipeline
35
- def generate(message, chat_history, max_new_tokens):
36
- instruction = "You are Abhishek Rai, Lead sales At PerfectSprout. Here is your Personality: He is energetic smart and has 17 years of sales experience. You are to strictly follow your personality and respond like Abhishek Rai on a cold Call. Be quick to judge the relevance of the pitch and proceed only if the product/pitch promises to add some value to your work. You possess comprehensive real-world knowledge, enabling you to scrutinize and challenge any implausible claims, inaccuracies, or conventional fallacies presented by a sales rep during a cold call."
37
- final_prompt = f"Instruction: {instruction}\n"
38
-
39
- for sent, received in chat_history:
40
- final_prompt += "User: " + sent + "\n"
41
- final_prompt += "Assistant: " + received + "\n"
42
 
43
- final_prompt += "User: " + message + "\n"
44
- final_prompt += "Output:"
45
 
46
- # Streamer
47
- streamer = TextIteratorStreamer(
48
- tokenizer=tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=300.0
49
- )
50
- thread = Thread(
51
- target=phi2,
52
- kwargs={
53
- "text_inputs": final_prompt,
54
- "max_new_tokens": max_new_tokens,
55
- "streamer": streamer,
56
- },
57
- )
58
- thread.start()
59
 
60
- generated_text = ""
61
- for word in streamer:
62
- generated_text += word
63
- response = generated_text.strip()
 
 
64
 
65
- if "User:" in response:
66
- response = response.split("User:")[0].strip()
67
-
68
- if "Assistant:" in response:
69
- response = response.split("Assistant:")[1].strip()
70
 
71
- yield response
 
 
 
 
 
72
 
 
73
 
74
- # Chat interface with gradio
75
- with gr.Blocks() as demo:
76
- gr.Markdown(
77
- """
78
- # Jivus AI Chatbot Demo
79
- This chatbot was created using Llama 3 billion parameter Transformer model.
80
- """
81
- )
82
-
83
- tokens_slider = gr.Slider(
84
- minimum=8,
85
- maximum=512,
86
- value=256,
87
- label="Maximum new tokens"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
- chatbot = gr.ChatInterface(
91
- fn=generate,
92
- additional_inputs=[tokens_slider],
93
- stop_btn=None,
94
- examples=[["Who is Leonhard Euler?"]],
95
- )
96
 
97
- demo.queue().launch()
 
 
 
1
  import os
 
 
 
 
 
 
 
2
  from threading import Thread
3
+ from typing import Iterator
4
 
5
+ import gradio as gr
6
+ import spaces
7
+ import torch
8
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
 
 
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
+ DESCRIPTION = """\
13
+ # Llama 3.2 3B Instruct
14
+ Llama 3.2 3B is Meta's latest iteration of open LLMs.
15
+ This is a demo of [`meta-llama/Llama-3.2-3B-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct), fine-tuned for instruction following.
16
+ For more details, please check [our post](https://huggingface.co/blog/llama32).
17
+ """
18
 
19
+ # Access token for the model (if required)
20
+ access_token = os.getenv('HF_TOKEN')
 
 
 
21
 
22
+ # Download the Base model
23
+ #model_id = "./models/Llama-32-3B-Instruct"
24
+ model_id = "Mikhil-jivus/Llama-32-3B-FineTuned-Instruct"
25
+ MAX_MAX_NEW_TOKENS = 2048
26
+ DEFAULT_MAX_NEW_TOKENS = 1024
27
+ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
28
 
29
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
30
 
31
+ #model_id = "nltpt/Llama-3.2-3B-Instruct"
32
+ tokenizer = AutoTokenizer.from_pretrained(model_id,token = access_token)
33
+ model = AutoModelForCausalLM.from_pretrained(
34
+ model_id,
35
+ device_map="auto",
36
+ torch_dtype=torch.bfloat16,
37
+ token = access_token
38
+ )
39
+ model.eval()
40
+
41
+
42
+ @spaces.GPU(duration=90)
43
+ def generate(
44
+ message: str,
45
+ chat_history: list[tuple[str, str]],
46
+ system_prompt: str,
47
+ max_new_tokens: int = 1024,
48
+ temperature: float = 0.6,
49
+ top_p: float = 0.9,
50
+ top_k: int = 50,
51
+ repetition_penalty: float = 1.2,
52
+ ) -> Iterator[str]:
53
+ conversation = [{"role": "system", "content": system_prompt}]
54
+ for user, assistant in chat_history:
55
+ conversation.extend(
56
+ [
57
+ {"role": "user", "content": user},
58
+ {"role": "assistant", "content": assistant},
59
+ ]
60
+ )
61
+ conversation.append({"role": "user", "content": message})
62
+
63
+ input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
64
+ if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
65
+ input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
66
+ gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
67
+ input_ids = input_ids.to(model.device)
68
+
69
+ streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
70
+ generate_kwargs = dict(
71
+ {"input_ids": input_ids},
72
+ streamer=streamer,
73
+ max_new_tokens=max_new_tokens,
74
+ do_sample=True,
75
+ top_p=top_p,
76
+ top_k=top_k,
77
+ temperature=temperature,
78
+ num_beams=1,
79
+ repetition_penalty=repetition_penalty,
80
  )
81
+ t = Thread(target=model.generate, kwargs=generate_kwargs)
82
+ t.start()
83
+
84
+ outputs = []
85
+ for text in streamer:
86
+ outputs.append(text)
87
+ yield "".join(outputs)
88
+
89
+
90
+ chat_interface = gr.ChatInterface(
91
+ fn=generate,
92
+ additional_inputs=[
93
+ gr.Textbox(
94
+ label="System Prompt",
95
+ placeholder="Enter system prompt here...",
96
+ lines=2,
97
+ ),
98
+ gr.Slider(
99
+ label="Max new tokens",
100
+ minimum=1,
101
+ maximum=MAX_MAX_NEW_TOKENS,
102
+ step=1,
103
+ value=DEFAULT_MAX_NEW_TOKENS,
104
+ ),
105
+ gr.Slider(
106
+ label="Temperature",
107
+ minimum=0.1,
108
+ maximum=4.0,
109
+ step=0.1,
110
+ value=0.6,
111
+ ),
112
+ gr.Slider(
113
+ label="Top-p (nucleus sampling)",
114
+ minimum=0.05,
115
+ maximum=1.0,
116
+ step=0.05,
117
+ value=0.9,
118
+ ),
119
+ gr.Slider(
120
+ label="Top-k",
121
+ minimum=1,
122
+ maximum=1000,
123
+ step=1,
124
+ value=50,
125
+ ),
126
+ gr.Slider(
127
+ label="Repetition penalty",
128
+ minimum=1.0,
129
+ maximum=2.0,
130
+ step=0.05,
131
+ value=1.2,
132
+ ),
133
+ ],
134
+ stop_btn=None,
135
+ examples=[
136
+ ["Hello there! How are you doing?"],
137
+ ["Can you explain briefly to me what is the Python programming language?"],
138
+ ["Explain the plot of Cinderella in a sentence."],
139
+ ["How many hours does it take a man to eat a Helicopter?"],
140
+ ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
141
+ ],
142
+ cache_examples=False,
143
+ )
144
 
145
+ with gr.Blocks(css="style.css", fill_height=True) as demo:
146
+ gr.Markdown(DESCRIPTION)
147
+ gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
148
+ chat_interface.render()
 
 
149
 
150
+ if __name__ == "__main__":
151
+ demo.queue(max_size=20).launch()