sarath-shekkizhar commited on
Commit
fd397ae
1 Parent(s): 4808ae9

Updating to api url

Browse files
Files changed (2) hide show
  1. app.py +47 -102
  2. requirements.txt +2 -1
app.py CHANGED
@@ -5,125 +5,69 @@ from typing import Iterator
5
  import gradio as gr
6
  import spaces
7
  import torch
8
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 
 
 
 
 
9
 
10
  MAX_MAX_NEW_TOKENS = 2048
11
- DEFAULT_MAX_NEW_TOKENS = 1024
12
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
13
 
14
  DESCRIPTION = """
15
  Llama3-TenyxChat-70B is part of the TenyxChat series, models trained to function as useful assistants.
16
  The model is obtained via direct preference tuning using Tenyx's fine-tuning technology. Model details available at our model page.
17
-
18
- **The model is currently loaded in 8-bit**.
19
  """
20
 
21
 
22
  LICENSE = """
23
  This demo is governed by the license available [here.](https://huggingface.co/spaces/tenyx/Llama3-TenyxChat-70B/blob/main/LICENSE.txt)"""
24
 
25
-
26
- if not torch.cuda.is_available():
27
- DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
28
-
29
-
30
- if torch.cuda.is_available():
31
- model_id = "tenyx/Llama3-TenyxChat-70B"
32
- # model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
33
- model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_4bit=True)
34
- tokenizer = AutoTokenizer.from_pretrained(model_id)
35
- tokenizer.use_default_system_prompt = False
36
-
37
-
38
  @spaces.GPU
39
  def generate(
40
  message: str,
41
  chat_history: list[tuple[str, str]],
42
- system_prompt: str,
43
- max_new_tokens: int = 1024,
44
- temperature: float = 0.6,
45
- top_p: float = 0.9,
46
- top_k: int = 50,
47
- repetition_penalty: float = 1.2,
48
  ) -> Iterator[str]:
49
  conversation = [{"role": "system", "content": "You are a helpful assistant developed by Tenyx, a conversational voice AI company."}]
50
- if system_prompt:
51
- conversation.append({"role": "system", "content": system_prompt})
52
  for user, assistant in chat_history:
53
  conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
54
  conversation.append({"role": "user", "content": message})
55
-
56
- # if not message.strip():
57
- # return "It looks like your message is empty. How can I assist you today?"
58
-
59
- input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt = True, return_tensors="pt")
60
- if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
61
- input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
62
- gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
63
- input_ids = input_ids.to(model.device)
64
-
65
- streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
66
- generate_kwargs = dict(
67
- {"input_ids": input_ids},
68
- streamer=streamer,
69
- max_new_tokens=max_new_tokens,
70
- # eos_token_id=[128001, 128009],
71
- do_sample=True,
72
- top_p=top_p,
73
- top_k=top_k,
74
- temperature=temperature,
75
- num_beams=1,
76
- repetition_penalty=repetition_penalty,
77
- )
78
- t = Thread(target=model.generate, kwargs=generate_kwargs)
79
- t.start()
80
-
81
- outputs = []
82
- for text in streamer:
83
- outputs.append(text)
84
- yield "".join(outputs)
85
-
86
-
87
- chat_interface = gr.ChatInterface(
88
  fn=generate,
89
- additional_inputs=[
90
- gr.Textbox(label="System prompt", lines=6),
91
- gr.Slider(
92
- label="Max new tokens",
93
- minimum=1,
94
- maximum=MAX_MAX_NEW_TOKENS,
95
- step=1,
96
- value=DEFAULT_MAX_NEW_TOKENS,
97
- ),
98
- gr.Slider(
99
- label="Temperature",
100
- minimum=0.1,
101
- maximum=4.0,
102
- step=0.1,
103
- value=0.1,
104
- ),
105
- gr.Slider(
106
- label="Top-p (nucleus sampling)",
107
- minimum=0.05,
108
- maximum=1.0,
109
- step=0.05,
110
- value=0.9,
111
- ),
112
- gr.Slider(
113
- label="Top-k",
114
- minimum=1,
115
- maximum=1000,
116
- step=1,
117
- value=50,
118
- ),
119
- gr.Slider(
120
- label="Repetition penalty",
121
- minimum=1.0,
122
- maximum=2.0,
123
- step=0.05,
124
- value=1.2,
125
- ),
126
- ],
127
  stop_btn=None,
128
  examples=[
129
  ["Hello there! How are you doing?"],
@@ -134,13 +78,14 @@ chat_interface = gr.ChatInterface(
134
  ],
135
  )
136
 
137
- with gr.Blocks(css="style.css") as demo:
138
- gr.Markdown(DESCRIPTION)
139
- gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
140
- chat_interface.render()
141
- gr.Markdown(LICENSE)
 
142
 
143
  if __name__ == "__main__":
144
- demo.queue(max_size=4).launch()
145
 
146
 
 
5
  import gradio as gr
6
  import spaces
7
  import torch
8
+ from openai import OpenAI, APIError
9
+
10
+ client = OpenAI(
11
+ base_url="https://hjopms3xd7gembdu.us-east-1.aws.endpoints.huggingface.cloud/v1/",
12
+ api_key="hf_XXXXX"
13
+ )
14
 
15
  MAX_MAX_NEW_TOKENS = 2048
16
+ DEFAULT_MAX_NEW_TOKENS = 512
17
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
18
 
19
  DESCRIPTION = """
20
  Llama3-TenyxChat-70B is part of the TenyxChat series, models trained to function as useful assistants.
21
  The model is obtained via direct preference tuning using Tenyx's fine-tuning technology. Model details available at our model page.
 
 
22
  """
23
 
24
 
25
  LICENSE = """
26
  This demo is governed by the license available [here.](https://huggingface.co/spaces/tenyx/Llama3-TenyxChat-70B/blob/main/LICENSE.txt)"""
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  @spaces.GPU
29
  def generate(
30
  message: str,
31
  chat_history: list[tuple[str, str]],
 
 
 
 
 
 
32
  ) -> Iterator[str]:
33
  conversation = [{"role": "system", "content": "You are a helpful assistant developed by Tenyx, a conversational voice AI company."}]
34
+
 
35
  for user, assistant in chat_history:
36
  conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
37
  conversation.append({"role": "user", "content": message})
38
+
39
+ try:
40
+ response = client.chat.completions.create(
41
+ model="tgi",
42
+ messages=conversation,
43
+ stop=["<|end_of_text|>", "<|eot_id|>"],
44
+ stream=True,
45
+ max_tokens=1024,
46
+ )
47
+ outputs = []
48
+ for chunk in response:
49
+ outputs.append(chunk.choices[0].delta.content)
50
+ yield "".join(outputs)
51
+
52
+ except APIError as e:
53
+ # Handle API errors or network errors here
54
+ print(f"Error: {e}")
55
+ yield "An error occurred. Please try again later."
56
+
57
+
58
+
59
+ demo = gr.ChatInterface(
 
 
 
 
 
 
 
 
 
 
 
60
  fn=generate,
61
+ # additional_inputs=[
62
+ # gr.Textbox(label="System prompt", lines=6),
63
+ # gr.Slider(
64
+ # label="Max new tokens",
65
+ # minimum=1,
66
+ # maximum=MAX_MAX_NEW_TOKENS,
67
+ # step=1,
68
+ # value=DEFAULT_MAX_NEW_TOKENS,
69
+ # ),
70
+ # ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  stop_btn=None,
72
  examples=[
73
  ["Hello there! How are you doing?"],
 
78
  ],
79
  )
80
 
81
+ # with gr.Blocks() as demo:
82
+ # # gr.Markdown(DESCRIPTION)
83
+ # # gr.Markdown(LICENSE)
84
+ # # gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
85
+ # chat_interface.render()
86
+
87
 
88
  if __name__ == "__main__":
89
+ demo.queue(max_size=4).launch(share=True)
90
 
91
 
requirements.txt CHANGED
@@ -6,4 +6,5 @@ spaces==0.26.2
6
  scipy
7
  sentencepiece
8
  torch==2.3.0
9
- transformers==4.40.1
 
 
6
  scipy
7
  sentencepiece
8
  torch==2.3.0
9
+ transformers==4.40.1
10
+ openai==1.25.0