Abhaykoul commited on
Commit
a78027d
Β·
verified Β·
1 Parent(s): 9a80034

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. README.md +8 -6
  2. app.py +111 -0
  3. requirements.txt +4 -0
README.md CHANGED
@@ -1,12 +1,14 @@
1
  ---
2
- title: HelpingAI 9B
3
- emoji: πŸ“Š
4
- colorFrom: blue
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 4.31.5
8
  app_file: app.py
9
  pinned: false
 
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: HelpingAI-9B
3
+ emoji: πŸ‘‘
4
+ colorFrom: purple
5
+ colorTo: yellow
6
  sdk: gradio
7
+ sdk_version: 4.23.0
8
  app_file: app.py
9
  pinned: false
10
+ license: apache-2.0
11
+ header: mini
12
  ---
13
 
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import subprocess
4
+ from threading import Thread
5
+
6
+ import torch
7
+ import spaces
8
+ import gradio as gr
9
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
10
+
11
+ subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
12
+
13
+ MODEL_ID = os.environ.get("MODEL_ID")
14
+ CHAT_TEMPLATE = os.environ.get("CHAT_TEMPLATE")
15
+ MODEL_NAME = MODEL_ID.split("/")[-1]
16
+ CONTEXT_LENGTH = int(os.environ.get("CONTEXT_LENGTH"))
17
+ COLOR = os.environ.get("COLOR")
18
+ EMOJI = os.environ.get("EMOJI")
19
+ DESCRIPTION = os.environ.get("DESCRIPTION")
20
+
21
+
22
+ @spaces.GPU()
23
+ def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
24
+ # Format history with a given chat template
25
+ if CHAT_TEMPLATE == "Auto":
26
+ stop_tokens = [tokenizer.eos_token_id]
27
+ instruction = []
28
+ for user, assistant in history:
29
+ instruction.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
30
+ instruction.append({"role": "user", "content": message})
31
+ elif CHAT_TEMPLATE == "ChatML":
32
+ stop_tokens = ["<|endoftext|>", "<|im_end|>"]
33
+ instruction = '<|im_start|>system\n' + system_prompt + '\n<|im_end|>\n'
34
+ for user, assistant in history:
35
+ instruction += '<|im_start|>user\n' + user + '\n<|im_end|>\n<|im_start|>assistant\n' + assistant
36
+ instruction += '\n<|im_start|>user\n' + message + '\n<|im_end|>\n<|im_start|>assistant\n'
37
+ elif CHAT_TEMPLATE == "Mistral Instruct":
38
+ stop_tokens = ["</s>", "[INST]", "[INST] ", "<s>", "[/INST]", "[/INST] "]
39
+ instruction = '<s>[INST] ' + system_prompt
40
+ for user, assistant in history:
41
+ instruction += user + ' [/INST] ' + assistant + '</s>[INST]'
42
+ instruction += ' ' + message + ' [/INST]'
43
+ else:
44
+ raise Exception("Incorrect chat template, select 'ChatML' or 'Mistral Instruct'")
45
+ print(instruction)
46
+
47
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
48
+ enc = tokenizer([instruction], return_tensors="pt", padding=True, truncation=True)
49
+ input_ids, attention_mask = enc.input_ids, enc.attention_mask
50
+
51
+ if input_ids.shape[1] > CONTEXT_LENGTH:
52
+ input_ids = input_ids[:, -CONTEXT_LENGTH:]
53
+
54
+ generate_kwargs = dict(
55
+ {"input_ids": input_ids.to(device), "attention_mask": attention_mask.to(device)},
56
+ streamer=streamer,
57
+ do_sample=True,
58
+ temperature=temperature,
59
+ max_new_tokens=max_new_tokens,
60
+ top_k=top_k,
61
+ repetition_penalty=repetition_penalty,
62
+ top_p=top_p
63
+ )
64
+ t = Thread(target=model.generate, kwargs=generate_kwargs)
65
+ t.start()
66
+ outputs = []
67
+ for new_token in streamer:
68
+ outputs.append(new_token)
69
+ if new_token in stop_tokens:
70
+ break
71
+ yield "".join(outputs)
72
+
73
+
74
+ # Load model
75
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
76
+ quantization_config = BitsAndBytesConfig(
77
+ load_in_4bit=True,
78
+ bnb_4bit_compute_dtype=torch.bfloat16
79
+ )
80
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
81
+ model = AutoModelForCausalLM.from_pretrained(
82
+ MODEL_ID,
83
+ device_map="auto",
84
+ quantization_config=quantization_config,
85
+ attn_implementation="flash_attention_2",
86
+ )
87
+
88
+ # Create Gradio interface
89
+ gr.ChatInterface(
90
+ predict,
91
+ title=EMOJI + " " + MODEL_NAME,
92
+ description=DESCRIPTION,
93
+ examples=[
94
+ ["Can you solve the equation 2x + 3 = 11 for x?"],
95
+ ["Write an epic poem about Ancient Rome."],
96
+ ["Who was the first person to walk on the Moon?"],
97
+ ["Use a list comprehension to create a list of squares for numbers from 1 to 10."],
98
+ ["Recommend some popular science fiction books."],
99
+ ["Can you write a short story about a time-traveling detective?"]
100
+ ],
101
+ additional_inputs_accordion=gr.Accordion(label="βš™οΈ Parameters", open=False),
102
+ additional_inputs=[
103
+ gr.Textbox("Perform the task to the best of your ability.", label="System prompt"),
104
+ gr.Slider(0, 1, 0.8, label="Temperature"),
105
+ gr.Slider(128, 4096, 1024, label="Max new tokens"),
106
+ gr.Slider(1, 80, 40, label="Top K sampling"),
107
+ gr.Slider(0, 2, 1.1, label="Repetition penalty"),
108
+ gr.Slider(0, 1, 0.95, label="Top P sampling"),
109
+ ],
110
+ theme=gr.themes.Soft(primary_hue=COLOR),
111
+ ).queue().launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ transformers==4.38.2
2
+ accelerate
3
+ bitsandbytes
4
+ optimum