Yemin Shi commited on
Commit
76a2333
1 Parent(s): c717e0f

use ggml model

Browse files
Files changed (1) hide show
  1. model.py +34 -48
model.py CHANGED
@@ -1,32 +1,25 @@
1
- from threading import Thread
2
  from typing import Iterator
 
 
3
 
4
- import torch
5
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
6
 
7
- # Original version
8
- model_id = "LinkSoul/Chinese-Llama-2-7b"
9
- # 4 bit version
10
- # model_id = "LinkSoul/Chinese-Llama-2-7b-4bit"
 
 
 
 
 
11
 
12
- if torch.cuda.is_available():
13
- if model_id.endswith("4bit"):
14
- model = AutoModelForCausalLM.from_pretrained(
15
- model_id,
16
- load_in_4bit=True,
17
- torch_dtype=torch.float16,
18
- device_map='auto'
19
- )
20
- else:
21
- model = AutoModelForCausalLM.from_pretrained(
22
- model_id,
23
- torch_dtype=torch.float16,
24
- device_map='auto'
25
- )
26
- else:
27
- model = None
28
- tokenizer = AutoTokenizer.from_pretrained(model_id)
29
 
 
 
30
 
31
 
32
  def get_prompt(message: str, chat_history: list[tuple[str, str]],
@@ -37,11 +30,20 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
37
  texts.append(f'{message.strip()} [/INST]')
38
  return ''.join(texts)
39
 
 
 
 
 
 
 
 
 
 
40
 
41
  def get_input_token_length(message: str, chat_history: list[tuple[str, str]], system_prompt: str) -> int:
42
  prompt = get_prompt(message, chat_history, system_prompt)
43
- input_ids = tokenizer([prompt], return_tensors='np')['input_ids']
44
- return input_ids.shape[-1]
45
 
46
 
47
  def run(message: str,
@@ -52,26 +54,10 @@ def run(message: str,
52
  top_p: float = 0.95,
53
  top_k: int = 50) -> Iterator[str]:
54
  prompt = get_prompt(message, chat_history, system_prompt)
55
- inputs = tokenizer([prompt], return_tensors='pt').to('cuda')
56
-
57
- streamer = TextIteratorStreamer(tokenizer,
58
- timeout=10.,
59
- skip_prompt=True,
60
- skip_special_tokens=True)
61
- generate_kwargs = dict(
62
- inputs,
63
- streamer=streamer,
64
- max_new_tokens=max_new_tokens,
65
- do_sample=True,
66
- top_p=top_p,
67
- top_k=top_k,
68
- temperature=temperature,
69
- num_beams=1,
70
- )
71
- t = Thread(target=model.generate, kwargs=generate_kwargs)
72
- t.start()
73
 
74
- outputs = []
75
- for text in streamer:
76
- outputs.append(text)
77
- yield ''.join(outputs)
 
 
1
  from typing import Iterator
2
+ from llama_cpp import Llama
3
+ from huggingface_hub import hf_hub_download
4
 
 
 
5
 
6
+ def download_model():
7
+ # See https://github.com/OpenAccess-AI-Collective/ggml-webui/blob/main/tabbed.py
8
+ # https://huggingface.co/spaces/kat33/llama.cpp/blob/main/app.py
9
+ print(f"Downloading model: {model_repo}/{model_filename}")
10
+ file = hf_hub_download(
11
+ repo_id=model_repo, filename=model_filename
12
+ )
13
+ print("Downloaded " + file)
14
+ return file
15
 
16
+ model_repo = "LinkSoul/Chinese-Llama-2-7b-ggml"
17
+ model_filename = "Chinese-Llama-2-7b.ggmlv3.q4_0.bin"
18
+ # model_filename = "Chinese-Llama-2-7b.ggmlv3.q8_0.bin"
19
+ model_path = download_model()
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
+ # load Llama-2
22
+ llm = Llama(model_path=model_path, verbose=False)
23
 
24
 
25
  def get_prompt(message: str, chat_history: list[tuple[str, str]],
 
30
  texts.append(f'{message.strip()} [/INST]')
31
  return ''.join(texts)
32
 
33
+ def generate(prompt, max_new_tokens, temperature, top_p, top_k):
34
+ return llm(prompt,
35
+ max_tokens=max_new_tokens,
36
+ stop=["</s>"],
37
+ temperature=temperature,
38
+ top_p=top_p,
39
+ top_k=top_k,
40
+ stream=False)
41
+
42
 
43
  def get_input_token_length(message: str, chat_history: list[tuple[str, str]], system_prompt: str) -> int:
44
  prompt = get_prompt(message, chat_history, system_prompt)
45
+ input_ids = llm.tokenize(prompt.encode('utf-8'))
46
+ return len(input_ids)
47
 
48
 
49
  def run(message: str,
 
54
  top_p: float = 0.95,
55
  top_k: int = 50) -> Iterator[str]:
56
  prompt = get_prompt(message, chat_history, system_prompt)
57
+ output = generate(prompt, max_new_tokens, temperature, top_p, top_k)
58
+ yield output['choices'][0]['text']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
+ # outputs = []
61
+ # for resp in streamer:
62
+ # outputs.append(resp['choices'][0]['text'])
63
+ # yield ''.join(outputs)