heegyu commited on
Commit
9bf4892
β€’
1 Parent(s): ff014aa

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +99 -0
README.md ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ - Original model is [yanolja/EEVE-Korean-Instruct-10.8B-v1.0](https://huggingface.co/yanolja/EEVE-Korean-Instruct-10.8B-v1.0)
3
+ - quantized using [llama.cpp](https://github.com/ggerganov/llama.cpp)
4
+
5
+
6
+ ### Usage
7
+ requirements
8
+ ```
9
+ # GPU model
10
+ CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir --verbose
11
+
12
+ # CPU
13
+ CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir --verbose
14
+
15
+ pip install huggingface_hub
16
+ ```
17
+
18
+ ```
19
+ from huggingface_hub import hf_hub_download
20
+ from llama_cpp import Llama
21
+
22
+ import time
23
+ from pprint import pprint
24
+
25
+
26
+ # download model
27
+ model_name_or_path = "heegyu/EEVE-Korean-Instruct-10.8B-v1.0-GGUF" # repo id
28
+ # 4bit
29
+ model_basename = "ggml-model-Q4_K_M.gguf" # file name
30
+
31
+ model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)
32
+ print(model_path)
33
+
34
+
35
+ # CPU
36
+ # lcpp_llm = Llama(
37
+ # model_path=model_path,
38
+ # n_threads=2,
39
+ # )
40
+
41
+ # GPUμ—μ„œ μ‚¬μš©ν•˜λ €λ©΄ μ•„λž˜ μ½”λ“œλ‘œ μ‹€ν–‰
42
+ lcpp_llm = Llama(
43
+ model_path=model_path,
44
+ n_threads=2, # CPU cores
45
+ n_batch=512, # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
46
+ n_gpu_layers=43, # Change this value based on your model and your GPU VRAM pool.
47
+ n_ctx=4096, # Context window
48
+ )
49
+
50
+
51
+ prompt_template = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\nHuman: {prompt}\nAssistant:\n"
52
+ text = 'ν•œκ΅­μ˜ μˆ˜λ„λŠ” μ–΄λ””μΈκ°€μš”? μ•„λž˜ 선택지 쀑 κ³¨λΌμ£Όμ„Έμš”.\n\n(A) κ²½μ„±\n(B) λΆ€μ‚°\n(C) 평양\n(D) μ„œμšΈ\n(E) μ „μ£Ό'
53
+
54
+ prompt = prompt_template.format(prompt=text)
55
+
56
+ start = time.time()
57
+ response = lcpp_llm(
58
+ prompt=prompt,
59
+ max_tokens=256,
60
+ temperature=0.5,
61
+ top_p=0.95,
62
+ top_k=50,
63
+ stop = ['</s>'], # Dynamic stopping when such token is detected.
64
+ echo=True # return the prompt
65
+ )
66
+ pprint(response)
67
+ print(time.time() - start)
68
+ ```
69
+
70
+ μ‹€ν–‰κ²°κ³Ό (Colab T4 GPU)
71
+ ```
72
+ llama_print_timings: load time = 942.53 ms
73
+ llama_print_timings: sample time = 27.60 ms / 37 runs ( 0.75 ms per token, 1340.43 tokens per second)
74
+ llama_print_timings: prompt eval time = 942.29 ms / 83 tokens ( 11.35 ms per token, 88.08 tokens per second)
75
+ llama_print_timings: eval time = 4530.31 ms / 36 runs ( 125.84 ms per token, 7.95 tokens per second)
76
+ llama_print_timings: total time = 5648.42 ms / 119 tokens
77
+ {'choices': [{'finish_reason': 'stop',
78
+ 'index': 0,
79
+ 'logprobs': None,
80
+ 'text': 'A chat between a curious user and an artificial '
81
+ 'intelligence assistant. The assistant gives helpful, '
82
+ "detailed, and polite answers to the user's questions.\n"
83
+ 'Human: ν•œκ΅­μ˜ μˆ˜λ„λŠ” μ–΄λ””μΈκ°€μš”? μ•„λž˜ 선택지 쀑 κ³¨λΌμ£Όμ„Έμš”.\n'
84
+ '\n'
85
+ '(A) κ²½μ„±\n'
86
+ '(B) λΆ€μ‚°\n'
87
+ '(C) 평양\n'
88
+ '(D) μ„œμšΈ\n'
89
+ '(E) μ „μ£Ό\n'
90
+ 'Assistant:\n'
91
+ 'ν•œκ΅­μ€ λ™μ•„μ‹œμ•„μ— μœ„μΉ˜ν•œ κ΅­κ°€λ‘œ κ³΅μ‹μ μœΌλ‘œ λŒ€ν•œλ―Όκ΅­μ΄λΌκ³  λΆˆλ¦½λ‹ˆλ‹€. μ„œμšΈμ€ λŒ€ν•œλ―Όκ΅­μ˜ μˆ˜λ„μž…λ‹ˆλ‹€. '
92
+ 'λ”°λΌμ„œ 정닡은 (D) μ„œμšΈμž…λ‹ˆλ‹€.'}],
93
+ 'created': 1710404368,
94
+ 'id': 'cmpl-af889267-f64e-4516-b0a3-5c8b918d0e36',
95
+ 'model': '/root/.cache/huggingface/hub/models--heegyu--EEVE-Korean-Instruct-10.8B-v1.0-GGUF/snapshots/ff014aa6d73ffa8a2857085261cb7a4e6c630bfe/ggml-model-Q4_K_M.gguf',
96
+ 'object': 'text_completion',
97
+ 'usage': {'completion_tokens': 36, 'prompt_tokens': 83, 'total_tokens': 119}}
98
+ 5.662428140640259
99
+ ```