|
import functools |
|
from collections import OrderedDict |
|
|
|
import gradio as gr |
|
|
|
from modules import shared |
|
|
|
loaders_and_params = OrderedDict({ |
|
'Transformers': [ |
|
'cpu_memory', |
|
'gpu_memory', |
|
'load_in_8bit', |
|
'bf16', |
|
'cpu', |
|
'disk', |
|
'auto_devices', |
|
'load_in_4bit', |
|
'use_double_quant', |
|
'quant_type', |
|
'compute_dtype', |
|
'trust_remote_code', |
|
'no_use_fast', |
|
'use_flash_attention_2', |
|
'alpha_value', |
|
'rope_freq_base', |
|
'compress_pos_emb', |
|
'disable_exllama', |
|
'disable_exllamav2', |
|
'transformers_info', |
|
], |
|
'llama.cpp': [ |
|
'n_ctx', |
|
'n_gpu_layers', |
|
'tensor_split', |
|
'n_batch', |
|
'threads', |
|
'threads_batch', |
|
'no_mmap', |
|
'mlock', |
|
'no_mul_mat_q', |
|
'alpha_value', |
|
'rope_freq_base', |
|
'compress_pos_emb', |
|
'cpu', |
|
'numa', |
|
'no_offload_kqv', |
|
'row_split', |
|
'tensorcores', |
|
'streaming_llm', |
|
'attention_sink_size', |
|
], |
|
'llamacpp_HF': [ |
|
'n_ctx', |
|
'n_gpu_layers', |
|
'tensor_split', |
|
'n_batch', |
|
'threads', |
|
'threads_batch', |
|
'no_mmap', |
|
'mlock', |
|
'no_mul_mat_q', |
|
'alpha_value', |
|
'rope_freq_base', |
|
'compress_pos_emb', |
|
'cpu', |
|
'numa', |
|
'cfg_cache', |
|
'trust_remote_code', |
|
'no_use_fast', |
|
'logits_all', |
|
'no_offload_kqv', |
|
'row_split', |
|
'tensorcores', |
|
'streaming_llm', |
|
'attention_sink_size', |
|
'llamacpp_HF_info', |
|
], |
|
'ExLlamav2_HF': [ |
|
'gpu_split', |
|
'max_seq_len', |
|
'cfg_cache', |
|
'no_flash_attn', |
|
'num_experts_per_token', |
|
'cache_8bit', |
|
'cache_4bit', |
|
'autosplit', |
|
'alpha_value', |
|
'compress_pos_emb', |
|
'trust_remote_code', |
|
'no_use_fast', |
|
], |
|
'ExLlamav2': [ |
|
'gpu_split', |
|
'max_seq_len', |
|
'no_flash_attn', |
|
'num_experts_per_token', |
|
'cache_8bit', |
|
'cache_4bit', |
|
'autosplit', |
|
'alpha_value', |
|
'compress_pos_emb', |
|
'exllamav2_info', |
|
], |
|
'AutoGPTQ': [ |
|
'triton', |
|
'no_inject_fused_attention', |
|
'no_inject_fused_mlp', |
|
'no_use_cuda_fp16', |
|
'wbits', |
|
'groupsize', |
|
'desc_act', |
|
'disable_exllama', |
|
'disable_exllamav2', |
|
'gpu_memory', |
|
'cpu_memory', |
|
'cpu', |
|
'disk', |
|
'auto_devices', |
|
'trust_remote_code', |
|
'no_use_fast', |
|
'autogptq_info', |
|
], |
|
'AutoAWQ': [ |
|
'cpu_memory', |
|
'gpu_memory', |
|
'auto_devices', |
|
'max_seq_len', |
|
'no_inject_fused_attention', |
|
'trust_remote_code', |
|
'no_use_fast', |
|
], |
|
'GPTQ-for-LLaMa': [ |
|
'wbits', |
|
'groupsize', |
|
'model_type', |
|
'pre_layer', |
|
'trust_remote_code', |
|
'no_use_fast', |
|
'gptq_for_llama_info', |
|
], |
|
'ctransformers': [ |
|
'n_ctx', |
|
'n_gpu_layers', |
|
'n_batch', |
|
'threads', |
|
'model_type', |
|
'no_mmap', |
|
'mlock' |
|
], |
|
'QuIP#': [ |
|
'trust_remote_code', |
|
'no_use_fast', |
|
'no_flash_attn', |
|
'quipsharp_info', |
|
], |
|
'HQQ': [ |
|
'hqq_backend', |
|
'trust_remote_code', |
|
'no_use_fast', |
|
] |
|
}) |
|
|
|
|
|
def transformers_samplers(): |
|
return { |
|
'temperature', |
|
'temperature_last', |
|
'dynamic_temperature', |
|
'dynatemp_low', |
|
'dynatemp_high', |
|
'dynatemp_exponent', |
|
'smoothing_factor', |
|
'smoothing_curve', |
|
'top_p', |
|
'min_p', |
|
'top_k', |
|
'typical_p', |
|
'epsilon_cutoff', |
|
'eta_cutoff', |
|
'tfs', |
|
'top_a', |
|
'repetition_penalty', |
|
'presence_penalty', |
|
'frequency_penalty', |
|
'repetition_penalty_range', |
|
'encoder_repetition_penalty', |
|
'no_repeat_ngram_size', |
|
'min_length', |
|
'seed', |
|
'do_sample', |
|
'penalty_alpha', |
|
'num_beams', |
|
'length_penalty', |
|
'early_stopping', |
|
'mirostat_mode', |
|
'mirostat_tau', |
|
'mirostat_eta', |
|
'grammar_file_row', |
|
'grammar_string', |
|
'guidance_scale', |
|
'negative_prompt', |
|
'ban_eos_token', |
|
'custom_token_bans', |
|
'sampler_priority', |
|
'add_bos_token', |
|
'skip_special_tokens', |
|
'auto_max_new_tokens', |
|
'prompt_lookup_num_tokens' |
|
} |
|
|
|
|
|
loaders_samplers = { |
|
'Transformers': transformers_samplers(), |
|
'AutoGPTQ': transformers_samplers(), |
|
'GPTQ-for-LLaMa': transformers_samplers(), |
|
'AutoAWQ': transformers_samplers(), |
|
'QuIP#': transformers_samplers(), |
|
'HQQ': transformers_samplers(), |
|
'ExLlamav2': { |
|
'temperature', |
|
'temperature_last', |
|
'top_p', |
|
'min_p', |
|
'top_k', |
|
'typical_p', |
|
'tfs', |
|
'top_a', |
|
'repetition_penalty', |
|
'presence_penalty', |
|
'frequency_penalty', |
|
'repetition_penalty_range', |
|
'seed', |
|
'mirostat_mode', |
|
'mirostat_tau', |
|
'mirostat_eta', |
|
'ban_eos_token', |
|
'add_bos_token', |
|
'custom_token_bans', |
|
'skip_special_tokens', |
|
'auto_max_new_tokens', |
|
}, |
|
'ExLlamav2_HF': { |
|
'temperature', |
|
'temperature_last', |
|
'dynamic_temperature', |
|
'dynatemp_low', |
|
'dynatemp_high', |
|
'dynatemp_exponent', |
|
'smoothing_factor', |
|
'smoothing_curve', |
|
'top_p', |
|
'min_p', |
|
'top_k', |
|
'typical_p', |
|
'epsilon_cutoff', |
|
'eta_cutoff', |
|
'tfs', |
|
'top_a', |
|
'repetition_penalty', |
|
'presence_penalty', |
|
'frequency_penalty', |
|
'repetition_penalty_range', |
|
'encoder_repetition_penalty', |
|
'no_repeat_ngram_size', |
|
'min_length', |
|
'seed', |
|
'do_sample', |
|
'mirostat_mode', |
|
'mirostat_tau', |
|
'mirostat_eta', |
|
'grammar_file_row', |
|
'grammar_string', |
|
'guidance_scale', |
|
'negative_prompt', |
|
'ban_eos_token', |
|
'custom_token_bans', |
|
'sampler_priority', |
|
'add_bos_token', |
|
'skip_special_tokens', |
|
'auto_max_new_tokens', |
|
}, |
|
'llama.cpp': { |
|
'temperature', |
|
'top_p', |
|
'min_p', |
|
'top_k', |
|
'typical_p', |
|
'tfs', |
|
'repetition_penalty', |
|
'presence_penalty', |
|
'frequency_penalty', |
|
'seed', |
|
'mirostat_mode', |
|
'mirostat_tau', |
|
'mirostat_eta', |
|
'grammar_file_row', |
|
'grammar_string', |
|
'ban_eos_token', |
|
'custom_token_bans', |
|
}, |
|
'llamacpp_HF': { |
|
'temperature', |
|
'temperature_last', |
|
'dynamic_temperature', |
|
'dynatemp_low', |
|
'dynatemp_high', |
|
'dynatemp_exponent', |
|
'smoothing_factor', |
|
'smoothing_curve', |
|
'top_p', |
|
'min_p', |
|
'top_k', |
|
'typical_p', |
|
'epsilon_cutoff', |
|
'eta_cutoff', |
|
'tfs', |
|
'top_a', |
|
'repetition_penalty', |
|
'presence_penalty', |
|
'frequency_penalty', |
|
'repetition_penalty_range', |
|
'encoder_repetition_penalty', |
|
'no_repeat_ngram_size', |
|
'min_length', |
|
'seed', |
|
'do_sample', |
|
'mirostat_mode', |
|
'mirostat_tau', |
|
'mirostat_eta', |
|
'grammar_file_row', |
|
'grammar_string', |
|
'guidance_scale', |
|
'negative_prompt', |
|
'ban_eos_token', |
|
'custom_token_bans', |
|
'sampler_priority', |
|
'add_bos_token', |
|
'skip_special_tokens', |
|
'auto_max_new_tokens', |
|
}, |
|
'ctransformers': { |
|
'temperature', |
|
'top_p', |
|
'top_k', |
|
'repetition_penalty', |
|
'repetition_penalty_range', |
|
}, |
|
} |
|
|
|
loaders_model_types = { |
|
'GPTQ-for-LLaMa': [ |
|
"None", |
|
"llama", |
|
"opt", |
|
"gptj" |
|
], |
|
'ctransformers': [ |
|
"None", |
|
"gpt2", |
|
"gptj", |
|
"gptneox", |
|
"llama", |
|
"mpt", |
|
"dollyv2", |
|
"replit", |
|
"starcoder", |
|
"gptbigcode", |
|
"falcon" |
|
], |
|
} |
|
|
|
|
|
@functools.cache |
|
def list_all_samplers(): |
|
all_samplers = set() |
|
for k in loaders_samplers: |
|
for sampler in loaders_samplers[k]: |
|
all_samplers.add(sampler) |
|
|
|
return sorted(all_samplers) |
|
|
|
|
|
def blacklist_samplers(loader, dynamic_temperature): |
|
all_samplers = list_all_samplers() |
|
output = [] |
|
|
|
for sampler in all_samplers: |
|
if loader == 'All' or sampler in loaders_samplers[loader]: |
|
if sampler.startswith('dynatemp'): |
|
output.append(gr.update(visible=dynamic_temperature)) |
|
else: |
|
output.append(gr.update(visible=True)) |
|
else: |
|
output.append(gr.update(visible=False)) |
|
|
|
return output |
|
|
|
|
|
def get_model_types(loader): |
|
if loader in loaders_model_types: |
|
return loaders_model_types[loader] |
|
|
|
return ["None"] |
|
|
|
|
|
def get_gpu_memory_keys(): |
|
return [k for k in shared.gradio if k.startswith('gpu_memory')] |
|
|
|
|
|
@functools.cache |
|
def get_all_params(): |
|
all_params = set() |
|
for k in loaders_and_params: |
|
for el in loaders_and_params[k]: |
|
all_params.add(el) |
|
|
|
if 'gpu_memory' in all_params: |
|
all_params.remove('gpu_memory') |
|
for k in get_gpu_memory_keys(): |
|
all_params.add(k) |
|
|
|
return sorted(all_params) |
|
|
|
|
|
def make_loader_params_visible(loader): |
|
params = [] |
|
all_params = get_all_params() |
|
if loader in loaders_and_params: |
|
params = loaders_and_params[loader] |
|
|
|
if 'gpu_memory' in params: |
|
params.remove('gpu_memory') |
|
params += get_gpu_memory_keys() |
|
|
|
return [gr.update(visible=True) if k in params else gr.update(visible=False) for k in all_params] |
|
|