|
{
|
|
"api_key": null,
|
|
"verify_url": "http://johnrachwan.pythonanywhere.com",
|
|
"smash_config": {
|
|
"pruners": "None",
|
|
"pruning_ratio": 0.0,
|
|
"factorizers": "None",
|
|
"quantizers": "['awq']",
|
|
"weight_quantization_bits": 4,
|
|
"output_deviation": 0.005,
|
|
"compilers": "None",
|
|
"static_batch": true,
|
|
"static_shape": true,
|
|
"controlnet": "None",
|
|
"unet_dim": 4,
|
|
"device": "cuda",
|
|
"cache_dir": "/ceph/hdd/staff/charpent/.cache/modelsxqkhqp_d",
|
|
"batch_size": 1,
|
|
"model_name": "meta-llama/Meta-Llama-3-8B-Instruct",
|
|
"task": "text_text_generation",
|
|
"max_batch_size": 1,
|
|
"qtype_weight": "torch.qint8",
|
|
"qtype_activation": "torch.quint8",
|
|
"qobserver": "<class 'torch.ao.quantization.observer.MinMaxObserver'>",
|
|
"qscheme": "torch.per_tensor_symmetric",
|
|
"qconfig": "x86",
|
|
"group_size": 128,
|
|
"damp_percent": 0.1,
|
|
"save_load_fn": "hf-awq"
|
|
}
|
|
} |