import torch import gradio as gr from transformers import AutoTokenizer, pipeline, logging from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig model_name_or_path = "TheBloke/WizardCoder-Guanaco-15B-V1.1-GPTQ" model_basename = "gptq_model-4bit-128g" use_triton = False device = "cuda:0" if torch.cuda.is_available() else "cpu" tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True) quantize_config = BaseQuantizeConfig( bits=4, # quantize model to 4-bit group_size=128, # it is recommended to set the value to 128 desc_act=False, # set to False can significantly speed up inference but the perplexity may slightly bad ) model = AutoGPTQForCausalLM.from_quantized(model_name_or_path, model_basename=model_basename, use_safetensors=True, trust_remote_code=False, device=device, use_triton=use_triton, quantize_config=quantize_config, cache_dir="models/" ) """ To download from a specific branch, use the revision parameter, as in this example: model = AutoGPTQForCausalLM.from_quantized(model_name_or_path, revision="gptq-4bit-32g-actorder_True", model_basename=model_basename, use_safetensors=True, trust_remote_code=False, device="cuda:0", quantize_config=None) """ def code_gen(text): logging.set_verbosity(logging.CRITICAL) print("*** Pipeline:") pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=124, temperature=0.7, top_p=0.95, repetition_penalty=1.15 ) response = pipe(text) print(response) return response[0]['generated_text'] iface = gr.Interface(fn=code_gen, inputs=gr.inputs.Textbox( label="Input Source Code"), outputs="text", title="Code Generation") iface.launch()