Spaces:

aka7774
/

trllm

Sleeping

App Files Files Community

aka7774 commited on May 30

Commit

58190b4

•

1 Parent(s): 81ccdcb

Upload 5 files

Browse files

Files changed (4) hide show

app.py +104 -119
fn.py +206 -94
main.py +15 -4
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -1,133 +1,118 @@
 import fn
 import gradio as gr
-import models
-def fn_chat(instruction, input, model, dtype, is_messages, template, max_new_tokens, temperature, top_p, top_k, repetition_penalty):
-    args = {
-        'instruction': instruction,
-        'input': input,
-        'model': model,
-        'dtype': dtype,
-        'is_messages': is_messages,
-        'template': template,
-        'max_new_tokens': int(max_new_tokens),
-        'temperature': float(temperature),
-        'top_p': float(top_p),
-        'top_k': int(top_k),
-        'repetition_penalty': float(repetition_penalty),
-    }
-    content = fn.infer(args)
-    return content
 with gr.Blocks() as demo:
-    opt = models.get_head_options()
-    with gr.Row():
-        with gr.Column(scale=1):
-            model = gr.Textbox(
-                value=opt['model'],
-                label='model',
-                show_label=True,
-                interactive=True,
-                show_copy_button=True,
-            )
-            dtype = gr.Dropdown(
-                value=opt['dtype'],
-                choices=['int4','int8','fp16', 'bf16'],
-                label='dtype',
-                show_label=True,
-                interactive=True,
-                allow_custom_value=True,
-            )
-            template = gr.Textbox(
-                value=opt['template'],
-                lines=3,
-                label='template',
-                show_label=True,
-                interactive=True,
-                show_copy_button=True,
-                )
-            is_messages = gr.Checkbox(
-                value=opt['is_messages'],
-                label='is_messages',
-                show_label=True,
-                interactive=True,
-                )
-        with gr.Column(scale=1):
-            max_new_tokens = gr.Textbox(
-                value=opt['max_new_tokens'],
-                label='max_new_tokens',
-                show_label=True,
-                interactive=True,
-                show_copy_button=True,
-                )
-            temperature = gr.Textbox(
-                value=opt['temperature'],
-                label='temperature',
-                show_label=True,
-                interactive=True,
-                show_copy_button=True,
-                )
-            top_p = gr.Textbox(
-                value=opt['top_p'],
-                label='top_p',
-                show_label=True,
-                interactive=True,
-                show_copy_button=True,
                 )
-            top_k = gr.Textbox(
-                value=opt['top_k'],
-                label='top_k',
-                show_label=True,
-                interactive=True,
-                show_copy_button=True,
                 )
-            repetition_penalty = gr.Textbox(
-                value=opt['repetition_penalty'],
-                label='repetition_penalty',
-                show_label=True,
-                interactive=True,
-                show_copy_button=True,
                 )
-    with gr.Accordion('Preset', open=False):
-        gr.Examples(
-            models.get_examples(),
-            [model, dtype, is_messages, template, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
-        )
-    with gr.Row():
-        with gr.Column(scale=1):
-            instruction = gr.Textbox(
-                lines=20,
-                label='instruction',
-                show_label=True,
-                interactive=True,
-                show_copy_button=True,
-                )
-            user_input = gr.Textbox(
-                lines=1,
-                label='input',
-                show_label=True,
-                interactive=True,
-                show_copy_button=True,
-                )
-            chat_button = gr.Button(value='chat')
-        with gr.Column(scale=1):
-            said = gr.Textbox(
-                label='said',
-                lines=15,
-                show_label=True,
-                show_copy_button=True,
-                )
-    chat_button.click(
-        fn=fn_chat,
-        inputs=[instruction, user_input, model, dtype, is_messages, template, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
         outputs=[said],
         )

 import fn
 import gradio as gr
 with gr.Blocks() as demo:
+    with gr.Tab('config'):
+        info = gr.Markdown()
+        with gr.Row():
+            with gr.Column(scale=1):
+                model = gr.Textbox(
+                    value=fn.cfg['model_name'],
+                    label='model',
+                    interactive=True,
+                    show_copy_button=True,
                 )
+                qtype = gr.Dropdown(
+                    value=fn.cfg['qtype'],
+                    choices=['bnb','gptq','gguf', 'awq'],
+                    label='qtype',
+                    interactive=True,
                 )
+                dtype = gr.Dropdown(
+                    value=fn.cfg['dtype'],
+                    choices=['4bit','8bit','fp16', 'bf16'],
+                    label='dtype',
+                    interactive=True,
+                    allow_custom_value=True,
                 )
+            with gr.Column(scale=1):
+                max_new_tokens = gr.Textbox(
+                    value=fn.cfg['max_new_tokens'],
+                    label='max_new_tokens',
+                    interactive=True,
+                    show_copy_button=True,
+                    )
+                temperature = gr.Textbox(
+                    value=fn.cfg['temperature'],
+                    label='temperature',
+                    interactive=True,
+                    show_copy_button=True,
+                    )
+                top_p = gr.Textbox(
+                    value=fn.cfg['top_p'],
+                    label='top_p',
+                    interactive=True,
+                    show_copy_button=True,
+                    )
+                top_k = gr.Textbox(
+                    value=fn.cfg['top_k'],
+                    label='top_k',
+                    interactive=True,
+                    show_copy_button=True,
+                    )
+                repetition_penalty = gr.Textbox(
+                    value=fn.cfg['repetition_penalty'],
+                    label='repetition_penalty',
+                    interactive=True,
+                    show_copy_button=True,
+                    )
+        with gr.Row():
+            with gr.Column(scale=1):
+                inst_template = gr.Textbox(
+                    value='',
+                    lines=10,
+                    label='inst_template',
+                    interactive=True,
+                    show_copy_button=True,
+                    )
+            with gr.Column(scale=1):
+                chat_template = gr.Textbox(
+                    value='',
+                    lines=10,
+                    label='chat_template',
+                    interactive=True,
+                    show_copy_button=True,
+                    )
+        set_button = gr.Button(value='Save')
+    with gr.Tab('inctruct'):
+        with gr.Row():
+            with gr.Column(scale=1):
+                instruction = gr.Textbox(
+                    lines=20,
+                    label='instruction',
+                    interactive=True,
+                    show_copy_button=True,
+                    )
+                input = gr.Textbox(
+                    lines=1,
+                    label='input',
+                    interactive=True,
+                    show_copy_button=True,
+                    )
+            with gr.Column(scale=1):
+                said = gr.Textbox(
+                    label='said',
+                    lines=25,
+                    show_copy_button=True,
+                    )
+        inst_button = gr.Button(value='inst')
+    with gr.Tab('chat'):
+        gr.ChatInterface(fn.chat)
+    set_button.click(
+        fn=fn.set_config,
+        inputs=[model, qtype, dtype, instruction, inst_template, chat_template, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
+        outputs=[info],
+        )
+    inst_button.click(
+        fn=fn.chat,
+        inputs=[input, input, instruction],
         outputs=[said],
         )

fn.py CHANGED Viewed

@@ -5,20 +5,32 @@ import datetime
 import json
 import csv
 import gc
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
-from transformers import TextStreamer, TextIteratorStreamer
-from transformers import GenerationConfig, AutoConfig, GPTQConfig, AwqConfig
-from models import models
 tokenizer = None
 model = None
-loaded_model_name = None
-loaded_dtype = None
-def load_model(model_name, dtype = 'int4'):
-    global tokenizer, model, loaded_model_name, loaded_dtype
-    if loaded_model_name == model_name and loaded_dtype == dtype:
         return
     del model
@@ -29,100 +41,200 @@ def load_model(model_name, dtype = 'int4'):
     torch.cuda.empty_cache()
     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-    if dtype == 'int4':
-        model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            device_map="auto",
-            trust_remote_code=True,
-            quantization_config=BitsAndBytesConfig(
-                load_in_4bit=True,
-                bnb_4bit_compute_dtype=torch.bfloat16,
-            ),
-        )
-    elif dtype == 'int8':
-        model = AutoModelForCausalLM.from_pretrained(
             model_name,
             device_map="auto",
             trust_remote_code=True,
-            quantization_config=BitsAndBytesConfig(
-                torch_dtype=torch.bfloat16,
-                load_in_8bit=True,
-            ),
-        )
-    elif dtype == 'fp16':
-        model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            device_map="auto",
-            trust_remote_code=True,
-            torch_dtype=torch.float16,
-        )
-    elif dtype == 'bf16':
-        model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            device_map="auto",
-            trust_remote_code=True,
-            torch_dtype=torch.bfloat16,
-        )
-    else:
-        model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            trust_remote_code=True,
-            device_map="auto",
         )
-    loaded_model_name = model_name
-    loaded_dtype = dtype
-def infer(args: dict):
-    global tokenizer, model, loaded_model_name
-    if 'model' in args:
-        args['model_name'] = args['model']
-    if not tokenizer or 'model_name' in args and loaded_model_name != args['model_name']:
-        if 'dtype' in args:
-            load_model(args['model_name'], args['dtype'])
-        else:
-            load_model(args['model_name'])
-    config = {}
-    if args['model_name'] in models:
-        config = models[args['model_name']]
-    config.update(args)
-    if config['is_messages']:
-        messages = []
-        messages.append({"role": "system", "content": args['instruction']})
-        if args['input']:
-            messages.append({"role": "user", "content": args['input']})
-        tprompt = tokenizer.apply_chat_template(conversation=messages, add_generation_prompt=True, tokenize=False)
     else:
-        tprompt = config['template'].format(bos_token=tokenizer.bos_token, instruction=args['instruction'], input=args['input'])
-    kwargs = config.copy()
-    for k in ['model_name', 'template', 'instruction', 'input', 'location', 'endpoint', 'model', 'dtype', 'is_messages']:
-        if k in kwargs:
-            del kwargs[k]
-    with torch.no_grad():
-        token_ids = tokenizer.encode(tprompt, add_special_tokens=False, return_tensors="pt")
-        if config['is_messages']:
-            output_ids = model.generate(
-                input_ids=token_ids.to(model.device),
-                do_sample=True,
-                **kwargs,
-            )
         else:
-            output_ids = model.generate(
-                input_ids=token_ids.to(model.device),
-                do_sample=True,
-                pad_token_id=tokenizer.pad_token_id,
-                bos_token_id=tokenizer.bos_token_id,
-                eos_token_id=tokenizer.eos_token_id,
-                **kwargs,
-            )
-    out = output_ids.tolist()[0][token_ids.size(1) :]
-    content = tokenizer.decode(out, skip_special_tokens=True)
-    return content

 import json
 import csv
 import gc
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from transformers import TextIteratorStreamer
+from transformers import BitsAndBytesConfig, GPTQConfig
+from threading import Thread
 tokenizer = None
 model = None
+default_cfg = {
+    'model_name': None,
+    'qtype': 'bnb',
+    'dtype': '4bit',
+    'instruction': None,
+    'inst_template': None,
+    'chat_template': None,
+    'max_new_tokens': 1024,
+    'temperature': 0.9,
+    'top_p': 0.95,
+    'top_k': 40,
+    'repetition_penalty': 1.2,
+}
+cfg = default_cfg.copy()
+def load_model(model_name, qtype = 'bnb', dtype = '4bit'):
+    global tokenizer, model, cfg
+    if cfg['model_name'] == model_name and cfg['qtype'] == qtype and cfg['dtype'] == dtype:
         return
     del model
     torch.cuda.empty_cache()
     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    match qtype:
+        case 'bnb':
+            match dtype:
+                case '4bit' | 'int4':
+                    kwargs = dict(
+                        quantization_config=BitsAndBytesConfig(
+                            load_in_4bit=True,
+                            bnb_4bit_compute_dtype=torch.bfloat16,
+                        ),
+                    )
+                case '8bit' | 'int8':
+                    kwargs = dict(
+                        quantization_config=BitsAndBytesConfig(
+                            load_in_8bit=True,
+                            bnb_4bit_compute_dtype=torch.bfloat16,
+                        ),
+                    )
+                case 'fp16':
+                    kwargs = dict(
+                        torch_dtype=torch.float16,
+                    )
+                case 'bf16':
+                    kwargs = dict(
+                        torch_dtype=torch.bfloat16,
+                    )
+                case _:
+                    kwargs = dict()
+        case 'gptq':
+            match dtype:
+                case '4bit' | 'int4':
+                    kwargs = dict(
+                        quantization_config=GPTQConfig(
+                            bits=4,
+                            tokenizer=tokenizer,
+                        ),
+                    )
+                case '8bit' | 'int8':
+                    kwargs = dict(
+                        quantization_config=GPTQConfig(
+                            bits=8,
+                            tokenizer=tokenizer,
+                        ),
+                    )
+        case 'gguf':
+            kwargs = dict(
+                gguf_file=qtype,
+            )
+        case 'awq':
+            match dtype:
+                case 'fa2':
+                    kwargs = dict(
+                        use_flash_attention_2=True,
+                    )
+                case _:
+                    kwargs = dict()
+    model = AutoModelForCausalLM.from_pretrained(
             model_name,
             device_map="auto",
             trust_remote_code=True,
+            **kwargs,
         )
+    cfg['model_name'] = model_name
+    cfg['qtype'] = qtype
+    cfg['dtype'] = dtype
+def clear_config():
+    global cfg
+    cfg = default_cfg.copy()
+def set_config(model_name, qtype, dtype, instruction, inst_template, chat_template, max_new_tokens, temperature, top_p, top_k, repetition_penalty):
+    global cfg
+    load_model(model_name, qtype, dtype)
+    cfg.update({
+        'instruction': instruction,
+        'inst_template': inst_template,
+        'chat_template': chat_template,
+        'max_new_tokens': int(max_new_tokens),
+        'temperature': float(temperature),
+        'top_p': float(top_p),
+        'top_k': int(top_k),
+        'repetition_penalty': float(repetition_penalty),
+    })
+    return 'done.'
+def set_config_args(args):
+    global cfg
+    load_model(args['model_name'], args['qtype'], args['dtype'])
+    cfg.update(args)
+    return 'done.'
+def chatinterface_to_messages(message, history):
+    global cfg
+    messages = []
+    if cfg['instruction']:
+        messages.append({'role': 'system', 'content': cfg['instruction']})
+    for pair in history:
+        [user, assistant] = pair
+        if user:
+            messages.append({'role': 'user', 'content': user})
+        if assistant:
+            messages.append({'role': 'assistant', 'content': assistant})
+    if message:
+        messages.append({'role': 'user', 'content': message})
+    return messages
+def chat(message, history = [], instruction = None, args = {}):
+    global tokenizer, model, cfg
+    if instruction:
+        cfg['instruction'] = instruction
+        prompt = apply_template(message)
     else:
+        messages = chatinterface_to_messages(message, history)
+        prompt = apply_template(messages)
+    model_inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
+    streamer = TextIteratorStreamer(
+        tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True,
+    )
+    generate_kwargs = dict(
+        model_inputs,
+        streamer=streamer,
+        do_sample=True,
+        num_beams=1,
+    )
+    for k in [
+        'max_new_tokens',
+        'temperature',
+        'top_p',
+        'top_k',
+        'repetition_penalty'
+        ]:
+        if cfg[k]:
+            generate_kwargs[k] = cfg[k]
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
+    model_output = ""
+    for new_text in streamer:
+        model_output += new_text
+        if 'fastapi' in args:
+            # fastapiは差分だけを返して欲しい
+            yield new_text
         else:
+            # gradioは常に全文を返して欲しい
+            yield model_output
+    return model_output
+def infer(args: dict):
+    global cfg
+    if 'model_name' in args:
+        load_model(args['model_name'], args['qtype'], args['dtype'])
+    for k in [
+        'instruction',
+        'inst_template',
+        'chat_template',
+        'max_new_tokens',
+        'temperature',
+        'top_p',
+        'top_k',
+        'repetition_penalty'
+        ]:
+        cfg[k] = args[k]
+    if 'messages' in args:
+        return chat(args['input'], args['messages'])
+    if 'instruction' in args:
+        return instruct(args['instruction'], args['input'])
+def apply_template(messages):
+    global tokenizer, cfg
+    if cfg['chat_template']:
+        tokenizer.chat_template = cfg['chat_template']
+    if type(messages) is str:
+        if cfg['inst_template']:
+            return cfg['inst_template'].format(instruction=cfg['instruction'], input=messages)
+        return cfg['instruction']
+    if type(messages) is list:
+        return tokenizer.apply_chat_template(conversation=messages, add_generation_prompt=True, tokenize=False)

main.py CHANGED Viewed

@@ -9,8 +9,7 @@ from fastapi.staticfiles import StaticFiles
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field
 from fastapi.exceptions import RequestValidationError
-from fastapi.responses import JSONResponse
 import fn
 import gradio as gr
 from app import demo
@@ -27,7 +26,19 @@ app.add_middleware(
 gr.mount_gradio_app(app, demo, path="/gradio")
 @app.post("/infer")
 async def api_infer(args: dict):
-    content = fn.infer(args)
-    return {'content': content}

 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field
 from fastapi.exceptions import RequestValidationError
+from fastapi.responses import JSONResponse, StreamingResponse
 import fn
 import gradio as gr
 from app import demo
 gr.mount_gradio_app(app, demo, path="/gradio")
+@app.post("/set_config")
+async def api_set_config(args: dict):
+    content = fn.set_config_args(args)
+    return {'content': content}
 @app.post("/infer")
 async def api_infer(args: dict):
+    args['fastapi'] = True
+    if 'stream' in args and args['stream']:
+        return StreamingResponse(
+            fn.chat(args['input'], [], args['instruct'], args),
+            media_type="text/event-stream",
+        )
+    else:
+        content = fn.chat(args['input'], [], args['instruct'], args)
+        return {'content': content}

requirements.txt CHANGED Viewed

@@ -4,9 +4,9 @@ transformers
 accelerate
 sentencepiece
 bitsandbytes
 scipy
 tiktoken
 einops
-transformers_stream_generator
 protobuf
 python-multipart

 accelerate
 sentencepiece
 bitsandbytes
+autoawq
 scipy
 tiktoken
 einops
 protobuf
 python-multipart