ORLM

Running on Zero

App Files Files Community

tangzhy commited on Jul 25

Commit

b2b7f7a

•

1 Parent(s): d3128ed

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -47

app.py CHANGED Viewed

@@ -15,18 +15,14 @@ from transformers import (
 import subprocess
 subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
-from vllm import LLM, SamplingParams
 DESCRIPTION = """\
 # ORLM LLaMA-3-8B
 Hello! I'm ORLM-LLaMA-3-8B, here to automate your optimization modeling tasks! Check our [repo](https://github.com/Cardinal-Operations/ORLM) and [paper](https://arxiv.org/abs/2405.17743)!
 """
 MAX_MAX_NEW_TOKENS = 4096
 DEFAULT_MAX_NEW_TOKENS = 4096
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
-model_id = "CardinalOperations/ORLM-LLaMA-3-8B"
 # quantization_config = BitsAndBytesConfig(
 #     load_in_4bit=True,
@@ -35,21 +31,19 @@ model_id = "CardinalOperations/ORLM-LLaMA-3-8B"
 #     bnb_4bit_quant_type= "nf4")
 # quantization_config = BitsAndBytesConfig(load_in_8bit=True)
-# tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
-# model = AutoModelForCausalLM.from_pretrained(
-#     model_id,
-#     device_map="auto",
-#     torch_dtype=torch.bfloat16,
-#     attn_implementation="flash_attention_2",
-#     # quantization_config=quantization_config,
-# )
-# model.eval()
-subprocess.run(f'huggingface-cli download {model_id} --local-dir ./local_model',  shell=True)
-model = LLM(model='./local_model', tensor_parallel_size=1)
-print("init model done.")
-@spaces.GPU(duration=60)
 def generate(
     message: str,
     chat_history: list[tuple[str, str]],
@@ -62,33 +56,33 @@ def generate(
     if chat_history != []:
         return "Sorry, I am an instruction-tuned model and currently do not support chatting. Please try clearing the chat history or refreshing the page to ask a new question."
-    # tokenized_example = tokenizer(message, return_tensors='pt', max_length=MAX_INPUT_TOKEN_LENGTH, truncation=True)
-    # input_ids = tokenized_example.input_ids
-    # input_ids = input_ids.to(model.device)
-    # streamer = TextIteratorStreamer(tokenizer, timeout=50.0, skip_prompt=True, skip_special_tokens=True)
-    # generate_kwargs = dict(
-    #     {"input_ids": input_ids},
-    #     streamer=streamer,
-    #     max_new_tokens=max_new_tokens,
-    #     do_sample=False if temperature == 0.0 else True,
-    #     top_p=top_p,
-    #     top_k=top_k,
-    #     temperature=temperature,
-    #     num_beams=1,
-    #     repetition_penalty=repetition_penalty,
-    #     eos_token_id=[tok.eos_token_id],
-    # )
-    prompts = [message]
-    stop_tokens = ["</s>"]
-    if temperature == 0.0:
-        sampling_params = SamplingParams(n=topk, temperature=0, top_p=1, repetition_penalty=repetition_penalty, max_tokens=max_new_tokens, stop=stop_tokens)
-    else:
-        sampling_params = SamplingParams(n=topk, temperature=temperature, top_p=top_p, repetition_penalty=repetition_penalty, max_tokens=max_new_tokens, stop=stop_tokens)
-    generations = model.generate(prompts, sampling_params)
-    outputs = [g.outputs[0].text for g in generations]
-    return outputs[0]
 chat_interface = gr.ChatInterface(
@@ -144,4 +138,4 @@ with gr.Blocks(css="style.css", fill_height=True) as demo:
     chat_interface.render()
 if __name__ == "__main__":
-    demo.queue(max_size=20).launch()

 import subprocess
 subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 DESCRIPTION = """\
 # ORLM LLaMA-3-8B
 Hello! I'm ORLM-LLaMA-3-8B, here to automate your optimization modeling tasks! Check our [repo](https://github.com/Cardinal-Operations/ORLM) and [paper](https://arxiv.org/abs/2405.17743)!
 """
 MAX_MAX_NEW_TOKENS = 4096
 DEFAULT_MAX_NEW_TOKENS = 4096
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 # quantization_config = BitsAndBytesConfig(
 #     load_in_4bit=True,
 #     bnb_4bit_quant_type= "nf4")
 # quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+model_id = "CardinalOperations/ORLM-LLaMA-3-8B"
+tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="auto",
+    torch_dtype=torch.bfloat16,
+    attn_implementation="flash_attention_2",
+    # quantization_config=quantization_config,
+)
+model.eval()
+@spaces.GPU(duration=100)
 def generate(
     message: str,
     chat_history: list[tuple[str, str]],
     if chat_history != []:
         return "Sorry, I am an instruction-tuned model and currently do not support chatting. Please try clearing the chat history or refreshing the page to ask a new question."
+    tokenized_example = tokenizer(message, return_tensors='pt', max_length=MAX_INPUT_TOKEN_LENGTH, truncation=True)
+    input_ids = tokenized_example.input_ids
+    input_ids = input_ids.to(model.device)
+    streamer = TextIteratorStreamer(tokenizer, timeout=50.0, skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = dict(
+        {"input_ids": input_ids},
+        streamer=streamer,
+        max_new_tokens=max_new_tokens,
+        do_sample=False if temperature == 0.0 else True,
+        top_p=top_p,
+        top_k=top_k,
+        temperature=temperature,
+        num_beams=1,
+        repetition_penalty=repetition_penalty,
+        eos_token_id=[tok.eos_token_id],
+    )
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
+    outputs = []
+    for text in streamer:
+        outputs.append(text)
+        yield "".join(outputs)
+    # outputs.append("\n\nI have now attempted to solve the optimization modeling task! Please try executing the code in your environment, making sure it is equipped with `coptpy`.")
+    # yield "".join(outputs)
 chat_interface = gr.ChatInterface(
     chat_interface.render()
 if __name__ == "__main__":
+    demo.queue(max_size=20).launch()