Spaces:

chansung
/

vid2persona

Paused

chansung commited on Mar 11

Commit

fe3e540

•

1 Parent(s): 89c0f3c

Update vid2persona/gen/local_openllm.py

Files changed (1) hide show

vid2persona/gen/local_openllm.py CHANGED Viewed

@@ -1,36 +1,26 @@
-# import spaces
 import torch
 from threading import Thread
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers import TextIteratorStreamer
-model = None
-tokenizer = None
-# @spaces.GPU
 def send_message(
     messages: list,
     model_id: str,
     max_input_token_length: int,
     parameters: dict
 ):
-    global tokenizer
-    global model
-    if tokenizer is None:
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        tokenizer.use_default_system_prompt = False
-    if model is None:
-        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
-    input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt")
     if input_ids.shape[1] > max_input_token_length:
         input_ids = input_ids[:, -max_input_token_length:]
         print(f"Trimmed input from conversation as it was longer than {max_input_token_length} tokens.")
-    input_ids = input_ids.to(model.device)
-    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         {"input_ids": input_ids},
         streamer=streamer,
@@ -38,7 +28,7 @@ def send_message(
         num_beams=1,
         **parameters
     )
-    t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
     for text in streamer:

+import spaces
 import torch
 from threading import Thread
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers import TextIteratorStreamer
+from vid2persona import init
+@spaces.GPU
 def send_message(
     messages: list,
     model_id: str,
     max_input_token_length: int,
     parameters: dict
 ):
+    input_ids = init.tokenizer.apply_chat_template(messages, return_tensors="pt")
     if input_ids.shape[1] > max_input_token_length:
         input_ids = input_ids[:, -max_input_token_length:]
         print(f"Trimmed input from conversation as it was longer than {max_input_token_length} tokens.")
+    input_ids = input_ids.to(init.model.device)
+    streamer = TextIteratorStreamer(init.tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         {"input_ids": input_ids},
         streamer=streamer,
         num_beams=1,
         **parameters
     )
+    t = Thread(target=init.model.generate, kwargs=generate_kwargs)
     t.start()
     for text in streamer: