initial commit

Browse files

Files changed (5) hide show

elm/infer_elm.py +2 -2
elm/infer_elm_for_demo_app.py +143 -0
elm/model.py +6 -8
elm/positional_embeddings.py +0 -2
elm/utils.py +1 -6

elm/infer_elm.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (c) 2024, SliceX AI, Inc. All Rights Reserved.
 from elm.model import *
 from elm.utils import batchify
@@ -129,4 +129,4 @@ def generate_elm_responses(elm_model_path,
                 print(json.dumps({"prompt": prompt, "response": response}, indent=4))
                 print("\n***\n")
     return result

+# Copyright (c) 2024, SliceX AI, Inc.
 from elm.model import *
 from elm.utils import batchify
                 print(json.dumps({"prompt": prompt, "response": response}, indent=4))
                 print("\n***\n")
     return result

elm/infer_elm_for_demo_app.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# Copyright (c) 2024, SliceX AI, Inc.
+from elm.model import *
+from elm.utils import batchify
+from transformers import AutoTokenizer
+import json
+def load_elm_model_and_tokenizer(local_path,
+                                 model_config_dict,
+                                 device="cuda",
+                                 load_partial=True,
+                                 get_num_layers_from_ckpt=True):
+    """Load ELM model and tokenizer from local checkpoint."""
+    model_args = ModelArgs(**model_config_dict)
+    model = load_elm_model_from_ckpt(local_path, device=device, model_args=model_args, load_partial=load_partial, get_num_layers_from_ckpt=get_num_layers_from_ckpt)
+    tokenizer = AutoTokenizer.from_pretrained(local_path)
+    tokenizer.padding_side = "left"
+    tokenizer.truncation_side = "left"
+    return model, tokenizer
+def generate_elm_response_given_model(prompts, model, tokenizer,
+                          device="cuda",
+                          max_ctx_word_len=1024,
+                          max_ctx_token_len=0,
+                          max_new_tokens=500,
+                          temperature=0.8, # set to 0 for greedy decoding
+                          top_k=200,
+                          return_tok_cnt=False,
+                          return_gen_only=False,
+                          early_stop_on_eos=False):
+    """Generate responses from ELM model given an input list of prompts ([str])."""
+    if max_ctx_token_len > 0:
+        inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=max_ctx_token_len).to(device)
+    else:
+        prompts = [" ".join(p.split(" ")[-max_ctx_word_len:]) for p in prompts]
+        inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(device)
+    results = []
+    input_tok_cnt = torch.numel(inputs.input_ids)
+    model.eval()
+    out_tok_cnt = 0
+    with torch.no_grad():
+        temperature = temperature
+        top_k = top_k
+        outputs = model.generate(inputs.input_ids, max_new_tokens, temperature=temperature, top_k=top_k,
+                                 return_gen_only=return_gen_only)
+        if return_tok_cnt:
+            out_tok_cnt += torch.numel(outputs)
+        if early_stop_on_eos:
+            mod_outputs = []
+            for i in range(len(outputs)):
+                curr_out = outputs[i]
+                eos_loc_id = -1
+                for j in range(len(outputs[i])):
+                    tok_id = outputs[i][j]
+                    if tok_id == tokenizer.eos_token_id:
+                        eos_loc_id = j
+                        break
+                if eos_loc_id >= 0:
+                    curr_out = outputs[i][:eos_loc_id]
+                mod_outputs.append(curr_out)
+            outputs = mod_outputs
+        detokenized_output = tokenizer.batch_decode(outputs, skip_special_tokens=False)
+        results = detokenized_output
+    if return_tok_cnt:
+        return results, (input_tok_cnt, out_tok_cnt)
+    return results
+def load_elm_model_given_path(elm_model_path, elm_model_config={}, device=None):
+    if not device:
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    print(f"Setting device to {device}")
+    model_config_dict = {
+            "hidden_size": elm_model_config.get("hidden_size", 2048),
+            "max_inp_len": elm_model_config.get("max_inp_len", 2048),
+            "num_attention_heads": elm_model_config.get("num_attention_heads", 32),
+            "num_layers": elm_model_config.get("num_layers", 48),
+            "bits": elm_model_config.get("bits", 256),
+            "vocab_size": elm_model_config.get("vocab_size", 50304),
+            "dropout": elm_model_config.get("dropout", 0.1),
+            "use_rotary_embeddings": elm_model_config.get("use_rotary_embeddings", True)
+        }
+    model, tokenizer = load_elm_model_and_tokenizer(local_path=elm_model_path, model_config_dict=model_config_dict, device=device, load_partial=True)
+    return {"model": model, "tokenizer": tokenizer}
+def generate_elm_responses(elm_model_path,
+                           prompts,
+                           device=None,
+                           elm_model_config={},
+                           eval_batch_size=1,
+                           verbose=True,
+                           model_info=None):
+    if not device:
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    print(f"Setting device to {device}")
+    if not model_info:
+        model_info = load_elm_model_given_path(elm_model_path, elm_model_config=elm_model_config, device=device)
+    model, tokenizer = model_info["model"], model_info["tokenizer"]
+    #prompts = [prompt if "[INST]" in prompt else f"[INST]{prompt}[/INST]" for prompt in prompts]
+    max_new_tokens = 128
+    if "classification" in elm_model_path or "detection" in elm_model_path:
+        max_new_tokens = 12
+    result = []
+    for prompt_batch in batchify(prompts, eval_batch_size):
+        responses, _ = generate_elm_response_given_model(prompt_batch,
+                                                            model,
+                                                            tokenizer,
+                                                            device=device,
+                                                            max_ctx_word_len=1024,
+                                                            max_ctx_token_len=512,
+                                                            max_new_tokens=max_new_tokens,
+                                                            return_tok_cnt=True,
+                                                            return_gen_only=False,
+                                                            temperature=0.0,
+                                                            early_stop_on_eos=True)
+        for prompt, response in zip(prompt_batch, responses):
+            response = response.split("[/INST]")[-1].strip()
+            result.append(response)
+            if verbose:
+                print(json.dumps({"prompt": prompt, "response": response}, indent=4))
+                print("\n***\n")
+    return result

elm/model.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (c) 2024, SliceX AI, Inc. All Rights Reserved.
 import copy
 import inspect
@@ -100,15 +100,12 @@ class ELM(torch.nn.Module):
         else:
             x = self.slice_transformer.drop(tok_emb)
-        tlayer_id = 0
         ignore_index_id = -100
         loss = torch.zeros(1).to(device)
         loss_denom = 0
         for tlayer in self.slice_transformer.h:
             x = tlayer(x, attention_mask=attention_mask)
-            tlayer_id += 1
         x = self.slice_transformer.ln_f(x)
@@ -133,9 +130,8 @@ class ELM(torch.nn.Module):
     def get_num_params(self, non_embedding=True):
         """
         Return the number of parameters in the model.
-        For non-embedding count (default), the position embeddings get subtracted.
-        This assumes parameter tying between input and final layer embeddings. Oherwise
-        If there is no parameter sharing , set the flag to False to include parameters for both layers.
         """
         n_params = sum(p.numel() for p in self.parameters())
         if non_embedding and not self.model_args.use_rotary_embeddings:
@@ -342,6 +338,8 @@ def init_elm_model(model_args=ModelArgs(), device="cuda", model_config_dict=None
         model_args = ModelArgs(**model_config_dict)
     dtype = torch.bfloat16 if device=="cuda" and torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16
     model = ELM(model_args=model_args).to(dtype=dtype)
@@ -415,4 +413,4 @@ def sample_top_p(probs, threshold):
     next_token = torch.multinomial(probs_sort, num_samples=1)
     next_token = torch.gather(probs_idx, -1, next_token)
-    return next_token

+# Copyright (c) 2024, SliceX AI, Inc.
 import copy
 import inspect
         else:
             x = self.slice_transformer.drop(tok_emb)
         ignore_index_id = -100
         loss = torch.zeros(1).to(device)
         loss_denom = 0
         for tlayer in self.slice_transformer.h:
             x = tlayer(x, attention_mask=attention_mask)
         x = self.slice_transformer.ln_f(x)
     def get_num_params(self, non_embedding=True):
         """
         Return the number of parameters in the model.
+        For non-embedding count (default), subtract position embeddings if parameter tying applies.
+        If there is no parameter sharing, set the flag to False to include parameters for both input/output layers.
         """
         n_params = sum(p.numel() for p in self.parameters())
         if non_embedding and not self.model_args.use_rotary_embeddings:
         model_args = ModelArgs(**model_config_dict)
     dtype = torch.bfloat16 if device=="cuda" and torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16
+    if not torch.cuda.is_available():
+        dtype = torch.bfloat16
     model = ELM(model_args=model_args).to(dtype=dtype)
     next_token = torch.multinomial(probs_sort, num_samples=1)
     next_token = torch.gather(probs_idx, -1, next_token)
+    return next_token

elm/positional_embeddings.py CHANGED Viewed

@@ -9,8 +9,6 @@ def rotate_half(x):
 @torch.jit.script
 def apply_rotary_pos_emb(x, cos, sin):
-    # NOTE: This could probably be moved to Triton
     # Handle a possible sequence length mismatch in between q and k
     cos = cos[:, :, : x.shape[-2], :]
     sin = sin[:, :, : x.shape[-2], :]

 @torch.jit.script
 def apply_rotary_pos_emb(x, cos, sin):
     # Handle a possible sequence length mismatch in between q and k
     cos = cos[:, :, : x.shape[-2], :]
     sin = sin[:, :, : x.shape[-2], :]

elm/utils.py CHANGED Viewed

@@ -1,21 +1,16 @@
-# Copyright (c) 2024, SliceX AI, Inc. All Rights Reserved.
-from prettytable import PrettyTable
 def count_parameters(model):
     """Count the number of parameters in the model."""
-    table = PrettyTable(["Modules", "Parameters"])
     total_params = 0
     for name, parameter in model.named_parameters():
         if not parameter.requires_grad: continue
         params = parameter.numel()
-        table.add_row([name, params])
         total_params+=params
-    print(table)
     print(f"Total Trainable Params: {total_params}")
     return total_params

+# Copyright (c) 2024, SliceX AI, Inc.
 def count_parameters(model):
     """Count the number of parameters in the model."""
     total_params = 0
     for name, parameter in model.named_parameters():
         if not parameter.requires_grad: continue
         params = parameter.numel()
         total_params+=params
     print(f"Total Trainable Params: {total_params}")
     return total_params