Spaces:

shangeth
/

multi-modal-speech-llm

Sleeping

shangeth commited on May 24

Commit

ec9a712

•

1 Parent(s): 101c04c

quantization added

Files changed (3) hide show

app.py CHANGED Viewed

@@ -34,7 +34,7 @@ def plot_mel_spectrogram(mel_spec):
 def get_or_load_model():
     if 'model' not in st.session_state or 'tokenizer' not in st.session_state or 'processor' not in st.session_state:
         ckpt_path = "checkpoints/pretrained_checkpoint.ckpt"
-        model = SpeechLLMLightning.load_from_checkpoint(ckpt_path)
         tokenizer = model.llm_tokenizer
         model.eval()
         model.freeze()

 def get_or_load_model():
     if 'model' not in st.session_state or 'tokenizer' not in st.session_state or 'processor' not in st.session_state:
         ckpt_path = "checkpoints/pretrained_checkpoint.ckpt"
+        model = SpeechLLMLightning.load_from_checkpoint(ckpt_path, quantize=True)
         tokenizer = model.llm_tokenizer
         model.eval()
         model.freeze()

model.py CHANGED Viewed

@@ -13,7 +13,7 @@ else:
 class HubertXCNNEnoder(nn.Module):
     def __init__(self, audio_enc_dim, llm_dim, finetune=False):
         super().__init__()
-        self.encoder = HubertModel.from_pretrained('facebook/hubert-xlarge-ll60k', device_map = device)
         for param in self.encoder.parameters():
             param.requires_grad = False

 class HubertXCNNEnoder(nn.Module):
     def __init__(self, audio_enc_dim, llm_dim, finetune=False):
         super().__init__()
+        self.encoder = HubertModel.from_pretrained('facebook/hubert-xlarge-ll60k').to(device)
         for param in self.encoder.parameters():
             param.requires_grad = False

trainer.py CHANGED Viewed

@@ -6,6 +6,9 @@ from peft import LoraConfig, get_peft_model, PeftModel
 import pytorch_lightning as pl
 from model import HubertXCNNEnoder
 if torch.cuda.is_available():
     # Set the device to CUDA
@@ -15,7 +18,7 @@ else:
     device = "cpu"
 class SpeechLLMLightning(pl.LightningModule):
-    def __init__(self, audio_enc_dim=512, llm_dim=2048, llm_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0"):
         super().__init__()
         self.save_hyperparameters()
@@ -48,6 +51,12 @@ class SpeechLLMLightning(pl.LightningModule):
         self.audio_encoder.eval()
         self.llm_model.eval()
     def encode(self, mel, pre_tokenized_ids, post_tokenized_ids, output_tokenized_ids):
         batch_size = mel.shape[0]

 import pytorch_lightning as pl
 from model import HubertXCNNEnoder
+from torch.quantization import quantize_dynamic
+import torch.jit as jit
 if torch.cuda.is_available():
     # Set the device to CUDA
     device = "cpu"
 class SpeechLLMLightning(pl.LightningModule):
+    def __init__(self, audio_enc_dim=512, llm_dim=2048, llm_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0", quantize=True):
         super().__init__()
         self.save_hyperparameters()
         self.audio_encoder.eval()
         self.llm_model.eval()
+        if quantize:
+            self.llm_model = jit.script(self.llm_model)
+            self.llm_model = quantize_dynamic(
+                                self.llm_model, {nn.Linear}, dtype=torch.qint8
+            )
     def encode(self, mel, pre_tokenized_ids, post_tokenized_ids, output_tokenized_ids):
         batch_size = mel.shape[0]