GPT-SoVITS-ProPlus

Runtime error

App Files Files Community

XXXXRT666 commited on Jun 29

Commit

5cfeca6

1 Parent(s): 7bdf3c3

Cache CUDA Graph

Browse files

Files changed (5) hide show

AR/models/structs.py +4 -6
AR/models/t2s_model_abc.py +33 -13
AR/models/t2s_model_flash_attn.py +62 -38
README.md +1 -1
inference_webui.py +6 -3

AR/models/structs.py CHANGED Viewed

@@ -1,3 +1,7 @@
 from __future__ import annotations
 from dataclasses import dataclass
@@ -48,7 +52,6 @@ class T2SSession:
             self.y_len = y_len
             # Cache
-            self.kv_cache = decoder.init_cache(bsz)
             self.sampler = Sampler(bsz, decoder.vocab_size)
             # Forward args
@@ -62,11 +65,6 @@ class T2SSession:
             self.input_pos = torch.zeros_like(self.prefill_len)
             self.input_pos.add_(self.prefill_len)
-            # CUDA Graph
-            self.graph: Optional[torch.cuda.CUDAGraph] = None
-            self.xy_pos_ = torch.rand((bsz, 1, decoder.embedding_dim)).to(dtype)
-            self.xy_dec_ = torch.rand((bsz, 1, decoder.embedding_dim)).to(dtype)
             # EOS
             self.completed = torch.Tensor([False] * len(self.x)).bool().to(device)
             self.y_results: List[Tensor] = [None] * len(self.x)  # type: ignore

+"""
+Modified From https://github.com/XXXXRT666/GPT-SoVITS
+"""
 from __future__ import annotations
 from dataclasses import dataclass
             self.y_len = y_len
             # Cache
             self.sampler = Sampler(bsz, decoder.vocab_size)
             # Forward args
             self.input_pos = torch.zeros_like(self.prefill_len)
             self.input_pos.add_(self.prefill_len)
             # EOS
             self.completed = torch.Tensor([False] * len(self.x)).bool().to(device)
             self.y_results: List[Tensor] = [None] * len(self.x)  # type: ignore

AR/models/t2s_model_abc.py CHANGED Viewed

@@ -1,9 +1,14 @@
 from __future__ import annotations
 import os
 from abc import ABC, abstractmethod
 from contextlib import nullcontext
 from typing import Any, Dict, List, MutableSequence, Optional, Tuple, Type
 import torch
 import torch._inductor.config
@@ -31,6 +36,7 @@ class Sampler(nn.Module):
         self.register_buffer("samples", torch.zeros((batch_size,), dtype=torch.int32), persistent=False)
         self.__CUDAGraph: Optional[CUDAGraph] = None
     def empty_cache(self):
         self.logits.zero_()
@@ -139,6 +145,7 @@ class Sampler(nn.Module):
         return idx_next
     def capture(self, temperature: float, top_k: int, top_p: float):
         s = torch.cuda.Stream()
         s.wait_stream(torch.cuda.current_stream())
@@ -153,7 +160,9 @@ class Sampler(nn.Module):
         with torch.cuda.graph(self.__CUDAGraph):
             self.samples = self.__sample_cuda_graph(logits, temperature, top_k, top_p)
         torch.cuda.synchronize()
     def sample(
         self,
         logits: Tensor,
@@ -162,21 +171,32 @@ class Sampler(nn.Module):
         top_k: int,
         top_p: float,
         repetition_penalty: float,
-        use_cuda_graph=False,
-        idx=-1,
     ) -> Tensor:
-        if use_cuda_graph and torch.cuda.is_available() and self.__CUDAGraph is None and idx > 0:
-            self.logits.copy_(logits)
-            self.capture(temperature, top_k, top_p)
-        if self.__CUDAGraph is not None:
-            self.logits.copy_(logits)
-            self.apply_repetition_penalty(self.logits, previous_tokens, repetition_penalty)
-            self.__CUDAGraph.replay()
-            samples = self.samples.clone()
-        else:
-            samples = self.__sample(logits, previous_tokens, temperature, top_k, top_p, repetition_penalty)[0]
-        return samples
 class KVCacheABC(ABC, nn.Module):

+"""
+Modified From https://github.com/XXXXRT666/GPT-SoVITS
+"""
 from __future__ import annotations
 import os
 from abc import ABC, abstractmethod
 from contextlib import nullcontext
 from typing import Any, Dict, List, MutableSequence, Optional, Tuple, Type
+import time
 import torch
 import torch._inductor.config
         self.register_buffer("samples", torch.zeros((batch_size,), dtype=torch.int32), persistent=False)
         self.__CUDAGraph: Optional[CUDAGraph] = None
     def empty_cache(self):
         self.logits.zero_()
         return idx_next
     def capture(self, temperature: float, top_k: int, top_p: float):
+        t1=time.perf_counter()
         s = torch.cuda.Stream()
         s.wait_stream(torch.cuda.current_stream())
         with torch.cuda.graph(self.__CUDAGraph):
             self.samples = self.__sample_cuda_graph(logits, temperature, top_k, top_p)
         torch.cuda.synchronize()
+        print("Sample",time.perf_counter()-t1)
+    # @torch.jit.script
     def sample(
         self,
         logits: Tensor,
         top_k: int,
         top_p: float,
         repetition_penalty: float,
     ) -> Tensor:
+        previous_tokens = previous_tokens.long()
+        score = torch.gather(logits, dim=1, index=previous_tokens)
+        score = torch.where(score < 0, score * repetition_penalty, score / repetition_penalty)
+        logits.scatter_(dim=1, index=previous_tokens, src=score)
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cum_probs = torch.cumsum(torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1)
+        sorted_indices_to_remove = cum_probs > top_p
+        sorted_indices_to_remove[:, 0] = False  # keep at least one option
+        indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove)
+        logits = logits.masked_fill(indices_to_remove, -float("Inf"))
+        logits = logits / max(temperature, 1e-5)
+        v, _ = torch.topk(logits, top_k)
+        pivot = v[:, -1].unsqueeze(-1)
+        logits = torch.where(logits < pivot, -float("Inf"), logits)
+        probs = torch.nn.functional.softmax(logits, dim=-1)
+        q = torch.empty_like(probs).exponential_(1.0)
+        idx_next = torch.argmax(probs / q, dim=-1, keepdim=True).to(dtype=torch.int32)
+        return idx_next
 class KVCacheABC(ABC, nn.Module):

AR/models/t2s_model_flash_attn.py CHANGED Viewed

@@ -1,8 +1,12 @@
-import gc
 import os
 import time
 import traceback
-from typing import Dict, List, Tuple
 import flash_attn  # type: ignore
 import torch
@@ -50,7 +54,7 @@ class Attention(AttentionABC):
         attn: Tensor = flash_attn.flash_attn_with_kvcache(
             q, kv_cache.k_cache, kv_cache.v_cache, k, v, cache_seqlens=input_pos - 1
-        )
         attn = self.dropout.forward(attn)
@@ -215,57 +219,66 @@ class CUDAGraphRunner:
         self.decoder_path: os.PathLike
         self.decoder_model: T2SDecoderABC = decoder_model.to(self.device, self.dtype)
     def _handle_request(self, request: T2SRequest) -> List[torch.Tensor]:
         with self.device:
             decoder = self.decoder_model
             session = T2SSession(decoder, request, device=self.device, dtype=self.dtype)
-            y = session.y
-            bsz = y.size(0)
             t1 = 0.0
             torch_profiler = TorchProfiler(request.debug)
             with torch_profiler.profiler():
                 for idx in tqdm(range(1500)):
                     if idx == 0:
-                        xy_dec = decoder.h.prefill(session.xy_pos, session.attn_mask_nested, session.kv_cache)
                         xy_dec = torch.stack([t[[-1]] for t in xy_dec.unbind()])
                     else:
-                        if request.use_cuda_graph and session.graph is None and torch.cuda.is_available():
-                            session.xy_pos_.copy_(session.xy_pos)
                             args, kwds = decoder.pre_forward(session)
-                            session.graph = decoder.capture(
-                                session.input_pos,
-                                session.xy_pos_,
-                                session.xy_dec_,
-                                kv_caches=session.kv_cache,
                                 *args,
                                 **kwds,
                             )
                         with torch_profiler.record("AR"):
-                            if session.graph:
-                                session.xy_pos_.copy_(session.xy_pos)
-                                session.graph.replay()
-                                xy_dec = session.xy_dec_.clone()
                             else:
                                 args, kwds = decoder.pre_forward(session)
                                 xy_dec = decoder.h.forward(
-                                    session.input_pos,
                                     session.xy_pos,
-                                    session.kv_cache,
                                     *args,
                                     **kwds,
                                 )
                     decoder.post_forward(idx, session)
                     logits = decoder.ar_predict_layer(xy_dec[:, -1])
-                    session.input_pos.add_(1)
                     if idx == 0:
-                        logits = logits[:, :-1]
                     with torch_profiler.record("Sampling"):
                         samples = session.sampler.sample(
                             logits=logits,
@@ -274,27 +287,26 @@ class CUDAGraphRunner:
                             top_p=request.top_p,
                             repetition_penalty=request.repetition_penalty,
                             temperature=request.temperature,
-                            use_cuda_graph=request.use_cuda_graph,
-                            idx=idx,
                         )
                         session.y = torch.cat([session.y, samples], dim=1)
                     with torch_profiler.record("EOS"):
                         argmax_token = torch.argmax(logits, dim=-1)
                         sample_token = samples.squeeze(1)
                         EOS_mask = (argmax_token == decoder.EOS) | (sample_token == decoder.EOS)
-                    with torch_profiler.record("EOS1"):
                         newly_done_mask = EOS_mask & (~session.completed)
-                    with torch_profiler.record("EOS2"):
                         newly_done_indices = newly_done_mask.nonzero()
-                    with torch_profiler.record("EOS3"):
                         if newly_done_indices.numel() > 0:
                             session.y_results[newly_done_indices[0]] = session.y[
                                 newly_done_indices[0], session.y_len : -1
                             ].squeeze(0)
                             session.completed[newly_done_indices] = True
-                    with torch_profiler.record("EOS4"):
                         if torch.all(session.completed).item():
                             if session.y.size(1) == 0:
                                 session.y = torch.cat([session.y, torch.zeros_like(samples)], dim=1)
@@ -304,11 +316,12 @@ class CUDAGraphRunner:
                                     f"T2S Decoding EOS {session.prefill_len.tolist().__str__().strip('[]')} -> \n{[i.size(0) for i in session.y_results].__str__().strip('[]')}"
                                 )
                                 tqdm.write(f"Infer Speed: {(idx - 1) / (time.perf_counter() - t1):.2f} token/s")
                             break
                         if (
-                            request.early_stop_num != -1
-                            and (session.y.size(1) - session.y_len) > request.early_stop_num
                         ):
                             for i in range(bsz):
                                 if not session.completed[i].item():
@@ -318,14 +331,25 @@ class CUDAGraphRunner:
                     with torch_profiler.record("NextPos"):
                         y_emb = decoder.ar_audio_embedding(session.y[:, -1:])
-                        session.xy_pos = decoder.ar_audio_position.forward(session.input_pos - session.x_lens, y_emb)
                     if idx == 2:
                         torch_profiler.start()
                         t1 = time.perf_counter()
-                    # if idx == 51:
-                    #     torch_profiler.end()
             match session.device.type:
                 case "cuda":
@@ -336,7 +360,7 @@ class CUDAGraphRunner:
                     torch.xpu.empty_cache()
                 case "mtia":
                     torch.mtia.empty_cache()
-            gc.collect()
             torch_profiler.end()
             return session.y_results[: request.valid_length]

+"""
+Modified From https://github.com/XXXXRT666/GPT-SoVITS
+"""
 import os
 import time
 import traceback
+from typing import Dict, List, Tuple,Optional
+import gradio as gr
 import flash_attn  # type: ignore
 import torch
         attn: Tensor = flash_attn.flash_attn_with_kvcache(
             q, kv_cache.k_cache, kv_cache.v_cache, k, v, cache_seqlens=input_pos - 1
+        ) # type: ignore
         attn = self.dropout.forward(attn)
         self.decoder_path: os.PathLike
         self.decoder_model: T2SDecoderABC = decoder_model.to(self.device, self.dtype)
+        self.graph: Optional[torch.cuda.CUDAGraph]= None
+        self.xy_pos_ = torch.rand((1, 1, decoder_model.embedding_dim),device=device).to(dtype)
+        self.xy_dec_ = torch.rand((1, 1, decoder_model.embedding_dim),device=device).to(dtype)
+        self.kv_cache = decoder_model.init_cache(1)
+        self.input_pos = torch.tensor([10]).int().cuda()
     def _handle_request(self, request: T2SRequest) -> List[torch.Tensor]:
         with self.device:
+            for i in self.kv_cache:
+                i.empty()
             decoder = self.decoder_model
             session = T2SSession(decoder, request, device=self.device, dtype=self.dtype)
+            self.input_pos.copy_(session.input_pos)
             t1 = 0.0
+            y = session.y
+            bsz = y.size(0)
             torch_profiler = TorchProfiler(request.debug)
             with torch_profiler.profiler():
                 for idx in tqdm(range(1500)):
                     if idx == 0:
+                        xy_dec = decoder.h.prefill(session.xy_pos, session.attn_mask_nested, self.kv_cache)
                         xy_dec = torch.stack([t[[-1]] for t in xy_dec.unbind()])
                     else:
+                        if request.use_cuda_graph and self.graph is None and torch.cuda.is_available():
+                            self.xy_pos_.copy_(session.xy_pos)
                             args, kwds = decoder.pre_forward(session)
+                            self.graph = decoder.capture(
+                                self.input_pos,
+                                self.xy_pos_,
+                                self.xy_dec_,
+                                kv_caches=self.kv_cache,
                                 *args,
                                 **kwds,
                             )
                         with torch_profiler.record("AR"):
+                            if self.graph:
+                                self.xy_pos_.copy_(session.xy_pos)
+                                self.graph.replay()
+                                xy_dec = self.xy_dec_.clone()
                             else:
                                 args, kwds = decoder.pre_forward(session)
                                 xy_dec = decoder.h.forward(
+                                    self.input_pos,
                                     session.xy_pos,
+                                    self.kv_cache,
                                     *args,
                                     **kwds,
                                 )
                     decoder.post_forward(idx, session)
                     logits = decoder.ar_predict_layer(xy_dec[:, -1])
+                    self.input_pos.add_(1)
                     if idx == 0:
+                        logits[:, -1] = float("-inf")
                     with torch_profiler.record("Sampling"):
                         samples = session.sampler.sample(
                             logits=logits,
                             top_p=request.top_p,
                             repetition_penalty=request.repetition_penalty,
                             temperature=request.temperature,
                         )
                         session.y = torch.cat([session.y, samples], dim=1)
                     with torch_profiler.record("EOS"):
                         argmax_token = torch.argmax(logits, dim=-1)
                         sample_token = samples.squeeze(1)
                         EOS_mask = (argmax_token == decoder.EOS) | (sample_token == decoder.EOS)
                         newly_done_mask = EOS_mask & (~session.completed)
                         newly_done_indices = newly_done_mask.nonzero()
                         if newly_done_indices.numel() > 0:
                             session.y_results[newly_done_indices[0]] = session.y[
                                 newly_done_indices[0], session.y_len : -1
                             ].squeeze(0)
                             session.completed[newly_done_indices] = True
                         if torch.all(session.completed).item():
                             if session.y.size(1) == 0:
                                 session.y = torch.cat([session.y, torch.zeros_like(samples)], dim=1)
                                     f"T2S Decoding EOS {session.prefill_len.tolist().__str__().strip('[]')} -> \n{[i.size(0) for i in session.y_results].__str__().strip('[]')}"
                                 )
                                 tqdm.write(f"Infer Speed: {(idx - 1) / (time.perf_counter() - t1):.2f} token/s")
+                                gr.Info(f"Infer Speed: {(idx - 1) / (time.perf_counter() - t1):.2f} token/s",duration=0.75)
                             break
                         if (
+                            (request.early_stop_num != -1
+                            and (session.y.size(1) - session.y_len) > request.early_stop_num )or idx ==1499
                         ):
                             for i in range(bsz):
                                 if not session.completed[i].item():
                     with torch_profiler.record("NextPos"):
                         y_emb = decoder.ar_audio_embedding(session.y[:, -1:])
+                        session.xy_pos = decoder.ar_audio_position.forward(self.input_pos - session.x_lens, y_emb)
                     if idx == 2:
                         torch_profiler.start()
                         t1 = time.perf_counter()
+                    if idx == 51:
+                        torch_profiler.end()
+                    if idx % 100 == 0:
+                        match session.device.type:
+                            case "cuda":
+                                torch.cuda.empty_cache()
+                            case "mps":
+                                torch.mps.empty_cache()
+                            case "xpu":
+                                torch.xpu.empty_cache()
+                            case "mtia":
+                                torch.mtia.empty_cache()
             match session.device.type:
                 case "cuda":
                     torch.xpu.empty_cache()
                 case "mtia":
                     torch.mtia.empty_cache()
             torch_profiler.end()
             return session.y_results[: request.valid_length]

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🤗
 colorFrom: indigo
 colorTo: red
 sdk: gradio
-sdk_version: 4.44.1
 app_file: inference_webui.py
 pinned: false
 license: mit

 colorFrom: indigo
 colorTo: red
 sdk: gradio
+sdk_version: 5.20.0
 app_file: inference_webui.py
 pinned: false
 license: mit

inference_webui.py CHANGED Viewed

@@ -57,6 +57,10 @@ import LangSegment
 import spaces
 import torch
 version = "v2"  # os.environ.get("version","v2")
 cnhubert_base_path = os.environ.get("cnhubert_base_path", "pretrained_models/chinese-hubert-base")
 bert_path = os.environ.get("bert_path", "pretrained_models/chinese-roberta-wwm-ext-large")
@@ -540,7 +544,7 @@ def get_tts_wav(
         if i_text in cache and if_freeze == True:
             pred_semantic = cache[i_text]
         else:
-            with torch.no_grad():
                 t2s_request = T2SRequest(
                     [all_phoneme_ids.squeeze(0)],
                     all_phoneme_len,
@@ -552,7 +556,7 @@ def get_tts_wav(
                     temperature=temperature,
                     early_stop_num=1500,
                     use_cuda_graph=True,
-                    debug=True,
                 )
                 t2s_result = t2s_model.generate(t2s_request)
                 pred_semantic = t2s_result.result
@@ -836,5 +840,4 @@ if __name__ == "__main__":
         server_name="0.0.0.0",
         inbrowser=True,
         show_api=False,
-        server_port=1111,
     )

 import spaces
 import torch
+import threading
+lock = threading.Lock()
 version = "v2"  # os.environ.get("version","v2")
 cnhubert_base_path = os.environ.get("cnhubert_base_path", "pretrained_models/chinese-hubert-base")
 bert_path = os.environ.get("bert_path", "pretrained_models/chinese-roberta-wwm-ext-large")
         if i_text in cache and if_freeze == True:
             pred_semantic = cache[i_text]
         else:
+            with torch.no_grad(),lock:
                 t2s_request = T2SRequest(
                     [all_phoneme_ids.squeeze(0)],
                     all_phoneme_len,
                     temperature=temperature,
                     early_stop_num=1500,
                     use_cuda_graph=True,
+                    # debug=True,
                 )
                 t2s_result = t2s_model.generate(t2s_request)
                 pred_semantic = t2s_result.result
         server_name="0.0.0.0",
         inbrowser=True,
         show_api=False,
     )