Spaces:

r3gm
/

SoniTranslate_translate_audio_of_a_video_content

Running on Zero

SoniTranslate_translate_audio_of_a_video_content

File size: 18,242 Bytes

84e44a0

import torch
from lib.infer_pack.models import (
    SynthesizerTrnMs256NSFsid,
    SynthesizerTrnMs256NSFsid_nono,
    SynthesizerTrnMs768NSFsid,
    SynthesizerTrnMs768NSFsid_nono,
)
from vc_infer_pipeline import VC
import traceback, pdb
from lib.audio import load_audio
import numpy as np
import os
from fairseq import checkpoint_utils
import soundfile as sf
from gtts import gTTS
import edge_tts
import asyncio
import nest_asyncio

# model load
def get_vc(sid, to_return_protect0, to_return_protect1):
    global n_spk, tgt_sr, net_g, vc, cpt, version
    if sid == "" or sid == []:
        global hubert_model
        if hubert_model is not None:  # change model or not
            print("clean_empty_cache")
            del net_g, n_spk, vc, hubert_model, tgt_sr  # ,cpt
            hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            ### if clean
            if_f0 = cpt.get("f0", 1)
            version = cpt.get("version", "v1")
            if version == "v1":
                if if_f0 == 1:
                    net_g = SynthesizerTrnMs256NSFsid(
                        *cpt["config"], is_half=config.is_half
                    )
                else:
                    net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
            elif version == "v2":
                if if_f0 == 1:
                    net_g = SynthesizerTrnMs768NSFsid(
                        *cpt["config"], is_half=config.is_half
                    )
                else:
                    net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
            del net_g, cpt
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
        return {"visible": False, "__type__": "update"}
    person = "%s/%s" % (weight_root, sid)
    print("loading %s" % person)
    cpt = torch.load(person, map_location="cpu")
    tgt_sr = cpt["config"][-1]
    cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]  # n_spk
    if_f0 = cpt.get("f0", 1)
    if if_f0 == 0:
        to_return_protect0 = to_return_protect1 = {
            "visible": False,
            "value": 0.5,
            "__type__": "update",
        }
    else:
        to_return_protect0 = {
            "visible": True,
            "value": to_return_protect0,
            "__type__": "update",
        }
        to_return_protect1 = {
            "visible": True,
            "value": to_return_protect1,
            "__type__": "update",
        }
    version = cpt.get("version", "v1")
    if version == "v1":
        if if_f0 == 1:
            net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
        else:
            net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
    elif version == "v2":
        if if_f0 == 1:
            net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
        else:
            net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
    del net_g.enc_q
    print(net_g.load_state_dict(cpt["weight"], strict=False))
    net_g.eval().to(config.device)
    if config.is_half:
        net_g = net_g.half()
    else:
        net_g = net_g.float()
    vc = VC(tgt_sr, config)
    n_spk = cpt["config"][-3]
    return (
        {"visible": True, "maximum": n_spk, "__type__": "update"},
        to_return_protect0,
        to_return_protect1,
    )



# inference
def vc_single(
    sid,
    input_audio_path,
    f0_up_key,
    f0_file,
    f0_method,
    file_index,
    file_index2,
    # file_big_npy,
    index_rate,
    filter_radius,
    resample_sr,
    rms_mix_rate,
    protect,
):  
    global tgt_sr, net_g, vc, hubert_model, version, cpt
    if input_audio_path is None:
        return "You need to upload an audio", None
    f0_up_key = int(f0_up_key)
    try:
        audio = load_audio(input_audio_path, 16000)
        audio_max = np.abs(audio).max() / 0.95
        if audio_max > 1:
            audio /= audio_max
        times = [0, 0, 0]
        if not hubert_model:
            load_hubert()
        if_f0 = cpt.get("f0", 1)
        file_index = (
            (
                file_index.strip(" ")
                .strip('"')
                .strip("\n")
                .strip('"')
                .strip(" ")
                .replace("trained", "added")
            )
            if file_index != ""
            else file_index2
        )  # reemplace for 2
        # file_big_npy = (
        #     file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
        # )
        audio_opt = vc.pipeline(
            hubert_model,
            net_g,
            sid,
            audio,
            input_audio_path,
            times,
            f0_up_key,
            f0_method,
            file_index,
            # file_big_npy,
            index_rate,
            if_f0,
            filter_radius,
            tgt_sr,
            resample_sr,
            rms_mix_rate,
            version,
            protect,
            f0_file=f0_file,
        )
        if tgt_sr != resample_sr >= 16000:
            tgt_sr = resample_sr
        index_info = (
            "Using index:%s." % file_index
            if os.path.exists(file_index)
            else "Index not used."
        )
        return "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % (
            index_info,
            times[0],
            times[1],
            times[2],
        ), (tgt_sr, audio_opt)
    except:
        info = traceback.format_exc()
        print(info)
        return info, (None, None)



# hubert model
def load_hubert():
    global hubert_model
    models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
        ["hubert_base.pt"],
        suffix="",
    )
    hubert_model = models[0]
    hubert_model = hubert_model.to(config.device)
    if config.is_half:
        hubert_model = hubert_model.half()
    else:
        hubert_model = hubert_model.float()
    hubert_model.eval()

# config cpu
def use_fp32_config():
    for config_file in [
        "32k.json",
        "40k.json",
        "48k.json",
        "48k_v2.json",
        "32k_v2.json",
    ]:
        with open(f"configs/{config_file}", "r") as f:
            strr = f.read().replace("true", "false")
        with open(f"configs/{config_file}", "w") as f:
            f.write(strr)

# config device and torch type
class Config:
    def __init__(self, device, is_half):
        self.device = device
        self.is_half = is_half
        self.n_cpu = 2 # set cpu cores ####################
        self.gpu_name = None
        self.gpu_mem = None
        self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()

    def device_config(self) -> tuple:
        if torch.cuda.is_available():
            i_device = int(self.device.split(":")[-1])
            self.gpu_name = torch.cuda.get_device_name(i_device)
            if (
                ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
                or "P40" in self.gpu_name.upper()
                or "1060" in self.gpu_name
                or "1070" in self.gpu_name
                or "1080" in self.gpu_name
            ):
                print("16 series / 10 series graphics cards and P40 force single precision")
                self.is_half = False
                for config_file in ["32k.json", "40k.json", "48k.json"]:
                    with open(f"configs/{config_file}", "r") as f:
                        strr = f.read().replace("true", "false")
                    with open(f"configs/{config_file}", "w") as f:
                        f.write(strr)
                with open("trainset_preprocess_pipeline_print.py", "r") as f:
                    strr = f.read().replace("3.7", "3.0")
                with open("trainset_preprocess_pipeline_print.py", "w") as f:
                    f.write(strr)
            else:
                self.gpu_name = None
            self.gpu_mem = int(
                torch.cuda.get_device_properties(i_device).total_memory
                / 1024
                / 1024
                / 1024
                + 0.4
            )
            if self.gpu_mem <= 4:
                with open("trainset_preprocess_pipeline_print.py", "r") as f:
                    strr = f.read().replace("3.7", "3.0")
                with open("trainset_preprocess_pipeline_print.py", "w") as f:
                    f.write(strr)
        elif torch.backends.mps.is_available():
            print("Supported N-card not found, using MPS for inference")
            self.device = "mps"
        else:
            print("No supported N-card found, using CPU for inference")
            self.device = "cpu"
            self.is_half = False
            use_fp32_config()

        if self.n_cpu == 0:
            self.n_cpu = cpu_count()

        if self.is_half:
            # 6GB VRAM configuration
            x_pad = 3
            x_query = 10
            x_center = 60
            x_max = 65
        else:
            # 5GB VRAM configuration
            x_pad = 1
            x_query = 6
            x_center = 38
            x_max = 41

        if self.gpu_mem != None and self.gpu_mem <= 4:
            x_pad = 1
            x_query = 5
            x_center = 30
            x_max = 32




        print(self.device, self.is_half)

        return x_pad, x_query, x_center, x_max

# call inference
class ClassVoices:
    def __init__(self):
        self.file_index = "" # root

    def apply_conf(self, f0method,
                   model_voice_path00, transpose00, file_index2_00,
                   model_voice_path01, transpose01, file_index2_01,
                   model_voice_path02, transpose02, file_index2_02,
                   model_voice_path03, transpose03, file_index2_03,
                   model_voice_path04, transpose04, file_index2_04,
                   model_voice_path05, transpose05, file_index2_05,
                   model_voice_path99, transpose99, file_index2_99):

        #self.filename = filename
        self.f0method = f0method # pm
        
        self.model_voice_path00 = model_voice_path00
        self.transpose00 = transpose00
        self.file_index200 = file_index2_00

        self.model_voice_path01 = model_voice_path01
        self.transpose01 = transpose01
        self.file_index201 = file_index2_01

        self.model_voice_path02 = model_voice_path02
        self.transpose02 = transpose02
        self.file_index202 = file_index2_02

        self.model_voice_path03 = model_voice_path03
        self.transpose03 = transpose03
        self.file_index203 = file_index2_03

        self.model_voice_path04 = model_voice_path04
        self.transpose04 = transpose04
        self.file_index204 = file_index2_04

        self.model_voice_path05 = model_voice_path05
        self.transpose05 = transpose05
        self.file_index205 = file_index2_05

        self.model_voice_path99 = model_voice_path99
        self.transpose99 = transpose99
        self.file_index299 = file_index2_99
        return "CONFIGURATION APPLIED"

    def custom_voice(self,
        _values, # filter indices
        audio_files, # all audio files
        model_voice_path='',
        transpose=0,
        f0method='pm',
        file_index='',
        file_index2='',
        ):

        #hubert_model = None

        get_vc(
            sid=model_voice_path,  # model path
            to_return_protect0=0.33,
            to_return_protect1=0.33
        )

        for _value_item in _values:
            filename = "audio2/"+audio_files[_value_item] if _value_item != "test" else audio_files[0]
            #filename = "audio2/"+audio_files[_value_item]
            try:
                print(audio_files[_value_item], model_voice_path)
            except:
                pass

            info_, (sample_, audio_output_) = vc_single(
                sid=0,
                input_audio_path=filename, #f"audio2/{filename}",
                f0_up_key=transpose, # transpose for m to f and reverse 0 12
                f0_file=None,
                f0_method= f0method,
                file_index= file_index, # dir pwd?
                file_index2= file_index2,
                # file_big_npy1,
                index_rate= float(0.66),
                filter_radius= int(3),
                resample_sr= int(0),
                rms_mix_rate= float(0.25),
                protect= float(0.33),
            )

            sf.write(
                file= filename, #f"audio2/{filename}",
                samplerate=sample_,
                data=audio_output_
            )

        # detele the model

    def make_test(self, 
        tts_text, 
        tts_voice, 
        model_path,
        index_path,
        transpose,
        f0_method,
        ):
        os.system("rm -rf test")
        filename = "test/test.wav"

        if "SET_LIMIT" == os.getenv("DEMO"):
          if len(tts_text) > 60:
            tts_text = tts_text[:60]
            print("DEMO; limit to 60 characters")

        language = tts_voice[:2]
        try:
          os.system("mkdir test")
          #nest_asyncio.apply() # gradio;not
          asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save(filename))
        except:
          try:
              tts = gTTS(tts_text, lang=language)
              tts.save(filename)
              tts.save
              print(f'No audio was received. Please change the tts voice for {tts_voice}. USING gTTS.')
          except:
            tts = gTTS('a', lang=language)
            tts.save(filename)
            print('Error: Audio will be replaced.')

        os.system("cp test/test.wav test/real_test.wav")

        self([],[]) # start modules

        self.custom_voice(
            ["test"], # filter indices
            ["test/test.wav"], # all audio files
            model_voice_path=model_path,
            transpose=transpose,
            f0method=f0_method,
            file_index='',
            file_index2=index_path,
        )
        return "test/test.wav", "test/real_test.wav"

    def __call__(self, speakers_list, audio_files):

        speakers_indices = {}

        for index, speak_ in enumerate(speakers_list):
            if speak_ in speakers_indices:
                speakers_indices[speak_].append(index)
            else:
                speakers_indices[speak_] = [index]

        
        # find models and index
        global weight_root, index_root, config, hubert_model
        weight_root = "weights"
        names = []
        for name in os.listdir(weight_root):
            if name.endswith(".pth"):
                names.append(name)

        index_root = "logs"
        index_paths = []
        for name in os.listdir(index_root):
            if name.endswith(".index"):
                index_paths.append(name)

        print(names, index_paths)
        # config machine
        hubert_model = None
        config = Config('cuda:0', is_half=True) # config = Config('cpu', is_half=False) # cpu

        # filter by speaker
        for _speak, _values in speakers_indices.items():
            #print(_speak, _values)
            #for _value_item in _values:
            #  self.filename = "audio2/"+audio_files[_value_item]
            ###print(audio_files[_value_item])

            #vc(_speak, _values, audio_files)

            if _speak == "SPEAKER_00":
              self.custom_voice(
                    _values, # filteredd
                    audio_files,
                    model_voice_path=self.model_voice_path00,
                    file_index2=self.file_index200,
                    transpose=self.transpose00,
                    f0method=self.f0method,
                    file_index=self.file_index,
                    )
            elif _speak == "SPEAKER_01":
                self.custom_voice(
                    _values,
                    audio_files,
                    model_voice_path=self.model_voice_path01,
                    file_index2=self.file_index201,
                    transpose=self.transpose01,
                    f0method=self.f0method,
                    file_index=self.file_index,
                )
            elif _speak == "SPEAKER_02":
                self.custom_voice(
                    _values,
                    audio_files,
                    model_voice_path=self.model_voice_path02,
                    file_index2=self.file_index202,
                    transpose=self.transpose02,
                    f0method=self.f0method,
                    file_index=self.file_index,
                )
            elif _speak == "SPEAKER_03":
                self.custom_voice(
                    _values,
                    audio_files,
                    model_voice_path=self.model_voice_path03,
                    file_index2=self.file_index203,
                    transpose=self.transpose03,
                    f0method=self.f0method,
                    file_index=self.file_index,
                )
            elif _speak == "SPEAKER_04":
                self.custom_voice(
                    _values,
                    audio_files,
                    model_voice_path=self.model_voice_path04,
                    file_index2=self.file_index204,
                    transpose=self.transpose04,
                    f0method=self.f0method,
                    file_index=self.file_index,
                )
            elif _speak == "SPEAKER_05":
                self.custom_voice(
                    _values,
                    audio_files,
                    model_voice_path=self.model_voice_path05,
                    file_index2=self.file_index205,
                    transpose=self.transpose05,
                    f0method=self.f0method,
                    file_index=self.file_index,
                )
            elif _speak == "SPEAKER_99":
                self.custom_voice(
                    _values,
                    audio_files,
                    model_voice_path=self.model_voice_path99,
                    file_index2=self.file_index299,
                    transpose=self.transpose99,
                    f0method=self.f0method,
                    file_index=self.file_index,
                )
            else:
                pass