File size: 4,446 Bytes
a23d717
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from models.synthesizer.inference import Synthesizer
from models.encoder import inference as encoder
from models.vocoder.hifigan import inference as gan_vocoder
from pathlib import Path
import numpy as np
import soundfile as sf
import torch
import sys
import os
import re
import cn2an

vocoder = gan_vocoder

def gen_one_wav(synthesizer, in_fpath, embed, texts, file_name, seq):
    embeds = [embed] * len(texts)
    # If you know what the attention layer alignments are, you can retrieve them here by
    # passing return_alignments=True
    specs = synthesizer.synthesize_spectrograms(texts, embeds, style_idx=-1, min_stop_token=4, steps=400)
    #spec = specs[0]
    breaks = [spec.shape[1] for spec in specs]
    spec = np.concatenate(specs, axis=1)

    # If seed is specified, reset torch seed and reload vocoder
    # Synthesizing the waveform is fairly straightforward. Remember that the longer the
    # spectrogram, the more time-efficient the vocoder.
    generated_wav, output_sample_rate = vocoder.infer_waveform(spec)
    
    # Add breaks
    b_ends = np.cumsum(np.array(breaks) * synthesizer.hparams.hop_size)
    b_starts = np.concatenate(([0], b_ends[:-1]))
    wavs = [generated_wav[start:end] for start, end, in zip(b_starts, b_ends)]
    breaks = [np.zeros(int(0.15 * synthesizer.sample_rate))] * len(breaks)
    generated_wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])
    
    ## Post-generation
    # There's a bug with sounddevice that makes the audio cut one second earlier, so we
    # pad it.

    # Trim excess silences to compensate for gaps in spectrograms (issue #53)
    generated_wav = encoder.preprocess_wav(generated_wav)
    generated_wav = generated_wav / np.abs(generated_wav).max() * 0.97
        
    # Save it on the disk
    model=os.path.basename(in_fpath)
    filename = "%s_%d_%s.wav" %(file_name, seq, model)
    sf.write(filename, generated_wav, synthesizer.sample_rate)

    print("\nSaved output as %s\n\n" % filename)
    
    
def generate_wav(enc_model_fpath, syn_model_fpath, voc_model_fpath, in_fpath, input_txt, file_name): 
    if torch.cuda.is_available():
        device_id = torch.cuda.current_device()
        gpu_properties = torch.cuda.get_device_properties(device_id)
        ## Print some environment information (for debugging purposes)
        print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
            "%.1fGb total memory.\n" % 
            (torch.cuda.device_count(),
            device_id,
            gpu_properties.name,
            gpu_properties.major,
            gpu_properties.minor,
            gpu_properties.total_memory / 1e9))
    else:
        print("Using CPU for inference.\n")

    print("Preparing the encoder, the synthesizer and the vocoder...")
    encoder.load_model(enc_model_fpath)
    synthesizer = Synthesizer(syn_model_fpath)
    vocoder.load_model(voc_model_fpath)

    encoder_wav = synthesizer.load_preprocess_wav(in_fpath)
    embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)

    texts = input_txt.split("\n")
    seq=0
    each_num=1500
    
    punctuation = '๏ผ๏ผŒใ€‚ใ€,' # punctuate and split/clean text
    processed_texts = []
    cur_num = 0
    for text in texts:
      for processed_text in re.sub(r'[{}]+'.format(punctuation), '\n', text).split('\n'):
        if processed_text:
            processed_texts.append(processed_text.strip())
            cur_num += len(processed_text.strip())
      if cur_num > each_num:
        seq = seq +1
        gen_one_wav(synthesizer, in_fpath, embed, processed_texts, file_name, seq)
        processed_texts = []
        cur_num = 0
    
    if len(processed_texts)>0:
      seq = seq +1
      gen_one_wav(synthesizer, in_fpath, embed, processed_texts, file_name, seq)

if (len(sys.argv)>=3):
    my_txt = ""
    print("reading from :", sys.argv[1])
    with open(sys.argv[1], "r") as f:
        for line in f.readlines():
            #line = line.strip('\n')
            my_txt += line
    txt_file_name = sys.argv[1]
    wav_file_name = sys.argv[2]

    output = cn2an.transform(my_txt, "an2cn")
    print(output)
    generate_wav(
    Path("encoder/saved_models/pretrained.pt"),
    Path("synthesizer/saved_models/mandarin.pt"),
    Path("vocoder/saved_models/pretrained/g_hifigan.pt"), wav_file_name, output, txt_file_name
    )

else:
    print("please input the file name")
    exit(1)