File size: 5,109 Bytes
35f6708 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import argparse
import logging
import sys
import time
import wave
from pathlib import Path
from typing import Any, Dict
from . import PiperVoice
from .download import ensure_voice_exists, find_voice, get_voices
_FILE = Path(__file__)
_DIR = _FILE.parent
_LOGGER = logging.getLogger(_FILE.stem)
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("-m", "--model", required=True, help="Path to Onnx model file")
parser.add_argument("-c", "--config", help="Path to model config file")
parser.add_argument(
"-f",
"--output-file",
"--output_file",
help="Path to output WAV file (default: stdout)",
)
parser.add_argument(
"-d",
"--output-dir",
"--output_dir",
help="Path to output directory (default: cwd)",
)
parser.add_argument(
"--output-raw",
"--output_raw",
action="store_true",
help="Stream raw audio to stdout",
)
#
parser.add_argument("-s", "--speaker", type=int, help="Id of speaker (default: 0)")
parser.add_argument(
"--length-scale", "--length_scale", type=float, help="Phoneme length"
)
parser.add_argument(
"--noise-scale", "--noise_scale", type=float, help="Generator noise"
)
parser.add_argument(
"--noise-w", "--noise_w", type=float, help="Phoneme width noise"
)
#
parser.add_argument("--cuda", action="store_true", help="Use GPU")
#
parser.add_argument(
"--sentence-silence",
"--sentence_silence",
type=float,
default=0.0,
help="Seconds of silence after each sentence",
)
#
parser.add_argument(
"--data-dir",
"--data_dir",
action="append",
default=[str(Path.cwd())],
help="Data directory to check for downloaded models (default: current directory)",
)
parser.add_argument(
"--download-dir",
"--download_dir",
help="Directory to download voices into (default: first data dir)",
)
#
parser.add_argument(
"--update-voices",
action="store_true",
help="Download latest voices.json during startup",
)
#
parser.add_argument(
"--debug", action="store_true", help="Print DEBUG messages to console"
)
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
_LOGGER.debug(args)
if not args.download_dir:
# Download to first data directory by default
args.download_dir = args.data_dir[0]
# Download voice if file doesn't exist
model_path = Path(args.model)
if not model_path.exists():
# Load voice info
voices_info = get_voices(args.download_dir, update_voices=args.update_voices)
# Resolve aliases for backwards compatibility with old voice names
aliases_info: Dict[str, Any] = {}
for voice_info in voices_info.values():
for voice_alias in voice_info.get("aliases", []):
aliases_info[voice_alias] = {"_is_alias": True, **voice_info}
voices_info.update(aliases_info)
ensure_voice_exists(args.model, args.data_dir, args.download_dir, voices_info)
args.model, args.config = find_voice(args.model, args.data_dir)
# Load voice
voice = PiperVoice.load(args.model, config_path=args.config, use_cuda=args.cuda)
synthesize_args = {
"speaker_id": args.speaker,
"length_scale": args.length_scale,
"noise_scale": args.noise_scale,
"noise_w": args.noise_w,
"sentence_silence": args.sentence_silence,
}
if args.output_raw:
# Read line-by-line
for line in sys.stdin:
line = line.strip()
if not line:
continue
# Write raw audio to stdout as its produced
audio_stream = voice.synthesize_stream_raw(line, **synthesize_args)
for audio_bytes in audio_stream:
sys.stdout.buffer.write(audio_bytes)
sys.stdout.buffer.flush()
elif args.output_dir:
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
# Read line-by-line
for line in sys.stdin:
line = line.strip()
if not line:
continue
wav_path = output_dir / f"{time.monotonic_ns()}.wav"
with wave.open(str(wav_path), "wb") as wav_file:
voice.synthesize(line, wav_file, **synthesize_args)
_LOGGER.info("Wrote %s", wav_path)
else:
# Read entire input
text = sys.stdin.read()
if (not args.output_file) or (args.output_file == "-"):
# Write to stdout
with wave.open(sys.stdout.buffer, "wb") as wav_file:
voice.synthesize(text, wav_file, **synthesize_args)
else:
# Write to file
with wave.open(args.output_file, "wb") as wav_file:
voice.synthesize(text, wav_file, **synthesize_args)
if __name__ == "__main__":
main()
|