JacobLinCool commited on
Commit
bd8dcd1
1 Parent(s): 64f96e7

feat: use docker space

Browse files
Files changed (5) hide show
  1. Dockerfile +28 -0
  2. README.md +1 -3
  3. app.py +17 -227
  4. build.py +17 -0
  5. requirements.txt +1 -1
Dockerfile ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11
2
+
3
+ # By using XTTS you agree to CPML license https://coqui.ai/cpml
4
+ ENV COQUI_TOS_AGREED=1
5
+
6
+ # Set up a new user named "user" with user ID 1000
7
+ RUN useradd -m -u 1000 user
8
+
9
+ # Switch to the "user" user
10
+ USER user
11
+
12
+ # Set home to the user's home directory
13
+ ENV HOME=/home/user \
14
+ PATH=/home/user/.local/bin:$PATH
15
+
16
+ # Set the working directory to the user's home directory
17
+ WORKDIR $HOME/app
18
+
19
+ # Install dependencies
20
+ COPY --chown=user:user requirements.txt .
21
+ RUN pip install -r requirements.txt
22
+ RUN python -m unidic download
23
+
24
+ # Install model weights
25
+ COPY --chown=user:user . .
26
+ RUN python build.py
27
+
28
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -3,10 +3,8 @@ title: XTTS
3
  emoji: 🐸
4
  colorFrom: green
5
  colorTo: red
6
- sdk: gradio
7
- sdk_version: 3.48.0
8
- app_file: app.py
9
  pinned: false
 
10
  models:
11
  - coqui/XTTS-v2
12
  ---
 
3
  emoji: 🐸
4
  colorFrom: green
5
  colorTo: red
 
 
 
6
  pinned: false
7
+ sdk: docker
8
  models:
9
  - coqui/XTTS-v2
10
  ---
app.py CHANGED
@@ -1,38 +1,26 @@
1
- import sys
2
- import io, os, stat
3
  import subprocess
4
- import random
5
- from zipfile import ZipFile
6
  import uuid
7
  import time
8
  import torch
9
  import torchaudio
10
 
11
-
12
- #download for mecab
13
- os.system('python -m unidic download')
14
-
15
- # By using XTTS you agree to CPML license https://coqui.ai/cpml
16
- os.environ["COQUI_TOS_AGREED"] = "1"
17
-
18
  # langid is used to detect language for longer text
19
  # Most users expect text to be their own language, there is checkbox to disable it
20
  import langid
21
- import base64
22
  import csv
23
  from io import StringIO
24
  import datetime
25
  import re
26
 
27
  import gradio as gr
28
- from scipy.io.wavfile import write
29
- from pydub import AudioSegment
30
 
31
- from TTS.api import TTS
32
  from TTS.tts.configs.xtts_config import XttsConfig
33
  from TTS.tts.models.xtts import Xtts
34
  from TTS.utils.generic_utils import get_user_data_dir
35
 
 
 
36
  HF_TOKEN = os.environ.get("HF_TOKEN")
37
 
38
  from huggingface_hub import HfApi
@@ -41,21 +29,10 @@ from huggingface_hub import HfApi
41
  api = HfApi(token=HF_TOKEN)
42
  repo_id = "coqui/xtts"
43
 
44
- # Use never ffmpeg binary for Ubuntu20 to use denoising for microphone input
45
- print("Export newer ffmpeg binary for denoise filter")
46
- ZipFile("ffmpeg.zip").extractall()
47
- print("Make ffmpeg binary executable")
48
- st = os.stat("ffmpeg")
49
- os.chmod("ffmpeg", st.st_mode | stat.S_IEXEC)
50
-
51
- # This will trigger downloading model
52
- print("Downloading if not downloaded Coqui XTTS V2")
53
- from TTS.utils.manage import ModelManager
54
 
55
  model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
56
- ModelManager().download_model(model_name)
57
  model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
58
- print("XTTS downloaded")
59
 
60
  config = XttsConfig()
61
  config.load_json(os.path.join(model_path, "config.json"))
@@ -66,9 +43,15 @@ model.load_checkpoint(
66
  checkpoint_path=os.path.join(model_path, "model.pth"),
67
  vocab_path=os.path.join(model_path, "vocab.json"),
68
  eval=True,
69
- use_deepspeed=True,
70
  )
71
- model.cuda()
 
 
 
 
 
 
72
 
73
  # This is for debugging purposes only
74
  DEVICE_ASSERT_DETECTED = 0
@@ -81,8 +64,6 @@ def predict(
81
  prompt,
82
  language,
83
  audio_file_pth,
84
- mic_file_path,
85
- use_mic,
86
  voice_cleanup,
87
  no_lang_auto_detect,
88
  agree,
@@ -130,22 +111,7 @@ def predict(
130
  None,
131
  )
132
 
133
- if use_mic == True:
134
- if mic_file_path is not None:
135
- speaker_wav = mic_file_path
136
- else:
137
- gr.Warning(
138
- "Please record your voice with Microphone, or uncheck Use Microphone to use reference audios"
139
- )
140
- return (
141
- None,
142
- None,
143
- None,
144
- None,
145
- )
146
-
147
- else:
148
- speaker_wav = audio_file_pth
149
 
150
  # Filtering for microphone input, as it has BG noise, maybe silence in beginning and end
151
  # This is fast filtering not perfect
@@ -328,8 +294,6 @@ def predict(
328
  prompt,
329
  language,
330
  audio_file_pth,
331
- mic_file_path,
332
- use_mic,
333
  voice_cleanup,
334
  no_lang_auto_detect,
335
  agree,
@@ -450,160 +414,6 @@ article = """
450
  <p>We collect data only for error cases for improvement.</p>
451
  </div>
452
  """
453
- examples = [
454
- [
455
- "Once when I was six years old I saw a magnificent picture",
456
- "en",
457
- "examples/female.wav",
458
- None,
459
- False,
460
- False,
461
- False,
462
- True,
463
- ],
464
- [
465
- "Lorsque j'avais six ans j'ai vu, une fois, une magnifique image",
466
- "fr",
467
- "examples/male.wav",
468
- None,
469
- False,
470
- False,
471
- False,
472
- True,
473
- ],
474
- [
475
- "Als ich sechs war, sah ich einmal ein wunderbares Bild",
476
- "de",
477
- "examples/female.wav",
478
- None,
479
- False,
480
- False,
481
- False,
482
- True,
483
- ],
484
- [
485
- "Cuando tenía seis años, vi una vez una imagen magnífica",
486
- "es",
487
- "examples/male.wav",
488
- None,
489
- False,
490
- False,
491
- False,
492
- True,
493
- ],
494
- [
495
- "Quando eu tinha seis anos eu vi, uma vez, uma imagem magnífica",
496
- "pt",
497
- "examples/female.wav",
498
- None,
499
- False,
500
- False,
501
- False,
502
- True,
503
- ],
504
- [
505
- "Kiedy miałem sześć lat, zobaczyłem pewnego razu wspaniały obrazek",
506
- "pl",
507
- "examples/male.wav",
508
- None,
509
- False,
510
- False,
511
- False,
512
- True,
513
- ],
514
- [
515
- "Un tempo lontano, quando avevo sei anni, vidi un magnifico disegno",
516
- "it",
517
- "examples/female.wav",
518
- None,
519
- False,
520
- False,
521
- False,
522
- True,
523
- ],
524
- [
525
- "Bir zamanlar, altı yaşındayken, muhteşem bir resim gördüm",
526
- "tr",
527
- "examples/female.wav",
528
- None,
529
- False,
530
- False,
531
- False,
532
- True,
533
- ],
534
- [
535
- "Когда мне было шесть лет, я увидел однажды удивительную картинку",
536
- "ru",
537
- "examples/female.wav",
538
- None,
539
- False,
540
- False,
541
- False,
542
- True,
543
- ],
544
- [
545
- "Toen ik een jaar of zes was, zag ik op een keer een prachtige plaat",
546
- "nl",
547
- "examples/male.wav",
548
- None,
549
- False,
550
- False,
551
- False,
552
- True,
553
- ],
554
- [
555
- "Když mi bylo šest let, viděl jsem jednou nádherný obrázek",
556
- "cs",
557
- "examples/female.wav",
558
- None,
559
- False,
560
- False,
561
- False,
562
- True,
563
- ],
564
- [
565
- "当我还只有六岁的时候, 看到了一副精彩的插画",
566
- "zh-cn",
567
- "examples/female.wav",
568
- None,
569
- False,
570
- False,
571
- False,
572
- True,
573
- ],
574
- [
575
- "かつて 六歳のとき、素晴らしい絵を見ました",
576
- "ja",
577
- "examples/female.wav",
578
- None,
579
- False,
580
- True,
581
- False,
582
- True,
583
- ],
584
- [
585
- "한번은 내가 여섯 살이었을 때 멋진 그림을 보았습니다.",
586
- "ko",
587
- "examples/female.wav",
588
- None,
589
- False,
590
- True,
591
- False,
592
- True,
593
- ],
594
- [
595
- "Egyszer hat éves koromban láttam egy csodálatos képet",
596
- "hu",
597
- "examples/male.wav",
598
- None,
599
- False,
600
- True,
601
- False,
602
- True,
603
- ],
604
- ]
605
-
606
-
607
 
608
  with gr.Blocks(analytics_enabled=False) as demo:
609
  with gr.Row():
@@ -651,7 +461,6 @@ with gr.Blocks(analytics_enabled=False) as demo:
651
  "ko",
652
  "hu"
653
  ],
654
- max_choices=1,
655
  value="en",
656
  )
657
  ref_gr = gr.Audio(
@@ -660,17 +469,6 @@ with gr.Blocks(analytics_enabled=False) as demo:
660
  type="filepath",
661
  value="examples/female.wav",
662
  )
663
- mic_gr = gr.Audio(
664
- source="microphone",
665
- type="filepath",
666
- info="Use your microphone to record audio",
667
- label="Use Microphone for Reference",
668
- )
669
- use_mic_gr = gr.Checkbox(
670
- label="Use Microphone",
671
- value=False,
672
- info="Notice: Microphone input may not work properly under traffic",
673
- )
674
  clean_ref_gr = gr.Checkbox(
675
  label="Cleanup Reference Voice",
676
  value=False,
@@ -696,15 +494,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
696
  out_text_gr = gr.Text(label="Metrics")
697
  ref_audio_gr = gr.Audio(label="Reference Audio Used")
698
 
699
- with gr.Row():
700
- gr.Examples(examples,
701
- label="Examples",
702
- inputs=[input_text_gr, language_gr, ref_gr, mic_gr, use_mic_gr, clean_ref_gr, auto_det_lang_gr, tos_gr],
703
- outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr],
704
- fn=predict,
705
- cache_examples=False,)
706
-
707
- tts_button.click(predict, [input_text_gr, language_gr, ref_gr, mic_gr, use_mic_gr, clean_ref_gr, auto_det_lang_gr, tos_gr], outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr])
708
-
709
- demo.queue()
710
- demo.launch(debug=True, show_api=True)
 
1
+ import os
 
2
  import subprocess
 
 
3
  import uuid
4
  import time
5
  import torch
6
  import torchaudio
7
 
 
 
 
 
 
 
 
8
  # langid is used to detect language for longer text
9
  # Most users expect text to be their own language, there is checkbox to disable it
10
  import langid
 
11
  import csv
12
  from io import StringIO
13
  import datetime
14
  import re
15
 
16
  import gradio as gr
 
 
17
 
 
18
  from TTS.tts.configs.xtts_config import XttsConfig
19
  from TTS.tts.models.xtts import Xtts
20
  from TTS.utils.generic_utils import get_user_data_dir
21
 
22
+ print("application starting")
23
+
24
  HF_TOKEN = os.environ.get("HF_TOKEN")
25
 
26
  from huggingface_hub import HfApi
 
29
  api = HfApi(token=HF_TOKEN)
30
  repo_id = "coqui/xtts"
31
 
32
+ print("loading model")
 
 
 
 
 
 
 
 
 
33
 
34
  model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
 
35
  model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
 
36
 
37
  config = XttsConfig()
38
  config.load_json(os.path.join(model_path, "config.json"))
 
43
  checkpoint_path=os.path.join(model_path, "model.pth"),
44
  vocab_path=os.path.join(model_path, "vocab.json"),
45
  eval=True,
46
+ use_deepspeed=False,
47
  )
48
+
49
+ if torch.cuda.is_available():
50
+ model.cuda()
51
+ else:
52
+ model.cpu()
53
+
54
+ print("Model loaded")
55
 
56
  # This is for debugging purposes only
57
  DEVICE_ASSERT_DETECTED = 0
 
64
  prompt,
65
  language,
66
  audio_file_pth,
 
 
67
  voice_cleanup,
68
  no_lang_auto_detect,
69
  agree,
 
111
  None,
112
  )
113
 
114
+ speaker_wav = audio_file_pth
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  # Filtering for microphone input, as it has BG noise, maybe silence in beginning and end
117
  # This is fast filtering not perfect
 
294
  prompt,
295
  language,
296
  audio_file_pth,
 
 
297
  voice_cleanup,
298
  no_lang_auto_detect,
299
  agree,
 
414
  <p>We collect data only for error cases for improvement.</p>
415
  </div>
416
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
417
 
418
  with gr.Blocks(analytics_enabled=False) as demo:
419
  with gr.Row():
 
461
  "ko",
462
  "hu"
463
  ],
 
464
  value="en",
465
  )
466
  ref_gr = gr.Audio(
 
469
  type="filepath",
470
  value="examples/female.wav",
471
  )
 
 
 
 
 
 
 
 
 
 
 
472
  clean_ref_gr = gr.Checkbox(
473
  label="Cleanup Reference Voice",
474
  value=False,
 
494
  out_text_gr = gr.Text(label="Metrics")
495
  ref_audio_gr = gr.Audio(label="Reference Audio Used")
496
 
497
+ tts_button.click(predict, [input_text_gr, language_gr, ref_gr, clean_ref_gr, auto_det_lang_gr, tos_gr], outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr])
498
+
499
+ print("Starting server")
500
+ demo.queue().launch(debug=True, show_api=True)
 
 
 
 
 
 
 
 
build.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, stat
2
+ from zipfile import ZipFile
3
+
4
+ # Use never ffmpeg binary for Ubuntu20 to use denoising for microphone input
5
+ print("Export newer ffmpeg binary for denoise filter")
6
+ ZipFile("ffmpeg.zip").extractall()
7
+ print("Make ffmpeg binary executable")
8
+ st = os.stat("ffmpeg")
9
+ os.chmod("ffmpeg", st.st_mode | stat.S_IEXEC)
10
+
11
+ # This will trigger downloading model
12
+ print("Downloading if not downloaded Coqui XTTS V2")
13
+ from TTS.utils.manage import ModelManager
14
+
15
+ model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
16
+ ModelManager().download_model(model_name)
17
+ print("XTTS downloaded")
requirements.txt CHANGED
@@ -8,5 +8,5 @@ mecab-python3==1.0.6
8
  unidic-lite==1.0.8
9
  unidic==1.1.0
10
  langid
11
- deepspeed
12
  pydub
 
 
8
  unidic-lite==1.0.8
9
  unidic==1.1.0
10
  langid
 
11
  pydub
12
+ gradio