Serhiy Stetskovych commited on
Commit
4b1870b
β€’
0 Parent(s):

iniial commit

Browse files
Files changed (4) hide show
  1. Dockerfile +21 -0
  2. README.md +10 -0
  3. app.py +60 -0
  4. requirements.txt +8 -0
Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:11.8.0-devel-ubuntu22.04
2
+
3
+
4
+
5
+ WORKDIR /app
6
+
7
+ RUN apt-get update
8
+ RUN apt-get install -y python3-pip git
9
+
10
+ RUN useradd -m -u 1000 user
11
+ USER user
12
+ ENV PATH="/home/user/.local/bin:$PATH"
13
+
14
+ COPY --chown=user ./requirements.txt requirements.txt
15
+ RUN pip3 install --upgrade pip wheel
16
+ RUN pip install numpy==1.26.2 torch==2.3.0 packaging
17
+ RUN pip install -U flash-attn==2.5.8
18
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
19
+
20
+ COPY --chown=user app.py /app
21
+ CMD [ "python3", "app.py" ]
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Valle2 Demo
3
+ emoji: 🌍
4
+ colorFrom: yellow
5
+ colorTo: gray
6
+ sdk: docker
7
+ app_port: 7860
8
+ pinned: false
9
+ ---
10
+
app.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import torch
4
+ import gradio as gr
5
+
6
+
7
+
8
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
9
+
10
+ # Vocoder
11
+ vocoder = torch.hub.load(repo_or_dir='ex3ndr/supervoice-vocoder', model='bigvsan')
12
+ vocoder.to(device)
13
+ vocoder.eval()
14
+
15
+ # GPT Model
16
+ gpt = torch.hub.load(repo_or_dir='ex3ndr/supervoice-gpt', model='phonemizer')
17
+ gpt.to(device)
18
+ gpt.eval()
19
+
20
+ # Main Model
21
+ model = torch.hub.load(repo_or_dir='ex3ndr/supervoice-voicebox', model='phonemizer', gpt=gpt, vocoder=vocoder)
22
+ model.to(device)
23
+ model.eval()
24
+
25
+
26
+
27
+
28
+ description = f'''
29
+ Voicebox demo
30
+ '''
31
+
32
+ def synthesise(text, voice):
33
+ output = model.synthesize(text, voice = voice, steps = 8, alpha = 0.1)
34
+ waveform = output['wav']
35
+ return (24000, waveform.numpy())
36
+
37
+ if __name__ == "__main__":
38
+ i = gr.Interface(
39
+ fn=synthesise,
40
+ description=description,
41
+ inputs=[
42
+ gr.Text(label='Text:', lines=5, max_lines=10),
43
+ gr.Dropdown(label="voice", choices=("voice_1", "voice_2"), value="voice_1"),
44
+ ],
45
+ outputs=[
46
+ gr.Audio(
47
+ label="Audio:",
48
+ autoplay=False,
49
+ streaming=False,
50
+ type="numpy",
51
+ ),
52
+
53
+ ],
54
+ allow_flagging ='never',
55
+ cache_examples=True,
56
+ title='Something',
57
+ examples=[ ],
58
+ )
59
+ i.queue(max_size=20, default_concurrency_limit=4)
60
+ i.launch(share=False, server_name="0.0.0.0")
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ torch==2.3.0
2
+ gradio
3
+ torchaudio
4
+ vocos
5
+ encodec
6
+ sentencepiece
7
+ xformers
8
+ flash-attn