zzk1st commited on
Commit
88c0b9b
·
1 Parent(s): bd6788c

Major pull from github

Browse files

Added Docker header in README.md

Removed png assets

APIs.py CHANGED
@@ -9,25 +9,15 @@ from retrying import retry
9
 
10
 
11
  os.environ['OPENBLAS_NUM_THREADS'] = '1'
12
-
13
  SAMPLE_RATE = 32000
14
 
15
 
16
  with open('config.yaml', 'r') as file:
17
  config = yaml.safe_load(file)
18
- tts_port = config['Text-to-Speech']['service-port']
19
- ttm_port = config['Text-to-Music']['service-port']
20
- tta_port = config['Text-to-Audio']['service-port']
21
- sr_port = config['Speech-Restoration']['service-port']
22
- vp_port = config['Voice-Parser']['service-port']
23
  enable_sr = config['Speech-Restoration']['Enable']
24
 
25
 
26
- def IDLE(length=1.0, out_wav='out.wav', sr=SAMPLE_RATE):
27
- idle = np.zeros(int(length * sr))
28
- WRITE_AUDIO(idle, name=out_wav, sr=SAMPLE_RATE)
29
-
30
-
31
  def LOUDNESS_NORM(audio, sr=32000, volumn=-25):
32
  # peak normalize audio to -1 dB
33
  peak_normalized_audio = pyln.normalize.peak(audio, -10.0)
@@ -57,7 +47,7 @@ def WRITE_AUDIO(wav, name=None, sr=SAMPLE_RATE):
57
  if max_value > 1:
58
  wav *= 0.9 / max_value
59
 
60
- # print(f'WRITE_AUDIO to {name}')
61
  write(name, sr, np.round(wav*32767).astype(np.int16))
62
 
63
 
@@ -81,10 +71,6 @@ def MIX(wavs=[['1.wav', 0.], ['2.wav', 10.]], out_wav='out.wav', sr=SAMPLE_RATE)
81
  wavs:[[wav_name, absolute_offset], ...]
82
  """
83
 
84
- # last_name, last_offset = wavs[-1]
85
- # last_len = len(READ_AUDIO_NUMPY(last_name))
86
- # max_length = int(last_offset * sr + last_len)
87
-
88
  max_length = max([int(wav[1]*sr + len(READ_AUDIO_NUMPY(wav[0]))) for wav in wavs])
89
  template_wav = np.zeros(max_length)
90
 
@@ -125,7 +111,7 @@ def COMPUTE_LEN(wav):
125
 
126
  @retry(stop_max_attempt_number=5, wait_fixed=2000)
127
  def TTM(text, length=10, volume=-28, out_wav='out.wav'):
128
- url = f'http://127.0.0.1:{ttm_port}/generate_music'
129
  data = {
130
  'text': f'{text}',
131
  'length': f'{length}',
@@ -143,7 +129,7 @@ def TTM(text, length=10, volume=-28, out_wav='out.wav'):
143
 
144
  @retry(stop_max_attempt_number=5, wait_fixed=2000)
145
  def TTA(text, length=5, volume=-35, out_wav='out.wav'):
146
- url = f'http://127.0.0.1:{tta_port}/generate_audio'
147
  data = {
148
  'text': f'{text}',
149
  'length': f'{length}',
@@ -162,7 +148,7 @@ def TTA(text, length=5, volume=-35, out_wav='out.wav'):
162
 
163
  @retry(stop_max_attempt_number=5, wait_fixed=2000)
164
  def TTS(text, speaker='news_anchor', volume=-20, out_wav='out.wav', enhanced=enable_sr, speaker_id='', speaker_npz=''):
165
- url = f'http://127.0.0.1:{tts_port}/generate_speech'
166
  data = {
167
  'text': f'{text}',
168
  'speaker_id': f'{speaker_id}',
@@ -185,7 +171,7 @@ def TTS(text, speaker='news_anchor', volume=-20, out_wav='out.wav', enhanced=ena
185
 
186
  @retry(stop_max_attempt_number=5, wait_fixed=2000)
187
  def SR(processfile):
188
- url = f'http://127.0.0.1:{sr_port}/fix_audio'
189
  data = {'processfile': f'{processfile}'}
190
 
191
  response = requests.post(url, json=data)
@@ -199,7 +185,7 @@ def SR(processfile):
199
 
200
  @retry(stop_max_attempt_number=5, wait_fixed=2000)
201
  def VP(wav_path, out_dir):
202
- url = f'http://127.0.0.1:{vp_port}/parse_voice'
203
  data = {
204
  'wav_path': f'{wav_path}',
205
  'out_dir':f'{out_dir}'
 
9
 
10
 
11
  os.environ['OPENBLAS_NUM_THREADS'] = '1'
 
12
  SAMPLE_RATE = 32000
13
 
14
 
15
  with open('config.yaml', 'r') as file:
16
  config = yaml.safe_load(file)
17
+ service_port = config['Service-Port']
 
 
 
 
18
  enable_sr = config['Speech-Restoration']['Enable']
19
 
20
 
 
 
 
 
 
21
  def LOUDNESS_NORM(audio, sr=32000, volumn=-25):
22
  # peak normalize audio to -1 dB
23
  peak_normalized_audio = pyln.normalize.peak(audio, -10.0)
 
47
  if max_value > 1:
48
  wav *= 0.9 / max_value
49
 
50
+ # write audio
51
  write(name, sr, np.round(wav*32767).astype(np.int16))
52
 
53
 
 
71
  wavs:[[wav_name, absolute_offset], ...]
72
  """
73
 
 
 
 
 
74
  max_length = max([int(wav[1]*sr + len(READ_AUDIO_NUMPY(wav[0]))) for wav in wavs])
75
  template_wav = np.zeros(max_length)
76
 
 
111
 
112
  @retry(stop_max_attempt_number=5, wait_fixed=2000)
113
  def TTM(text, length=10, volume=-28, out_wav='out.wav'):
114
+ url = f'http://127.0.0.1:{service_port}/generate_music'
115
  data = {
116
  'text': f'{text}',
117
  'length': f'{length}',
 
129
 
130
  @retry(stop_max_attempt_number=5, wait_fixed=2000)
131
  def TTA(text, length=5, volume=-35, out_wav='out.wav'):
132
+ url = f'http://127.0.0.1:{service_port}/generate_audio'
133
  data = {
134
  'text': f'{text}',
135
  'length': f'{length}',
 
148
 
149
  @retry(stop_max_attempt_number=5, wait_fixed=2000)
150
  def TTS(text, speaker='news_anchor', volume=-20, out_wav='out.wav', enhanced=enable_sr, speaker_id='', speaker_npz=''):
151
+ url = f'http://127.0.0.1:{service_port}/generate_speech'
152
  data = {
153
  'text': f'{text}',
154
  'speaker_id': f'{speaker_id}',
 
171
 
172
  @retry(stop_max_attempt_number=5, wait_fixed=2000)
173
  def SR(processfile):
174
+ url = f'http://127.0.0.1:{service_port}/fix_audio'
175
  data = {'processfile': f'{processfile}'}
176
 
177
  response = requests.post(url, json=data)
 
185
 
186
  @retry(stop_max_attempt_number=5, wait_fixed=2000)
187
  def VP(wav_path, out_dir):
188
+ url = f'http://127.0.0.1:{service_port}/parse_voice'
189
  data = {
190
  'wav_path': f'{wav_path}',
191
  'out_dir':f'{out_dir}'
AudioCraft/app.py DELETED
@@ -1,110 +0,0 @@
1
- import sys
2
- sys.path.append('../AudioJourney')
3
- import os
4
- import yaml
5
- import logging
6
- import torchaudio
7
- from APIs import WRITE_AUDIO, LOUDNESS_NORM
8
- from utils import fade
9
- from flask import Flask, request, jsonify
10
-
11
- with open('config.yaml', 'r') as file:
12
- config = yaml.safe_load(file)
13
-
14
- # Configure the logging format and level
15
- logging.basicConfig(
16
- level=logging.INFO,
17
- format='%(asctime)s - %(levelname)s - %(message)s'
18
- )
19
-
20
- # Create a FileHandler for the log file
21
- os.makedirs('services_logs', exist_ok=True)
22
- log_filename = 'services_logs/Text-to-Audio-Music.log'
23
- file_handler = logging.FileHandler(log_filename, mode='w')
24
- file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
25
-
26
- # Add the FileHandler to the root logger
27
- logging.getLogger('').addHandler(file_handler)
28
-
29
-
30
- # Initialize the model here
31
- from audiocraft.models import AudioGen, MusicGen
32
- tta_model = AudioGen.get_pretrained('facebook/audiogen-medium')
33
- logging.info('AudioGen is loaded ...')
34
-
35
- model_size = config['Text-to-Music']['model_size']
36
- ttm_model = MusicGen.get_pretrained(f'facebook/musicgen-{model_size}')
37
- logging.info(f'MusicGen ({model_size}) is loaded ...')
38
-
39
- app = Flask(__name__)
40
-
41
- @app.route('/generate_audio', methods=['POST'])
42
- def generate_audio():
43
- # Receive the text from the POST request
44
- data = request.json
45
- text = data['text']
46
- length = float(data.get('length', 5.0))
47
- volume = float(data.get('volume', -35))
48
- output_wav = data.get('output_wav', 'out.wav')
49
-
50
- logging.info(f'TTA (AudioGen): Prompt: {text}, length: {length} seconds, volume: {volume} dB')
51
-
52
- try:
53
- tta_model.set_generation_params(duration=length)
54
- wav = tta_model.generate([text])
55
- wav = torchaudio.functional.resample(wav, orig_freq=16000, new_freq=32000)
56
-
57
- wav = wav.squeeze().cpu().detach().numpy()
58
- wav = fade(LOUDNESS_NORM(wav, volumn=volume))
59
- WRITE_AUDIO(wav, name=output_wav)
60
-
61
- # Return success message and the filename of the generated audio
62
- return jsonify({'message': f'Text-to-Audio generated successfully | {text}', 'file': output_wav})
63
-
64
- except Exception as e:
65
- return jsonify({'API error': str(e)}), 500
66
-
67
-
68
- @app.route('/generate_music', methods=['POST'])
69
- def generate_music():
70
- # Receive the text from the POST request
71
- data = request.json
72
- text = data['text']
73
- length = float(data.get('length', 5.0))
74
- volume = float(data.get('volume', -35))
75
- output_wav = data.get('output_wav', 'out.wav')
76
-
77
- logging.info(f'TTM (MusicGen): Prompt: {text}, length: {length} seconds, volume: {volume} dB')
78
-
79
-
80
- try:
81
- ttm_model.set_generation_params(duration=length)
82
- wav = ttm_model.generate([text])
83
- wav = wav[0][0].cpu().detach().numpy()
84
- wav = fade(LOUDNESS_NORM(wav, volumn=volume))
85
- WRITE_AUDIO(wav, name=output_wav)
86
-
87
- # Return success message and the filename of the generated audio
88
- return jsonify({'message': f'Text-to-Music generated successfully | {text}', 'file': output_wav})
89
-
90
- except Exception as e:
91
- # Return error message if something goes wrong
92
- return jsonify({'API error': str(e)}), 500
93
-
94
-
95
- if __name__ == '__main__':
96
- import yaml
97
- with open('config.yaml', 'r') as file:
98
- config = yaml.safe_load(file)
99
-
100
- tta_service_port = config['Text-to-Audio']['service-port']
101
- ttm_service_port = config['Text-to-Audio']['service-port']
102
-
103
- if tta_service_port != ttm_service_port:
104
- msg = 'Ports of TTA and TTM should be same if you are using Audiocraft ...'
105
- logging.info(msg)
106
- raise ValueError(msg)
107
-
108
- app.run(debug=False, port=tta_service_port)
109
-
110
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Bark/__init__.py DELETED
File without changes
Dockerfile ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11
2
+
3
+ # Install miniconda
4
+ RUN apt-get install -y wget && rm -rf /var/lib/apt/lists/*
5
+
6
+ RUN wget \
7
+ https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
8
+ && bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/miniconda3 \
9
+ && rm -f Miniconda3-latest-Linux-x86_64.sh
10
+
11
+ # Add conda binary to PATH variable
12
+ ENV HOME=/home/user \
13
+ PATH=/opt/miniconda3/bin:/home/user/.local/bin:$PATH \
14
+ CONDA_PREFIX=/opt/miniconda3/envs
15
+
16
+ # Setup conda envs
17
+ WORKDIR $HOME/app
18
+ COPY . .
19
+
20
+ # Conda envs setup
21
+ RUN bash ./scripts/EnvsSetup.sh
22
+
23
+ # pre-download all models
24
+ RUN conda run --live-stream -n WavJourney python scripts/download_models.py
25
+ RUN mkdir $HOME/app/services_logs
26
+
27
+ # entrypoint
28
+ ENTRYPOINT bash /home/user/app/scripts/start_service_and_ui.sh
Envs/AudioCraft.yml CHANGED
@@ -1,4 +1,4 @@
1
- name: AudioCraft
2
  channels:
3
  - nvidia/label/cuda-11.8.0
4
  - conda-forge
 
1
+ name: WavJourney
2
  channels:
3
  - nvidia/label/cuda-11.8.0
4
  - conda-forge
Envs/Bark.yml CHANGED
@@ -1,4 +1,4 @@
1
- name: Bark
2
  channels:
3
  - conda-forge
4
  - defaults
@@ -177,4 +177,3 @@ dependencies:
177
  - xxhash==3.3.0
178
  - yarl==1.9.2
179
  - zipp==3.16.1
180
- prefix: /home/zzk/Workspace/miniconda3/envs/Bark
 
1
+ name: WavJourney
2
  channels:
3
  - conda-forge
4
  - defaults
 
177
  - xxhash==3.3.0
178
  - yarl==1.9.2
179
  - zipp==3.16.1
 
EnvsSetup/AudioCraft.sh DELETED
@@ -1,16 +0,0 @@
1
- conda env create -f Envs/AudioCraft.yml
2
- conda run --live-stream -n AudioCraft pip install -U git+https://git@github.com/facebookresearch/audiocraft@c5157b5bf14bf83449c17ea1eeb66c19fb4bc7f0#egg=audiocraft
3
- # Could not load library libcudnn_cnn_infer.so.8.
4
- # Error: libnvrtc.so: cannot open shared object file: No such file or directory
5
- CONDAENV=AudioCraft
6
- source activate ${CONDAENV}
7
- conda install -c "nvidia/label/cuda-11.8.0" cuda-toolkit
8
- python3 -m pip install nvidia-cudnn-cu11==8.5.0.96
9
- source deactivate
10
- mkdir -p $CONDA_PREFIX/envs/${CONDAENV}/etc/conda/activate.d
11
- echo 'CUDNN_PATH=$(dirname $(python -c "import nvidia.cudnn;print(nvidia.cudnn.__file__)"))' >> $CONDA_PREFIX/envs/${CONDAENV}/etc/conda/activate.d/env_vars.sh
12
- echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib/:$CUDNN_PATH/lib' >> $CONDA_PREFIX/envs/${CONDAENV}/etc/conda/activate.d/env_vars.sh
13
- source $CONDA_PREFIX/envs/${CONDAENV}/etc/conda/activate.d/env_vars.sh
14
-
15
- # If you're using WSL2, you can add the following into ~/.bashrc
16
- # export LD_LIBRARY_PATH=/usr/lib/wsl/lib:$LD_LIBRARY_PATH
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
EnvsSetup/Bark.sh DELETED
@@ -1 +0,0 @@
1
- conda env create -f Envs/Bark.yml
 
 
EnvsSetup/VoiceFixer.sh DELETED
@@ -1 +0,0 @@
1
- conda env create -f Envs/VoiceFixer.yml
 
 
EnvsSetup/WavJourney.sh DELETED
@@ -1 +0,0 @@
1
- conda env create -f Envs/WavJourney.yml
 
 
LICENSE ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International Public License
2
+
3
+ By exercising the Licensed Rights (defined below), You accept and agree to be
4
+ bound by the terms and conditions of this Creative Commons
5
+ Attribution-NonCommercial-NoDerivatives 4.0 International Public License
6
+ ("Public License"). To the extent this Public License may be interpreted as a
7
+ contract, You are granted the Licensed Rights in consideration of Your
8
+ acceptance of these terms and conditions, and the Licensor grants You such
9
+ rights in consideration of benefits the Licensor receives from making the
10
+ Licensed Material available under these terms and conditions.
11
+
12
+ Section 1 – Definitions.
13
+
14
+ a. Adapted Material means material subject to Copyright and Similar Rights
15
+ that is derived from or based upon the Licensed Material and in which
16
+ the Licensed Material is translated, altered, arranged, transformed, or
17
+ otherwise modified in a manner requiring permission under the Copyright
18
+ and Similar Rights held by the Licensor. For purposes of this Public
19
+ License, where the Licensed Material is a musical work, performance, or
20
+ sound recording, Adapted Material is always produced where the Licensed
21
+ Material is synched in timed relation with a moving image.
22
+ b. Copyright and Similar Rights means copyright and/or similar rights
23
+ closely related to copyright including, without limitation,
24
+ performance, broadcast, sound recording, and Sui Generis Database
25
+ Rights, without regard to how the rights are labeled or categorized.
26
+ For purposes of this Public License, the rights specified in Section
27
+ 2(b)(1)-(2) are not Copyright and Similar Rights.
28
+ c. Effective Technological Measures means those measures that, in the
29
+ absence of proper authority, may not be circumvented under laws
30
+ fulfilling obligations under Article 11 of the WIPO Copyright Treaty
31
+ adopted on December 20, 1996, and/or similar international agreements.
32
+ d. Exceptions and Limitations means fair use, fair dealing, and/or any
33
+ other exception or limitation to Copyright and Similar Rights that
34
+ applies to Your use of the Licensed Material.
35
+ e. Licensed Material means the artistic or literary work, database, or
36
+ other material to which the Licensor applied this Public License.
37
+ f. Licensed Rights means the rights granted to You subject to the terms
38
+ and conditions of this Public License, which are limited to all
39
+ Copyright and Similar Rights that apply to Your use of the Licensed
40
+ Material and that the Licensor has authority to license.
41
+ g. Licensor means the individual(s) or entity(ies) granting rights under
42
+ this Public License.
43
+ h. NonCommercial means not primarily intended for or directed towards
44
+ commercial advantage or monetary compensation. For purposes of this
45
+ Public License, the exchange of the Licensed Material for other
46
+ material subject to Copyright and Similar Rights by digital
47
+ file-sharing or similar means is NonCommercial provided there is no
48
+ payment of monetary compensation in connection with the exchange.
49
+ i. Share means to provide material to the public by any means or process
50
+ that requires permission under the Licensed Rights, such as
51
+ reproduction, public display, public performance, distribution,
52
+ dissemination, communication, or importation, and to make material
53
+ available to the public including in ways that members of the public
54
+ may access the material from a place and at a time individually chosen
55
+ by them.
56
+ j. Sui Generis Database Rights means rights other than copyright resulting
57
+ from Directive 96/9/EC of the European Parliament and of the Council of
58
+ 11 March 1996 on the legal protection of databases, as amended and/or
59
+ succeeded, as well as other essentially equivalent rights anywhere in
60
+ the world.
61
+ k. You means the individual or entity exercising the Licensed Rights under
62
+ this Public License. Your has a corresponding meaning.
63
+
64
+ Section 2 – Scope.
65
+
66
+ a. License grant.
67
+ 1. Subject to the terms and conditions of this Public License, the
68
+ Licensor hereby grants You a worldwide, royalty-free,
69
+ non-sublicensable, non-exclusive, irrevocable license to exercise
70
+ the Licensed Rights in the Licensed Material to:
71
+ A. reproduce and Share the Licensed Material, in whole or in part,
72
+ for NonCommercial purposes only; and
73
+ B. produce and reproduce, but not Share, Adapted Material for
74
+ NonCommercial purposes only.
75
+ 2. Exceptions and Limitations. For the avoidance of doubt, where
76
+ Exceptions and Limitations apply to Your use, this Public License
77
+ does not apply, and You do not need to comply with its terms and
78
+ conditions.
79
+ 3. Term. The term of this Public License is specified in Section 6(a).
80
+ 4. Media and formats; technical modifications allowed. The Licensor
81
+ authorizes You to exercise the Licensed Rights in all media and
82
+ formats whether now known or hereafter created, and to make
83
+ technical modifications necessary to do so. The Licensor waives
84
+ and/or agrees not to assert any right or authority to forbid You
85
+ from making technical modifications necessary to exercise the
86
+ Licensed Rights, including technical modifications necessary to
87
+ circumvent Effective Technological Measures. For purposes of this
88
+ Public License, simply making modifications authorized by this
89
+ Section 2(a)(4) never produces Adapted Material.
90
+ 5. Downstream recipients.
91
+ A. Offer from the Licensor – Licensed Material. Every recipient of
92
+ the Licensed Material automatically receives an offer from the
93
+ Licensor to exercise the Licensed Rights under the terms and
94
+ conditions of this Public License.
95
+ B. No downstream restrictions. You may not offer or impose any
96
+ additional or different terms or conditions on, or apply any
97
+ Effective Technological Measures to, the Licensed Material if
98
+ doing so restricts exercise of the Licensed Rights by any
99
+ recipient of the Licensed Material.
100
+ 6. No endorsement. Nothing in this Public License constitutes or may
101
+ be construed as permission to assert or imply that You are, or that
102
+ Your use of the Licensed Material is, connected with, or sponsored,
103
+ endorsed, or granted official status by, the Licensor or others
104
+ designated to receive attribution as provided in Section
105
+ 3(a)(1)(A)(i).
106
+
107
+ b. Other rights.
108
+ 1. Moral rights, such as the right of integrity, are not licensed
109
+ under this Public License, nor are publicity, privacy, and/or other
110
+ similar personality rights; however, to the extent possible, the
111
+ Licensor waives and/or agrees not to assert any such rights held by
112
+ the Licensor to the limited extent necessary to allow You to
113
+ exercise the Licensed Rights, but not otherwise.
114
+ 2. Patent and trademark rights are not licensed under this Public
115
+ License.
116
+ 3. To the extent possible, the Licensor waives any right to collect
117
+ royalties from You for the exercise of the Licensed Rights, whether
118
+ directly or through a collecting society under any voluntary or
119
+ waivable statutory or compulsory licensing scheme. In all other
120
+ cases the Licensor expressly reserves any right to collect such
121
+ royalties, including when the Licensed Material is used other than
122
+ for NonCommercial purposes.
123
+
124
+ Section 3 – License Conditions.
125
+
126
+ Your exercise of the Licensed Rights is expressly made subject to the following conditions.
127
+
128
+ a. Attribution.
129
+
130
+ 1. If You Share the Licensed Material, You must:
131
+ A. retain the following if it is supplied by the Licensor with the
132
+ Licensed Material:
133
+ i. identification of the creator(s) of the Licensed Material
134
+ and any others designated to receive attribution, in any
135
+ reasonable manner requested by the Licensor (including by
136
+ pseudonym if designated);
137
+ ii. a copyright notice;
138
+ iii. a notice that refers to this Public License;
139
+ iv. a notice that refers to the disclaimer of warranties;
140
+ v. a URI or hyperlink to the Licensed Material to the extent
141
+ reasonably practicable;
142
+ B. indicate if You modified the Licensed Material and retain an
143
+ indication of any previous modifications; and
144
+ C. indicate the Licensed Material is licensed under this Public
145
+ License, and include the text of, or the URI or hyperlink to,
146
+ this Public License.
147
+
148
+ For the avoidance of doubt, You do not have permission under this
149
+ Public License to Share Adapted Material.
150
+
151
+ 2. You may satisfy the conditions in Section 3(a)(1) in any reasonable
152
+ manner based on the medium, means, and context in which You Share
153
+ the Licensed Material. For example, it may be reasonable to satisfy
154
+ the conditions by providing a URI or hyperlink to a resource that
155
+ includes the required information.
156
+ 3. If requested by the Licensor, You must remove any of the
157
+ information required by Section 3(a)(1)(A) to the extent reasonably
158
+ practicable.
159
+
160
+ Section 4 – Sui Generis Database Rights.
161
+
162
+ Where the Licensed Rights include Sui Generis Database Rights that apply to
163
+ Your use of the Licensed Material:
164
+
165
+ a. for the avoidance of doubt, Section 2(a)(1) grants You the right to
166
+ extract, reuse, reproduce, and Share all or a substantial portion of
167
+ the contents of the database for NonCommercial purposes only and
168
+ provided You do not Share Adapted Material;
169
+ b. if You include all or a substantial portion of the database contents in
170
+ a database in which You have Sui Generis Database Rights, then the
171
+ database in which You have Sui Generis Database Rights (but not its
172
+ individual contents) is Adapted Material; and
173
+ c. You must comply with the conditions in Section 3(a) if You Share all or
174
+ a substantial portion of the contents of the database.
175
+
176
+ For the avoidance of doubt, this Section 4 supplements and does not replace
177
+ Your obligations under this Public License where the Licensed Rights include
178
+ other Copyright and Similar Rights.
179
+
180
+ Section 5 – Disclaimer of Warranties and Limitation of Liability.
181
+
182
+ a. Unless otherwise separately undertaken by the Licensor, to the extent
183
+ possible, the Licensor offers the Licensed Material as-is and
184
+ as-available, and makes no representations or warranties of any kind
185
+ concerning the Licensed Material, whether express, implied, statutory,
186
+ or other. This includes, without limitation, warranties of title,
187
+ merchantability, fitness for a particular purpose, non-infringement,
188
+ absence of latent or other defects, accuracy, or the presence or
189
+ absence of errors, whether or not known or discoverable. Where
190
+ disclaimers of warranties are not allowed in full or in part, this
191
+ disclaimer may not apply to You.
192
+ b. To the extent possible, in no event will the Licensor be liable to You
193
+ on any legal theory (including, without limitation, negligence) or
194
+ otherwise for any direct, special, indirect, incidental, consequential,
195
+ punitive, exemplary, or other losses, costs, expenses, or damages
196
+ arising out of this Public License or use of the Licensed Material,
197
+ even if the Licensor has been advised of the possibility of such
198
+ losses, costs, expenses, or damages. Where a limitation of liability is
199
+ not allowed in full or in part, this limitation may not apply to You.
200
+ c. The disclaimer of warranties and limitation of liability provided above
201
+ shall be interpreted in a manner that, to the extent possible, most
202
+ closely approximates an absolute disclaimer and waiver of all
203
+ liability.
204
+
205
+ Section 6 – Term and Termination.
206
+
207
+ a. This Public License applies for the term of the Copyright and Similar
208
+ Rights licensed here. However, if You fail to comply with this Public
209
+ License, then Your rights under this Public License terminate
210
+ automatically.
211
+ b. Where Your right to use the Licensed Material has terminated under
212
+ Section 6(a), it reinstates:
213
+ 1. automatically as of the date the violation is cured, provided it is
214
+ cured within 30 days of Your discovery of the violation; or
215
+ 2. upon express reinstatement by the Licensor.
216
+
217
+ For the avoidance of doubt, this Section 6(b) does not affect any right
218
+ the Licensor may have to seek remedies for Your violations of this
219
+ Public License.
220
+
221
+ c. For the avoidance of doubt, the Licensor may also offer the Licensed
222
+ Material under separate terms or conditions or stop distributing the
223
+ Licensed Material at any time; however, doing so will not terminate
224
+ this Public License.
225
+ d. Sections 1, 5, 6, 7, and 8 survive termination of this Public License.
226
+
227
+ Section 7 – Other Terms and Conditions.
228
+
229
+ a. The Licensor shall not be bound by any additional or different terms or
230
+ conditions communicated by You unless expressly agreed.
231
+ b. Any arrangements, understandings, or agreements regarding the Licensed
232
+ Material not stated herein are separate from and independent of the
233
+ terms and conditions of this Public License.
234
+
235
+ Section 8 – Interpretation.
236
+
237
+ a. For the avoidance of doubt, this Public License does not, and shall not
238
+ be interpreted to, reduce, limit, restrict, or impose conditions on any
239
+ use of the Licensed Material that could lawfully be made without
240
+ permission under this Public License.
241
+ b. To the extent possible, if any provision of this Public License is
242
+ deemed unenforceable, it shall be automatically reformed to the minimum
243
+ extent necessary to make it enforceable. If the provision cannot be
244
+ reformed, it shall be severed from this Public License without
245
+ affecting the enforceability of the remaining terms and conditions.
246
+ c. No term or condition of this Public License will be waived and no
247
+ failure to comply consented to unless expressly agreed to by the Licensor.
248
+ d. Nothing in this Public License constitutes or may be interpreted as a
249
+ limitation upon, or waiver of, any privileges and immunities that apply
250
+ to the Licensor or You, including from the legal processes of any
251
+ jurisdiction or authority.
README.md CHANGED
@@ -1,61 +1,85 @@
1
- ---
2
- title: WavJourney
3
- emoji: 🔥
4
- colorFrom: blue
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 3.40.1
8
- app_file: app.py
9
- pinned: false
10
- license: cc-by-nc-nd-4.0
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
-
15
- # How to run WavJourney?
16
- 1. Install environment by following the bash scripts in `EnvsSetup/`
17
- 2. Start API services; The service logs are in the folder of `logs/`
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  ```bash
19
- python scripts/start_services.py
20
  ```
21
- 3. Run AudioJourney client; The results of scripts and audio are in the folder of `output/[datetime]_[instruction text]/`
 
22
  ```bash
23
- conda activate AudioJourney
24
- python audiojourney_cli.py -f --instruction "News channel BBC broadcast about Trump playing street fighter 6 against Biden"
25
  ```
26
- 4. Kill the API services
27
- ```bash
28
- python scripts/kill_services.py
29
- ```
30
 
31
- 5. Start the UI
 
 
32
  ```bash
33
- sh scripts/start_ui.sh
34
  ```
35
-
36
 
37
- # Voice Presets
38
- You can add voice presets to WavJourney to customize the voice actors. Simply provide the voice id, the description and a sample wav file, and WavJourney will pick the voice automatically based on the audio script.
39
-
40
- Predefined system voice presets are in `data/voice_presets`, whereas session voice presets are in each session's individual folder. See the example below:
41
-
42
- - 📂 **project_folder**
43
- - 📂 **data**
44
- - 📂 **voice_presets** <-- system voice presets
45
- - 📄 **metadata.json** <-- system voice preset metadata
46
- - 📂 **npz**
47
- - 📂 **output**
48
- - 📂 **sessions**
49
- - 📂 **session_1**
50
- - 📂 **voice_presets** <-- session voice presets
51
- - 📄 **metadata.json** <-- session voice preset metadata
52
- - 📂 **npz**
53
- - 📂 **session_2**
54
- - **...**
55
-
56
- ## Add voice to system voice presets via command line
57
- It's recommended to manage voice presets via UI. However if you want to add voice to voice presets via command line. Run the script below:
58
  ```bash
59
- python add_voice_preset.py --id "id" --desc "description" --wav-path path/to/wav --session-id session-id
60
  ```
61
- if `session-id` is set to '', then you are adding to system voice presets
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # <span style="color: blue;">🎵</span> WavJourney: Compositional Audio Creation with LLMs
2
+ [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2307.14335) [![GitHub Stars](https://img.shields.io/github/stars/Audio-AGI/WavJourney?style=social)](https://github.com/Audio-AGI/WavJourney/) [![githubio](https://img.shields.io/badge/GitHub.io-Demo_Page-blue?logo=Github&style=flat-square)](https://audio-agi.github.io/WavJourney_demopage/)
3
+
4
+
5
+ This repository contains the official implementation of ["WavJourney: Compositional Audio Creation with Large Language Models"](https://audio-agi.github.io/WavJourney_demopage/WavJourney_arXiv.pdf).
6
+
7
+ Starting with a text prompt, WavJourney can create audio content with engaging storylines encompassing personalized speakers, lifelike speech in context, emotionally resonant music compositions, and impactful sound effects that enhance the auditory experience. Check the audio examples in the [Project Page](https://audio-agi.github.io/WavJourney_demopage/)!
8
+
9
+ <!-- <p align="center">
10
+ <img align="middle" width="800" src="assets/WavJourney.png"/>
11
+ </p> -->
12
+
13
+ <hr>
14
+
15
+
16
+ ## Preliminaries
17
+ 1. Install the environment:
18
+ ```bash
19
+ bash ./scripts/EnvsSetup.sh
20
+ ```
21
+ 2. Activate the conda environment:
22
+ ```bash
23
+ conda activate WavJourney
24
+ ```
25
+
26
+ 3. Set your `OpenAI-Key` in `config.yaml` for accessing [GPT-4 API](https://platform.openai.com/account/api-keys) [[Guidance](https://help.openai.com/en/articles/7102672-how-can-i-access-gpt-4)]. Please make sure the 'Service-Port' is not occupied. You can also modify the configuration, check the details described in the configuration file.
27
+
28
+ 3. Pre-download the models (might take some time):
29
+ ```bash
30
+ python scripts/download_models.py
31
+ ```
32
+
33
+ 5. Start Python API services (e.g., Text-to-Speech, Text-to-Audio)
34
+ ```bash
35
+ bash scripts/start_services.sh
36
+ ```
37
+
38
+ ## Web APP
39
  ```bash
40
+ bash scripts/start_ui.sh
41
  ```
42
+
43
+ ## Commandline Usage
44
  ```bash
45
+ python wavjourney_cli.py -f --input-text "Generate a one-minute introduction to quantum mechanics"
 
46
  ```
 
 
 
 
47
 
48
+
49
+ ## Kill the services
50
+ You can kill the running services via this command:
51
  ```bash
52
+ python scripts/kill_services.py
53
  ```
 
54
 
55
+ ## (Advanced features) Speaker customization
56
+ You can add voice presets to WavJourney to customize the voice actors. Simply provide the voice id, the description and a sample wav file, and WavJourney will pick the voice automatically based on the audio script. Predefined system voice presets are in `data/voice_presets`.
57
+
58
+ You can manage voice presets via UI. Specifically, if you want to add voice to voice presets. Run the script via command line below:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  ```bash
60
+ python add_voice_preset.py --id "id" --desc "description" --wav-path path/to/wav --session-id ''
61
  ```
62
+ What makes for good voice prompt? See detailed instructions <a href="https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer">here</a>.
63
+ ## Hardware requirement
64
+ - The VRAM of the GPU in the default configuration should be greater than 16 GB.
65
+ - Operation system: Linux.
66
+
67
+ ## Citation
68
+ If you find this work useful, you can cite the paper below:
69
+
70
+ @article{liu2023wavjourney,
71
+ title = {WavJourney: Compositional Audio Creation with Large Language Models},
72
+ author = {Liu, Xubo and Zhu, Zhongkai and Liu, Haohe and Yuan, Yi and Huang, Qiushi and Liang, Jinhua and Cao, Yin and Kong, Qiuqiang and Plumbley, Mark D and Wang, Wenwu},
73
+ journal = {arXiv preprint arXiv:2307.14335},
74
+ year = {2023}
75
+ }
76
+
77
+ [!["Buy Me A Coffee"](https://www.buymeacoffee.com/assets/img/custom_images/orange_img.png)](https://www.buymeacoffee.com/liuxubo)
78
+
79
+ ## Appreciation
80
+ - [Bark](https://github.com/suno-ai/bark) for a zero-shot text-to-speech synthesis model.
81
+ - [AudioCraft](https://github.com/facebookresearch/audiocraft) for state-of-the-art audio generation models.
82
+
83
+ ## Disclaimer
84
+ We are not responsible for audio generated using semantics created by this model. Just don't use it for illegal purposes.
85
+
VoiceFixer/app.py DELETED
@@ -1,55 +0,0 @@
1
- from genericpath import exists
2
- import os
3
- import os.path
4
- import logging
5
- from voicefixer import VoiceFixer
6
- from flask import Flask, request, jsonify
7
-
8
- # Configure the logging format and level
9
- logging.basicConfig(
10
- level=logging.INFO,
11
- format='%(asctime)s - %(levelname)s - %(message)s'
12
- )
13
-
14
- # Create a FileHandler for the log file
15
- os.makedirs('services_logs', exist_ok=True)
16
- log_filename = 'services_logs/Speech-Restoration.log'
17
- file_handler = logging.FileHandler(log_filename, mode='w')
18
- file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
19
-
20
- # Add the FileHandler to the root logger
21
- logging.getLogger('').addHandler(file_handler)
22
-
23
- # Initialize the model here
24
- vf = VoiceFixer()
25
- logging.info('VoiceFixer is loaded ...')
26
-
27
- app = Flask(__name__)
28
-
29
- @app.route('/fix_audio', methods=['POST'])
30
- def fix_audio():
31
- # Receive the text from the POST request
32
- data = request.json
33
- processfile = data['processfile']
34
-
35
- logging.info(f'Fixing {processfile} ...')
36
-
37
- try:
38
- vf.restore(input=processfile, output=processfile, cuda=True, mode=0)
39
-
40
- # Return success message and the filename of the generated audio
41
- return jsonify({'message': 'Speech restored successfully', 'file': processfile})
42
-
43
- except Exception as e:
44
- # Return error message if something goes wrong
45
- return jsonify({'API error': str(e)}), 500
46
-
47
-
48
- if __name__ == '__main__':
49
- import yaml
50
- with open('config.yaml', 'r') as file:
51
- config = yaml.safe_load(file)
52
-
53
- service_port = config['Speech-Restoration']['service-port']
54
- app.run(debug=False, port=service_port)
55
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VoiceParser/app.py DELETED
@@ -1,58 +0,0 @@
1
- from genericpath import exists
2
- import os
3
- import os.path
4
- import logging
5
- import yaml
6
- from model import VoiceParser
7
- from flask import Flask, request, jsonify
8
-
9
- with open('config.yaml', 'r') as file:
10
- config = yaml.safe_load(file)
11
-
12
- service_port = config['Voice-Parser']['service-port']
13
- vp_device = config['Voice-Parser']['device']
14
-
15
- # Configure the logging format and level
16
- logging.basicConfig(
17
- level=logging.INFO,
18
- format='%(asctime)s - %(levelname)s - %(message)s'
19
- )
20
-
21
- # Create a FileHandler for the log file
22
- os.makedirs('services_logs', exist_ok=True)
23
- log_filename = 'services_logs/Voice-Parser.log'
24
- file_handler = logging.FileHandler(log_filename, mode='w')
25
- file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
26
-
27
- # Add the FileHandler to the root logger
28
- logging.getLogger('').addHandler(file_handler)
29
-
30
- # Initialize the model here
31
- vp = VoiceParser(device=vp_device)
32
- logging.info('VoiceParser is loaded ...')
33
-
34
- app = Flask(__name__)
35
-
36
- @app.route('/parse_voice', methods=['POST'])
37
- def parse_voice():
38
- # Receive the text from the POST request
39
- data = request.json
40
- wav_path = data['wav_path']
41
- out_dir = data['out_dir']
42
-
43
- logging.info(f'Parsing {wav_path} ...')
44
-
45
- try:
46
- vp.extract_acoustic_embed(wav_path, out_dir)
47
-
48
- # Return success message and the filename of the generated audio
49
- return jsonify({'message': f'Sucessfully parsed {wav_path}'})
50
-
51
- except Exception as e:
52
- # Return error message if something goes wrong
53
- return jsonify({'API error': str(e)}), 500
54
-
55
-
56
- if __name__ == '__main__':
57
- app.run(debug=False, port=service_port)
58
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VoiceParser/model.py CHANGED
@@ -8,9 +8,9 @@ torchaudio.set_audio_backend("soundfile") # Use 'soundfile' backend
8
 
9
  from encodec import EncodecModel
10
  from encodec.utils import convert_audio
11
- from hubert_manager import HuBERTManager
12
- from pre_kmeans_hubert import CustomHubert
13
- from customtokenizer import CustomTokenizer
14
 
15
  class VoiceParser():
16
  def __init__(self, device='cpu'):
 
8
 
9
  from encodec import EncodecModel
10
  from encodec.utils import convert_audio
11
+ from .hubert_manager import HuBERTManager
12
+ from .pre_kmeans_hubert import CustomHubert
13
+ from .customtokenizer import CustomTokenizer
14
 
15
  class VoiceParser():
16
  def __init__(self, device='cpu'):
pipeline.py CHANGED
@@ -1,11 +1,8 @@
1
- import argparse
2
  import datetime
3
  import os
4
- import subprocess
5
  from string import Template
6
  import openai
7
  import re
8
- from pathlib import Path
9
  import glob
10
  from utils import get_key
11
  import pickle
@@ -13,7 +10,6 @@ import time
13
  import json5
14
  from retrying import retry
15
  from code_generator import check_json_script, collect_and_check_audio_data
16
- from tabulate import tabulate
17
  import random
18
  import string
19
 
@@ -21,7 +17,8 @@ import utils
21
  import voice_presets
22
  from code_generator import AudioCodeGenerator
23
 
24
- USE_OPENAI_CACHE = True
 
25
  openai_cache = []
26
  if USE_OPENAI_CACHE:
27
  os.makedirs('cache', exist_ok=True)
@@ -203,7 +200,7 @@ def generate_audio(session_id, json_script):
203
  voices = voice_presets.get_merged_voice_presets(session_id)
204
 
205
  # Step 2
206
- json_script_to_char_voice_map(json_script, voices, output_path)
207
  # Step 3
208
  json_script_filename = output_path / 'audio_script.json'
209
  char_voice_map_filename = output_path / 'character_voice_map.json'
@@ -214,22 +211,9 @@ def generate_audio(session_id, json_script):
214
 
215
  result_wav_filename = output_audio_path / f'{result_wav_basename}.wav'
216
  print(f'Done all processes, result: {result_wav_filename}')
217
- return result_wav_filename
218
 
219
  # Convenient function call used by wavjourney_cli
220
  def full_steps(session_id, input_text):
221
  json_script = generate_json_file(session_id, input_text)
222
  return generate_audio(session_id, json_script)
223
-
224
- def convert_json_to_md(audio_script_response):
225
- audio_json_data = json5.loads(audio_script_response)
226
- table = [[node.get(field, 'N/A') for field in ["audio_type", "layout", "id", "character", "action", 'vol']] +
227
- [node.get("desc", "N/A") if node.get("audio_type") != "speech" else node.get("text", "N/A")] +
228
- [node.get("len", "Auto") if "len" in node else "Auto"]
229
- for i, node in enumerate(audio_json_data)]
230
-
231
- headers = ["Audio Type", "Layout", "ID", "Character", "Action", 'Volume', "Description", "Length" ]
232
-
233
- # Tabulate
234
- table_txt = tabulate(table, headers, tablefmt="github")
235
- return table_txt
 
 
1
  import datetime
2
  import os
 
3
  from string import Template
4
  import openai
5
  import re
 
6
  import glob
7
  from utils import get_key
8
  import pickle
 
10
  import json5
11
  from retrying import retry
12
  from code_generator import check_json_script, collect_and_check_audio_data
 
13
  import random
14
  import string
15
 
 
17
  import voice_presets
18
  from code_generator import AudioCodeGenerator
19
 
20
+ # Enable this for debugging
21
+ USE_OPENAI_CACHE = False
22
  openai_cache = []
23
  if USE_OPENAI_CACHE:
24
  os.makedirs('cache', exist_ok=True)
 
200
  voices = voice_presets.get_merged_voice_presets(session_id)
201
 
202
  # Step 2
203
+ char_voice_map = json_script_to_char_voice_map(json_script, voices, output_path)
204
  # Step 3
205
  json_script_filename = output_path / 'audio_script.json'
206
  char_voice_map_filename = output_path / 'character_voice_map.json'
 
211
 
212
  result_wav_filename = output_audio_path / f'{result_wav_basename}.wav'
213
  print(f'Done all processes, result: {result_wav_filename}')
214
+ return result_wav_filename, char_voice_map
215
 
216
  # Convenient function call used by wavjourney_cli
217
  def full_steps(session_id, input_text):
218
  json_script = generate_json_file(session_id, input_text)
219
  return generate_audio(session_id, json_script)
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/EnvsSetup.sh ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ conda env create -f Envs/WavJourney.yml && \
2
+ conda env update -f Envs/Bark.yml && \
3
+ conda env update -f Envs/AudioCraft.yml && \
4
+ conda run --live-stream -n WavJourney pip install -U git+https://git@github.com/facebookresearch/audiocraft@c5157b5bf14bf83449c17ea1eeb66c19fb4bc7f0#egg=audiocraft && \
5
+ conda run --live-stream -n WavJourney pip install -U --no-deps voicefixer==0.1.2 && \
6
+ conda run --live-stream -n WavJourney pip install -U --no-deps numpy==1.21 && \
7
+ conda run --live-stream -n WavJourney pip install -U --no-deps librosa==0.8.1
scripts/download_models.py CHANGED
@@ -6,26 +6,24 @@ with open('config.yaml', 'r') as file:
6
  config = yaml.safe_load(file)
7
 
8
  # Extract values for each application
9
- tts_env = config['Text-to-Speech']['env']
 
10
 
11
- ttm_env = config['Text-to-Music']['env']
12
- ttm_model_size = config['Text-to-Music']['model_size']
13
-
14
- tta_env = config['Text-to-Audio']['env']
15
-
16
- sr_env = config['Speech-Restoration']['env']
17
 
18
  # Downloading the TTS models
19
  print('Step 1: Downloading TTS model ...')
20
- os.system(f'conda run --live-stream -n {tts_env} python -c \'from transformers import BarkModel; BarkModel.from_pretrained("suno/bark")\'')
21
 
22
  print('Step 2: Downloading TTA model ...')
23
- os.system(f'conda run --live-stream -n {tta_env} python -c \'from audiocraft.models import AudioGen; tta_model = AudioGen.get_pretrained("facebook/audiogen-medium")\'')
24
 
25
  print('Step 3: Downloading TTM model ...')
26
- os.system(f'conda run --live-stream -n {ttm_env} python -c \'from audiocraft.models import MusicGen; tta_model = MusicGen.get_pretrained("facebook/musicgen-{ttm_model_size}")\'')
27
 
28
  print('Step 4: Downloading SR model ...')
29
- os.system(f'conda run --live-stream -n {sr_env} python -c \'from voicefixer import VoiceFixer; vf = VoiceFixer()\'')
 
 
 
30
 
31
  print('All models successfully downloaded!')
 
6
  config = yaml.safe_load(file)
7
 
8
  # Extract values for each application
9
+ ttm_model_size = config['AudioCraft']['ttm_model_size']
10
+ tta_model_size = config['AudioCraft']['tta_model_size']
11
 
 
 
 
 
 
 
12
 
13
  # Downloading the TTS models
14
  print('Step 1: Downloading TTS model ...')
15
+ os.system(f'conda run --live-stream -n WavJourney python -c \'from transformers import BarkModel; BarkModel.from_pretrained("suno/bark")\'')
16
 
17
  print('Step 2: Downloading TTA model ...')
18
+ os.system(f'conda run --live-stream -n WavJourney python -c \'from audiocraft.models import AudioGen; tta_model = AudioGen.get_pretrained("facebook/audiogen-{tta_model_size}")\'')
19
 
20
  print('Step 3: Downloading TTM model ...')
21
+ os.system(f'conda run --live-stream -n WavJourney python -c \'from audiocraft.models import MusicGen; tta_model = MusicGen.get_pretrained("facebook/musicgen-{ttm_model_size}")\'')
22
 
23
  print('Step 4: Downloading SR model ...')
24
+ os.system(f'conda run --live-stream -n WavJourney python -c \'from voicefixer import VoiceFixer; vf = VoiceFixer()\'')
25
+
26
+ print('Step 5: Downloading VP model ...')
27
+ os.system(f'conda run --live-stream -n WavJourney python -c \'from VoiceParser.model import VoiceParser; vp = VoiceParser(device="cpu")\'')
28
 
29
  print('All models successfully downloaded!')
scripts/kill_services.py CHANGED
@@ -6,23 +6,11 @@ with open('config.yaml', 'r') as file:
6
  config = yaml.safe_load(file)
7
 
8
  # Extract values for each application
9
- tts_port = config['Text-to-Speech']['service-port']
10
-
11
- ttm_port = config['Text-to-Music']['service-port']
12
-
13
- tta_port = config['Text-to-Audio']['service-port']
14
-
15
- sr_port = config['Speech-Restoration']['service-port']
16
-
17
- vp_port = config['Voice-Parser']['service-port']
18
-
19
 
20
  # Execute the commands
21
- os.system(f'kill $(lsof -t -i :{tts_port})')
22
- os.system(f'kill $(lsof -t -i :{tta_port})')
23
- os.system(f'kill $(lsof -t -i :{ttm_port})')
24
- os.system(f'kill $(lsof -t -i :{sr_port})')
25
- os.system(f'kill $(lsof -t -i :{vp_port})')
26
 
27
 
28
 
 
6
  config = yaml.safe_load(file)
7
 
8
  # Extract values for each application
9
+ service_port = config['Service-Port']
 
 
 
 
 
 
 
 
 
10
 
11
  # Execute the commands
12
+ os.system(f'kill $(lsof -t -i :{service_port})')
13
+
 
 
 
14
 
15
 
16
 
scripts/restart_services.sh DELETED
@@ -1,2 +0,0 @@
1
- python scripts/kill_services.py
2
- python scripts/start_services.py
 
 
 
scripts/start_service_and_ui.sh ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ nohup conda run --live-stream -n WavJourney python services.py > services_logs/service.out 2>&1 &
2
+ conda run --live-stream -n WavJourney python -u ui_client.py 2>&1 | stdbuf -oL tee services_logs/wavejourney.out
scripts/start_services.py DELETED
@@ -1,41 +0,0 @@
1
- import yaml
2
- import os
3
-
4
- # Read the YAML file
5
- with open('config.yaml', 'r') as file:
6
- config = yaml.safe_load(file)
7
-
8
- os.makedirs('services_logs', exist_ok=True)
9
-
10
- # Extract values for each application
11
- tts_model = config['Text-to-Speech']['model']
12
- tts_env = config['Text-to-Speech']['env']
13
-
14
- ttm_model = config['Text-to-Music']['model']
15
- ttm_env = config['Text-to-Music']['env']
16
-
17
- tta_model = config['Text-to-Audio']['model']
18
- tta_env = config['Text-to-Audio']['env']
19
-
20
- sr_model = config['Speech-Restoration']['model']
21
- sr_env = config['Speech-Restoration']['env']
22
- enable_sr = config['Speech-Restoration']['Enable']
23
-
24
- vp_model = config['Voice-Parser']['model']
25
- vp_env = config['Voice-Parser']['env']
26
-
27
- # Execute the commands
28
- os.system(f'nohup conda run --live-stream -n {tts_env} python {tts_model}/app.py > services_logs/meta_tts.out 2>&1 &')
29
- os.system(f'nohup conda run --live-stream -n {vp_env} python {vp_model}/app.py > services_logs/meta_vp.out 2>&1 &')
30
-
31
- if enable_sr:
32
- os.system(f'nohup conda run --live-stream -n {sr_env} python {sr_model}/app.py > services_logs/meta_sr.out 2>&1 &')
33
-
34
- # Using AudioCraft for TTA & TTM
35
- if tta_env == ttm_env:
36
- os.system(f'nohup conda run --live-stream -n {ttm_env} python {ttm_model}/app.py > services_logs/meta_tta_ttm.out 2>&1 &')
37
-
38
- # Using AudioLDM for TTA, MusicGen for TTM
39
- if tta_env != ttm_env:
40
- os.system(f'nohup conda run --live-stream -n {tta_env} python {tta_model}/app.py > services_logs/meta_tta.out 2>&1 &')
41
- os.system(f'nohup conda run --live-stream -n {ttm_env} python {ttm_model}/app.py > services_logs/meta_ttm.out 2>&1 &')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/start_services.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ nohup conda run --live-stream -n WavJourney python services.py > services_logs/service.out 2>&1 &
scripts/start_ui.sh CHANGED
@@ -1 +1 @@
1
- conda run --live-stream -n WavJourney gradio ui_client.py
 
1
+ conda run --live-stream -n WavJourney python -u ui_client.py 2>&1 | stdbuf -oL tee services_logs/wavejourney.out
Bark/app.py → services.py RENAMED
@@ -1,17 +1,13 @@
1
  import os
2
- import sys
3
- sys.path.append('../AudioJourney')
4
- import logging
5
  import yaml
6
- import numpy as np
 
7
  import torch
8
  import torchaudio
9
  from torchaudio.transforms import SpeedPerturbation
10
- import nltk
11
  from APIs import WRITE_AUDIO, LOUDNESS_NORM
 
12
  from flask import Flask, request, jsonify
13
- from transformers import BarkModel, AutoProcessor
14
-
15
 
16
  with open('config.yaml', 'r') as file:
17
  config = yaml.safe_load(file)
@@ -24,32 +20,119 @@ logging.basicConfig(
24
 
25
  # Create a FileHandler for the log file
26
  os.makedirs('services_logs', exist_ok=True)
27
- log_filename = 'services_logs/Text-to-Speech.log'
28
  file_handler = logging.FileHandler(log_filename, mode='w')
29
  file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
30
 
31
  # Add the FileHandler to the root logger
32
  logging.getLogger('').addHandler(file_handler)
33
 
34
- # Initialize the model here
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  SPEED = float(config['Text-to-Speech']['speed'])
36
  speed_perturb = SpeedPerturbation(32000, [SPEED])
37
-
38
- logging.info('Loading Bark model ...')
39
- # TODO: fp16?
40
- model = BarkModel.from_pretrained("suno/bark")
41
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
42
- model = model.to(device)
43
- model = model.to_bettertransformer() # Flash attention
44
- SAMPLE_RATE = model.generation_config.sample_rate
45
  SEMANTIC_TEMPERATURE = 0.9
46
  COARSE_TEMPERATURE = 0.5
47
  FINE_TEMPERATURE = 0.5
48
-
49
  processor = AutoProcessor.from_pretrained("suno/bark")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
  app = Flask(__name__)
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  @app.route('/generate_speech', methods=['POST'])
54
  def generate_speech():
55
  # Receive the text from the POST request
@@ -77,7 +160,7 @@ def generate_speech():
77
 
78
  with torch.inference_mode():
79
  # TODO: min_eos_p?
80
- output = model.generate(
81
  **inputs,
82
  do_sample = True,
83
  semantic_temperature = SEMANTIC_TEMPERATURE,
@@ -99,11 +182,49 @@ def generate_speech():
99
  return jsonify({'message': f'Text-to-Speech generated successfully | {speaker_id}: {text}', 'file': output_wav})
100
 
101
  except Exception as e:
102
- raise e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  # Return error message if something goes wrong
104
  return jsonify({'API error': str(e)}), 500
105
 
106
 
107
  if __name__ == '__main__':
108
- service_port = config['Text-to-Speech']['service-port']
109
  app.run(debug=False, port=service_port)
 
1
  import os
 
 
 
2
  import yaml
3
+ import logging
4
+ import nltk
5
  import torch
6
  import torchaudio
7
  from torchaudio.transforms import SpeedPerturbation
 
8
  from APIs import WRITE_AUDIO, LOUDNESS_NORM
9
+ from utils import fade
10
  from flask import Flask, request, jsonify
 
 
11
 
12
  with open('config.yaml', 'r') as file:
13
  config = yaml.safe_load(file)
 
20
 
21
  # Create a FileHandler for the log file
22
  os.makedirs('services_logs', exist_ok=True)
23
+ log_filename = 'services_logs/Wav-API.log'
24
  file_handler = logging.FileHandler(log_filename, mode='w')
25
  file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
26
 
27
  # Add the FileHandler to the root logger
28
  logging.getLogger('').addHandler(file_handler)
29
 
30
+
31
+ """
32
+ Initialize the AudioCraft models here
33
+ """
34
+ from audiocraft.models import AudioGen, MusicGen
35
+ tta_model_size = config['AudioCraft']['tta_model_size']
36
+ tta_model = AudioGen.get_pretrained(f'facebook/audiogen-{tta_model_size}')
37
+ logging.info(f'AudioGen ({tta_model_size}) is loaded ...')
38
+
39
+ ttm_model_size = config['AudioCraft']['ttm_model_size']
40
+ ttm_model = MusicGen.get_pretrained(f'facebook/musicgen-{ttm_model_size}')
41
+ logging.info(f'MusicGen ({ttm_model_size}) is loaded ...')
42
+
43
+
44
+ """
45
+ Initialize the BarkModel here
46
+ """
47
+ from transformers import BarkModel, AutoProcessor
48
  SPEED = float(config['Text-to-Speech']['speed'])
49
  speed_perturb = SpeedPerturbation(32000, [SPEED])
50
+ tts_model = BarkModel.from_pretrained("suno/bark")
 
 
 
51
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
52
+ tts_model = tts_model.to(device)
53
+ tts_model = tts_model.to_bettertransformer() # Flash attention
54
+ SAMPLE_RATE = tts_model.generation_config.sample_rate
55
  SEMANTIC_TEMPERATURE = 0.9
56
  COARSE_TEMPERATURE = 0.5
57
  FINE_TEMPERATURE = 0.5
 
58
  processor = AutoProcessor.from_pretrained("suno/bark")
59
+ logging.info('Bark model is loaded ...')
60
+
61
+
62
+ """
63
+ Initialize the VoiceFixer model here
64
+ """
65
+ from voicefixer import VoiceFixer
66
+ vf = VoiceFixer()
67
+ logging.info('VoiceFixer is loaded ...')
68
+
69
+
70
+ """
71
+ Initalize the VoiceParser model here
72
+ """
73
+ from VoiceParser.model import VoiceParser
74
+ vp_device = config['Voice-Parser']['device']
75
+ vp = VoiceParser(device=vp_device)
76
+ logging.info('VoiceParser is loaded ...')
77
+
78
 
79
  app = Flask(__name__)
80
 
81
+
82
+ @app.route('/generate_audio', methods=['POST'])
83
+ def generate_audio():
84
+ # Receive the text from the POST request
85
+ data = request.json
86
+ text = data['text']
87
+ length = float(data.get('length', 5.0))
88
+ volume = float(data.get('volume', -35))
89
+ output_wav = data.get('output_wav', 'out.wav')
90
+
91
+ logging.info(f'TTA (AudioGen): Prompt: {text}, length: {length} seconds, volume: {volume} dB')
92
+
93
+ try:
94
+ tta_model.set_generation_params(duration=length)
95
+ wav = tta_model.generate([text])
96
+ wav = torchaudio.functional.resample(wav, orig_freq=16000, new_freq=32000)
97
+
98
+ wav = wav.squeeze().cpu().detach().numpy()
99
+ wav = fade(LOUDNESS_NORM(wav, volumn=volume))
100
+ WRITE_AUDIO(wav, name=output_wav)
101
+
102
+ # Return success message and the filename of the generated audio
103
+ return jsonify({'message': f'Text-to-Audio generated successfully | {text}', 'file': output_wav})
104
+
105
+ except Exception as e:
106
+ return jsonify({'API error': str(e)}), 500
107
+
108
+
109
+ @app.route('/generate_music', methods=['POST'])
110
+ def generate_music():
111
+ # Receive the text from the POST request
112
+ data = request.json
113
+ text = data['text']
114
+ length = float(data.get('length', 5.0))
115
+ volume = float(data.get('volume', -35))
116
+ output_wav = data.get('output_wav', 'out.wav')
117
+
118
+ logging.info(f'TTM (MusicGen): Prompt: {text}, length: {length} seconds, volume: {volume} dB')
119
+
120
+
121
+ try:
122
+ ttm_model.set_generation_params(duration=length)
123
+ wav = ttm_model.generate([text])
124
+ wav = wav[0][0].cpu().detach().numpy()
125
+ wav = fade(LOUDNESS_NORM(wav, volumn=volume))
126
+ WRITE_AUDIO(wav, name=output_wav)
127
+
128
+ # Return success message and the filename of the generated audio
129
+ return jsonify({'message': f'Text-to-Music generated successfully | {text}', 'file': output_wav})
130
+
131
+ except Exception as e:
132
+ # Return error message if something goes wrong
133
+ return jsonify({'API error': str(e)}), 500
134
+
135
+
136
  @app.route('/generate_speech', methods=['POST'])
137
  def generate_speech():
138
  # Receive the text from the POST request
 
160
 
161
  with torch.inference_mode():
162
  # TODO: min_eos_p?
163
+ output = tts_model.generate(
164
  **inputs,
165
  do_sample = True,
166
  semantic_temperature = SEMANTIC_TEMPERATURE,
 
182
  return jsonify({'message': f'Text-to-Speech generated successfully | {speaker_id}: {text}', 'file': output_wav})
183
 
184
  except Exception as e:
185
+ # Return error message if something goes wrong
186
+ return jsonify({'API error': str(e)}), 500
187
+
188
+
189
+ @app.route('/fix_audio', methods=['POST'])
190
+ def fix_audio():
191
+ # Receive the text from the POST request
192
+ data = request.json
193
+ processfile = data['processfile']
194
+
195
+ logging.info(f'Fixing {processfile} ...')
196
+
197
+ try:
198
+ vf.restore(input=processfile, output=processfile, cuda=True, mode=0)
199
+
200
+ # Return success message and the filename of the generated audio
201
+ return jsonify({'message': 'Speech restored successfully', 'file': processfile})
202
+
203
+ except Exception as e:
204
+ # Return error message if something goes wrong
205
+ return jsonify({'API error': str(e)}), 500
206
+
207
+
208
+ @app.route('/parse_voice', methods=['POST'])
209
+ def parse_voice():
210
+ # Receive the text from the POST request
211
+ data = request.json
212
+ wav_path = data['wav_path']
213
+ out_dir = data['out_dir']
214
+
215
+ logging.info(f'Parsing {wav_path} ...')
216
+
217
+ try:
218
+ vp.extract_acoustic_embed(wav_path, out_dir)
219
+
220
+ # Return success message and the filename of the generated audio
221
+ return jsonify({'message': f'Sucessfully parsed {wav_path}'})
222
+
223
+ except Exception as e:
224
  # Return error message if something goes wrong
225
  return jsonify({'API error': str(e)}), 500
226
 
227
 
228
  if __name__ == '__main__':
229
+ service_port = config['Service-Port']
230
  app.run(debug=False, port=service_port)
share_btn.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ community_icon_html = """<svg id="share-btn-share-icon" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32">
2
+ <path d="M20.6081 3C21.7684 3 22.8053 3.49196 23.5284 4.38415C23.9756 4.93678 24.4428 5.82749 24.4808 7.16133C24.9674 7.01707 25.4353 6.93643 25.8725 6.93643C26.9833 6.93643 27.9865 7.37587 28.696 8.17411C29.6075 9.19872 30.0124 10.4579 29.8361 11.7177C29.7523 12.3177 29.5581 12.8555 29.2678 13.3534C29.8798 13.8646 30.3306 14.5763 30.5485 15.4322C30.719 16.1032 30.8939 17.5006 29.9808 18.9403C30.0389 19.0342 30.0934 19.1319 30.1442 19.2318C30.6932 20.3074 30.7283 21.5229 30.2439 22.6548C29.5093 24.3704 27.6841 25.7219 24.1397 27.1727C21.9347 28.0753 19.9174 28.6523 19.8994 28.6575C16.9842 29.4379 14.3477 29.8345 12.0653 29.8345C7.87017 29.8345 4.8668 28.508 3.13831 25.8921C0.356375 21.6797 0.754104 17.8269 4.35369 14.1131C6.34591 12.058 7.67023 9.02782 7.94613 8.36275C8.50224 6.39343 9.97271 4.20438 12.4172 4.20438H12.4179C12.6236 4.20438 12.8314 4.2214 13.0364 4.25468C14.107 4.42854 15.0428 5.06476 15.7115 6.02205C16.4331 5.09583 17.134 4.359 17.7682 3.94323C18.7242 3.31737 19.6794 3 20.6081 3ZM20.6081 5.95917C20.2427 5.95917 19.7963 6.1197 19.3039 6.44225C17.7754 7.44319 14.8258 12.6772 13.7458 14.7131C13.3839 15.3952 12.7655 15.6837 12.2086 15.6837C11.1036 15.6837 10.2408 14.5497 12.1076 13.1085C14.9146 10.9402 13.9299 7.39584 12.5898 7.1776C12.5311 7.16799 12.4731 7.16355 12.4172 7.16355C11.1989 7.16355 10.6615 9.33114 10.6615 9.33114C10.6615 9.33114 9.0863 13.4148 6.38031 16.206C3.67434 18.998 3.5346 21.2388 5.50675 24.2246C6.85185 26.2606 9.42666 26.8753 12.0653 26.8753C14.8021 26.8753 17.6077 26.2139 19.1799 25.793C19.2574 25.7723 28.8193 22.984 27.6081 20.6107C27.4046 20.212 27.0693 20.0522 26.6471 20.0522C24.9416 20.0522 21.8393 22.6726 20.5057 22.6726C20.2076 22.6726 19.9976 22.5416 19.9116 22.222C19.3433 20.1173 28.552 19.2325 27.7758 16.1839C27.639 15.6445 27.2677 15.4256 26.746 15.4263C24.4923 15.4263 19.4358 19.5181 18.3759 19.5181C18.2949 19.5181 18.2368 19.4937 18.2053 19.4419C17.6743 18.557 17.9653 17.9394 21.7082 15.6009C25.4511 13.2617 28.0783 11.8545 26.5841 10.1752C26.4121 9.98141 26.1684 9.8956 25.8725 9.8956C23.6001 9.89634 18.2311 14.9403 18.2311 14.9403C18.2311 14.9403 16.7821 16.496 15.9057 16.496C15.7043 16.496 15.533 16.4139 15.4169 16.2112C14.7956 15.1296 21.1879 10.1286 21.5484 8.06535C21.7928 6.66715 21.3771 5.95917 20.6081 5.95917Z" fill="#FF9D00"></path>
3
+ <path d="M5.50686 24.2246C3.53472 21.2387 3.67446 18.9979 6.38043 16.206C9.08641 13.4147 10.6615 9.33111 10.6615 9.33111C10.6615 9.33111 11.2499 6.95933 12.59 7.17757C13.93 7.39581 14.9139 10.9401 12.1069 13.1084C9.29997 15.276 12.6659 16.7489 13.7459 14.713C14.8258 12.6772 17.7747 7.44316 19.304 6.44221C20.8326 5.44128 21.9089 6.00204 21.5484 8.06532C21.188 10.1286 14.795 15.1295 15.4171 16.2118C16.0391 17.2934 18.2312 14.9402 18.2312 14.9402C18.2312 14.9402 25.0907 8.49588 26.5842 10.1752C28.0776 11.8545 25.4512 13.2616 21.7082 15.6008C17.9646 17.9393 17.6744 18.557 18.2054 19.4418C18.7372 20.3266 26.9998 13.1351 27.7759 16.1838C28.5513 19.2324 19.3434 20.1173 19.9117 22.2219C20.48 24.3274 26.3979 18.2382 27.6082 20.6107C28.8193 22.9839 19.2574 25.7722 19.18 25.7929C16.0914 26.62 8.24723 28.3726 5.50686 24.2246Z" fill="#FFD21E"></path>
4
+ </svg>"""
5
+
6
+ loading_icon_html = """<svg id="share-btn-loading-icon" style="display:none;" class="animate-spin"
7
+ style="color: #ffffff;
8
+ "
9
+ xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" fill="none" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 24 24"><circle style="opacity: 0.25;" cx="12" cy="12" r="10" stroke="white" stroke-width="4"></circle><path style="opacity: 0.75;" fill="white" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"></path></svg>"""
10
+
11
+ share_js = """async () => {
12
+ async function uploadFile(file){
13
+ const UPLOAD_URL = 'https://huggingface.co/uploads';
14
+ const response = await fetch(UPLOAD_URL, {
15
+ method: 'POST',
16
+ headers: {
17
+ 'Content-Type': file.type,
18
+ 'X-Requested-With': 'XMLHttpRequest',
19
+ },
20
+ body: file, /// <- File inherits from Blob
21
+ });
22
+ const url = await response.text();
23
+ return url;
24
+ }
25
+ async function getInputVideoFile(videoEl){
26
+ const res = await fetch(videoEl.src);
27
+ const blob = await res.blob();
28
+ const videoId = Date.now() % 200;
29
+ const fileName = `sd-perception-${{videoId}}.mp4`;
30
+ return new File([blob], fileName, { type: 'video/mp4' });
31
+ }
32
+
33
+ async function audioToBase64(audioFile) {
34
+ return new Promise((resolve, reject) => {
35
+ let reader = new FileReader();
36
+ reader.readAsDataURL(audioFile);
37
+ reader.onload = () => resolve(reader.result);
38
+ reader.onerror = error => reject(error);
39
+
40
+ });
41
+ }
42
+ const gradioEl = document.querySelector("gradio-app").shadowRoot || document.querySelector('body > gradio-app');
43
+ const inputPromptEl = gradioEl.querySelector('#prompt-in input').value;
44
+ const outputVideoEl = gradioEl.querySelector('#output-video video');
45
+
46
+ let titleTxt = `WavJourney: ${inputPromptEl}`;
47
+
48
+ const shareBtnEl = gradioEl.querySelector('#share-btn');
49
+ const shareIconEl = gradioEl.querySelector('#share-btn-share-icon');
50
+ const loadingIconEl = gradioEl.querySelector('#share-btn-loading-icon');
51
+ if(!outputVideoEl){
52
+ return;
53
+ };
54
+ shareBtnEl.style.pointerEvents = 'none';
55
+ shareIconEl.style.display = 'none';
56
+ loadingIconEl.style.removeProperty('display');
57
+ const outputVideo = await getInputVideoFile(outputVideoEl);
58
+ const urlOutputVideo = await uploadFile(outputVideo);
59
+
60
+ const descriptionMd = `
61
+ ##### ${inputPromptEl}
62
+
63
+ ${urlOutputVideo}
64
+ `;
65
+ const params = new URLSearchParams({
66
+ title: titleTxt,
67
+ description: descriptionMd,
68
+ });
69
+ const paramsStr = params.toString();
70
+ window.open(`https://huggingface.co/spaces/Audio-AGI/WavJourney/discussions/new?${paramsStr}`, '_blank');
71
+ shareBtnEl.style.removeProperty('pointer-events');
72
+ shareIconEl.style.removeProperty('display');
73
+ loadingIconEl.style.display = 'none';
74
+ }"""
ui_client.py CHANGED
@@ -1,30 +1,63 @@
1
- import pdb
2
  import shutil
 
3
 
 
4
  import gradio as gr
 
5
 
6
- import pipeline
7
  import utils
 
8
  from pipeline import generate_json_file, generate_audio
9
  from voice_presets import load_voice_presets_metadata, add_session_voice_preset, \
10
  remove_session_voice_preset
 
 
11
 
12
- import openai
13
 
14
  VOICE_PRESETS_HEADERS = ['ID', 'Description']
15
  DELETE_FILE_WHEN_DO_CLEAR = False
16
  DEBUG = False
17
 
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  def generate_script_fn(instruction, _state: gr.State):
20
  try:
21
  session_id = _state['session_id']
22
  json_script = generate_json_file(session_id, instruction)
23
- table_text = pipeline.convert_json_to_md(json_script)
24
  except Exception as e:
25
  gr.Warning(str(e))
26
  print(f"Generating script error: {str(e)}")
27
- return [None, gr.Button.update(interactive=False), _state, gr.Button.update(interactive=True)]
 
 
 
 
 
 
 
 
28
  _state = {
29
  **_state,
30
  'session_id': session_id,
@@ -43,8 +76,11 @@ def generate_script_fn(instruction, _state: gr.State):
43
  def generate_audio_fn(state):
44
  btn_state = gr.Button.update(interactive=True)
45
  try:
46
- audio_path = generate_audio(**state)
 
 
47
  return [
 
48
  gr.make_waveform(str(audio_path)),
49
  btn_state,
50
  btn_state,
@@ -54,7 +90,11 @@ def generate_audio_fn(state):
54
  except Exception as e:
55
  print(f"Generation audio error: {str(e)}")
56
  gr.Warning(str(e))
 
 
 
57
  return [
 
58
  None,
59
  btn_state,
60
  btn_state,
@@ -164,40 +204,262 @@ def add_voice_preset(vp_id, vp_desc, file, ui_state, added_voice_preset):
164
  df_visible, del_visible]
165
 
166
 
167
- with gr.Blocks() as interface:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  system_voice_presets = get_system_voice_presets()
169
  # State
170
  ui_state = gr.State(value={'session_id': pipeline.init_session()})
171
  selected_voice_presets = gr.State(value={'selected_voice_preset': None})
172
  added_voice_preset_state = gr.State(value={'added_file': None, 'count': 0})
173
  # UI Component
174
- key_text_input = gr.Textbox(label='Please Enter OPENAI Key for acessing GPT4', lines=1, placeholder="Input instruction here.",
175
- value='')
176
- text_input_value = '' if DEBUG is False else "News channel BBC broadcast about Trump playing street fighter 6 against Biden"
177
- text_input = gr.Textbox(label='Input', lines=2, placeholder="Input instruction here.",
178
- value=text_input_value)
179
- markdown_output = gr.Markdown(label='Audio Script', lines=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  generate_script_btn = gr.Button(value='Generate Script', interactive=False)
181
- audio_output = gr.Video(type='filepath')
 
 
 
 
 
 
 
 
 
182
  generate_audio_btn = gr.Button(value='Generate Audio', interactive=False)
183
- clear_btn = gr.ClearButton(value='Clear Inputs')
 
 
 
 
 
 
 
 
184
  # System Voice Presets
185
  gr.Markdown(label='System Voice Presets', value='# System Voice Presets')
186
  system_markdown_voice_presets = gr.Dataframe(label='System Voice Presets', headers=VOICE_PRESETS_HEADERS,
187
  value=system_voice_presets)
188
  # User Voice Preset Related
189
- gr.Markdown(label='User Voice Presets', value='# User Voice Presets')
190
- get_voice_preset_to_list(ui_state)
191
- voice_presets_df = gr.Dataframe(headers=VOICE_PRESETS_HEADERS, col_count=len(VOICE_PRESETS_HEADERS),
 
 
192
  value=get_voice_preset_to_list(ui_state), interactive=False, visible=False)
193
  # voice_presets_ds = gr.Dataset(components=[gr.Dataframe(visible=True)], samples=get_voice_preset_to_list(ui_state))
194
- del_voice_btn = gr.Button(value='Delete Selected Voice Preset', visible=False)
195
- gr.Markdown(label='Add Voice Preset', value='## Add Voice Preset')
196
- vp_text_id = gr.Textbox(label='Id', lines=1, placeholder="Input voice preset id here.")
197
- vp_text_desc = gr.Textbox(label='Desc', lines=1, placeholder="Input description here.")
198
- vp_file = gr.File(label='Wav File', type='file', description='Upload your wav file here.', file_types=['.wav'],
199
- interactive=True)
200
- vp_submit = gr.Button(label='Upload Voice Preset', value="Upload Voice Preset")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  # events
202
  key_text_input.change(fn=set_openai_key, inputs=[key_text_input], outputs=[key_text_input])
203
  text_input.change(fn=textbox_listener, inputs=[text_input], outputs=[generate_script_btn])
@@ -205,6 +467,7 @@ with gr.Blocks() as interface:
205
  fn=generate_audio_fn,
206
  inputs=[ui_state],
207
  outputs=[
 
208
  audio_output,
209
  generate_audio_btn,
210
  generate_script_btn,
@@ -214,7 +477,7 @@ with gr.Blocks() as interface:
214
  api_name='audio_journey',
215
  )
216
  generate_audio_btn.click(
217
- fn=lambda _: [
218
  gr.Button.update(interactive=False),
219
  gr.Button.update(interactive=False),
220
  gr.Button.update(interactive=False),
@@ -228,13 +491,13 @@ with gr.Blocks() as interface:
228
  ]
229
  )
230
  clear_btn.click(fn=clear_fn, inputs=ui_state,
231
- outputs=[text_input, audio_output, markdown_output, generate_audio_btn, generate_script_btn,
232
  ui_state, voice_presets_df, del_voice_btn,
233
  vp_text_id, vp_text_desc, vp_file])
234
  generate_script_btn.click(
235
  fn=generate_script_fn, inputs=[text_input, ui_state],
236
  outputs=[
237
- markdown_output,
238
  ui_state,
239
  generate_audio_btn,
240
  generate_script_btn,
@@ -243,7 +506,7 @@ with gr.Blocks() as interface:
243
  ]
244
  )
245
  generate_script_btn.click(
246
- fn=lambda _: [
247
  gr.Button.update(interactive=False),
248
  gr.Button.update(interactive=False),
249
  gr.Button.update(interactive=False),
@@ -266,6 +529,10 @@ with gr.Blocks() as interface:
266
  vp_submit,
267
  voice_presets_df, del_voice_btn])
268
  vp_submit.click(lambda _: gr.Button.update(interactive=False), inputs=[vp_submit])
 
 
 
 
269
  # debug only
270
  # print_state_btn = gr.Button(value='Print State')
271
  # print_state_btn.click(fn=lambda state, state2: print(state, state2), inputs=[ui_state, selected_voice_presets])
 
 
1
  import shutil
2
+ import json5
3
 
4
+ import openai
5
  import gradio as gr
6
+ from tabulate import tabulate
7
 
 
8
  import utils
9
+ import pipeline
10
  from pipeline import generate_json_file, generate_audio
11
  from voice_presets import load_voice_presets_metadata, add_session_voice_preset, \
12
  remove_session_voice_preset
13
+ from share_btn import community_icon_html, loading_icon_html, share_js
14
+
15
 
 
16
 
17
  VOICE_PRESETS_HEADERS = ['ID', 'Description']
18
  DELETE_FILE_WHEN_DO_CLEAR = False
19
  DEBUG = False
20
 
21
 
22
+ def convert_json_to_md(audio_script_response):
23
+ audio_json_data = json5.loads(audio_script_response)
24
+ table = [[node.get(field, 'N/A') for field in ["audio_type", "layout", "id", "character", "action", 'vol']] +
25
+ [node.get("desc", "N/A") if node.get("audio_type") != "speech" else node.get("text", "N/A")] +
26
+ [node.get("len", "Auto") if "len" in node else "Auto"]
27
+ for i, node in enumerate(audio_json_data)]
28
+
29
+ headers = ["Audio Type", "Layout", "ID", "Character", "Action", 'Volume', "Description", "Length" ]
30
+
31
+ # Tabulate
32
+ table_txt = tabulate(table, headers, tablefmt="github")
33
+ return table_txt
34
+
35
+
36
+ def convert_char_voice_map_to_md(char_voice_map):
37
+ table =[[character, char_voice_map[character]["id"]] for character in char_voice_map]
38
+ headers = ["Character", "Voice"]
39
+ # Tabulate
40
+ table_txt = tabulate(table, headers, tablefmt="github")
41
+ return table_txt
42
+
43
+
44
  def generate_script_fn(instruction, _state: gr.State):
45
  try:
46
  session_id = _state['session_id']
47
  json_script = generate_json_file(session_id, instruction)
48
+ table_text = convert_json_to_md(json_script)
49
  except Exception as e:
50
  gr.Warning(str(e))
51
  print(f"Generating script error: {str(e)}")
52
+ return [
53
+ None,
54
+ _state,
55
+ gr.Button.update(interactive=False),
56
+ gr.Button.update(interactive=True),
57
+ gr.Button.update(interactive=False),
58
+ gr.Button.update(interactive=False),
59
+ ]
60
+
61
  _state = {
62
  **_state,
63
  'session_id': session_id,
 
76
  def generate_audio_fn(state):
77
  btn_state = gr.Button.update(interactive=True)
78
  try:
79
+ audio_path, char_voice_map = generate_audio(**state)
80
+ table_text = convert_char_voice_map_to_md(char_voice_map)
81
+ # TODO: output char_voice_map to a table
82
  return [
83
+ table_text,
84
  gr.make_waveform(str(audio_path)),
85
  btn_state,
86
  btn_state,
 
90
  except Exception as e:
91
  print(f"Generation audio error: {str(e)}")
92
  gr.Warning(str(e))
93
+ # For debugging, uncomment the line below
94
+ #raise e
95
+
96
  return [
97
+ None,
98
  None,
99
  btn_state,
100
  btn_state,
 
204
  df_visible, del_visible]
205
 
206
 
207
+ css = """
208
+ a {
209
+ color: inherit;
210
+ text-decoration: underline;
211
+ }
212
+ .gradio-container {
213
+ font-family: 'IBM Plex Sans', sans-serif;
214
+ }
215
+ .gr-button {
216
+ color: white;
217
+ border-color: #000000;
218
+ background: #000000;
219
+ }
220
+ input[type='range'] {
221
+ accent-color: #000000;
222
+ }
223
+ .dark input[type='range'] {
224
+ accent-color: #dfdfdf;
225
+ }
226
+ .container {
227
+ max-width: 730px;
228
+ margin: auto;
229
+ padding-top: 1.5rem;
230
+ }
231
+ #gallery {
232
+ min-height: 22rem;
233
+ margin-bottom: 15px;
234
+ margin-left: auto;
235
+ margin-right: auto;
236
+ border-bottom-right-radius: .5rem !important;
237
+ border-bottom-left-radius: .5rem !important;
238
+ }
239
+ #gallery>div>.h-full {
240
+ min-height: 20rem;
241
+ }
242
+ .details:hover {
243
+ text-decoration: underline;
244
+ }
245
+ .gr-button {
246
+ white-space: nowrap;
247
+ }
248
+ .gr-button:focus {
249
+ border-color: rgb(147 197 253 / var(--tw-border-opacity));
250
+ outline: none;
251
+ box-shadow: var(--tw-ring-offset-shadow), var(--tw-ring-shadow), var(--tw-shadow, 0 0 #0000);
252
+ --tw-border-opacity: 1;
253
+ --tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color);
254
+ --tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(3px var(--tw-ring-offset-width)) var(--tw-ring-color);
255
+ --tw-ring-color: rgb(191 219 254 / var(--tw-ring-opacity));
256
+ --tw-ring-opacity: .5;
257
+ }
258
+ #advanced-btn {
259
+ font-size: .7rem !important;
260
+ line-height: 19px;
261
+ margin-top: 12px;
262
+ margin-bottom: 12px;
263
+ padding: 2px 8px;
264
+ border-radius: 14px !important;
265
+ }
266
+ #advanced-options {
267
+ margin-bottom: 20px;
268
+ }
269
+ .footer {
270
+ margin-bottom: 45px;
271
+ margin-top: 35px;
272
+ text-align: center;
273
+ border-bottom: 1px solid #e5e5e5;
274
+ }
275
+ .footer>p {
276
+ font-size: .8rem;
277
+ display: inline-block;
278
+ padding: 0 10px;
279
+ transform: translateY(10px);
280
+ background: white;
281
+ }
282
+ .dark .footer {
283
+ border-color: #303030;
284
+ }
285
+ .dark .footer>p {
286
+ background: #0b0f19;
287
+ }
288
+ .acknowledgments h4{
289
+ margin: 1.25em 0 .25em 0;
290
+ font-weight: bold;
291
+ font-size: 115%;
292
+ }
293
+ #container-advanced-btns{
294
+ display: flex;
295
+ flex-wrap: wrap;
296
+ justify-content: space-between;
297
+ align-items: center;
298
+ }
299
+ .animate-spin {
300
+ animation: spin 1s linear infinite;
301
+ }
302
+ @keyframes spin {
303
+ from {
304
+ transform: rotate(0deg);
305
+ }
306
+ to {
307
+ transform: rotate(360deg);
308
+ }
309
+ }
310
+ #share-btn-container {
311
+ display: flex; padding-left: 0.5rem !important; padding-right: 0.5rem !important; background-color: #000000; justify-content: center; align-items: center; border-radius: 9999px !important; width: 13rem;
312
+ margin-top: 10px;
313
+ margin-left: auto;
314
+ }
315
+ #share-btn {
316
+ all: initial; color: #ffffff;font-weight: 600; cursor:pointer; font-family: 'IBM Plex Sans', sans-serif; margin-left: 0.5rem !important; padding-top: 0.25rem !important; padding-bottom: 0.25rem !important;right:0;
317
+ }
318
+ #share-btn * {
319
+ all: unset;
320
+ }
321
+ #share-btn-container div:nth-child(-n+2){
322
+ width: auto !important;
323
+ min-height: 0px !important;
324
+ }
325
+ #share-btn-container .wrap {
326
+ display: none !important;
327
+ }
328
+ .gr-form{
329
+ flex: 1 1 50%; border-top-right-radius: 0; border-bottom-right-radius: 0;
330
+ }
331
+ #prompt-container{
332
+ gap: 0;
333
+ }
334
+ #generated_id{
335
+ min-height: 700px
336
+ }
337
+ #setting_id{
338
+ margin-bottom: 12px;
339
+ text-align: center;
340
+ font-weight: 900;
341
+ }
342
+ """
343
+
344
+ with gr.Blocks(css=css) as interface:
345
+
346
+ gr.HTML(
347
+ """
348
+ <div style="text-align: center; max-width: 700px; margin: 0 auto;">
349
+ <div
350
+ style="
351
+ display: inline-flex;
352
+ align-items: center;
353
+ gap: 0.8rem;
354
+ font-size: 1.75rem;
355
+ "
356
+ >
357
+ <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
358
+ WavJourney: Compositional Audio Creation with LLMs
359
+ </h1>
360
+ </div>
361
+ <p style="margin-bottom: 10px; margin-top: 10px; font-size: 94%">
362
+ <a href="https://arxiv.org/abs/2307.14335">[Paper]</a> <a href="https://audio-agi.github.io/WavJourney_demopage/">[Demo Page]</a> <a href="https://github.com/Audio-AGI/WavJourney">[GitHub]</a> <a href="https://discord.com/invite/5Hqu9NmA8V">[Join Discord]</a>
363
+ </p>
364
+ </div>
365
+ """
366
+ )
367
+
368
+ gr.HTML(
369
+ """
370
+ <p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU (VRAM>16G) in settings.
371
+ <br>
372
+ <a href="https://huggingface.co/spaces/Audio-AGI/WavJourney?duplicate=true">
373
+ <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
374
+ <p/>
375
+ """
376
+ )
377
+
378
+
379
+
380
  system_voice_presets = get_system_voice_presets()
381
  # State
382
  ui_state = gr.State(value={'session_id': pipeline.init_session()})
383
  selected_voice_presets = gr.State(value={'selected_voice_preset': None})
384
  added_voice_preset_state = gr.State(value={'added_file': None, 'count': 0})
385
  # UI Component
386
+ gr.Markdown(
387
+ """
388
+ How can I access GPT-4? <a href="https://platform.openai.com/account/api-keys">[Guidence1]</a><a href="https://help.openai.com/en/articles/7102672-how-can-i-access-gpt-4">[Guidence2]</a>
389
+ """
390
+ )
391
+ key_text_input = gr.Textbox(label='Please Enter OPENAI Key for accessing GPT-4 API', lines=1, placeholder="OPENAI Key here.",
392
+ value=utils.get_key())
393
+ text_input_value = '' if DEBUG is False else "Generate a one-minute introduction to quantum mechanics"
394
+
395
+ text_input = gr.Textbox(
396
+ label='Input Text Instruction',
397
+ lines=2,
398
+ placeholder="Input instruction here (e.g., Generate a one-minute introduction to quantum mechanics).",
399
+ value=text_input_value,
400
+ elem_id="prompt-in",)
401
+
402
+ gr.Markdown(
403
+ """
404
+ Clicking 'Generate Script' button, the generated audio script will be displayed below.
405
+ """
406
+ )
407
+ audio_script_markdown = gr.Markdown(label='Audio Script')
408
  generate_script_btn = gr.Button(value='Generate Script', interactive=False)
409
+
410
+ gr.Markdown(
411
+ """
412
+ Clicking 'Generate Audio' button, the voice mapping results & generated audio will be displayed below.
413
+ """
414
+ )
415
+ char_voice_map_markdown = gr.Markdown(label='Character-to-voice Map')
416
+
417
+ audio_output = gr.Video(elem_id="output-video")
418
+
419
  generate_audio_btn = gr.Button(value='Generate Audio', interactive=False)
420
+
421
+ clear_btn = gr.ClearButton(value='Clear All')
422
+
423
+ # share to community
424
+ with gr.Group(elem_id="share-btn-container", visible=False):
425
+ community_icon = gr.HTML(community_icon_html)
426
+ loading_icon = gr.HTML(loading_icon_html)
427
+ share_button = gr.Button(value="Share to community", elem_id="share-btn")
428
+
429
  # System Voice Presets
430
  gr.Markdown(label='System Voice Presets', value='# System Voice Presets')
431
  system_markdown_voice_presets = gr.Dataframe(label='System Voice Presets', headers=VOICE_PRESETS_HEADERS,
432
  value=system_voice_presets)
433
  # User Voice Preset Related
434
+ gr.Markdown('# (Optional) Speaker Customization ')
435
+ with gr.Accordion("Click to add speakers", open=False):
436
+ gr.Markdown(label='User Voice Presets', value='## User Voice Presets')
437
+ get_voice_preset_to_list(ui_state)
438
+ voice_presets_df = gr.Dataframe(headers=VOICE_PRESETS_HEADERS, col_count=len(VOICE_PRESETS_HEADERS),
439
  value=get_voice_preset_to_list(ui_state), interactive=False, visible=False)
440
  # voice_presets_ds = gr.Dataset(components=[gr.Dataframe(visible=True)], samples=get_voice_preset_to_list(ui_state))
441
+ del_voice_btn = gr.Button(value='Delete Selected Voice Preset', visible=False)
442
+ gr.Markdown(label='Add Voice Preset', value='## Add Voice Preset')
443
+ gr.Markdown(
444
+ """
445
+
446
+ What makes for good voice prompt? See detailed instructions <a href="https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer">here</a>.
447
+ """
448
+ )
449
+ vp_text_id = gr.Textbox(label='Id', lines=1, placeholder="Input voice preset id here.")
450
+ vp_text_desc = gr.Textbox(label='Desc', lines=1, placeholder="Input description here.")
451
+ vp_file = gr.File(label='Wav File', type='file', file_types=['.wav'],
452
+ interactive=True)
453
+ vp_submit = gr.Button(label='Upload Voice Preset', value="Upload Voice Preset")
454
+
455
+ # disclaimer
456
+ gr.Markdown(
457
+ """
458
+ # Disclaimer
459
+ We are not responsible for audio generated using semantics created by WavJourney. Just don't use it for illegal purposes.
460
+ """
461
+ )
462
+
463
  # events
464
  key_text_input.change(fn=set_openai_key, inputs=[key_text_input], outputs=[key_text_input])
465
  text_input.change(fn=textbox_listener, inputs=[text_input], outputs=[generate_script_btn])
 
467
  fn=generate_audio_fn,
468
  inputs=[ui_state],
469
  outputs=[
470
+ char_voice_map_markdown,
471
  audio_output,
472
  generate_audio_btn,
473
  generate_script_btn,
 
477
  api_name='audio_journey',
478
  )
479
  generate_audio_btn.click(
480
+ fn=lambda: [
481
  gr.Button.update(interactive=False),
482
  gr.Button.update(interactive=False),
483
  gr.Button.update(interactive=False),
 
491
  ]
492
  )
493
  clear_btn.click(fn=clear_fn, inputs=ui_state,
494
+ outputs=[text_input, audio_output, audio_script_markdown, generate_audio_btn, generate_script_btn,
495
  ui_state, voice_presets_df, del_voice_btn,
496
  vp_text_id, vp_text_desc, vp_file])
497
  generate_script_btn.click(
498
  fn=generate_script_fn, inputs=[text_input, ui_state],
499
  outputs=[
500
+ audio_script_markdown,
501
  ui_state,
502
  generate_audio_btn,
503
  generate_script_btn,
 
506
  ]
507
  )
508
  generate_script_btn.click(
509
+ fn=lambda: [
510
  gr.Button.update(interactive=False),
511
  gr.Button.update(interactive=False),
512
  gr.Button.update(interactive=False),
 
529
  vp_submit,
530
  voice_presets_df, del_voice_btn])
531
  vp_submit.click(lambda _: gr.Button.update(interactive=False), inputs=[vp_submit])
532
+
533
+ # share to HF community
534
+ share_button.click(None, [], [], _js=share_js)
535
+
536
  # debug only
537
  # print_state_btn = gr.Button(value='Print State')
538
  # print_state_btn.click(fn=lambda state, state2: print(state, state2), inputs=[ui_state, selected_voice_presets])
webapp/app.prompt DELETED
@@ -1,18 +0,0 @@
1
- write a web app in python and flask and bootstrap.
2
-
3
- The UI:
4
- - input textbox named "InputTextbox" on top
5
- - "Generate All" button named GenerateAllButton at the same row as input textbox 1
6
- - a button "Text -> Script" called TextToScriptButton
7
- - A big textbox named "ScriptTextbox". The textbox should be set to wrap-word mode
8
- - A split line
9
- - a button "Script -> HAML" called ScriptToHAMLButton
10
- - A big textbox named "HAMLTextbox". The textbox should be set to wrap-word mode and display text with HTML syntax format.
11
- - A split line
12
- - a button "HAML -> Python Code" called HAMLToPythonCodeButton
13
- - A big textbox named "PythonCodeTextbox". The textbox should be set to wrap-word mode and display text with python syntax format.
14
-
15
- Behaviors:
16
- - When the user click TextToScriptButton, it will call ChatGPT API, which concat a prompt which is red from "prompts/text_to_audio_script.prompt" with the content from InputTextbox, and send it to ChatGPT, and output ChatGPT's response to ScriptTextbox.
17
- - When the user click ScriptToHAMLButton, it will call ChatGPT API, which concat a prompt which is red from "prompts/audio_script_to_HAML.prompt" with the content from ScriptTextbox, and send it to ChatGPT, and output ChatGPT's response to HAMLTextbox.
18
- - When the user click HAMLToPythonCodeButton, it will get the content of HAMLTextbox and pipe the content to the python script convert_haml_to_py_code.py, and return the script's output to PythonCodeTextbox.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
webapp/app.py DELETED
@@ -1,43 +0,0 @@
1
- from flask import Flask, request, render_template
2
- import os
3
- import subprocess
4
-
5
- app = Flask(__name__)
6
-
7
- def call_chatgpt(prompt_file, input_text):
8
- # Your actual function to call the ChatGPT API will go here
9
- # For now, return a placeholder string
10
- with open(prompt_file, 'r') as file:
11
- prompt = file.read()
12
- return f"Prompt: {prompt}\nInput: {input_text}"
13
-
14
- def call_convert_script(input_text):
15
- # Your actual function to call the script will go here
16
- # For now, return a placeholder string
17
- # Run the script and capture the output
18
- process = subprocess.Popen(['python', '../convert_haml_to_py_code.py'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
19
- output, error = process.communicate(input=input_text)
20
- return output + error
21
-
22
- @app.route('/', methods=['GET', 'POST'])
23
- def index():
24
- if request.method == 'POST':
25
- input_text = request.form.get('InputTextbox', '')
26
- script_text = request.form.get('ScriptTextbox', '')
27
- haml_text = request.form.get('HAMLTextbox', '')
28
- python_code_text = request.form.get('PythonCodeTextbox', '')
29
- if 'TextToScriptButton' in request.form:
30
- script_text = call_chatgpt('../prompts/text_to_audio_script.prompt', input_text)
31
-
32
- elif 'ScriptToHAMLButton' in request.form:
33
- haml_text = call_chatgpt('../prompts/audio_script_to_HAML.prompt', script_text)
34
-
35
- elif 'HAMLToPythonCodeButton' in request.form:
36
- python_code_text = call_convert_script(haml_text)
37
-
38
- return render_template('index.html', haml_text=haml_text, python_code_text=python_code_text, script_text=script_text, input_text=input_text)
39
-
40
- return render_template('index.html')
41
-
42
- if __name__ == '__main__':
43
- app.run(debug=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
webapp/templates/index.html DELETED
@@ -1,30 +0,0 @@
1
- <!doctype html>
2
- <html lang="en">
3
- <head>
4
- <link href="https://stackpath.bootstrapcdn.com/bootstrap/4.5.0/css/bootstrap.min.css" rel="stylesheet">
5
- </head>
6
- <body>
7
- <div class="container">
8
- <form method="POST">
9
- <div class="form-group">
10
- <textarea type="text" class="form-control" name="InputTextbox" placeholder="Enter text" style="word-wrap: break-word;"> {{ input_text }}</textarea>
11
- <button type="submit" class="btn btn-primary" name="TextToScriptButton">Text -> Script</button>
12
- </div>
13
- <div class="form-group">
14
- <textarea class="form-control" rows="5" name="ScriptTextbox" placeholder="Script Output" style="word-wrap: break-word;">{{ script_text }}</textarea>
15
- <button type="submit" class="btn btn-primary" name="ScriptToHAMLButton">Script -> HAML</button>
16
- </div>
17
- <hr>
18
- <div class="form-group">
19
- <textarea class="form-control" rows="5" name="HAMLTextbox" placeholder="HAML Output" style="word-wrap: break-word;">{{ haml_text }}</textarea>
20
- <button type="submit" class="btn btn-primary" name="HAMLToPythonCodeButton">HAML -> Python Code</button>
21
- </div>
22
- <hr>
23
- <div class="form-group">
24
- <textarea class="form-control" rows="5" name="PythonCodeTextbox" placeholder="Python Code Output" style="word-wrap: break-word;">{{ python_code_text }}</textarea>
25
- </div>
26
- </form>
27
- </div>
28
- <script src="https://stackpath.bootstrapcdn.com/bootstrap/4.5.0/js/bootstrap.min.js"></script>
29
- </body>
30
- </html>