Taino commited on
Commit
2f8f70d
1 Parent(s): 08767d3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -226
app.py CHANGED
@@ -1,233 +1,36 @@
1
- import os, sys, re, json
2
- import argparse
3
- import shutil
4
- import warnings
5
- import whisper_timestamped as wt
6
- from pdb import set_trace as b
7
- from pprint import pprint as pp
8
- from profanity_check import predict, predict_prob
9
- from pydub import AudioSegment
10
- from pydub.playback import play
11
- from subprocess import Popen, PIPE
12
  import gradio as gr
13
-
14
- def parse_args():
15
- """
16
- """
17
- parser = argparse.ArgumentParser(
18
- description=('Tool to mute profanities in a song (source separation -> speech recognition -> profanity detection -> mask profanities -> re-mix)'),
19
- usage=('see <py main.py --help> or run as local web app with streamlit: <streamlit run main.py>')
20
- )
21
-
22
- parser.add_argument(
23
- '-i',
24
- '--input',
25
- default=None,
26
- nargs='?',
27
- #required=True,
28
- help=("path to a mp3")
29
- )
30
- parser.add_argument(
31
- '-m',
32
- '--model',
33
- default='small',
34
- nargs='?',
35
- help=("model used by whisper for speech recognition: tiny, small (default) or medium")
36
  )
37
- parser.add_argument(
38
- '-p',
39
- '--play',
40
- default=False,
41
- action='store_true',
42
- help=("play output audio at the end")
43
- )
44
- parser.add_argument(
45
- '-v',
46
- '--verbose',
47
- default=True,
48
- action='store_true',
49
- help=("print transcribed text and detected profanities to screen")
50
- )
51
- return parser.parse_args()
52
-
53
-
54
- def main(args, input_file=None, model_size=None, verbose=False, play_output=False, skip_ss=False):
55
- """
56
- """
57
- if not input_file:
58
- input_file = args.input
59
-
60
- if not model_size:
61
- model_size = args.model
62
-
63
- if not verbose:
64
- verbose = args.verbose
65
-
66
- if not play_output:
67
- play_output = args.play
68
-
69
- # exit if input file not found
70
- if len(sys.argv)>1 and not os.path.isfile(input_file):
71
- print('Error: --input file not found')
72
- raise Exception
73
-
74
- print(f'\nProcessing input file: {input_file}')
75
-
76
- if not skip_ss:
77
- # split audio into vocals + accompaniment
78
- print('Running source separation')
79
- stems_dir = source_separation(input_file, use_demucs=False, use_spleeter=True)
80
- vocal_stem = os.path.join(stems_dir, 'vocals.wav')
81
- #instr_stem = os.path.join(stems_dir, 'no_vocals.wav') # demucs
82
- instr_stem = os.path.join(stems_dir, 'accompaniment.wav') # spleeter
83
- print(f'Vocal stem written to: {vocal_stem}')
84
- else:
85
- vocal_stem = input_file
86
- instr_stem = None
87
-
88
- audio = wt.load_audio(vocal_stem)
89
- model = wt.load_model(model_size, device='cpu')
90
- text = wt.transcribe(model, audio, language='en')
91
-
92
- if verbose:
93
- print('\nTranscribed text:')
94
- print(text['text']+'\n')
95
-
96
- # checking for profanities in text
97
- print('Run profanity detection on text')
98
- profanities = profanity_detection(text)
99
- if not profanities:
100
- print(f'No profanities found in {input_file} - exiting')
101
- return 'No profanities found', None, None
102
-
103
- if verbose:
104
- print('profanities found in text:')
105
- pp(profanities)
106
-
107
- # masking
108
- print('Mask profanities in vocal stem')
109
- vocals = mask_profanities(vocal_stem, profanities)
110
-
111
- # re-mixing
112
- print('Merge instrumentals stem and masked vocals stem')
113
- if not skip_ss:
114
- mix = AudioSegment.from_wav(instr_stem).overlay(vocals)
115
- else:
116
- mix = vocals
117
-
118
- # write mix to file
119
- outpath = input_file.replace('.mp3', '_masked.mp3').replace('.wav', '_masked.wav')
120
- if input_file.endswith('.wav'):
121
- mix.export(outpath, format="wav")
122
- elif input_file.endswith('.mp3'):
123
- mix.export(outpath, format="mp3")
124
- print(f'Mixed file written to: {outpath}')
125
-
126
- # play output
127
- if play_output:
128
- print('\nPlaying output...')
129
- play(mix)
130
-
131
- return outpath, vocal_stem, instr_stem
132
-
133
-
134
- def source_separation(inpath, use_demucs=False, use_spleeter=True):
135
- """
136
- Execute shell command to run demucs and pipe stdout/stderr back to python
137
- """
138
- infile = os.path.basename(inpath)
139
-
140
- if use_demucs:
141
- cmd = f'demucs --two-stems=vocals --jobs 8 "{inpath}"'
142
- #stems_dir = os.path.join(re.findall('/.*', stdout)[0], infile.replace('.mp3','').replace('.wav',''))
143
- elif use_spleeter:
144
- outdir = 'audio/separated'
145
- cmd = f'spleeter separate {inpath} -p spleeter:2stems -o {outdir}'
146
- stems_dir = os.path.join(outdir, os.path.splitext(infile)[0])
147
-
148
- stdout, stderr = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True, executable='/bin/bash').communicate()
149
- stdout = stdout.decode('utf8')
150
-
151
- # exit if lib error'd out
152
- if stderr:
153
- stderr = stderr.decode('utf-8').lower()
154
- if 'error' in stderr or 'not exist' in stderr:
155
- print(stderr.decode('utf8').split('\n')[0])
156
- raise Exception
157
-
158
- # parse stems directory path from stdout and return it if successful
159
- if not os.path.isdir(stems_dir):
160
- print(f'Error: output stem directory "{stems_dir}" not found')
161
- raise Exception
162
-
163
- return stems_dir
164
-
165
-
166
- def profanity_detection(text):
167
- """
168
- """
169
- # detect profanities in text
170
- profs = []
171
- for segment in text['segments']:
172
- for word in segment['words']:
173
- #if word['confidence']<.25:
174
- # print(word)
175
- text = word['text'].replace('.','').replace(',','').lower()
176
-
177
- # skip false positives
178
- if text in ['cancer','hell','junk','die','lame','freak','freaky','white','stink','shut','spit','mouth','orders','eat','clouds','ugly','dirty','wet']:
179
- continue
180
-
181
- # assume anything returned by whisper with more than 1 * is profanity e.g n***a
182
- if '**' in text:
183
- profs.append(word)
184
- continue
185
-
186
- # add true negatives
187
- if text in ['bitchy', 'puss']:
188
- profs.append(word)
189
- continue
190
-
191
- # run profanity detection - returns 1 (True) or 0 (False)
192
- if predict([word['text']])[0]:
193
- profs.append(word)
194
-
195
- return profs
196
-
197
 
198
- def mask_profanities(vocal_stem, profanities):
199
- """
200
- """
201
- # load vocal stem and mask profanities
202
- vocals = AudioSegment.from_wav(vocal_stem)
203
- for prof in profanities:
204
- mask = vocals[prof['start']*1000:prof['end']*1000] # pydub works in milliseconds
205
- mask -= 50 # reduce lvl by some dB (enough to ~mute it)
206
- #mask = mask.silent(len(mask))
207
- #mask = mask.fade_in(100).fade_out(100) # it prepends/appends fades so end up with longer mask
208
- start = vocals[:prof['start']*1000]
209
- end = vocals[prof['end']*1000:]
210
- #print(f"masking {prof['text']} from {prof['start']} to {prof['end']}")
211
- vocals = start + mask + end
212
 
213
- return vocals
 
 
 
214
 
 
 
215
 
216
- def process_audio(input_file, model_size):
217
- args = parse_args()
218
- inpath = os.path.abspath(input_file.name)
219
- outpath, vocal_stem, instr_stem = main(args, input_file=inpath, model_size=model_size)
220
- return outpath
221
 
222
- if __name__ == "__main__":
223
- args = parse_args()
224
-
225
- if len(sys.argv)>1:
226
- main(args, skip_ss=False)
227
- else:
228
- iface = gr.Interface(
229
- fn=process_audio,
230
- inputs=[gr.Audio(source="upload"), gr.Radio(['tiny','small','medium'])],
231
- outputs='audio'
232
- )
233
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import librosa
3
+ import pydub
4
+ import profanity_check
5
+ import openai
6
+
7
+ def clean_song(file_path):
8
+ # Load the audio file and isolate the acapella.
9
+ audio_file = librosa.load(file_path)
10
+ acapella = librosa.effects.trim(audio_file, top_db=60)
11
+
12
+ # Transcribe the acapella with the OpenAI Whisper model.
13
+ transcript = openai.engine("text-davinci-002").generate(
14
+ text="What is the acapella of this song?",
15
+ prompt="Listen to this audio file: " + acapella.to_wav().hex(),
16
+ temperature=0.7,
17
+ max_tokens=200,
 
 
 
 
 
 
 
18
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
+ # Find the timestamps of the profane words in the transcript.
21
+ profane_words = profanity_check.get_profanity(transcript)
22
+ timestamps = [
23
+ (m.start(), m.end()) for m in profanity_check.match_all(transcript)
24
+ ]
 
 
 
 
 
 
 
 
 
25
 
26
+ # Mute the profane words in the audio file.
27
+ audio = pydub.AudioSegment.from_wav(file_path)
28
+ for start, end in timestamps:
29
+ audio[start:end].set_volume(0)
30
 
31
+ # Save the clean audio file.
32
+ audio.export("clean_song.wav", format="wav")
33
 
34
+ return "Clean audio file saved as clean_song.wav"
 
 
 
 
35
 
36
+ gr.Interface(clean_song, inputs="file", outputs="text").launch()