Spaces:
Sleeping
Sleeping
Update src/UltraSinger.py
Browse files- src/UltraSinger.py +984 -130
src/UltraSinger.py
CHANGED
@@ -1,140 +1,994 @@
|
|
1 |
-
|
2 |
-
import subprocess
|
3 |
-
|
4 |
-
def run_ultrasinger(opt_i, youtube_link, opt_o, mode, whisper_model, language, crepe_model, extra, device):
|
5 |
-
# Construct the command based on inputs
|
6 |
-
cmd = ["python", "UltraSinger.py"]
|
7 |
-
|
8 |
-
# Add input option
|
9 |
-
if opt_i:
|
10 |
-
cmd.extend(["-i", f'"{opt_i.name}"'])
|
11 |
-
elif youtube_link:
|
12 |
-
cmd.extend(["-i", f'"{youtube_link}"'])
|
13 |
-
else:
|
14 |
-
return "Error: No input file or YouTube link provided", ""
|
15 |
-
|
16 |
-
# Add output folder option
|
17 |
-
if opt_o:
|
18 |
-
cmd.extend(["-o", f'"{opt_o}"'])
|
19 |
-
|
20 |
-
# Add mode
|
21 |
-
if mode != "default":
|
22 |
-
mode_flags = {
|
23 |
-
"Create Ultrastar txt file": "-u",
|
24 |
-
"Create MIDI file": "-m",
|
25 |
-
"Create sheet file": "-s"
|
26 |
-
}
|
27 |
-
cmd.append(mode_flags[mode])
|
28 |
-
|
29 |
-
# Add transcription options
|
30 |
-
if whisper_model:
|
31 |
-
cmd.extend(["--whisper", whisper_model])
|
32 |
-
if language:
|
33 |
-
language_codes = {
|
34 |
-
"English": "en", "French": "fr", "German": "de", "Spanish": "es",
|
35 |
-
"Italian": "it", "Japanese": "ja", "Chinese": "zh", "Dutch": "nl",
|
36 |
-
"Ukrainian": "uk", "Portuguese": "pt"
|
37 |
-
}
|
38 |
-
cmd.extend(["--language", language_codes[language]])
|
39 |
-
|
40 |
-
# Add pitcher options
|
41 |
-
cmd.extend(["--crepe", crepe_model])
|
42 |
-
|
43 |
-
# Add extra options
|
44 |
-
if extra:
|
45 |
-
cmd.extend(extra.split())
|
46 |
-
|
47 |
-
# Add device options
|
48 |
-
if device:
|
49 |
-
cmd.extend(device.split())
|
50 |
-
|
51 |
-
# Debug: Print the command to check if it's constructed correctly
|
52 |
-
print("Running command:", ' '.join(cmd))
|
53 |
-
|
54 |
-
# Execute the command
|
55 |
-
try:
|
56 |
-
result = subprocess.run(cmd, capture_output=True, text=True)
|
57 |
-
return result.stdout, result.stderr
|
58 |
-
except Exception as e:
|
59 |
-
return str(e), "Error occurred during execution"
|
60 |
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
|
|
|
|
79 |
)
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
|
|
87 |
)
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
)
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
|
|
100 |
)
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
choices=[
|
105 |
-
"", "--force_cpu True", "--force_cpu False",
|
106 |
-
"--force_whisper_cpu True", "--force_whisper_cpu False",
|
107 |
-
"--force_crepe_cpu True", "--force_crepe_cpu False"
|
108 |
-
],
|
109 |
-
value=""
|
110 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
|
112 |
-
|
113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
|
115 |
-
# Define Gradio interface for UltraSinger
|
116 |
-
ultrasinger_tab = gr.Interface(
|
117 |
-
fn=run_ultrasinger,
|
118 |
-
inputs=[opt_i, youtube_link, opt_o, mode, whisper_model, language, crepe_model, extra, device],
|
119 |
-
outputs=[output_text, error_text],
|
120 |
-
title="UltraSinger UI",
|
121 |
-
description="Upload an Ultrastar.txt or an audio file, set the options, and run UltraSinger."
|
122 |
-
)
|
123 |
|
124 |
-
# Load content for Tab 1 and Tab 2
|
125 |
-
tab1_content = load_text_file("info.txt")
|
126 |
-
tab2_content = load_text_file("usdb.txt")
|
127 |
-
|
128 |
-
# Create Gradio tabs
|
129 |
-
with gr.Blocks(theme="soft") as demo:
|
130 |
-
with gr.Tabs():
|
131 |
-
with gr.TabItem("UltraSinger"):
|
132 |
-
ultrasinger_tab.render()
|
133 |
-
with gr.TabItem("Info"):
|
134 |
-
gr.Markdown(tab1_content)
|
135 |
-
with gr.TabItem("FOR USDB USERS"):
|
136 |
-
gr.Markdown(tab2_content)
|
137 |
-
|
138 |
-
# Launch the app
|
139 |
if __name__ == "__main__":
|
140 |
-
|
|
|
1 |
+
"""UltraSinger uses AI to automatically create UltraStar song files"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
+
import copy
|
4 |
+
import getopt
|
5 |
+
import os
|
6 |
+
import sys
|
7 |
+
import re
|
8 |
+
|
9 |
+
import Levenshtein
|
10 |
+
import librosa
|
11 |
+
|
12 |
+
from tqdm import tqdm
|
13 |
+
from packaging import version
|
14 |
+
|
15 |
+
import soundfile as sf
|
16 |
+
|
17 |
+
from modules import os_helper
|
18 |
+
from modules.Audio.denoise import ffmpeg_reduce_noise
|
19 |
+
from modules.Audio.separation import separate_audio
|
20 |
+
from modules.Audio.vocal_chunks import (
|
21 |
+
export_chunks_from_transcribed_data,
|
22 |
+
export_chunks_from_ultrastar_data,
|
23 |
)
|
24 |
+
from modules.Audio.silence_processing import remove_silence_from_transcription_data, get_silence_sections
|
25 |
+
from modules.csv_handler import export_transcribed_data_to_csv
|
26 |
+
from modules.Audio.convert_audio import convert_audio_to_mono_wav, convert_wav_to_mp3
|
27 |
+
from modules.Audio.youtube import (
|
28 |
+
download_youtube_audio,
|
29 |
+
download_youtube_thumbnail,
|
30 |
+
download_youtube_video,
|
31 |
+
get_youtube_title,
|
32 |
)
|
33 |
+
from modules.DeviceDetection.device_detection import check_gpu_support
|
34 |
+
from modules.console_colors import (
|
35 |
+
ULTRASINGER_HEAD,
|
36 |
+
blue_highlighted,
|
37 |
+
gold_highlighted,
|
38 |
+
light_blue_highlighted,
|
39 |
+
red_highlighted,
|
40 |
)
|
41 |
+
from modules.Midi import midi_creator
|
42 |
+
from modules.Midi.midi_creator import (
|
43 |
+
convert_frequencies_to_notes,
|
44 |
+
create_midi_notes_from_pitched_data,
|
45 |
+
most_frequent,
|
46 |
)
|
47 |
+
from modules.Pitcher.pitcher import (
|
48 |
+
get_frequencies_with_high_confidence,
|
49 |
+
get_pitch_with_crepe_file,
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
)
|
51 |
+
from modules.Pitcher.pitched_data import PitchedData
|
52 |
+
from modules.Speech_Recognition.hyphenation import hyphenation, language_check, create_hyphenator
|
53 |
+
from modules.Speech_Recognition.Whisper import transcribe_with_whisper
|
54 |
+
from modules.Ultrastar import ultrastar_score_calculator, ultrastar_writer, ultrastar_converter, ultrastar_parser
|
55 |
+
from modules.Ultrastar.ultrastar_txt import UltrastarTxtValue
|
56 |
+
from Settings import Settings
|
57 |
+
from modules.Speech_Recognition.TranscribedData import TranscribedData
|
58 |
+
from modules.plot import plot, plot_spectrogram
|
59 |
+
from modules.musicbrainz_client import get_music_infos
|
60 |
|
61 |
+
settings = Settings()
|
62 |
+
|
63 |
+
|
64 |
+
def convert_midi_notes_to_ultrastar_notes(midi_notes: list[str]) -> list[int]:
|
65 |
+
"""Convert midi notes to ultrastar notes"""
|
66 |
+
print(f"{ULTRASINGER_HEAD} Creating Ultrastar notes from midi data")
|
67 |
+
|
68 |
+
ultrastar_note_numbers = []
|
69 |
+
for i in enumerate(midi_notes):
|
70 |
+
pos = i[0]
|
71 |
+
note_number_librosa = librosa.note_to_midi(midi_notes[pos])
|
72 |
+
pitch = ultrastar_converter.midi_note_to_ultrastar_note(
|
73 |
+
note_number_librosa
|
74 |
+
)
|
75 |
+
ultrastar_note_numbers.append(pitch)
|
76 |
+
# todo: Progress?
|
77 |
+
# print(
|
78 |
+
# f"Note: {midi_notes[i]} midi_note: {str(note_number_librosa)} pitch: {str(pitch)}"
|
79 |
+
# )
|
80 |
+
return ultrastar_note_numbers
|
81 |
+
|
82 |
+
|
83 |
+
def pitch_each_chunk_with_crepe(directory: str) -> list[str]:
|
84 |
+
"""Pitch each chunk with crepe and return midi notes"""
|
85 |
+
print(
|
86 |
+
f"{ULTRASINGER_HEAD} Pitching each chunk with {blue_highlighted('crepe')}"
|
87 |
+
)
|
88 |
+
|
89 |
+
midi_notes = []
|
90 |
+
for filename in sorted(
|
91 |
+
[f for f in os.listdir(directory) if f.endswith(".wav")],
|
92 |
+
key=lambda x: int(x.split("_")[1]),
|
93 |
+
):
|
94 |
+
filepath = os.path.join(directory, filename)
|
95 |
+
# todo: stepsize = duration? then when shorter than "it" it should take the duration. Otherwise there a more notes
|
96 |
+
pitched_data = get_pitch_with_crepe_file(
|
97 |
+
filepath,
|
98 |
+
settings.crepe_model_capacity,
|
99 |
+
settings.crepe_step_size,
|
100 |
+
settings.tensorflow_device,
|
101 |
+
)
|
102 |
+
conf_f = get_frequencies_with_high_confidence(
|
103 |
+
pitched_data.frequencies, pitched_data.confidence
|
104 |
+
)
|
105 |
+
|
106 |
+
notes = convert_frequencies_to_notes(conf_f)
|
107 |
+
note = most_frequent(notes)[0][0]
|
108 |
+
|
109 |
+
midi_notes.append(note)
|
110 |
+
# todo: Progress?
|
111 |
+
# print(filename + " f: " + str(mean))
|
112 |
+
|
113 |
+
return midi_notes
|
114 |
+
|
115 |
+
|
116 |
+
def add_hyphen_to_data(transcribed_data: list[TranscribedData], hyphen_words: list[list[str]]):
|
117 |
+
"""Add hyphen to transcribed data return new data list"""
|
118 |
+
new_data = []
|
119 |
+
|
120 |
+
for i, data in enumerate(transcribed_data):
|
121 |
+
if not hyphen_words[i]:
|
122 |
+
new_data.append(data)
|
123 |
+
else:
|
124 |
+
chunk_duration = data.end - data.start
|
125 |
+
chunk_duration = chunk_duration / (len(hyphen_words[i]))
|
126 |
+
|
127 |
+
next_start = data.start
|
128 |
+
for j in enumerate(hyphen_words[i]):
|
129 |
+
hyphenated_word_index = j[0]
|
130 |
+
dup = copy.copy(data)
|
131 |
+
dup.start = next_start
|
132 |
+
next_start = data.end - chunk_duration * (
|
133 |
+
len(hyphen_words[i]) - 1 - hyphenated_word_index
|
134 |
+
)
|
135 |
+
dup.end = next_start
|
136 |
+
dup.word = hyphen_words[i][hyphenated_word_index]
|
137 |
+
dup.is_hyphen = True
|
138 |
+
if hyphenated_word_index == len(hyphen_words[i]) - 1:
|
139 |
+
dup.is_word_end = True
|
140 |
+
else:
|
141 |
+
dup.is_word_end = False
|
142 |
+
new_data.append(dup)
|
143 |
+
|
144 |
+
return new_data
|
145 |
+
|
146 |
+
|
147 |
+
def get_bpm_from_data(data, sampling_rate):
|
148 |
+
"""Get real bpm from audio data"""
|
149 |
+
onset_env = librosa.onset.onset_strength(y=data, sr=sampling_rate)
|
150 |
+
wav_tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sampling_rate)
|
151 |
+
|
152 |
+
print(
|
153 |
+
f"{ULTRASINGER_HEAD} BPM is {blue_highlighted(str(round(wav_tempo[0], 2)))}"
|
154 |
+
)
|
155 |
+
return wav_tempo[0]
|
156 |
+
|
157 |
+
|
158 |
+
def get_bpm_from_file(wav_file: str) -> float:
|
159 |
+
"""Get real bpm from audio file"""
|
160 |
+
data, sampling_rate = librosa.load(wav_file, sr=None)
|
161 |
+
return get_bpm_from_data(data, sampling_rate)
|
162 |
+
|
163 |
+
|
164 |
+
def correct_words(recognized_words, word_list_file):
|
165 |
+
"""Docstring"""
|
166 |
+
with open(word_list_file, "r", encoding="utf-8") as file:
|
167 |
+
text = file.read()
|
168 |
+
word_list = text.split()
|
169 |
+
|
170 |
+
for i, rec_word in enumerate(recognized_words):
|
171 |
+
if rec_word.word in word_list:
|
172 |
+
continue
|
173 |
+
|
174 |
+
closest_word = min(
|
175 |
+
word_list, key=lambda x: Levenshtein.distance(rec_word.word, x)
|
176 |
+
)
|
177 |
+
print(recognized_words[i].word + " - " + closest_word)
|
178 |
+
recognized_words[i].word = closest_word
|
179 |
+
return recognized_words
|
180 |
+
|
181 |
+
|
182 |
+
def print_help() -> None:
|
183 |
+
"""Print help text"""
|
184 |
+
help_string = """
|
185 |
+
UltraSinger.py [opt] [mode] [transcription] [pitcher] [extra]
|
186 |
+
|
187 |
+
[opt]
|
188 |
+
-h This help text.
|
189 |
+
-i Ultrastar.txt
|
190 |
+
audio like .mp3, .wav, youtube link
|
191 |
+
-o Output folder
|
192 |
+
|
193 |
+
[mode]
|
194 |
+
## INPUT is audio ##
|
195 |
+
default Creates all
|
196 |
+
|
197 |
+
# Single file creation selection is in progress, you currently getting all!
|
198 |
+
(-u Create ultrastar txt file) # In Progress
|
199 |
+
(-m Create midi file) # In Progress
|
200 |
+
(-s Create sheet file) # In Progress
|
201 |
+
|
202 |
+
## INPUT is ultrastar.txt ##
|
203 |
+
default Creates all
|
204 |
+
|
205 |
+
# Single selection is in progress, you currently getting all!
|
206 |
+
(-r repitch Ultrastar.txt (input has to be audio)) # In Progress
|
207 |
+
(-p Check pitch of Ultrastar.txt input) # In Progress
|
208 |
+
(-m Create midi file) # In Progress
|
209 |
+
|
210 |
+
[transcription]
|
211 |
+
# Default is whisper
|
212 |
+
--whisper Multilingual model > tiny|base|small|medium|large-v1|large-v2 >> ((default) is large-v2
|
213 |
+
English-only model > tiny.en|base.en|small.en|medium.en
|
214 |
+
--whisper_align_model Use other languages model for Whisper provided from huggingface.co
|
215 |
+
--language Override the language detected by whisper, does not affect transcription but steps after transcription
|
216 |
+
--whisper_batch_size Reduce if low on GPU mem >> ((default) is 16)
|
217 |
+
--whisper_compute_type Change to "int8" if low on GPU mem (may reduce accuracy) >> ((default) is "float16" for cuda devices, "int8" for cpu)
|
218 |
+
|
219 |
+
[pitcher]
|
220 |
+
# Default is crepe
|
221 |
+
--crepe tiny|full >> ((default) is full)
|
222 |
+
--crepe_step_size unit is miliseconds >> ((default) is 10)
|
223 |
+
|
224 |
+
[extra]
|
225 |
+
--hyphenation True|False >> ((default) is True)
|
226 |
+
--disable_separation True|False >> ((default) is False)
|
227 |
+
--disable_karaoke True|False >> ((default) is False)
|
228 |
+
--create_audio_chunks True|False >> ((default) is False)
|
229 |
+
--keep_cache True|False >> ((default) is False)
|
230 |
+
--plot True|False >> ((default) is False)
|
231 |
+
--format_version 0.3.0|1.0.0|1.1.0 >> ((default) is 1.0.0)
|
232 |
+
|
233 |
+
[device]
|
234 |
+
--force_cpu True|False >> ((default) is False) All steps will be forced to cpu
|
235 |
+
--force_whisper_cpu True|False >> ((default) is False) Only whisper will be forced to cpu
|
236 |
+
--force_crepe_cpu True|False >> ((default) is False) Only crepe will be forced to cpu
|
237 |
+
"""
|
238 |
+
print(help_string)
|
239 |
+
|
240 |
+
|
241 |
+
def remove_unecessary_punctuations(transcribed_data: list[TranscribedData]) -> None:
|
242 |
+
"""Remove unecessary punctuations from transcribed data"""
|
243 |
+
punctuation = ".,"
|
244 |
+
for i, data in enumerate(transcribed_data):
|
245 |
+
data.word = data.word.translate(
|
246 |
+
{ord(i): None for i in punctuation}
|
247 |
+
)
|
248 |
+
|
249 |
+
|
250 |
+
def hyphenate_each_word(language: str, transcribed_data: list[TranscribedData]) -> list[list[str]] | None:
|
251 |
+
"""Hyphenate each word in the transcribed data."""
|
252 |
+
lang_region = language_check(language)
|
253 |
+
if lang_region is None:
|
254 |
+
print(
|
255 |
+
f"{ULTRASINGER_HEAD} {red_highlighted('Error in hyphenation for language ')} {blue_highlighted(language)}{red_highlighted(', maybe you want to disable it?')}"
|
256 |
+
)
|
257 |
+
return None
|
258 |
+
|
259 |
+
hyphenated_word = []
|
260 |
+
try:
|
261 |
+
hyphenator = create_hyphenator(lang_region)
|
262 |
+
for i in tqdm(enumerate(transcribed_data)):
|
263 |
+
pos = i[0]
|
264 |
+
hyphenated_word.append(
|
265 |
+
hyphenation(transcribed_data[pos].word, hyphenator)
|
266 |
+
)
|
267 |
+
except:
|
268 |
+
print(f"{ULTRASINGER_HEAD} {red_highlighted('Error in hyphenation for language ')} {blue_highlighted(language)}{red_highlighted(', maybe you want to disable it?')}")
|
269 |
+
return None
|
270 |
+
|
271 |
+
return hyphenated_word
|
272 |
+
|
273 |
+
|
274 |
+
def print_support() -> None:
|
275 |
+
"""Print support text"""
|
276 |
+
print()
|
277 |
+
print(
|
278 |
+
f"{ULTRASINGER_HEAD} {gold_highlighted('Do you like UltraSinger? Want it to be even better? Then help with your')} {light_blue_highlighted('support')}{gold_highlighted('!')}"
|
279 |
+
)
|
280 |
+
print(
|
281 |
+
f"{ULTRASINGER_HEAD} See project page -> https://github.com/rakuri255/UltraSinger"
|
282 |
+
)
|
283 |
+
print(
|
284 |
+
f"{ULTRASINGER_HEAD} {gold_highlighted('This will help a lot to keep this project alive and improved.')}"
|
285 |
+
)
|
286 |
+
|
287 |
+
def print_version() -> None:
|
288 |
+
"""Print version text"""
|
289 |
+
print()
|
290 |
+
print(
|
291 |
+
f"{ULTRASINGER_HEAD} {gold_highlighted('*****************************')}"
|
292 |
+
)
|
293 |
+
print(
|
294 |
+
f"{ULTRASINGER_HEAD} {gold_highlighted('UltraSinger Version:')} {light_blue_highlighted(settings.APP_VERSION)}"
|
295 |
+
)
|
296 |
+
print(
|
297 |
+
f"{ULTRASINGER_HEAD} {gold_highlighted('*****************************')}"
|
298 |
+
)
|
299 |
+
|
300 |
+
def run() -> None:
|
301 |
+
"""The processing function of this program"""
|
302 |
+
is_audio = ".txt" not in settings.input_file_path
|
303 |
+
ultrastar_class = None
|
304 |
+
real_bpm = None
|
305 |
+
(title, artist, year, genre) = (None, None, None, None)
|
306 |
+
|
307 |
+
if not is_audio: # Parse Ultrastar txt
|
308 |
+
print(
|
309 |
+
f"{ULTRASINGER_HEAD} {gold_highlighted('re-pitch mode')}"
|
310 |
+
)
|
311 |
+
(
|
312 |
+
basename_without_ext,
|
313 |
+
real_bpm,
|
314 |
+
song_output,
|
315 |
+
ultrastar_audio_input_path,
|
316 |
+
ultrastar_class,
|
317 |
+
) = parse_ultrastar_txt()
|
318 |
+
elif settings.input_file_path.startswith("https:"): # Youtube
|
319 |
+
print(
|
320 |
+
f"{ULTRASINGER_HEAD} {gold_highlighted('full automatic mode')}"
|
321 |
+
)
|
322 |
+
(
|
323 |
+
basename_without_ext,
|
324 |
+
song_output,
|
325 |
+
ultrastar_audio_input_path,
|
326 |
+
(title, artist, year, genre)
|
327 |
+
) = download_from_youtube()
|
328 |
+
else: # Audio File
|
329 |
+
print(
|
330 |
+
f"{ULTRASINGER_HEAD} {gold_highlighted('full automatic mode')}"
|
331 |
+
)
|
332 |
+
(
|
333 |
+
basename_without_ext,
|
334 |
+
song_output,
|
335 |
+
ultrastar_audio_input_path,
|
336 |
+
(title, artist, year, genre)
|
337 |
+
) = infos_from_audio_input_file()
|
338 |
+
|
339 |
+
cache_path = os.path.join(song_output, "cache")
|
340 |
+
settings.processing_audio_path = os.path.join(
|
341 |
+
cache_path, basename_without_ext + ".wav"
|
342 |
+
)
|
343 |
+
os_helper.create_folder(cache_path)
|
344 |
+
|
345 |
+
# Separate vocal from audio
|
346 |
+
audio_separation_path = separate_vocal_from_audio(
|
347 |
+
basename_without_ext, cache_path, ultrastar_audio_input_path
|
348 |
+
)
|
349 |
+
vocals_path = os.path.join(audio_separation_path, "vocals.wav")
|
350 |
+
instrumental_path = os.path.join(audio_separation_path, "no_vocals.wav")
|
351 |
+
|
352 |
+
# Move instrumental and vocals
|
353 |
+
if settings.create_karaoke and version.parse(settings.format_version) < version.parse("1.1.0"):
|
354 |
+
karaoke_output_path = os.path.join(song_output, basename_without_ext + " [Karaoke].mp3")
|
355 |
+
convert_wav_to_mp3(instrumental_path, karaoke_output_path)
|
356 |
+
|
357 |
+
if version.parse(settings.format_version) >= version.parse("1.1.0"):
|
358 |
+
instrumental_output_path = os.path.join(song_output, basename_without_ext + " [Instrumental].mp3")
|
359 |
+
convert_wav_to_mp3(instrumental_path, instrumental_output_path)
|
360 |
+
vocals_output_path = os.path.join(song_output, basename_without_ext + " [Vocals].mp3")
|
361 |
+
convert_wav_to_mp3(vocals_path, vocals_output_path)
|
362 |
+
|
363 |
+
if settings.use_separated_vocal:
|
364 |
+
input_path = vocals_path
|
365 |
+
else:
|
366 |
+
input_path = ultrastar_audio_input_path
|
367 |
+
|
368 |
+
# Denoise vocal audio
|
369 |
+
denoised_output_path = os.path.join(
|
370 |
+
cache_path, basename_without_ext + "_denoised.wav"
|
371 |
+
)
|
372 |
+
denoise_vocal_audio(input_path, denoised_output_path)
|
373 |
+
|
374 |
+
# Convert to mono audio
|
375 |
+
mono_output_path = os.path.join(
|
376 |
+
cache_path, basename_without_ext + "_mono.wav"
|
377 |
+
)
|
378 |
+
convert_audio_to_mono_wav(denoised_output_path, mono_output_path)
|
379 |
+
|
380 |
+
# Mute silence sections
|
381 |
+
mute_output_path = os.path.join(
|
382 |
+
cache_path, basename_without_ext + "_mute.wav"
|
383 |
+
)
|
384 |
+
mute_no_singing_parts(mono_output_path, mute_output_path)
|
385 |
+
|
386 |
+
# Define the audio file to process
|
387 |
+
settings.processing_audio_path = mute_output_path
|
388 |
+
|
389 |
+
# Audio transcription
|
390 |
+
transcribed_data = None
|
391 |
+
language = settings.language
|
392 |
+
if is_audio:
|
393 |
+
detected_language, transcribed_data = transcribe_audio()
|
394 |
+
if language is None:
|
395 |
+
language = detected_language
|
396 |
+
|
397 |
+
remove_unecessary_punctuations(transcribed_data)
|
398 |
+
|
399 |
+
if settings.hyphenation:
|
400 |
+
hyphen_words = hyphenate_each_word(language, transcribed_data)
|
401 |
+
if hyphen_words is not None:
|
402 |
+
transcribed_data = add_hyphen_to_data(transcribed_data, hyphen_words)
|
403 |
+
|
404 |
+
transcribed_data = remove_silence_from_transcription_data(
|
405 |
+
settings.processing_audio_path, transcribed_data
|
406 |
+
)
|
407 |
+
|
408 |
+
# todo: do we need to correct words?
|
409 |
+
# lyric = 'input/faber_lyric.txt'
|
410 |
+
# --corrected_words = correct_words(vosk_speech, lyric)
|
411 |
+
|
412 |
+
# Create audio chunks
|
413 |
+
if settings.create_audio_chunks:
|
414 |
+
create_audio_chunks(
|
415 |
+
cache_path,
|
416 |
+
is_audio,
|
417 |
+
transcribed_data,
|
418 |
+
ultrastar_audio_input_path,
|
419 |
+
ultrastar_class,
|
420 |
+
)
|
421 |
+
|
422 |
+
# Pitch the audio
|
423 |
+
midi_notes, pitched_data, ultrastar_note_numbers = pitch_audio(
|
424 |
+
is_audio, transcribed_data, ultrastar_class
|
425 |
+
)
|
426 |
+
|
427 |
+
# Create plot
|
428 |
+
if settings.create_plot:
|
429 |
+
vocals_path = os.path.join(audio_separation_path, "vocals.wav")
|
430 |
+
plot_spectrogram(vocals_path, song_output, "vocals.wav")
|
431 |
+
plot_spectrogram(settings.processing_audio_path, song_output, "processing audio")
|
432 |
+
plot(pitched_data, song_output, transcribed_data, ultrastar_class, midi_notes)
|
433 |
+
|
434 |
+
# Write Ultrastar txt
|
435 |
+
if is_audio:
|
436 |
+
real_bpm, ultrastar_file_output = create_ultrastar_txt_from_automation(
|
437 |
+
basename_without_ext,
|
438 |
+
song_output,
|
439 |
+
transcribed_data,
|
440 |
+
ultrastar_audio_input_path,
|
441 |
+
ultrastar_note_numbers,
|
442 |
+
language,
|
443 |
+
title,
|
444 |
+
artist,
|
445 |
+
year,
|
446 |
+
genre
|
447 |
+
)
|
448 |
+
else:
|
449 |
+
ultrastar_file_output = create_ultrastar_txt_from_ultrastar_data(
|
450 |
+
song_output, ultrastar_class, ultrastar_note_numbers
|
451 |
+
)
|
452 |
+
|
453 |
+
# Calc Points
|
454 |
+
ultrastar_class, simple_score, accurate_score = calculate_score_points(
|
455 |
+
is_audio, pitched_data, ultrastar_class, ultrastar_file_output
|
456 |
+
)
|
457 |
+
|
458 |
+
# Add calculated score to Ultrastar txt #Todo: Missing Karaoke
|
459 |
+
ultrastar_writer.add_score_to_ultrastar_txt(
|
460 |
+
ultrastar_file_output, simple_score
|
461 |
+
)
|
462 |
+
|
463 |
+
# Midi
|
464 |
+
if settings.create_midi:
|
465 |
+
create_midi_file(real_bpm, song_output, ultrastar_class, basename_without_ext)
|
466 |
+
|
467 |
+
# Cleanup
|
468 |
+
if not settings.keep_cache:
|
469 |
+
remove_cache_folder(cache_path)
|
470 |
+
|
471 |
+
# Print Support
|
472 |
+
print_support()
|
473 |
+
|
474 |
+
|
475 |
+
def mute_no_singing_parts(mono_output_path, mute_output_path):
|
476 |
+
print(
|
477 |
+
f"{ULTRASINGER_HEAD} Mute audio parts with no singing"
|
478 |
+
)
|
479 |
+
silence_sections = get_silence_sections(mono_output_path)
|
480 |
+
y, sr = librosa.load(mono_output_path, sr=None)
|
481 |
+
# Mute the parts of the audio with no singing
|
482 |
+
for i in silence_sections:
|
483 |
+
# Define the time range to mute
|
484 |
+
|
485 |
+
start_time = i[0] # Start time in seconds
|
486 |
+
end_time = i[1] # End time in seconds
|
487 |
+
|
488 |
+
# Convert time to sample indices
|
489 |
+
start_sample = int(start_time * sr)
|
490 |
+
end_sample = int(end_time * sr)
|
491 |
+
|
492 |
+
y[start_sample:end_sample] = 0
|
493 |
+
sf.write(mute_output_path, y, sr)
|
494 |
+
|
495 |
+
|
496 |
+
def get_unused_song_output_dir(path: str) -> str:
|
497 |
+
"""Get an unused song output dir"""
|
498 |
+
# check if dir exists and add (i) if it does
|
499 |
+
i = 1
|
500 |
+
if os_helper.check_if_folder_exists(path):
|
501 |
+
path = f"{path} ({i})"
|
502 |
+
else:
|
503 |
+
return path
|
504 |
+
|
505 |
+
while os_helper.check_if_folder_exists(path):
|
506 |
+
path = path.replace(f"({i - 1})", f"({i})")
|
507 |
+
i += 1
|
508 |
+
if i > 999:
|
509 |
+
print(
|
510 |
+
f"{ULTRASINGER_HEAD} {red_highlighted('Error: Could not create output folder! (999) is the maximum number of tries.')}"
|
511 |
+
)
|
512 |
+
sys.exit(1)
|
513 |
+
return path
|
514 |
+
|
515 |
+
|
516 |
+
def transcribe_audio() -> (str, list[TranscribedData]):
|
517 |
+
"""Transcribe audio with AI"""
|
518 |
+
if settings.transcriber == "whisper":
|
519 |
+
device = "cpu" if settings.force_whisper_cpu else settings.pytorch_device
|
520 |
+
transcribed_data, detected_language = transcribe_with_whisper(
|
521 |
+
settings.processing_audio_path,
|
522 |
+
settings.whisper_model,
|
523 |
+
device,
|
524 |
+
settings.whisper_align_model,
|
525 |
+
settings.whisper_batch_size,
|
526 |
+
settings.whisper_compute_type,
|
527 |
+
settings.language,
|
528 |
+
)
|
529 |
+
else:
|
530 |
+
raise NotImplementedError
|
531 |
+
return detected_language, transcribed_data
|
532 |
+
|
533 |
+
|
534 |
+
def separate_vocal_from_audio(
|
535 |
+
basename_without_ext: str, cache_path: str, ultrastar_audio_input_path: str
|
536 |
+
) -> str:
|
537 |
+
"""Separate vocal from audio"""
|
538 |
+
audio_separation_path = os.path.join(
|
539 |
+
cache_path, "separated", "htdemucs", basename_without_ext
|
540 |
+
)
|
541 |
+
|
542 |
+
if settings.use_separated_vocal or settings.create_karaoke:
|
543 |
+
separate_audio(ultrastar_audio_input_path, cache_path, settings.pytorch_device)
|
544 |
+
|
545 |
+
return audio_separation_path
|
546 |
+
|
547 |
+
def calculate_score_points(
|
548 |
+
is_audio: bool, pitched_data: PitchedData, ultrastar_class: UltrastarTxtValue, ultrastar_file_output: str
|
549 |
+
):
|
550 |
+
"""Calculate score points"""
|
551 |
+
if is_audio:
|
552 |
+
ultrastar_class = ultrastar_parser.parse_ultrastar_txt(
|
553 |
+
ultrastar_file_output
|
554 |
+
)
|
555 |
+
(
|
556 |
+
simple_score,
|
557 |
+
accurate_score,
|
558 |
+
) = ultrastar_score_calculator.calculate_score(
|
559 |
+
pitched_data, ultrastar_class
|
560 |
+
)
|
561 |
+
ultrastar_score_calculator.print_score_calculation(
|
562 |
+
simple_score, accurate_score
|
563 |
+
)
|
564 |
+
else:
|
565 |
+
print(
|
566 |
+
f"{ULTRASINGER_HEAD} {blue_highlighted('Score of original Ultrastar txt')}"
|
567 |
+
)
|
568 |
+
(
|
569 |
+
simple_score,
|
570 |
+
accurate_score,
|
571 |
+
) = ultrastar_score_calculator.calculate_score(
|
572 |
+
pitched_data, ultrastar_class
|
573 |
+
)
|
574 |
+
ultrastar_score_calculator.print_score_calculation(
|
575 |
+
simple_score, accurate_score
|
576 |
+
)
|
577 |
+
print(
|
578 |
+
f"{ULTRASINGER_HEAD} {blue_highlighted('Score of re-pitched Ultrastar txt')}"
|
579 |
+
)
|
580 |
+
ultrastar_class = ultrastar_parser.parse_ultrastar_txt(
|
581 |
+
ultrastar_file_output
|
582 |
+
)
|
583 |
+
(
|
584 |
+
simple_score,
|
585 |
+
accurate_score,
|
586 |
+
) = ultrastar_score_calculator.calculate_score(
|
587 |
+
pitched_data, ultrastar_class
|
588 |
+
)
|
589 |
+
ultrastar_score_calculator.print_score_calculation(
|
590 |
+
simple_score, accurate_score
|
591 |
+
)
|
592 |
+
return ultrastar_class, simple_score, accurate_score
|
593 |
+
|
594 |
+
|
595 |
+
def create_ultrastar_txt_from_ultrastar_data(
|
596 |
+
song_output: str, ultrastar_class: UltrastarTxtValue, ultrastar_note_numbers: list[int]
|
597 |
+
) -> str:
|
598 |
+
"""Create Ultrastar txt from Ultrastar data"""
|
599 |
+
output_repitched_ultrastar = os.path.join(
|
600 |
+
song_output, ultrastar_class.title + ".txt"
|
601 |
+
)
|
602 |
+
ultrastar_writer.create_repitched_txt_from_ultrastar_data(
|
603 |
+
settings.input_file_path,
|
604 |
+
ultrastar_note_numbers,
|
605 |
+
output_repitched_ultrastar,
|
606 |
+
)
|
607 |
+
return output_repitched_ultrastar
|
608 |
+
|
609 |
+
|
610 |
+
def create_ultrastar_txt_from_automation(
|
611 |
+
basename_without_ext: str,
|
612 |
+
song_output: str,
|
613 |
+
transcribed_data: list[TranscribedData],
|
614 |
+
ultrastar_audio_input_path: str,
|
615 |
+
ultrastar_note_numbers: list[int],
|
616 |
+
language: str,
|
617 |
+
title: str,
|
618 |
+
artist: str,
|
619 |
+
year: str,
|
620 |
+
genre: str
|
621 |
+
):
|
622 |
+
"""Create Ultrastar txt from automation"""
|
623 |
+
ultrastar_header = UltrastarTxtValue()
|
624 |
+
ultrastar_header.version = settings.format_version
|
625 |
+
ultrastar_header.title = basename_without_ext
|
626 |
+
ultrastar_header.artist = basename_without_ext
|
627 |
+
ultrastar_header.mp3 = basename_without_ext + ".mp3"
|
628 |
+
ultrastar_header.audio = basename_without_ext + ".mp3"
|
629 |
+
ultrastar_header.vocals = basename_without_ext + " [Vocals].mp3"
|
630 |
+
ultrastar_header.instrumental = basename_without_ext + " [Instrumental].mp3"
|
631 |
+
ultrastar_header.video = basename_without_ext + ".mp4"
|
632 |
+
ultrastar_header.language = language
|
633 |
+
cover = basename_without_ext + " [CO].jpg"
|
634 |
+
ultrastar_header.cover = (
|
635 |
+
cover
|
636 |
+
if os_helper.check_file_exists(os.path.join(song_output, cover))
|
637 |
+
else None
|
638 |
+
)
|
639 |
+
ultrastar_header.creator = f"{ultrastar_header.creator} {Settings.APP_VERSION}"
|
640 |
+
ultrastar_header.comment = f"{ultrastar_header.comment} {Settings.APP_VERSION}"
|
641 |
+
|
642 |
+
# Additional data
|
643 |
+
if title is not None:
|
644 |
+
ultrastar_header.title = title
|
645 |
+
if artist is not None:
|
646 |
+
ultrastar_header.artist = artist
|
647 |
+
if year is not None:
|
648 |
+
ultrastar_header.year = extract_year(year)
|
649 |
+
if genre is not None:
|
650 |
+
ultrastar_header.genre = format_separated_string(genre)
|
651 |
+
|
652 |
+
real_bpm = get_bpm_from_file(ultrastar_audio_input_path)
|
653 |
+
ultrastar_file_output = os.path.join(
|
654 |
+
song_output, basename_without_ext + ".txt"
|
655 |
+
)
|
656 |
+
ultrastar_writer.create_ultrastar_txt_from_automation(
|
657 |
+
transcribed_data,
|
658 |
+
ultrastar_note_numbers,
|
659 |
+
ultrastar_file_output,
|
660 |
+
ultrastar_header,
|
661 |
+
real_bpm,
|
662 |
+
)
|
663 |
+
if settings.create_karaoke and version.parse(settings.format_version) < version.parse("1.1.0"):
|
664 |
+
title = basename_without_ext + " [Karaoke]"
|
665 |
+
ultrastar_header.title = title
|
666 |
+
ultrastar_header.mp3 = title + ".mp3"
|
667 |
+
karaoke_output_path = os.path.join(song_output, title)
|
668 |
+
karaoke_txt_output_path = karaoke_output_path + ".txt"
|
669 |
+
ultrastar_writer.create_ultrastar_txt_from_automation(
|
670 |
+
transcribed_data,
|
671 |
+
ultrastar_note_numbers,
|
672 |
+
karaoke_txt_output_path,
|
673 |
+
ultrastar_header,
|
674 |
+
real_bpm,
|
675 |
+
)
|
676 |
+
return real_bpm, ultrastar_file_output
|
677 |
+
|
678 |
+
def extract_year(date: str) -> str:
|
679 |
+
match = re.search(r'\b\d{4}\b', date)
|
680 |
+
if match:
|
681 |
+
return match.group(0)
|
682 |
+
else:
|
683 |
+
return date
|
684 |
+
|
685 |
+
def format_separated_string(data: str) -> str:
|
686 |
+
temp = re.sub(r'[;/]', ',', data)
|
687 |
+
words = temp.split(',')
|
688 |
+
words = [s for s in words if s.strip()]
|
689 |
+
|
690 |
+
for i, word in enumerate(words):
|
691 |
+
if "-" not in word:
|
692 |
+
words[i] = word.strip().capitalize() + ', '
|
693 |
+
else:
|
694 |
+
dash_words = word.split('-')
|
695 |
+
capitalized_dash_words = [dash_word.strip().capitalize() for dash_word in dash_words]
|
696 |
+
formatted_dash_word = '-'.join(capitalized_dash_words) + ', '
|
697 |
+
words[i] = formatted_dash_word
|
698 |
+
|
699 |
+
formatted_string = ''.join(words)
|
700 |
+
|
701 |
+
if formatted_string.endswith(', '):
|
702 |
+
formatted_string = formatted_string[:-2]
|
703 |
+
|
704 |
+
return formatted_string
|
705 |
+
|
706 |
+
def infos_from_audio_input_file() -> tuple[str, str, str, tuple[str, str, str, str]]:
|
707 |
+
"""Infos from audio input file"""
|
708 |
+
basename = os.path.basename(settings.input_file_path)
|
709 |
+
basename_without_ext = os.path.splitext(basename)[0]
|
710 |
+
|
711 |
+
artist, title = None, None
|
712 |
+
if " - " in basename_without_ext:
|
713 |
+
artist, title = basename_without_ext.split(" - ", 1)
|
714 |
+
search_string = f"{artist} - {title}"
|
715 |
+
else:
|
716 |
+
search_string = basename_without_ext
|
717 |
+
|
718 |
+
# Get additional data for song
|
719 |
+
(title_info, artist_info, year_info, genre_info) = get_music_infos(search_string)
|
720 |
+
|
721 |
+
if title_info is not None:
|
722 |
+
title = title_info
|
723 |
+
artist = artist_info
|
724 |
+
|
725 |
+
if artist is not None and title is not None:
|
726 |
+
basename_without_ext = f"{artist} - {title}"
|
727 |
+
extension = os.path.splitext(basename)[1]
|
728 |
+
basename = f"{basename_without_ext}{extension}"
|
729 |
+
|
730 |
+
song_output = os.path.join(settings.output_file_path, basename_without_ext)
|
731 |
+
song_output = get_unused_song_output_dir(song_output)
|
732 |
+
os_helper.create_folder(song_output)
|
733 |
+
os_helper.copy(settings.input_file_path, song_output)
|
734 |
+
os_helper.rename(os.path.join(song_output, os.path.basename(settings.input_file_path)), os.path.join(song_output, basename))
|
735 |
+
ultrastar_audio_input_path = os.path.join(song_output, basename)
|
736 |
+
return basename_without_ext, song_output, ultrastar_audio_input_path, (title, artist, year_info, genre_info)
|
737 |
+
|
738 |
+
|
739 |
+
FILENAME_REPLACEMENTS = (('?:"', ""), ("<", "("), (">", ")"), ("/\\|*", "-"))
|
740 |
+
|
741 |
+
|
742 |
+
def sanitize_filename(fname: str) -> str:
|
743 |
+
"""Sanitize filename"""
|
744 |
+
for old, new in FILENAME_REPLACEMENTS:
|
745 |
+
for char in old:
|
746 |
+
fname = fname.replace(char, new)
|
747 |
+
if fname.endswith("."):
|
748 |
+
fname = fname.rstrip(" .") # Windows does not like trailing periods
|
749 |
+
return fname
|
750 |
+
|
751 |
+
|
752 |
+
def download_from_youtube() -> tuple[str, str, str, tuple[str, str, str, str]]:
|
753 |
+
"""Download from YouTube"""
|
754 |
+
(artist, title) = get_youtube_title(settings.input_file_path)
|
755 |
+
|
756 |
+
# Get additional data for song
|
757 |
+
(title_info, artist_info, year_info, genre_info) = get_music_infos(f"{artist} - {title}")
|
758 |
+
|
759 |
+
if title_info is not None:
|
760 |
+
title = title_info
|
761 |
+
artist = artist_info
|
762 |
+
|
763 |
+
basename_without_ext = sanitize_filename(f"{artist} - {title}")
|
764 |
+
basename = basename_without_ext + ".mp3"
|
765 |
+
song_output = os.path.join(settings.output_file_path, basename_without_ext)
|
766 |
+
song_output = get_unused_song_output_dir(song_output)
|
767 |
+
os_helper.create_folder(song_output)
|
768 |
+
download_youtube_audio(
|
769 |
+
settings.input_file_path, basename_without_ext, song_output
|
770 |
+
)
|
771 |
+
download_youtube_video(
|
772 |
+
settings.input_file_path, basename_without_ext, song_output
|
773 |
+
)
|
774 |
+
download_youtube_thumbnail(
|
775 |
+
settings.input_file_path, basename_without_ext, song_output
|
776 |
+
)
|
777 |
+
ultrastar_audio_input_path = os.path.join(song_output, basename)
|
778 |
+
return basename_without_ext, song_output, ultrastar_audio_input_path, (title, artist, year_info, genre_info)
|
779 |
+
|
780 |
+
|
781 |
+
def parse_ultrastar_txt() -> tuple[str, float, str, str, UltrastarTxtValue]:
|
782 |
+
"""Parse Ultrastar txt"""
|
783 |
+
ultrastar_class = ultrastar_parser.parse_ultrastar_txt(
|
784 |
+
settings.input_file_path
|
785 |
+
)
|
786 |
+
real_bpm = ultrastar_converter.ultrastar_bpm_to_real_bpm(
|
787 |
+
float(ultrastar_class.bpm.replace(",", "."))
|
788 |
+
)
|
789 |
+
ultrastar_mp3_name = ultrastar_class.mp3
|
790 |
+
basename_without_ext = os.path.splitext(ultrastar_mp3_name)[0]
|
791 |
+
dirname = os.path.dirname(settings.input_file_path)
|
792 |
+
ultrastar_audio_input_path = os.path.join(dirname, ultrastar_mp3_name)
|
793 |
+
song_output = os.path.join(
|
794 |
+
settings.output_file_path,
|
795 |
+
ultrastar_class.artist.strip() + " - " + ultrastar_class.title.strip(),
|
796 |
+
)
|
797 |
+
song_output = get_unused_song_output_dir(str(song_output))
|
798 |
+
os_helper.create_folder(song_output)
|
799 |
+
|
800 |
+
return (
|
801 |
+
str(basename_without_ext),
|
802 |
+
real_bpm,
|
803 |
+
song_output,
|
804 |
+
str(ultrastar_audio_input_path),
|
805 |
+
ultrastar_class,
|
806 |
+
)
|
807 |
+
|
808 |
+
|
809 |
+
def create_midi_file(real_bpm: float,
|
810 |
+
song_output: str,
|
811 |
+
ultrastar_class: UltrastarTxtValue,
|
812 |
+
basename_without_ext: str) -> None:
|
813 |
+
"""Create midi file"""
|
814 |
+
print(
|
815 |
+
f"{ULTRASINGER_HEAD} Creating Midi with {blue_highlighted('pretty_midi')}"
|
816 |
+
)
|
817 |
+
|
818 |
+
voice_instrument = [
|
819 |
+
midi_creator.convert_ultrastar_to_midi_instrument(ultrastar_class)
|
820 |
+
]
|
821 |
+
midi_output = os.path.join(song_output, f"{basename_without_ext}.mid")
|
822 |
+
midi_creator.instruments_to_midi(
|
823 |
+
voice_instrument, real_bpm, midi_output
|
824 |
+
)
|
825 |
+
|
826 |
+
|
827 |
+
def pitch_audio(is_audio: bool, transcribed_data: list[TranscribedData], ultrastar_class: UltrastarTxtValue) -> tuple[
|
828 |
+
list[str], PitchedData, list[int]]:
|
829 |
+
"""Pitch audio"""
|
830 |
+
# todo: chunk pitching as option?
|
831 |
+
# midi_notes = pitch_each_chunk_with_crepe(chunk_folder_name)
|
832 |
+
device = "cpu" if settings.force_crepe_cpu else settings.tensorflow_device
|
833 |
+
pitched_data = get_pitch_with_crepe_file(
|
834 |
+
settings.processing_audio_path,
|
835 |
+
settings.crepe_model_capacity,
|
836 |
+
settings.crepe_step_size,
|
837 |
+
device,
|
838 |
+
)
|
839 |
+
if is_audio:
|
840 |
+
start_times = []
|
841 |
+
end_times = []
|
842 |
+
for i, data in enumerate(transcribed_data):
|
843 |
+
start_times.append(data.start)
|
844 |
+
end_times.append(data.end)
|
845 |
+
midi_notes = create_midi_notes_from_pitched_data(
|
846 |
+
start_times, end_times, pitched_data
|
847 |
+
)
|
848 |
+
|
849 |
+
else:
|
850 |
+
midi_notes = create_midi_notes_from_pitched_data(
|
851 |
+
ultrastar_class.startTimes, ultrastar_class.endTimes, pitched_data
|
852 |
+
)
|
853 |
+
ultrastar_note_numbers = convert_midi_notes_to_ultrastar_notes(midi_notes)
|
854 |
+
return midi_notes, pitched_data, ultrastar_note_numbers
|
855 |
+
|
856 |
+
|
857 |
+
def create_audio_chunks(
|
858 |
+
cache_path: str,
|
859 |
+
is_audio: bool,
|
860 |
+
transcribed_data: list[TranscribedData],
|
861 |
+
ultrastar_audio_input_path: str,
|
862 |
+
ultrastar_class: UltrastarTxtValue
|
863 |
+
) -> None:
|
864 |
+
"""Create audio chunks"""
|
865 |
+
audio_chunks_path = os.path.join(
|
866 |
+
cache_path, settings.audio_chunk_folder_name
|
867 |
+
)
|
868 |
+
os_helper.create_folder(audio_chunks_path)
|
869 |
+
if is_audio: # and csv
|
870 |
+
csv_filename = os.path.join(audio_chunks_path, "_chunks.csv")
|
871 |
+
export_chunks_from_transcribed_data(
|
872 |
+
settings.processing_audio_path, transcribed_data, audio_chunks_path
|
873 |
+
)
|
874 |
+
export_transcribed_data_to_csv(transcribed_data, csv_filename)
|
875 |
+
else:
|
876 |
+
export_chunks_from_ultrastar_data(
|
877 |
+
ultrastar_audio_input_path, ultrastar_class, audio_chunks_path
|
878 |
+
)
|
879 |
+
|
880 |
+
def denoise_vocal_audio(input_path: str, output_path: str) -> None:
|
881 |
+
"""Denoise vocal audio"""
|
882 |
+
ffmpeg_reduce_noise(input_path, output_path)
|
883 |
+
|
884 |
+
|
885 |
+
def main(argv: list[str]) -> None:
|
886 |
+
"""Main function"""
|
887 |
+
print_version()
|
888 |
+
init_settings(argv)
|
889 |
+
run()
|
890 |
+
sys.exit()
|
891 |
+
|
892 |
+
def remove_cache_folder(cache_path: str) -> None:
|
893 |
+
"""Remove cache folder"""
|
894 |
+
os_helper.remove_folder(cache_path)
|
895 |
+
|
896 |
+
def init_settings(argv: list[str]) -> None:
|
897 |
+
"""Init settings"""
|
898 |
+
long, short = arg_options()
|
899 |
+
opts, args = getopt.getopt(argv, short, long)
|
900 |
+
if len(opts) == 0:
|
901 |
+
print_help()
|
902 |
+
sys.exit()
|
903 |
+
for opt, arg in opts:
|
904 |
+
if opt == "-h":
|
905 |
+
print_help()
|
906 |
+
sys.exit()
|
907 |
+
elif opt in ("-i", "--ifile"):
|
908 |
+
settings.input_file_path = arg
|
909 |
+
elif opt in ("-o", "--ofile"):
|
910 |
+
settings.output_file_path = arg
|
911 |
+
elif opt in ("--whisper"):
|
912 |
+
settings.transcriber = "whisper"
|
913 |
+
settings.whisper_model = arg
|
914 |
+
elif opt in ("--whisper_align_model"):
|
915 |
+
settings.whisper_align_model = arg
|
916 |
+
elif opt in ("--whisper_batch_size"):
|
917 |
+
settings.whisper_batch_size = int(arg)
|
918 |
+
elif opt in ("--whisper_compute_type"):
|
919 |
+
settings.whisper_compute_type = arg
|
920 |
+
elif opt in ("--language"):
|
921 |
+
settings.language = arg
|
922 |
+
elif opt in ("--crepe"):
|
923 |
+
settings.crepe_model_capacity = arg
|
924 |
+
elif opt in ("--crepe_step_size"):
|
925 |
+
settings.crepe_step_size = int(arg)
|
926 |
+
elif opt in ("--plot"):
|
927 |
+
settings.create_plot = arg in ["True", "true"]
|
928 |
+
elif opt in ("--midi"):
|
929 |
+
settings.create_midi = arg in ["True", "true"]
|
930 |
+
elif opt in ("--hyphenation"):
|
931 |
+
settings.hyphenation = eval(arg.title())
|
932 |
+
elif opt in ("--disable_separation"):
|
933 |
+
settings.use_separated_vocal = not arg
|
934 |
+
elif opt in ("--disable_karaoke"):
|
935 |
+
settings.create_karaoke = not arg
|
936 |
+
elif opt in ("--create_audio_chunks"):
|
937 |
+
settings.create_audio_chunks = arg
|
938 |
+
elif opt in ("--force_cpu"):
|
939 |
+
settings.force_cpu = arg
|
940 |
+
if settings.force_cpu:
|
941 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
|
942 |
+
elif opt in ("--force_whisper_cpu"):
|
943 |
+
settings.force_whisper_cpu = eval(arg.title())
|
944 |
+
elif opt in ("--force_crepe_cpu"):
|
945 |
+
settings.force_crepe_cpu = eval(arg.title())
|
946 |
+
elif opt in ("--format_version"):
|
947 |
+
if arg != '0.3.0' and arg != '1.0.0' and arg != '1.1.0':
|
948 |
+
print(
|
949 |
+
f"{ULTRASINGER_HEAD} {red_highlighted('Error: Format version')} {blue_highlighted(arg)} {red_highlighted('is not supported.')}"
|
950 |
+
)
|
951 |
+
sys.exit(1)
|
952 |
+
settings.format_version = arg
|
953 |
+
elif opt in ("--keep_cache"):
|
954 |
+
settings.keep_cache = arg
|
955 |
+
if settings.output_file_path == "":
|
956 |
+
if settings.input_file_path.startswith("https:"):
|
957 |
+
dirname = os.getcwd()
|
958 |
+
else:
|
959 |
+
dirname = os.path.dirname(settings.input_file_path)
|
960 |
+
settings.output_file_path = os.path.join(dirname, "output")
|
961 |
+
|
962 |
+
if not settings.force_cpu:
|
963 |
+
settings.tensorflow_device, settings.pytorch_device = check_gpu_support()
|
964 |
+
|
965 |
+
|
966 |
+
def arg_options():
|
967 |
+
short = "hi:o:amv:"
|
968 |
+
long = [
|
969 |
+
"ifile=",
|
970 |
+
"ofile=",
|
971 |
+
"crepe=",
|
972 |
+
"crepe_step_size=",
|
973 |
+
"whisper=",
|
974 |
+
"whisper_align_model=",
|
975 |
+
"whisper_batch_size=",
|
976 |
+
"whisper_compute_type=",
|
977 |
+
"language=",
|
978 |
+
"plot=",
|
979 |
+
"midi=",
|
980 |
+
"hyphenation=",
|
981 |
+
"disable_separation=",
|
982 |
+
"disable_karaoke=",
|
983 |
+
"create_audio_chunks=",
|
984 |
+
"force_cpu=",
|
985 |
+
"force_whisper_cpu=",
|
986 |
+
"force_crepe_cpu=",
|
987 |
+
"format_version=",
|
988 |
+
"keep_cache"
|
989 |
+
]
|
990 |
+
return long, short
|
991 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
992 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
993 |
if __name__ == "__main__":
|
994 |
+
main(sys.argv[1:])
|