Spaces:
Running
on
Zero
Running
on
Zero
xieli
commited on
Commit
·
f21ec03
1
Parent(s):
6852edb
feat: change prompt
Browse files- app.py +1 -1
- config/edit_config.py +3 -4
- config/prompts.py +7 -49
- tts.py +11 -9
app.py
CHANGED
|
@@ -389,7 +389,7 @@ class EditxTab:
|
|
| 389 |
""")
|
| 390 |
gr.Markdown("""
|
| 391 |
**Para-linguistic Description:**
|
| 392 |
-
- Supported tags include: [Breathing] [Laughter] [
|
| 393 |
- Example:
|
| 394 |
- Fill in "clone text" field: "Great, the weather is so nice today." Click the "CLONE" button to get audio.
|
| 395 |
- Change "clone text" field to: "Great[Laughter], the weather is so nice today[Surprise-ah]." Click the "EDIT" button to get para-linguistic audio.
|
|
|
|
| 389 |
""")
|
| 390 |
gr.Markdown("""
|
| 391 |
**Para-linguistic Description:**
|
| 392 |
+
- Supported tags include: [Breathing] [Laughter] [Surprise-oh] [Confirmation-en] [Uhm] [Surprise-ah] [Surprise-wa] [Sigh] [Question-ei] [Dissatisfaction-hnn]
|
| 393 |
- Example:
|
| 394 |
- Fill in "clone text" field: "Great, the weather is so nice today." Click the "CLONE" button to get audio.
|
| 395 |
- Change "clone text" field to: "Great[Laughter], the weather is so nice today[Surprise-ah]." Click the "EDIT" button to get para-linguistic audio.
|
config/edit_config.py
CHANGED
|
@@ -15,19 +15,18 @@ def get_supported_edit_types():
|
|
| 15 |
"emotion": [
|
| 16 |
'happy', 'angry', 'sad', 'humour', 'confusion', 'disgusted',
|
| 17 |
'empathy', 'embarrass', 'fear', 'surprised', 'excited',
|
| 18 |
-
'depressed', 'coldness', 'admiration'
|
| 19 |
],
|
| 20 |
"style": [
|
| 21 |
'serious', 'arrogant', 'child', 'older', 'girl', 'pure',
|
| 22 |
'sister', 'sweet', 'ethereal', 'whisper', 'gentle', 'recite',
|
| 23 |
'generous', 'act_coy', 'warm', 'shy', 'comfort', 'authority',
|
| 24 |
'chat', 'radio', 'soulful', 'story', 'vivid', 'program',
|
| 25 |
-
'news', 'advertising', 'roar', 'murmur', 'shout', 'deeply', 'loudly'
|
|
|
|
| 26 |
],
|
| 27 |
"vad": [],
|
| 28 |
-
"music": [],
|
| 29 |
"denoise": [],
|
| 30 |
"para-linguistic": [],
|
| 31 |
"speed": ["faster", "slower", "more faster", "more slower"],
|
| 32 |
-
"animal": [],
|
| 33 |
}
|
|
|
|
| 15 |
"emotion": [
|
| 16 |
'happy', 'angry', 'sad', 'humour', 'confusion', 'disgusted',
|
| 17 |
'empathy', 'embarrass', 'fear', 'surprised', 'excited',
|
| 18 |
+
'depressed', 'coldness', 'admiration', 'remove'
|
| 19 |
],
|
| 20 |
"style": [
|
| 21 |
'serious', 'arrogant', 'child', 'older', 'girl', 'pure',
|
| 22 |
'sister', 'sweet', 'ethereal', 'whisper', 'gentle', 'recite',
|
| 23 |
'generous', 'act_coy', 'warm', 'shy', 'comfort', 'authority',
|
| 24 |
'chat', 'radio', 'soulful', 'story', 'vivid', 'program',
|
| 25 |
+
'news', 'advertising', 'roar', 'murmur', 'shout', 'deeply', 'loudly',
|
| 26 |
+
'remove'
|
| 27 |
],
|
| 28 |
"vad": [],
|
|
|
|
| 29 |
"denoise": [],
|
| 30 |
"para-linguistic": [],
|
| 31 |
"speed": ["faster", "slower", "more faster", "more slower"],
|
|
|
|
| 32 |
}
|
config/prompts.py
CHANGED
|
@@ -11,52 +11,10 @@ TTS_SYSTEM_PROMPTS = {
|
|
| 11 |
"sys_prompt_with_spk": '请用{}的声音尽可能自然地说出下面这些话。',
|
| 12 |
}
|
| 13 |
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
#
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
- sad
|
| 22 |
-
- fear
|
| 23 |
-
- disgusted
|
| 24 |
-
- surprised
|
| 25 |
-
- excited
|
| 26 |
-
|
| 27 |
-
# Speech Style Transfer:
|
| 28 |
-
You can adapt vocal delivery to diverse styles including:
|
| 29 |
-
- Whisper
|
| 30 |
-
- Coquettish
|
| 31 |
-
- Gentle
|
| 32 |
-
- Sweet
|
| 33 |
-
- Arrogant
|
| 34 |
-
- Innocent
|
| 35 |
-
- Radio Host
|
| 36 |
-
- Childlike
|
| 37 |
-
- Bold and Unconstrained
|
| 38 |
-
- Serious
|
| 39 |
-
- Expressive and Vivid
|
| 40 |
-
- Ethereal
|
| 41 |
-
- Exaggerated
|
| 42 |
-
- Recitation
|
| 43 |
-
- Girlish
|
| 44 |
-
- News Broadcast
|
| 45 |
-
- Mature Female Voice
|
| 46 |
-
- Middle-Aged or Elderly
|
| 47 |
-
- Program Hosting
|
| 48 |
-
|
| 49 |
-
# Paralinguistic Adjustments:
|
| 50 |
-
You can fine-tune non-verbal speech elements such as:
|
| 51 |
-
- Laughter Enhancement
|
| 52 |
-
- Emphatic Stress
|
| 53 |
-
- Rhythm and Pace Modulation
|
| 54 |
-
|
| 55 |
-
# Audio Tuning & Editing:
|
| 56 |
-
Your technical proficiency includes:
|
| 57 |
-
- Noise Reduction
|
| 58 |
-
- Background Music Removal
|
| 59 |
-
- Silence Trimming
|
| 60 |
-
- Speaker Extraction
|
| 61 |
-
|
| 62 |
-
Note: Users will provide instructions in natural language. You are expected to accurately interpret their requirements and perform the most suitable audio edits and enhancements."""
|
|
|
|
| 11 |
"sys_prompt_with_spk": '请用{}的声音尽可能自然地说出下面这些话。',
|
| 12 |
}
|
| 13 |
|
| 14 |
+
AUDIO_EDIT_SYSTEM_PROMPT = """As a highly skilled audio editing and tuning specialist, you excel in interpreting user instructions and applying precise adjustments to meet their needs. Your expertise spans a wide range of enhancement capabilities, including but not limited to:
|
| 15 |
+
# Emotional Enhancement
|
| 16 |
+
# Speaking Style Transfer
|
| 17 |
+
# Non-linguistic Adjustments
|
| 18 |
+
# Audio Tuning & Editing
|
| 19 |
+
Note: You will receive instructions in natural language and are expected to accurately interpret and execute the most suitable audio edits and enhancements.
|
| 20 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tts.py
CHANGED
|
@@ -277,23 +277,25 @@ class StepAudioTTS:
|
|
| 277 |
"""
|
| 278 |
|
| 279 |
audio_text = audio_text.strip() if audio_text else ""
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
instruct_prefix = f"Make the following audio more {edit_info} style. The text corresponding to the audio is: {audio_text}\n"
|
| 284 |
else:
|
| 285 |
-
instruct_prefix
|
| 286 |
-
elif edit_type == "music":
|
| 287 |
-
instruct_prefix = f"Separate the vocals from the following audio. The lyric is: {audio_text}"
|
| 288 |
elif edit_type == "denoise":
|
| 289 |
instruct_prefix = f"Remove any noise from the given audio while preserving the voice content clearly. Ensure that the speech quality remains intact with minimal distortion, and eliminate all noise from the audio."
|
| 290 |
elif edit_type == "vad":
|
| 291 |
instruct_prefix = f"Remove any silent portions from the given audio while preserving the voice content clearly. Ensure that the speech quality remains intact with minimal distortion, and eliminate all silence from the audio."
|
| 292 |
elif edit_type == "bgm":
|
| 293 |
instruct_prefix = f"Remove any background music (BGM) from the given audio while preserving the voice content clearly. Ensure that the speech quality remains intact with minimal distortion, and eliminate all BGM from the audio."
|
| 294 |
-
elif edit_type == "
|
| 295 |
-
instruct_prefix = f"Make the following audio more like mimic animal calls. The text corresponding to the audio is: {audio_text}\n"
|
| 296 |
-
elif edit_type == "para-linguistic":
|
| 297 |
instruct_prefix = f"Add some non-verbal sounds to make the audio more natural, the new text is : {text}\n The text corresponding to the audio is: {audio_text}\n"
|
| 298 |
else:
|
| 299 |
raise HTTPException(
|
|
|
|
| 277 |
"""
|
| 278 |
|
| 279 |
audio_text = audio_text.strip() if audio_text else ""
|
| 280 |
+
if edit_type in {"emotion", "speed"}:
|
| 281 |
+
if edit_info == "remove":
|
| 282 |
+
instruct_prefix = f"Remove any emotion in the following audio and the reference text is: {audio_text}\n"
|
| 283 |
+
else:
|
| 284 |
+
instruct_prefix=f"Make the following audio more {edit_info}. The text corresponding to the audio is: {audio_text}\n"
|
| 285 |
+
elif edit_type == "style":
|
| 286 |
+
if edit_info == "remove":
|
| 287 |
+
instruct_prefix = f"Remove any speaking styles in the following audio and the reference text is: {audio_text}\n"
|
| 288 |
+
elif edit_info in {"exaggerated","ethereal","whisper","act_coy","older"}:
|
| 289 |
instruct_prefix = f"Make the following audio more {edit_info} style. The text corresponding to the audio is: {audio_text}\n"
|
| 290 |
else:
|
| 291 |
+
instruct_prefix=f"Make the following audio more {edit_info}. The text corresponding to the audio is: {audio_text}\n"
|
|
|
|
|
|
|
| 292 |
elif edit_type == "denoise":
|
| 293 |
instruct_prefix = f"Remove any noise from the given audio while preserving the voice content clearly. Ensure that the speech quality remains intact with minimal distortion, and eliminate all noise from the audio."
|
| 294 |
elif edit_type == "vad":
|
| 295 |
instruct_prefix = f"Remove any silent portions from the given audio while preserving the voice content clearly. Ensure that the speech quality remains intact with minimal distortion, and eliminate all silence from the audio."
|
| 296 |
elif edit_type == "bgm":
|
| 297 |
instruct_prefix = f"Remove any background music (BGM) from the given audio while preserving the voice content clearly. Ensure that the speech quality remains intact with minimal distortion, and eliminate all BGM from the audio."
|
| 298 |
+
elif edit_type == "paralinguistic":
|
|
|
|
|
|
|
| 299 |
instruct_prefix = f"Add some non-verbal sounds to make the audio more natural, the new text is : {text}\n The text corresponding to the audio is: {audio_text}\n"
|
| 300 |
else:
|
| 301 |
raise HTTPException(
|