xieli commited on
Commit
f21ec03
·
1 Parent(s): 6852edb

feat: change prompt

Browse files
Files changed (4) hide show
  1. app.py +1 -1
  2. config/edit_config.py +3 -4
  3. config/prompts.py +7 -49
  4. tts.py +11 -9
app.py CHANGED
@@ -389,7 +389,7 @@ class EditxTab:
389
  """)
390
  gr.Markdown("""
391
  **Para-linguistic Description:**
392
- - Supported tags include: [Breathing] [Laughter] [Cough] [Sigh] [Confirmation-en] [Question-en] [Question-ah] [Question-oh] [Surprise-ah] [Surprise-oh] [Dissatisfaction-hnn] [Uhm] [Shh] [Crying] [Surprise-wa] [Surprise-yo] [Question-ei] [Question-yi]
393
  - Example:
394
  - Fill in "clone text" field: "Great, the weather is so nice today." Click the "CLONE" button to get audio.
395
  - Change "clone text" field to: "Great[Laughter], the weather is so nice today[Surprise-ah]." Click the "EDIT" button to get para-linguistic audio.
 
389
  """)
390
  gr.Markdown("""
391
  **Para-linguistic Description:**
392
+ - Supported tags include: [Breathing] [Laughter] [Surprise-oh] [Confirmation-en] [Uhm] [Surprise-ah] [Surprise-wa] [Sigh] [Question-ei] [Dissatisfaction-hnn]
393
  - Example:
394
  - Fill in "clone text" field: "Great, the weather is so nice today." Click the "CLONE" button to get audio.
395
  - Change "clone text" field to: "Great[Laughter], the weather is so nice today[Surprise-ah]." Click the "EDIT" button to get para-linguistic audio.
config/edit_config.py CHANGED
@@ -15,19 +15,18 @@ def get_supported_edit_types():
15
  "emotion": [
16
  'happy', 'angry', 'sad', 'humour', 'confusion', 'disgusted',
17
  'empathy', 'embarrass', 'fear', 'surprised', 'excited',
18
- 'depressed', 'coldness', 'admiration'
19
  ],
20
  "style": [
21
  'serious', 'arrogant', 'child', 'older', 'girl', 'pure',
22
  'sister', 'sweet', 'ethereal', 'whisper', 'gentle', 'recite',
23
  'generous', 'act_coy', 'warm', 'shy', 'comfort', 'authority',
24
  'chat', 'radio', 'soulful', 'story', 'vivid', 'program',
25
- 'news', 'advertising', 'roar', 'murmur', 'shout', 'deeply', 'loudly'
 
26
  ],
27
  "vad": [],
28
- "music": [],
29
  "denoise": [],
30
  "para-linguistic": [],
31
  "speed": ["faster", "slower", "more faster", "more slower"],
32
- "animal": [],
33
  }
 
15
  "emotion": [
16
  'happy', 'angry', 'sad', 'humour', 'confusion', 'disgusted',
17
  'empathy', 'embarrass', 'fear', 'surprised', 'excited',
18
+ 'depressed', 'coldness', 'admiration', 'remove'
19
  ],
20
  "style": [
21
  'serious', 'arrogant', 'child', 'older', 'girl', 'pure',
22
  'sister', 'sweet', 'ethereal', 'whisper', 'gentle', 'recite',
23
  'generous', 'act_coy', 'warm', 'shy', 'comfort', 'authority',
24
  'chat', 'radio', 'soulful', 'story', 'vivid', 'program',
25
+ 'news', 'advertising', 'roar', 'murmur', 'shout', 'deeply', 'loudly',
26
+ 'remove'
27
  ],
28
  "vad": [],
 
29
  "denoise": [],
30
  "para-linguistic": [],
31
  "speed": ["faster", "slower", "more faster", "more slower"],
 
32
  }
config/prompts.py CHANGED
@@ -11,52 +11,10 @@ TTS_SYSTEM_PROMPTS = {
11
  "sys_prompt_with_spk": '请用{}的声音尽可能自然地说出下面这些话。',
12
  }
13
 
14
- # 音频编辑系统提示
15
- AUDIO_EDIT_SYSTEM_PROMPT = """As a highly skilled audio editing and tuning specialist, you excel at interpreting user instructions and applying precise adjustments to audio files according to their needs. Your expertise spans a wide range of audio enhancement capabilities, including but not limited to the following:
16
-
17
- # Emotional Enhancement of Speech:
18
- You are capable of infusing speech with various emotions such as:
19
- - happy
20
- - angry
21
- - sad
22
- - fear
23
- - disgusted
24
- - surprised
25
- - excited
26
-
27
- # Speech Style Transfer:
28
- You can adapt vocal delivery to diverse styles including:
29
- - Whisper
30
- - Coquettish
31
- - Gentle
32
- - Sweet
33
- - Arrogant
34
- - Innocent
35
- - Radio Host
36
- - Childlike
37
- - Bold and Unconstrained
38
- - Serious
39
- - Expressive and Vivid
40
- - Ethereal
41
- - Exaggerated
42
- - Recitation
43
- - Girlish
44
- - News Broadcast
45
- - Mature Female Voice
46
- - Middle-Aged or Elderly
47
- - Program Hosting
48
-
49
- # Paralinguistic Adjustments:
50
- You can fine-tune non-verbal speech elements such as:
51
- - Laughter Enhancement
52
- - Emphatic Stress
53
- - Rhythm and Pace Modulation
54
-
55
- # Audio Tuning & Editing:
56
- Your technical proficiency includes:
57
- - Noise Reduction
58
- - Background Music Removal
59
- - Silence Trimming
60
- - Speaker Extraction
61
-
62
- Note: Users will provide instructions in natural language. You are expected to accurately interpret their requirements and perform the most suitable audio edits and enhancements."""
 
11
  "sys_prompt_with_spk": '请用{}的声音尽可能自然地说出下面这些话。',
12
  }
13
 
14
+ AUDIO_EDIT_SYSTEM_PROMPT = """As a highly skilled audio editing and tuning specialist, you excel in interpreting user instructions and applying precise adjustments to meet their needs. Your expertise spans a wide range of enhancement capabilities, including but not limited to:
15
+ # Emotional Enhancement
16
+ # Speaking Style Transfer
17
+ # Non-linguistic Adjustments
18
+ # Audio Tuning & Editing
19
+ Note: You will receive instructions in natural language and are expected to accurately interpret and execute the most suitable audio edits and enhancements.
20
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tts.py CHANGED
@@ -277,23 +277,25 @@ class StepAudioTTS:
277
  """
278
 
279
  audio_text = audio_text.strip() if audio_text else ""
280
-
281
- if edit_type in {"emotion", "style", "speed"}:
282
- if edit_info in {"exaggerated", "ethereal", "whisper", "act_coy", "older"}:
 
 
 
 
 
 
283
  instruct_prefix = f"Make the following audio more {edit_info} style. The text corresponding to the audio is: {audio_text}\n"
284
  else:
285
- instruct_prefix = f"Make the following audio more {edit_info}. The text corresponding to the audio is: {audio_text}\n"
286
- elif edit_type == "music":
287
- instruct_prefix = f"Separate the vocals from the following audio. The lyric is: {audio_text}"
288
  elif edit_type == "denoise":
289
  instruct_prefix = f"Remove any noise from the given audio while preserving the voice content clearly. Ensure that the speech quality remains intact with minimal distortion, and eliminate all noise from the audio."
290
  elif edit_type == "vad":
291
  instruct_prefix = f"Remove any silent portions from the given audio while preserving the voice content clearly. Ensure that the speech quality remains intact with minimal distortion, and eliminate all silence from the audio."
292
  elif edit_type == "bgm":
293
  instruct_prefix = f"Remove any background music (BGM) from the given audio while preserving the voice content clearly. Ensure that the speech quality remains intact with minimal distortion, and eliminate all BGM from the audio."
294
- elif edit_type == "animal":
295
- instruct_prefix = f"Make the following audio more like mimic animal calls. The text corresponding to the audio is: {audio_text}\n"
296
- elif edit_type == "para-linguistic":
297
  instruct_prefix = f"Add some non-verbal sounds to make the audio more natural, the new text is : {text}\n The text corresponding to the audio is: {audio_text}\n"
298
  else:
299
  raise HTTPException(
 
277
  """
278
 
279
  audio_text = audio_text.strip() if audio_text else ""
280
+ if edit_type in {"emotion", "speed"}:
281
+ if edit_info == "remove":
282
+ instruct_prefix = f"Remove any emotion in the following audio and the reference text is: {audio_text}\n"
283
+ else:
284
+ instruct_prefix=f"Make the following audio more {edit_info}. The text corresponding to the audio is: {audio_text}\n"
285
+ elif edit_type == "style":
286
+ if edit_info == "remove":
287
+ instruct_prefix = f"Remove any speaking styles in the following audio and the reference text is: {audio_text}\n"
288
+ elif edit_info in {"exaggerated","ethereal","whisper","act_coy","older"}:
289
  instruct_prefix = f"Make the following audio more {edit_info} style. The text corresponding to the audio is: {audio_text}\n"
290
  else:
291
+ instruct_prefix=f"Make the following audio more {edit_info}. The text corresponding to the audio is: {audio_text}\n"
 
 
292
  elif edit_type == "denoise":
293
  instruct_prefix = f"Remove any noise from the given audio while preserving the voice content clearly. Ensure that the speech quality remains intact with minimal distortion, and eliminate all noise from the audio."
294
  elif edit_type == "vad":
295
  instruct_prefix = f"Remove any silent portions from the given audio while preserving the voice content clearly. Ensure that the speech quality remains intact with minimal distortion, and eliminate all silence from the audio."
296
  elif edit_type == "bgm":
297
  instruct_prefix = f"Remove any background music (BGM) from the given audio while preserving the voice content clearly. Ensure that the speech quality remains intact with minimal distortion, and eliminate all BGM from the audio."
298
+ elif edit_type == "paralinguistic":
 
 
299
  instruct_prefix = f"Add some non-verbal sounds to make the audio more natural, the new text is : {text}\n The text corresponding to the audio is: {audio_text}\n"
300
  else:
301
  raise HTTPException(