Spaces:

stepfun-ai
/

Step-Audio-EditX

Running on Zero

App Files Files Community

xieli commited on 27 days ago

Commit

f21ec03

1 Parent(s): 6852edb

feat: change prompt

Browse files

Files changed (4) hide show

app.py +1 -1
config/edit_config.py +3 -4
config/prompts.py +7 -49
tts.py +11 -9

app.py CHANGED Viewed

@@ -389,7 +389,7 @@ class EditxTab:
                 """)
             gr.Markdown("""
                 **Para-linguistic Description:**
-                - Supported tags include: [Breathing] [Laughter] [Cough] [Sigh] [Confirmation-en] [Question-en] [Question-ah] [Question-oh] [Surprise-ah] [Surprise-oh] [Dissatisfaction-hnn] [Uhm] [Shh] [Crying] [Surprise-wa] [Surprise-yo] [Question-ei] [Question-yi]
                 - Example:
                     - Fill in "clone text" field: "Great, the weather is so nice today." Click the "CLONE" button to get audio.
                     - Change "clone text" field to: "Great[Laughter], the weather is so nice today[Surprise-ah]." Click the "EDIT" button to get para-linguistic audio.

                 """)
             gr.Markdown("""
                 **Para-linguistic Description:**
+                - Supported tags include: [Breathing] [Laughter] [Surprise-oh] [Confirmation-en] [Uhm] [Surprise-ah] [Surprise-wa] [Sigh] [Question-ei] [Dissatisfaction-hnn]
                 - Example:
                     - Fill in "clone text" field: "Great, the weather is so nice today." Click the "CLONE" button to get audio.
                     - Change "clone text" field to: "Great[Laughter], the weather is so nice today[Surprise-ah]." Click the "EDIT" button to get para-linguistic audio.

config/edit_config.py CHANGED Viewed

@@ -15,19 +15,18 @@ def get_supported_edit_types():
         "emotion": [
             'happy', 'angry', 'sad', 'humour', 'confusion', 'disgusted',
             'empathy', 'embarrass', 'fear', 'surprised', 'excited',
-            'depressed', 'coldness', 'admiration'
         ],
         "style": [
             'serious', 'arrogant', 'child', 'older', 'girl', 'pure',
             'sister', 'sweet', 'ethereal', 'whisper', 'gentle', 'recite',
             'generous', 'act_coy', 'warm', 'shy', 'comfort', 'authority',
             'chat', 'radio', 'soulful', 'story', 'vivid', 'program',
-            'news', 'advertising', 'roar', 'murmur', 'shout', 'deeply', 'loudly'
         ],
         "vad": [],
-        "music": [],
         "denoise": [],
         "para-linguistic": [],
         "speed": ["faster", "slower", "more faster", "more slower"],
-        "animal": [],
     }

         "emotion": [
             'happy', 'angry', 'sad', 'humour', 'confusion', 'disgusted',
             'empathy', 'embarrass', 'fear', 'surprised', 'excited',
+            'depressed', 'coldness', 'admiration', 'remove'
         ],
         "style": [
             'serious', 'arrogant', 'child', 'older', 'girl', 'pure',
             'sister', 'sweet', 'ethereal', 'whisper', 'gentle', 'recite',
             'generous', 'act_coy', 'warm', 'shy', 'comfort', 'authority',
             'chat', 'radio', 'soulful', 'story', 'vivid', 'program',
+            'news', 'advertising', 'roar', 'murmur', 'shout', 'deeply', 'loudly',
+            'remove'
         ],
         "vad": [],
         "denoise": [],
         "para-linguistic": [],
         "speed": ["faster", "slower", "more faster", "more slower"],
     }

config/prompts.py CHANGED Viewed

@@ -11,52 +11,10 @@ TTS_SYSTEM_PROMPTS = {
     "sys_prompt_with_spk": '请用{}的声音尽可能自然地说出下面这些话。',
 }
-# 音频编辑系统提示
-AUDIO_EDIT_SYSTEM_PROMPT = """As a highly skilled audio editing and tuning specialist, you excel at interpreting user instructions and applying precise adjustments to audio files according to their needs. Your expertise spans a wide range of audio enhancement capabilities, including but not limited to the following:
-# Emotional Enhancement of Speech:
-You are capable of infusing speech with various emotions such as:
-- happy
-- angry
-- sad
-- fear
-- disgusted
-- surprised
-- excited
-# Speech Style Transfer:
-You can adapt vocal delivery to diverse styles including:
-- Whisper
-- Coquettish
-- Gentle
-- Sweet
-- Arrogant
-- Innocent
-- Radio Host
-- Childlike
-- Bold and Unconstrained
-- Serious
-- Expressive and Vivid
-- Ethereal
-- Exaggerated
-- Recitation
-- Girlish
-- News Broadcast
-- Mature Female Voice
-- Middle-Aged or Elderly
-- Program Hosting
-# Paralinguistic Adjustments:
-You can fine-tune non-verbal speech elements such as:
-- Laughter Enhancement
-- Emphatic Stress
-- Rhythm and Pace Modulation
-# Audio Tuning & Editing:
-Your technical proficiency includes:
-- Noise Reduction
-- Background Music Removal
-- Silence Trimming
-- Speaker Extraction
-Note: Users will provide instructions in natural language. You are expected to accurately interpret their requirements and perform the most suitable audio edits and enhancements."""

     "sys_prompt_with_spk": '请用{}的声音尽可能自然地说出下面这些话。',
 }
+AUDIO_EDIT_SYSTEM_PROMPT = """As a highly skilled audio editing and tuning specialist, you excel in interpreting user instructions and applying precise adjustments to meet their needs. Your expertise spans a wide range of enhancement capabilities, including but not limited to:
+# Emotional Enhancement
+# Speaking Style Transfer
+# Non-linguistic Adjustments
+# Audio Tuning & Editing
+Note: You will receive instructions in natural language and are expected to accurately interpret and execute the most suitable audio edits and enhancements.
+"""

tts.py CHANGED Viewed

@@ -277,23 +277,25 @@ class StepAudioTTS:
         """
         audio_text = audio_text.strip() if audio_text else ""
-        if edit_type in {"emotion", "style", "speed"}:
-            if edit_info in {"exaggerated", "ethereal", "whisper", "act_coy", "older"}:
                 instruct_prefix = f"Make the following audio more {edit_info} style. The text corresponding to the audio is: {audio_text}\n"
             else:
-                instruct_prefix = f"Make the following audio more {edit_info}. The text corresponding to the audio is: {audio_text}\n"
-        elif edit_type == "music":
-            instruct_prefix = f"Separate the vocals from the following audio. The lyric is: {audio_text}"
         elif edit_type == "denoise":
             instruct_prefix = f"Remove any noise from the given audio while preserving the voice content clearly. Ensure that the speech quality remains intact with minimal distortion, and eliminate all noise from the audio."
         elif edit_type == "vad":
             instruct_prefix = f"Remove any silent portions from the given audio while preserving the voice content clearly. Ensure that the speech quality remains intact with minimal distortion, and eliminate all silence from the audio."
         elif edit_type == "bgm":
             instruct_prefix = f"Remove any background music (BGM) from the given audio while preserving the voice content clearly. Ensure that the speech quality remains intact with minimal distortion, and eliminate all BGM from the audio."
-        elif edit_type == "animal":
-            instruct_prefix = f"Make the following audio more like mimic animal calls. The text corresponding to the audio is: {audio_text}\n"
-        elif edit_type == "para-linguistic":
             instruct_prefix = f"Add some non-verbal sounds to make the audio more natural, the new text is : {text}\n  The text corresponding to the audio is: {audio_text}\n"
         else:
             raise HTTPException(

         """
         audio_text = audio_text.strip() if audio_text else ""
+        if edit_type in {"emotion", "speed"}:
+            if edit_info == "remove":
+                instruct_prefix = f"Remove any emotion in the following audio and the reference text is: {audio_text}\n"
+            else:
+                instruct_prefix=f"Make the following audio more {edit_info}. The text corresponding to the audio is: {audio_text}\n"
+        elif edit_type == "style":
+            if edit_info == "remove":
+                instruct_prefix = f"Remove any speaking styles in the following audio and the reference text is: {audio_text}\n"
+            elif edit_info in {"exaggerated","ethereal","whisper","act_coy","older"}:
                 instruct_prefix = f"Make the following audio more {edit_info} style. The text corresponding to the audio is: {audio_text}\n"
             else:
+                instruct_prefix=f"Make the following audio more {edit_info}. The text corresponding to the audio is: {audio_text}\n"
         elif edit_type == "denoise":
             instruct_prefix = f"Remove any noise from the given audio while preserving the voice content clearly. Ensure that the speech quality remains intact with minimal distortion, and eliminate all noise from the audio."
         elif edit_type == "vad":
             instruct_prefix = f"Remove any silent portions from the given audio while preserving the voice content clearly. Ensure that the speech quality remains intact with minimal distortion, and eliminate all silence from the audio."
         elif edit_type == "bgm":
             instruct_prefix = f"Remove any background music (BGM) from the given audio while preserving the voice content clearly. Ensure that the speech quality remains intact with minimal distortion, and eliminate all BGM from the audio."
+        elif edit_type == "paralinguistic":
             instruct_prefix = f"Add some non-verbal sounds to make the audio more natural, the new text is : {text}\n  The text corresponding to the audio is: {audio_text}\n"
         else:
             raise HTTPException(