Spaces:

stepfun-ai
/

Step-Audio-EditX

Running on Zero

xieli commited on about 1 month ago

Commit

6852edb

1 Parent(s): 1e0e3cd

audio edit

remove useless file

feat: add spaces, change edit_app name

feat: change readme, dockerfile

feat: change readme

feat: add default config

feat: remote useless file

feat: change readme

feat: change readme

feat: change requirements version

feat: change requirements

feat: remove dockerfile

feat: change pkg version

feat: support hf model source

feat: fix model loader

feat: test

feat: fix model loader

feat: fix model cache path

feat: add log

feat: fix download

feat: fix tokenizer

feat: fix tokenizer

feat: fix download

feat: add hf login

feat: add log

feat: remove useless log

feat: fix model loader

feat: fix model loader

feat: add log

feat: fix model loader

feat: rollback code

feat: fix

feat: fix model loader

feat: fix model path

feat: zerogpu

feat: fix

feat: fix app

feat: optimize download

feat: optimize download

feat: change app desc

feat: add log

feat: add log

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +4 -4
.gitignore +2 -0
README.md +13 -1
__init__.py +0 -0
app.py +499 -0
config/__init__.py +12 -0
config/edit_config.py +33 -0
config/prompts.py +62 -0
funasr_detach/__init__.py +38 -0
funasr_detach/auto/__init__.py +0 -0
funasr_detach/auto/auto_frontend.py +90 -0
funasr_detach/auto/auto_model.py +575 -0
funasr_detach/auto/auto_tokenizer.py +7 -0
funasr_detach/bin/__init__.py +0 -0
funasr_detach/bin/compute_audio_cmvn.py +152 -0
funasr_detach/bin/inference.py +33 -0
funasr_detach/bin/tokenize_text.py +281 -0
funasr_detach/bin/train.py +227 -0
funasr_detach/datasets/__init__.py +0 -0
funasr_detach/datasets/audio_datasets/__init__.py +0 -0
funasr_detach/datasets/audio_datasets/datasets.py +112 -0
funasr_detach/datasets/audio_datasets/index_ds.py +150 -0
funasr_detach/datasets/audio_datasets/preprocessor.py +55 -0
funasr_detach/datasets/audio_datasets/samplers.py +306 -0
funasr_detach/datasets/audio_datasets/scp2jsonl.py +116 -0
funasr_detach/download/__init__.py +0 -0
funasr_detach/download/download_dataset_from_hub.py +19 -0
funasr_detach/download/download_from_hub.py +231 -0
funasr_detach/download/file.py +335 -0
funasr_detach/download/name_maps_from_hub.py +13 -0
funasr_detach/download/runtime_sdk_download_tool.py +60 -0
funasr_detach/frontends/__init__.py +0 -0
funasr_detach/frontends/default.py +347 -0
funasr_detach/frontends/eend_ola_feature.py +49 -0
funasr_detach/frontends/fused.py +144 -0
funasr_detach/frontends/s3prl.py +139 -0
funasr_detach/frontends/utils/__init__.py +1 -0
funasr_detach/frontends/utils/beamformer.py +84 -0
funasr_detach/frontends/utils/complex_utils.py +194 -0
funasr_detach/frontends/utils/dnn_beamformer.py +173 -0
funasr_detach/frontends/utils/dnn_wpe.py +93 -0
funasr_detach/frontends/utils/feature_transform.py +263 -0
funasr_detach/frontends/utils/frontend.py +151 -0
funasr_detach/frontends/utils/log_mel.py +83 -0
funasr_detach/frontends/utils/mask_estimator.py +77 -0
funasr_detach/frontends/utils/stft.py +239 -0
funasr_detach/frontends/wav_frontend.py +556 -0
funasr_detach/frontends/windowing.py +74 -0
funasr_detach/losses/__init__.py +0 -0
funasr_detach/losses/label_smoothing_loss.py +125 -0

.gitattributes CHANGED Viewed

@@ -1,4 +1,4 @@
-assets/*.pdf filter=lfs diff=lfs merge=lfs -text
-assets/*.png filter=lfs diff=lfs merge=lfs -text
-examples/*.wav filter=lfs diff=lfs merge=lfs -text
-* !text !filter !merge !diff

+examples filter=lfs diff=lfs merge=lfs -text
+speakers/nezha_prompt.wav filter=lfs diff=lfs merge=lfs -text
+speakers/nezhaRAP_prompt.wav filter=lfs diff=lfs merge=lfs -text
+speakers/nezha哼唱_prompt.wav filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__/
2	+ output/

README.md CHANGED Viewed

	@@ -1 +1,13 @@
1	- ~~# Step-Audio-EditX~~

+---
+title: Step-Audio-EditX
+emoji: 🚀
+colorFrom: red
+colorTo: red
+sdk: gradio
+sdk_version: 5.49.1
+app_file: app.py
+pinned: true
+short_description: Try out Step-Audio-EditX
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

__init__.py ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,499 @@

+import gradio as gr
+import os
+import argparse
+import torch
+import logging
+import threading
+from datetime import datetime
+import torchaudio
+import librosa
+import soundfile as sf
+# ZeroGPU support
+try:
+    import spaces
+    ZEROGPU_AVAILABLE = True
+except ImportError:
+    ZEROGPU_AVAILABLE = False
+    # Create a dummy decorator for non-ZeroGPU environments
+    class spaces:
+        @staticmethod
+        def GPU(duration=10):
+            def decorator(func):
+                return func
+            return decorator
+# Project imports
+from tokenizer import StepAudioTokenizer
+from tts import StepAudioTTS
+from model_loader import ModelSource
+from config.edit_config import get_supported_edit_types
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# Global variables for ZeroGPU-optimized loading
+encoder = None
+common_tts_engine = None
+args_global = None
+_model_lock = threading.Lock()  # Thread lock for model initialization
+def initialize_models():
+    """Initialize models on first GPU call (ZeroGPU optimization: load inside GPU context)"""
+    global encoder, common_tts_engine, args_global
+    # Fast path: check if already initialized (without lock)
+    if common_tts_engine is not None:
+        return  # Already initialized
+    # Slow path: acquire lock and double-check
+    with _model_lock:
+        # Double-check pattern: another thread might have initialized while waiting for lock
+        if common_tts_engine is not None:
+            return  # Already initialized by another thread
+        if args_global is None:
+            raise RuntimeError("Global args not set. Cannot initialize models.")
+        try:
+            logger.info("🚀 Initializing models inside GPU context (first call)...")
+            # Determine model source
+            source_mapping = {
+                "auto": ModelSource.AUTO,
+                "local": ModelSource.LOCAL,
+                "modelscope": ModelSource.MODELSCOPE,
+                "huggingface": ModelSource.HUGGINGFACE
+            }
+            model_source = source_mapping[args_global.model_source]
+            # Load StepAudioTokenizer (avoid CUDA initialization in main process)
+            encoder = StepAudioTokenizer(
+                os.path.join(args_global.model_path, "Step-Audio-Tokenizer"),
+                model_source=model_source,
+                funasr_model_id=args_global.tokenizer_model_id
+            )
+            logger.info("✓ StepAudioTokenizer loaded")
+            # Initialize common TTS engine (avoid CUDA initialization in main process)
+            common_tts_engine = StepAudioTTS(
+                os.path.join(args_global.model_path, "Step-Audio-EditX"),
+                encoder,
+                model_source=model_source,
+                tts_model_id=args_global.tts_model_id
+            )
+            logger.info("✓ StepCommonAudioTTS loaded")
+            print("Models initialized inside GPU context.")
+            if ZEROGPU_AVAILABLE:
+                logger.info("💡 Models loaded inside GPU context - ready for inference")
+            else:
+                logger.info("💡 Models loaded - ready for inference")
+        except Exception as e:
+            logger.error(f"❌ Error loading models: {e}")
+            raise
+def get_model_config():
+    """Get model configuration without initializing GPU models"""
+    if args_global is None:
+        raise RuntimeError("Global args not set. Cannot get model config.")
+    return {
+        "encoder_path": os.path.join(args_global.model_path, "Step-Audio-Tokenizer"),
+        "tts_path": os.path.join(args_global.model_path, "Step-Audio-EditX"),
+        "model_source": args_global.model_source,
+        "tokenizer_model_id": args_global.tokenizer_model_id,
+        "tts_model_id": args_global.tts_model_id
+    }
+def get_gpu_duration(audio_input, text_input, target_text, task_type, task_info):
+    """Dynamic GPU duration based on whether models need initialization"""
+    global common_tts_engine
+    if common_tts_engine is None:
+        # First call - need time for model loading (up to 5 minutes)
+        return 300  # Maximum allowed duration for model initialization
+    else:
+        # Subsequent calls - only inference time needed
+        return 120  # Standard inference duration
+@spaces.GPU(duration=get_gpu_duration)  # Dynamic duration based on model state
+def process_audio_with_gpu(audio_input, text_input, target_text, task_type, task_info):
+    """Process audio using GPU (models are loaded inside GPU context to avoid main process errors)"""
+    global common_tts_engine
+    # Initialize models if not already loaded (inside GPU context to avoid main process errors)
+    if common_tts_engine is None:
+        print("Initializing common_tts_engine inside GPU context...")
+        logger.info("🎯 GPU allocated for 300s (first call with model loading)...")
+        initialize_models()
+        logger.info("✅ Models loaded successfully inside GPU context")
+    else:
+        print("common_tts_engine already initialized.")
+        logger.info("🎯 GPU allocated for 120s (inference with loaded models)...")
+    try:
+        # Use loaded models (first call may include loading time, subsequent calls are fast)
+        if task_type == "clone":
+            output_audio, sr = common_tts_engine.clone(audio_input, text_input, target_text)
+        else:
+            output_audio, sr = common_tts_engine.edit(audio_input, text_input, task_type, task_info, target_text)
+        logger.info("✅ Audio processing completed")
+        return output_audio, sr
+    except Exception as e:
+        logger.error(f"❌ Audio processing failed: {e}")
+        raise
+    # GPU automatically deallocated when function exits
+# Save audio to temporary directory
+def save_audio(audio_type, audio_data, sr, tmp_dir):
+    """Save audio data to a temporary file with timestamp"""
+    current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    save_path = os.path.join(tmp_dir, audio_type, f"{current_time}.wav")
+    os.makedirs(os.path.dirname(save_path), exist_ok=True)
+    try:
+        if isinstance(audio_data, torch.Tensor):
+            torchaudio.save(save_path, audio_data, sr)
+        else:
+            sf.write(save_path, audio_data, sr)
+        logger.debug(f"Audio saved to: {save_path}")
+        return save_path
+    except Exception as e:
+        logger.error(f"Failed to save audio: {e}")
+        raise
+class EditxTab:
+    """Audio editing and voice cloning interface tab"""
+    def __init__(self, args):
+        self.args = args
+        self.edit_type_list = list(get_supported_edit_types().keys())
+        self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
+    def history_messages_to_show(self, messages):
+        """Convert message history to gradio chatbot format"""
+        show_msgs = []
+        for message in messages:
+            edit_type = message['edit_type']
+            edit_info = message['edit_info']
+            source_text = message['source_text']
+            target_text = message['target_text']
+            raw_audio_part = message['raw_wave']
+            edit_audio_part = message['edit_wave']
+            type_str = f"{edit_type}-{edit_info}" if edit_info is not None else f"{edit_type}"
+            show_msgs.extend([
+                {"role": "user", "content": f"任务类型：{type_str}\n文本：{source_text}"},
+                {"role": "user", "content": gr.Audio(value=raw_audio_part, interactive=False)},
+                {"role": "assistant", "content": f"输出音频：\n文本：{target_text}"},
+                {"role": "assistant", "content": gr.Audio(value=edit_audio_part, interactive=False)}
+            ])
+        return show_msgs
+    def generate_clone(self, prompt_text_input, prompt_audio_input, generated_text, edit_type, edit_info, state):
+        """Generate cloned audio (models are loaded on first GPU call)"""
+        self.logger.info("Starting voice cloning process")
+        state['history_audio'] = []
+        state['history_messages'] = []
+        # Input validation
+        if not prompt_text_input or prompt_text_input.strip() == "":
+            error_msg = "[Error] Uploaded text cannot be empty."
+            self.logger.error(error_msg)
+            return [{"role": "user", "content": error_msg}], state
+        if not prompt_audio_input:
+            error_msg = "[Error] Uploaded audio cannot be empty."
+            self.logger.error(error_msg)
+            return [{"role": "user", "content": error_msg}], state
+        if not generated_text or generated_text.strip() == "":
+            error_msg = "[Error] Clone content cannot be empty."
+            self.logger.error(error_msg)
+            return [{"role": "user", "content": error_msg}], state
+        if edit_type != "clone":
+            error_msg = "[Error] CLONE button must use clone task."
+            self.logger.error(error_msg)
+            return [{"role": "user", "content": error_msg}], state
+        try:
+            # Use GPU inference with models loaded inside GPU context
+            output_audio, output_sr = process_audio_with_gpu(
+                prompt_audio_input, prompt_text_input, generated_text, "clone", edit_info
+            )
+            if output_audio is not None and output_sr is not None:
+                # Convert tensor to numpy if needed
+                if isinstance(output_audio, torch.Tensor):
+                    audio_numpy = output_audio.cpu().numpy().squeeze()
+                else:
+                    audio_numpy = output_audio
+                # Load original audio for comparison
+                input_audio_data_numpy, input_sample_rate = librosa.load(prompt_audio_input)
+                # Create message for history
+                cur_assistant_msg = {
+                    "edit_type": edit_type,
+                    "edit_info": edit_info,
+                    "source_text": prompt_text_input,
+                    "target_text": generated_text,
+                    "raw_wave": (input_sample_rate, input_audio_data_numpy),
+                    "edit_wave": (output_sr, audio_numpy),
+                }
+                state["history_audio"].append((output_sr, audio_numpy, generated_text))
+                state["history_messages"].append(cur_assistant_msg)
+                show_msgs = self.history_messages_to_show(state["history_messages"])
+                self.logger.info("Voice cloning completed successfully")
+                return show_msgs, state
+            else:
+                error_msg = "[Error] Clone failed"
+                self.logger.error(error_msg)
+                return [{"role": "user", "content": error_msg}], state
+        except Exception as e:
+            error_msg = f"[Error] Clone failed: {str(e)}"
+            self.logger.error(error_msg)
+            return [{"role": "user", "content": error_msg}], state
+    def generate_edit(self, prompt_text_input, prompt_audio_input, generated_text, edit_type, edit_info, state):
+        """Generate edited audio (models are loaded on first GPU call)"""
+        self.logger.info("Starting audio editing process")
+        # Input validation
+        if not prompt_text_input or prompt_text_input.strip() == "":
+            error_msg = "[Error] Uploaded text cannot be empty."
+            self.logger.error(error_msg)
+            return [{"role": "user", "content": error_msg}], state
+        if not prompt_audio_input:
+            error_msg = "[Error] Uploaded audio cannot be empty."
+            self.logger.error(error_msg)
+            return [{"role": "user", "content": error_msg}], state
+        try:
+            # Determine which audio to use
+            if len(state["history_audio"]) == 0:
+                # First edit - use uploaded audio
+                audio_to_edit = prompt_audio_input
+                text_to_use = prompt_text_input
+                self.logger.debug("Using prompt audio, no history found")
+            else:
+                # Use previous edited audio - save it to temp file first
+                sample_rate, audio_numpy, previous_text = state["history_audio"][-1]
+                temp_path = save_audio("temp", audio_numpy, sample_rate, self.args.tmp_dir)
+                audio_to_edit = temp_path
+                text_to_use = previous_text
+                self.logger.debug(f"Using previous audio from history, count: {len(state['history_audio'])}")
+            # For para-linguistic, use generated_text; otherwise use source text
+            if edit_type not in {"para-linguistic"}:
+                generated_text = text_to_use
+            # Use GPU inference with models loaded inside GPU context
+            output_audio, output_sr = process_audio_with_gpu(
+                audio_to_edit, text_to_use, generated_text, edit_type, edit_info
+            )
+            if output_audio is not None and output_sr is not None:
+                # Convert tensor to numpy if needed
+                if isinstance(output_audio, torch.Tensor):
+                    audio_numpy = output_audio.cpu().numpy().squeeze()
+                else:
+                    audio_numpy = output_audio
+                # Load original audio for comparison
+                if len(state["history_audio"]) == 0:
+                    input_audio_data_numpy, input_sample_rate = librosa.load(prompt_audio_input)
+                else:
+                    input_sample_rate, input_audio_data_numpy, _ = state["history_audio"][-1]
+                # Create message for history
+                cur_assistant_msg = {
+                    "edit_type": edit_type,
+                    "edit_info": edit_info,
+                    "source_text": text_to_use,
+                    "target_text": generated_text,
+                    "raw_wave": (input_sample_rate, input_audio_data_numpy),
+                    "edit_wave": (output_sr, audio_numpy),
+                }
+                state["history_audio"].append((output_sr, audio_numpy, generated_text))
+                state["history_messages"].append(cur_assistant_msg)
+                show_msgs = self.history_messages_to_show(state["history_messages"])
+                self.logger.info("Audio editing completed successfully")
+                return show_msgs, state
+            else:
+                error_msg = "[Error] Edit failed"
+                self.logger.error(error_msg)
+                return [{"role": "user", "content": error_msg}], state
+        except Exception as e:
+            error_msg = f"[Error] Edit failed: {str(e)}"
+            self.logger.error(error_msg)
+            return [{"role": "user", "content": error_msg}], state
+    def clear_history(self, state):
+        """Clear conversation history"""
+        state["history_messages"] = []
+        state["history_audio"] = []
+        return [], state
+    def init_state(self):
+        """Initialize conversation state"""
+        return {
+            "history_messages": [],
+            "history_audio": []
+        }
+    def register_components(self):
+        """Register gradio components - maintaining exact layout from original"""
+        with gr.Tab("Editx"):
+            with gr.Row():
+                with gr.Column():
+                    self.model_input = gr.Textbox(label="Model Name", value="Step-Audio-EditX", scale=1)
+                    self.prompt_text_input = gr.Textbox(label="Audio Text Content", value="", scale=1)
+                    self.prompt_audio_input = gr.Audio(
+                        sources=["upload", "microphone"],
+                        format="wav",
+                        type="filepath",
+                        label="Input Audio",
+                    )
+                    self.generated_text = gr.Textbox(label="Clone Text", lines=1, max_lines=200)
+                    with gr.Row():
+                        self.button_tts = gr.Button("CLONE")
+                        self.button_edit = gr.Button("EDIT")
+                with gr.Column():
+                    with gr.Row():
+                        self.edit_type = gr.Dropdown(label="Task", choices=self.edit_type_list, value="clone")
+                        self.edit_info = gr.Dropdown(label="Sub-task", choices=[], value=None)
+                    self.chat_box = gr.Chatbot(label="History", type="messages", height=480*1)
+                    self.clean_history_submit = gr.Button("Clear History")
+            gr.Markdown("---")
+            gr.Markdown("""
+                **Button Description:**
+                - CLONE: Synthesizes audio based on uploaded audio and text, only used for clone mode, will clear history information when used.
+                - EDIT: Edits based on uploaded audio, or continues to stack edit effects based on the previous round of generated audio.
+                """)
+            gr.Markdown("""
+                **Operation Workflow:**
+                - Upload the audio to be edited on the left side and fill in the corresponding text content of the audio;
+                - If the task requires modifying text content (such as clone, para-linguistic), fill in the text to be synthesized in the "clone text" field. For all other tasks, keep the uploaded audio text content unchanged;
+                - Select tasks and subtasks on the right side (some tasks have no subtasks, such as vad, etc.);
+                - Click the "CLONE" or "EDIT" button on the left side, and audio will be generated in the dialog box on the right side.
+                """)
+            gr.Markdown("""
+                **Para-linguistic Description:**
+                - Supported tags include: [Breathing] [Laughter] [Cough] [Sigh] [Confirmation-en] [Question-en] [Question-ah] [Question-oh] [Surprise-ah] [Surprise-oh] [Dissatisfaction-hnn] [Uhm] [Shh] [Crying] [Surprise-wa] [Surprise-yo] [Question-ei] [Question-yi]
+                - Example:
+                    - Fill in "clone text" field: "Great, the weather is so nice today." Click the "CLONE" button to get audio.
+                    - Change "clone text" field to: "Great[Laughter], the weather is so nice today[Surprise-ah]." Click the "EDIT" button to get para-linguistic audio.
+                """)
+    def register_events(self):
+        """Register event handlers"""
+        # Create independent state for each session
+        state = gr.State(self.init_state())
+        self.button_tts.click(self.generate_clone,
+            inputs=[self.prompt_text_input, self.prompt_audio_input, self.generated_text, self.edit_type, self.edit_info, state],
+            outputs=[self.chat_box, state])
+        self.button_edit.click(self.generate_edit,
+            inputs=[self.prompt_text_input, self.prompt_audio_input, self.generated_text, self.edit_type, self.edit_info, state],
+            outputs=[self.chat_box, state])
+        self.clean_history_submit.click(self.clear_history, inputs=[state], outputs=[self.chat_box, state])
+        self.edit_type.change(
+            fn=self.update_edit_info,
+            inputs=self.edit_type,
+            outputs=self.edit_info,
+        )
+    def update_edit_info(self, category):
+        """Update sub-task dropdown based on main task selection"""
+        category_items = get_supported_edit_types()
+        choices = category_items.get(category, [])
+        value = None if len(choices) == 0 else choices[0]
+        return gr.Dropdown(label="Sub-task", choices=choices, value=value)
+def launch_demo(args, editx_tab):
+    """Launch the gradio demo"""
+    with gr.Blocks(title="🎙️ Step-Audio-EditX") as demo:
+        gr.Markdown("## 🎙️ Step-Audio-EditX")
+        gr.Markdown("Audio editing and voice cloning using Step-Audio-Edit model.")
+        # Register components
+        editx_tab.register_components()
+        # Register events
+        editx_tab.register_events()
+    # Launch demo
+    demo.queue().launch(
+        server_name=args.server_name,
+        server_port=args.server_port,
+        share=args.share if hasattr(args, 'share') else False
+    )
+if __name__ == "__main__":
+    # Parse command line arguments
+    parser = argparse.ArgumentParser(description="Step-Audio Edit Demo")
+    parser.add_argument("--model-path", type=str, default="stepfun-ai", help="Model path.")
+    parser.add_argument("--server-name", type=str, default="0.0.0.0", help="Demo server name.")
+    parser.add_argument("--server-port", type=int, default=7860, help="Demo server port.")
+    parser.add_argument("--tmp-dir", type=str, default="/tmp/gradio", help="Save path.")
+    parser.add_argument("--share", action="store_true", help="Share gradio app.")
+    # Multi-source loading support parameters
+    parser.add_argument(
+        "--model-source",
+        type=str,
+        default="huggingface",
+        choices=["auto", "local", "modelscope", "huggingface"],
+        help="Model source: auto (detect automatically), local, modelscope, or huggingface"
+    )
+    parser.add_argument(
+        "--tokenizer-model-id",
+        type=str,
+        default="dengcunqin/speech_paraformer-large_asr_nat-zh-cantonese-en-16k-vocab8501-online",
+        help="Tokenizer model ID for online loading"
+    )
+    parser.add_argument(
+        "--tts-model-id",
+        type=str,
+        default=None,
+        help="TTS model ID for online loading (if different from model-path)"
+    )
+    args = parser.parse_args()
+    # Store args globally for model configuration
+    args_global = args
+    logger.info(f"Configuration loaded:")
+    logger.info(f"Model source: {args.model_source}")
+    logger.info(f"Model path: {args.model_path}")
+    logger.info(f"Tokenizer model ID: {args.tokenizer_model_id}")
+    if args.tts_model_id:
+        logger.info(f"TTS model ID: {args.tts_model_id}")
+    # Models will be initialized on first GPU call to avoid ZeroGPU main process errors
+    if ZEROGPU_AVAILABLE:
+        logger.info("🎉 ZeroGPU detected - using dynamic GPU duration management!")
+        logger.info("💡 First call: 300s (model loading), subsequent calls: 120s (inference only)")
+    else:
+        logger.info("💻 Running in local mode - models will be loaded on first call")
+    # Create EditxTab instance
+    editx_tab = EditxTab(args)
+    # Launch demo
+    launch_demo(args, editx_tab)

config/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+"""
+Configuration module for Step-Audio
+"""
+from .prompts import TTS_SYSTEM_PROMPTS, AUDIO_EDIT_SYSTEM_PROMPT
+from .edit_config import get_supported_edit_types
+__all__ = [
+    'TTS_SYSTEM_PROMPTS',
+    'AUDIO_EDIT_SYSTEM_PROMPT',
+    'get_supported_edit_types'
+]

config/edit_config.py ADDED Viewed

	@@ -0,0 +1,33 @@

+"""
+音频编辑配置模块
+包含支持的编辑类型和相关配置
+"""
+def get_supported_edit_types():
+    """
+    获取支持的编辑类型和选项
+    Returns:
+        Dict[str, list]: Dictionary of edit types and their options
+    """
+    return {
+        "clone": [],
+        "emotion": [
+            'happy', 'angry', 'sad', 'humour', 'confusion', 'disgusted',
+            'empathy', 'embarrass', 'fear', 'surprised', 'excited',
+            'depressed', 'coldness', 'admiration'
+        ],
+        "style": [
+            'serious', 'arrogant', 'child', 'older', 'girl', 'pure',
+            'sister', 'sweet', 'ethereal', 'whisper', 'gentle', 'recite',
+            'generous', 'act_coy', 'warm', 'shy', 'comfort', 'authority',
+            'chat', 'radio', 'soulful', 'story', 'vivid', 'program',
+            'news', 'advertising', 'roar', 'murmur', 'shout', 'deeply', 'loudly'
+        ],
+        "vad": [],
+        "music": [],
+        "denoise": [],
+        "para-linguistic": [],
+        "speed": ["faster", "slower", "more faster", "more slower"],
+        "animal": [],
+    }

config/prompts.py ADDED Viewed

	@@ -0,0 +1,62 @@

+"""
+系统提示配置模块
+包含所有TTS和编辑相关的系统提示
+"""
+# TTS相关系统提示
+TTS_SYSTEM_PROMPTS = {
+    "sys_prompt_for_rap": "请参考对话历史里的音色，用RAP方式将文本内容大声说唱出来。",
+    "sys_prompt_for_vocal": "请参考对话历史里的音色，用哼唱的方式将文本内容大声唱出来。",
+    "sys_prompt_wo_spk": '以自然的语速读出下面的文字。',
+    "sys_prompt_with_spk": '请用{}的声音尽可能自然地说出下面这些话。',
+}
+# 音频编辑系统提示
+AUDIO_EDIT_SYSTEM_PROMPT = """As a highly skilled audio editing and tuning specialist, you excel at interpreting user instructions and applying precise adjustments to audio files according to their needs. Your expertise spans a wide range of audio enhancement capabilities, including but not limited to the following:
+# Emotional Enhancement of Speech:
+You are capable of infusing speech with various emotions such as:
+- happy
+- angry
+- sad
+- fear
+- disgusted
+- surprised
+- excited
+# Speech Style Transfer:
+You can adapt vocal delivery to diverse styles including:
+- Whisper
+- Coquettish
+- Gentle
+- Sweet
+- Arrogant
+- Innocent
+- Radio Host
+- Childlike
+- Bold and Unconstrained
+- Serious
+- Expressive and Vivid
+- Ethereal
+- Exaggerated
+- Recitation
+- Girlish
+- News Broadcast
+- Mature Female Voice
+- Middle-Aged or Elderly
+- Program Hosting
+# Paralinguistic Adjustments:
+You can fine-tune non-verbal speech elements such as:
+- Laughter Enhancement
+- Emphatic Stress
+- Rhythm and Pace Modulation
+# Audio Tuning & Editing:
+Your technical proficiency includes:
+- Noise Reduction
+- Background Music Removal
+- Silence Trimming
+- Speaker Extraction
+Note: Users will provide instructions in natural language. You are expected to accurately interpret their requirements and perform the most suitable audio edits and enhancements."""

funasr_detach/__init__.py ADDED Viewed

	@@ -0,0 +1,38 @@

+"""Initialize funasr package."""
+import os
+import pkgutil
+import importlib
+dirname = os.path.dirname(__file__)
+version_file = os.path.join(dirname, "version.txt")
+with open(version_file, "r") as f:
+    __version__ = f.read().strip()
+import importlib
+import pkgutil
+def import_submodules(package, recursive=True):
+    if isinstance(package, str):
+        package = importlib.import_module(package)
+    results = {}
+    for loader, name, is_pkg in pkgutil.walk_packages(
+        package.__path__, package.__name__ + "."
+    ):
+        try:
+            results[name] = importlib.import_module(name)
+        except Exception as e:
+            # 如果想要看到导入错误的具体信息，可以取消注释下面的行
+            # print(f"Failed to import {name}: {e}")
+            pass
+        if recursive and is_pkg:
+            results.update(import_submodules(name))
+    return results
+import_submodules(__name__)
+from funasr_detach.auto.auto_model import AutoModel
+from funasr_detach.auto.auto_frontend import AutoFrontend

funasr_detach/auto/__init__.py ADDED Viewed

File without changes

funasr_detach/auto/auto_frontend.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import time
+import logging
+from tqdm import tqdm
+from funasr_detach.register import tables
+from funasr_detach.download.download_from_hub import download_model
+from funasr_detach.utils.load_utils import load_audio_text_image_video, extract_fbank
+from funasr_detach.auto.auto_model import prepare_data_iterator
+from funasr_detach.auto.auto_model import prepare_data_iterator
+class AutoFrontend:
+    def __init__(self, **kwargs):
+        assert "model" in kwargs
+        if "model_conf" not in kwargs:
+            logging.info(
+                "download models from model hub: {}".format(
+                    kwargs.get("model_hub", "ms")
+                )
+            )
+            kwargs = download_model(**kwargs)
+        # build frontend
+        frontend = kwargs.get("frontend", None)
+        if frontend is not None:
+            frontend_class = tables.frontend_classes.get(frontend)
+            frontend = frontend_class(**kwargs["frontend_conf"])
+        self.frontend = frontend
+        if "frontend" in kwargs:
+            del kwargs["frontend"]
+        self.kwargs = kwargs
+    def __call__(self, input, input_len=None, kwargs=None, **cfg):
+        kwargs = self.kwargs if kwargs is None else kwargs
+        kwargs.update(cfg)
+        key_list, data_list = prepare_data_iterator(input, input_len=input_len)
+        batch_size = kwargs.get("batch_size", 1)
+        device = kwargs.get("device", "cpu")
+        if device == "cpu":
+            batch_size = 1
+        meta_data = {}
+        result_list = []
+        num_samples = len(data_list)
+        pbar = tqdm(colour="blue", total=num_samples + 1, dynamic_ncols=True)
+        time0 = time.perf_counter()
+        for beg_idx in range(0, num_samples, batch_size):
+            end_idx = min(num_samples, beg_idx + batch_size)
+            data_batch = data_list[beg_idx:end_idx]
+            key_batch = key_list[beg_idx:end_idx]
+            # extract fbank feats
+            time1 = time.perf_counter()
+            audio_sample_list = load_audio_text_image_video(
+                data_batch, fs=self.frontend.fs, audio_fs=kwargs.get("fs", 16000)
+            )
+            time2 = time.perf_counter()
+            meta_data["load_data"] = f"{time2 - time1:0.3f}"
+            speech, speech_lengths = extract_fbank(
+                audio_sample_list,
+                data_type=kwargs.get("data_type", "sound"),
+                frontend=self.frontend,
+                **kwargs,
+            )
+            time3 = time.perf_counter()
+            meta_data["extract_feat"] = f"{time3 - time2:0.3f}"
+            meta_data["batch_data_time"] = (
+                speech_lengths.sum().item()
+                * self.frontend.frame_shift
+                * self.frontend.lfr_n
+                / 1000
+            )
+            speech.to(device=device), speech_lengths.to(device=device)
+            batch = {"input": speech, "input_len": speech_lengths, "key": key_batch}
+            result_list.append(batch)
+            pbar.update(1)
+            description = f"{meta_data}, "
+            pbar.set_description(description)
+        time_end = time.perf_counter()
+        pbar.set_description(f"time escaped total: {time_end - time0:0.3f}")
+        return result_list

funasr_detach/auto/auto_model.py ADDED Viewed

	@@ -0,0 +1,575 @@

+import json
+import time
+import copy
+import torch
+import random
+import string
+import logging
+import os.path
+import numpy as np
+from tqdm import tqdm
+from funasr_detach.register import tables
+from funasr_detach.utils.load_utils import load_bytes
+from funasr_detach.download.file import download_from_url
+from funasr_detach.download.download_from_hub import download_model
+from funasr_detach.utils.vad_utils import slice_padding_audio_samples
+from funasr_detach.train_utils.set_all_random_seed import set_all_random_seed
+from funasr_detach.train_utils.load_pretrained_model import load_pretrained_model
+from funasr_detach.utils.load_utils import load_audio_text_image_video
+from funasr_detach.utils.timestamp_tools import timestamp_sentence
+from funasr_detach.models.campplus.utils import sv_chunk, postprocess, distribute_spk
+try:
+    from funasr_detach.models.campplus.cluster_backend import ClusterBackend
+except:
+    print("If you want to use the speaker diarization, please `pip install hdbscan`")
+def prepare_data_iterator(data_in, input_len=None, data_type=None, key=None):
+    """
+    :param input:
+    :param input_len:
+    :param data_type:
+    :param frontend:
+    :return:
+    """
+    data_list = []
+    key_list = []
+    filelist = [".scp", ".txt", ".json", ".jsonl"]
+    chars = string.ascii_letters + string.digits
+    if isinstance(data_in, str) and data_in.startswith("http"):  # url
+        data_in = download_from_url(data_in)
+    if isinstance(data_in, str) and os.path.exists(
+        data_in
+    ):  # wav_path; filelist: wav.scp, file.jsonl;text.txt;
+        _, file_extension = os.path.splitext(data_in)
+        file_extension = file_extension.lower()
+        if file_extension in filelist:  # filelist: wav.scp, file.jsonl;text.txt;
+            with open(data_in, encoding="utf-8") as fin:
+                for line in fin:
+                    key = "rand_key_" + "".join(random.choice(chars) for _ in range(13))
+                    if data_in.endswith(
+                        ".jsonl"
+                    ):  # file.jsonl: json.dumps({"source": data})
+                        lines = json.loads(line.strip())
+                        data = lines["source"]
+                        key = data["key"] if "key" in data else key
+                    else:  # filelist, wav.scp, text.txt: id \t data or data
+                        lines = line.strip().split(maxsplit=1)
+                        data = lines[1] if len(lines) > 1 else lines[0]
+                        key = lines[0] if len(lines) > 1 else key
+                    data_list.append(data)
+                    key_list.append(key)
+        else:
+            key = "rand_key_" + "".join(random.choice(chars) for _ in range(13))
+            data_list = [data_in]
+            key_list = [key]
+    elif isinstance(data_in, (list, tuple)):
+        if data_type is not None and isinstance(
+            data_type, (list, tuple)
+        ):  # mutiple inputs
+            data_list_tmp = []
+            for data_in_i, data_type_i in zip(data_in, data_type):
+                key_list, data_list_i = prepare_data_iterator(
+                    data_in=data_in_i, data_type=data_type_i
+                )
+                data_list_tmp.append(data_list_i)
+            data_list = []
+            for item in zip(*data_list_tmp):
+                data_list.append(item)
+        else:
+            # [audio sample point, fbank, text]
+            data_list = data_in
+            key_list = [
+                "rand_key_" + "".join(random.choice(chars) for _ in range(13))
+                for _ in range(len(data_in))
+            ]
+    else:  # raw text; audio sample point, fbank; bytes
+        if isinstance(data_in, bytes):  # audio bytes
+            data_in = load_bytes(data_in)
+        if key is None:
+            key = "rand_key_" + "".join(random.choice(chars) for _ in range(13))
+        data_list = [data_in]
+        key_list = [key]
+    return key_list, data_list
+class AutoModel:
+    def __init__(self, **kwargs):
+        if not kwargs.get("disable_log", False):
+            tables.print()
+        model, kwargs = self.build_model(**kwargs)
+        # if vad_model is not None, build vad model else None
+        vad_model = kwargs.get("vad_model", None)
+        vad_kwargs = kwargs.get("vad_model_revision", None)
+        if vad_model is not None:
+            logging.info("Building VAD model.")
+            vad_kwargs = {
+                "model": vad_model,
+                "model_revision": vad_kwargs,
+                "device": kwargs["device"],
+            }
+            vad_model, vad_kwargs = self.build_model(**vad_kwargs)
+        # if punc_model is not None, build punc model else None
+        punc_model = kwargs.get("punc_model", None)
+        punc_kwargs = kwargs.get("punc_model_revision", None)
+        if punc_model is not None:
+            logging.info("Building punc model.")
+            punc_kwargs = {
+                "model": punc_model,
+                "model_revision": punc_kwargs,
+                "device": kwargs["device"],
+            }
+            punc_model, punc_kwargs = self.build_model(**punc_kwargs)
+        # if spk_model is not None, build spk model else None
+        spk_model = kwargs.get("spk_model", None)
+        spk_kwargs = kwargs.get("spk_model_revision", None)
+        if spk_model is not None:
+            logging.info("Building SPK model.")
+            spk_kwargs = {
+                "model": spk_model,
+                "model_revision": spk_kwargs,
+                "device": kwargs["device"],
+            }
+            spk_model, spk_kwargs = self.build_model(**spk_kwargs)
+            self.cb_model = ClusterBackend().to(kwargs["device"])
+            spk_mode = kwargs.get("spk_mode", "punc_segment")
+            if spk_mode not in ["default", "vad_segment", "punc_segment"]:
+                logging.error(
+                    "spk_mode should be one of default, vad_segment and punc_segment."
+                )
+            self.spk_mode = spk_mode
+        self.kwargs = kwargs
+        self.model = model
+        self.vad_model = vad_model
+        self.vad_kwargs = vad_kwargs
+        self.punc_model = punc_model
+        self.punc_kwargs = punc_kwargs
+        self.spk_model = spk_model
+        self.spk_kwargs = spk_kwargs
+        self.model_path = kwargs.get("model_path")
+        self.repo_path = kwargs.get("repo_path")
+    def build_model(self, **kwargs):
+        assert "model" in kwargs
+        if "model_conf" not in kwargs:
+            logging.info(
+                "download models from model hub: {}".format(
+                    kwargs.get("model_hub", "ms")
+                )
+            )
+            kwargs = download_model(**kwargs)
+        set_all_random_seed(kwargs.get("seed", 0))
+        device = kwargs.get("device", "cuda")
+        if not torch.cuda.is_available() or kwargs.get("ngpu", 1) == 0:
+            device = "cpu"
+            kwargs["batch_size"] = 1
+        kwargs["device"] = device
+        if kwargs.get("ncpu", None):
+            torch.set_num_threads(kwargs.get("ncpu"))
+        # build tokenizer
+        tokenizer = kwargs.get("tokenizer", None)
+        if tokenizer is not None:
+            tokenizer_class = tables.tokenizer_classes.get(tokenizer)
+            tokenizer = tokenizer_class(**kwargs["tokenizer_conf"])
+            kwargs["tokenizer"] = tokenizer
+            kwargs["token_list"] = tokenizer.token_list
+            vocab_size = len(tokenizer.token_list)
+        else:
+            vocab_size = -1
+        # build frontend
+        frontend = kwargs.get("frontend", None)
+        if frontend is not None:
+            frontend_class = tables.frontend_classes.get(frontend)
+            frontend = frontend_class(**kwargs["frontend_conf"])
+            kwargs["frontend"] = frontend
+            kwargs["input_size"] = frontend.output_size()
+        # build model
+        model_class = tables.model_classes.get(kwargs["model"])
+        model = model_class(**kwargs, **kwargs["model_conf"], vocab_size=vocab_size)
+        model.to(device)
+        # init_param
+        init_param = kwargs.get("init_param", None)
+        if init_param is not None:
+            logging.info(f"Loading pretrained params from {init_param}")
+            load_pretrained_model(
+                model=model,
+                path=init_param,
+                ignore_init_mismatch=kwargs.get("ignore_init_mismatch", False),
+                oss_bucket=kwargs.get("oss_bucket", None),
+                scope_map=kwargs.get("scope_map", None),
+                excludes=kwargs.get("excludes", None),
+            )
+        return model, kwargs
+    def __call__(self, *args, **cfg):
+        kwargs = self.kwargs
+        kwargs.update(cfg)
+        res = self.model(*args, kwargs)
+        return res
+    def generate(self, input, input_len=None, **cfg):
+        if self.vad_model is None:
+            return self.inference(input, input_len=input_len, **cfg)
+        else:
+            return self.inference_with_vad(input, input_len=input_len, **cfg)
+    def inference(
+        self, input, input_len=None, model=None, kwargs=None, key=None, **cfg
+    ):
+        kwargs = self.kwargs if kwargs is None else kwargs
+        kwargs.update(cfg)
+        model = self.model if model is None else model
+        model = model.cuda()
+        model.eval()
+        batch_size = kwargs.get("batch_size", 1)
+        # if kwargs.get("device", "cpu") == "cpu":
+        #     batch_size = 1
+        key_list, data_list = prepare_data_iterator(
+            input, input_len=input_len, data_type=kwargs.get("data_type", None), key=key
+        )
+        speed_stats = {}
+        asr_result_list = []
+        num_samples = len(data_list)
+        disable_pbar = kwargs.get("disable_pbar", False)
+        pbar = (
+            tqdm(colour="blue", total=num_samples, dynamic_ncols=True)
+            if not disable_pbar
+            else None
+        )
+        time_speech_total = 0.0
+        time_escape_total = 0.0
+        for beg_idx in range(0, num_samples, batch_size):
+            end_idx = min(num_samples, beg_idx + batch_size)
+            data_batch = data_list[beg_idx:end_idx]
+            key_batch = key_list[beg_idx:end_idx]
+            batch = {"data_in": data_batch, "key": key_batch}
+            if (end_idx - beg_idx) == 1 and kwargs.get(
+                "data_type", None
+            ) == "fbank":  # fbank
+                batch["data_in"] = data_batch[0]
+                batch["data_lengths"] = input_len
+            time1 = time.perf_counter()
+            with torch.no_grad():
+                results, meta_data = model.inference(**batch, **kwargs)
+            time2 = time.perf_counter()
+            asr_result_list.extend(results)
+            # batch_data_time = time_per_frame_s * data_batch_i["speech_lengths"].sum().item()
+            batch_data_time = meta_data.get("batch_data_time", -1)
+            time_escape = time2 - time1
+            speed_stats["load_data"] = meta_data.get("load_data", 0.0)
+            speed_stats["extract_feat"] = meta_data.get("extract_feat", 0.0)
+            speed_stats["forward"] = f"{time_escape:0.3f}"
+            speed_stats["batch_size"] = f"{len(results)}"
+            speed_stats["time_cost"] = f"{(time_escape)}"
+            speed_stats["rtf"] = f"{(time_escape) / batch_data_time:0.3f}"
+            description = f"{speed_stats}, "
+            if pbar:
+                pbar.update(1)
+                pbar.set_description(description)
+            time_speech_total += batch_data_time
+            time_escape_total += time_escape
+        if pbar:
+            # pbar.update(1)
+            pbar.set_description(f"rtf_avg: {time_escape_total/time_speech_total:0.3f}")
+        torch.cuda.empty_cache()
+        return asr_result_list
+    def inference_with_vad(self, input, input_len=None, **cfg):
+        # step.1: compute the vad model
+        self.vad_kwargs.update(cfg)
+        beg_vad = time.time()
+        res = self.inference(
+            input,
+            input_len=input_len,
+            model=self.vad_model,
+            kwargs=self.vad_kwargs,
+            **cfg,
+        )
+        end_vad = time.time()
+        print(f"time cost vad: {end_vad - beg_vad:0.3f}")
+        # step.2 compute asr model
+        model = self.model
+        kwargs = self.kwargs
+        kwargs.update(cfg)
+        batch_size = int(kwargs.get("batch_size_s", 300)) * 1000
+        batch_size_threshold_ms = int(kwargs.get("batch_size_threshold_s", 60)) * 1000
+        kwargs["batch_size"] = batch_size
+        key_list, data_list = prepare_data_iterator(
+            input, input_len=input_len, data_type=kwargs.get("data_type", None)
+        )
+        results_ret_list = []
+        time_speech_total_all_samples = 1e-6
+        beg_total = time.time()
+        pbar_total = tqdm(colour="red", total=len(res), dynamic_ncols=True)
+        for i in range(len(res)):
+            key = res[i]["key"]
+            vadsegments = res[i]["value"]
+            input_i = data_list[i]
+            speech = load_audio_text_image_video(
+                input_i, fs=kwargs["frontend"].fs, audio_fs=kwargs.get("fs", 16000)
+            )
+            speech_lengths = len(speech)
+            n = len(vadsegments)
+            data_with_index = [(vadsegments[i], i) for i in range(n)]
+            sorted_data = sorted(data_with_index, key=lambda x: x[0][1] - x[0][0])
+            results_sorted = []
+            if not len(sorted_data):
+                logging.info("decoding, utt: {}, empty speech".format(key))
+                continue
+            if len(sorted_data) > 0 and len(sorted_data[0]) > 0:
+                batch_size = max(
+                    batch_size, sorted_data[0][0][1] - sorted_data[0][0][0]
+                )
+            batch_size_ms_cum = 0
+            beg_idx = 0
+            beg_asr_total = time.time()
+            time_speech_total_per_sample = speech_lengths / 16000
+            time_speech_total_all_samples += time_speech_total_per_sample
+            all_segments = []
+            for j, _ in enumerate(range(0, n)):
+                # pbar_sample.update(1)
+                batch_size_ms_cum += sorted_data[j][0][1] - sorted_data[j][0][0]
+                if (
+                    j < n - 1
+                    and (
+                        batch_size_ms_cum
+                        + sorted_data[j + 1][0][1]
+                        - sorted_data[j + 1][0][0]
+                    )
+                    < batch_size
+                    and (sorted_data[j + 1][0][1] - sorted_data[j + 1][0][0])
+                    < batch_size_threshold_ms
+                ):
+                    continue
+                batch_size_ms_cum = 0
+                end_idx = j + 1
+                speech_j, speech_lengths_j = slice_padding_audio_samples(
+                    speech, speech_lengths, sorted_data[beg_idx:end_idx]
+                )
+                results = self.inference(
+                    speech_j,
+                    input_len=None,
+                    model=model,
+                    kwargs=kwargs,
+                    disable_pbar=True,
+                    **cfg,
+                )
+                if self.spk_model is not None:
+                    # compose vad segments: [[start_time_sec, end_time_sec, speech], [...]]
+                    for _b in range(len(speech_j)):
+                        vad_segments = [
+                            [
+                                sorted_data[beg_idx:end_idx][_b][0][0] / 1000.0,
+                                sorted_data[beg_idx:end_idx][_b][0][1] / 1000.0,
+                                np.array(speech_j[_b]),
+                            ]
+                        ]
+                        segments = sv_chunk(vad_segments)
+                        all_segments.extend(segments)
+                        speech_b = [i[2] for i in segments]
+                        spk_res = self.inference(
+                            speech_b,
+                            input_len=None,
+                            model=self.spk_model,
+                            kwargs=kwargs,
+                            disable_pbar=True,
+                            **cfg,
+                        )
+                        results[_b]["spk_embedding"] = spk_res[0]["spk_embedding"]
+                beg_idx = end_idx
+                if len(results) < 1:
+                    continue
+                results_sorted.extend(results)
+            restored_data = [0] * n
+            for j in range(n):
+                index = sorted_data[j][1]
+                restored_data[index] = results_sorted[j]
+            result = {}
+            # results combine for texts, timestamps, speaker embeddings and others
+            # TODO: rewrite for clean code
+            for j in range(n):
+                for k, v in restored_data[j].items():
+                    if k.startswith("timestamp"):
+                        if k not in result:
+                            result[k] = []
+                        for t in restored_data[j][k]:
+                            t[0] += vadsegments[j][0]
+                            t[1] += vadsegments[j][0]
+                        result[k].extend(restored_data[j][k])
+                    elif k == "spk_embedding":
+                        if k not in result:
+                            result[k] = restored_data[j][k]
+                        else:
+                            result[k] = torch.cat(
+                                [result[k], restored_data[j][k]], dim=0
+                            )
+                    elif "text" in k:
+                        if k not in result:
+                            result[k] = restored_data[j][k]
+                        else:
+                            result[k] += " " + restored_data[j][k]
+                    else:
+                        if k not in result:
+                            result[k] = restored_data[j][k]
+                        else:
+                            result[k] += restored_data[j][k]
+            return_raw_text = kwargs.get("return_raw_text", False)
+            # step.3 compute punc model
+            if self.punc_model is not None:
+                self.punc_kwargs.update(cfg)
+                punc_res = self.inference(
+                    result["text"],
+                    model=self.punc_model,
+                    kwargs=self.punc_kwargs,
+                    disable_pbar=True,
+                    **cfg,
+                )
+                raw_text = copy.copy(result["text"])
+                if return_raw_text:
+                    result["raw_text"] = raw_text
+                result["text"] = punc_res[0]["text"]
+            else:
+                raw_text = None
+            # speaker embedding cluster after resorted
+            if self.spk_model is not None and kwargs.get("return_spk_res", True):
+                if raw_text is None:
+                    logging.error("Missing punc_model, which is required by spk_model.")
+                all_segments = sorted(all_segments, key=lambda x: x[0])
+                spk_embedding = result["spk_embedding"]
+                labels = self.cb_model(
+                    spk_embedding.cpu(), oracle_num=kwargs.get("preset_spk_num", None)
+                )
+                # del result['spk_embedding']
+                sv_output = postprocess(all_segments, None, labels, spk_embedding.cpu())
+                if self.spk_mode == "vad_segment":  # recover sentence_list
+                    sentence_list = []
+                    for res, vadsegment in zip(restored_data, vadsegments):
+                        if "timestamp" not in res:
+                            logging.error(
+                                "Only 'iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch' \
+                                           and 'iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'\
+                                           can predict timestamp, and speaker diarization relies on timestamps."
+                            )
+                        sentence_list.append(
+                            {
+                                "start": vadsegment[0],
+                                "end": vadsegment[1],
+                                "sentence": res["text"],
+                                "timestamp": res["timestamp"],
+                            }
+                        )
+                elif self.spk_mode == "punc_segment":
+                    if "timestamp" not in result:
+                        logging.error(
+                            "Only 'iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch' \
+                                       and 'iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'\
+                                       can predict timestamp, and speaker diarization relies on timestamps."
+                        )
+                    sentence_list = timestamp_sentence(
+                        punc_res[0]["punc_array"],
+                        result["timestamp"],
+                        raw_text,
+                        return_raw_text=return_raw_text,
+                    )
+                distribute_spk(sentence_list, sv_output)
+                result["sentence_info"] = sentence_list
+            elif kwargs.get("sentence_timestamp", False):
+                sentence_list = timestamp_sentence(
+                    punc_res[0]["punc_array"],
+                    result["timestamp"],
+                    raw_text,
+                    return_raw_text=return_raw_text,
+                )
+                result["sentence_info"] = sentence_list
+            if "spk_embedding" in result:
+                del result["spk_embedding"]
+            result["key"] = key
+            results_ret_list.append(result)
+            end_asr_total = time.time()
+            time_escape_total_per_sample = end_asr_total - beg_asr_total
+            pbar_total.update(1)
+            pbar_total.set_description(
+                f"rtf_avg: {time_escape_total_per_sample / time_speech_total_per_sample:0.3f}, "
+                f"time_speech: {time_speech_total_per_sample: 0.3f}, "
+                f"time_escape: {time_escape_total_per_sample:0.3f}"
+            )
+        return results_ret_list
+    def infer_encoder(
+        self, input, input_len=None, model=None, kwargs=None, key=None, **cfg
+    ):
+        kwargs = self.kwargs if kwargs is None else kwargs
+        kwargs.update(cfg)
+        model = self.model if model is None else model
+        model = model.cuda()
+        model.eval()
+        batch_size = kwargs.get("batch_size", 1)
+        key_list, data_list = prepare_data_iterator(
+            input, input_len=input_len, data_type=kwargs.get("data_type", None), key=key
+        )
+        asr_result_list = []
+        num_samples = len(data_list)
+        for beg_idx in range(0, num_samples, batch_size):
+            end_idx = min(num_samples, beg_idx + batch_size)
+            data_batch = data_list[beg_idx:end_idx]
+            key_batch = key_list[beg_idx:end_idx]
+            batch = {"data_in": data_batch, "key": key_batch}
+            if (end_idx - beg_idx) == 1 and kwargs.get(
+                "data_type", None
+            ) == "fbank":  # fbank
+                batch["data_in"] = data_batch[0]
+                batch["data_lengths"] = input_len
+            with torch.no_grad():
+                results, meta_data, cache = model.infer_encoder(**batch, **kwargs)
+            asr_result_list.extend(results)
+        torch.cuda.empty_cache()
+        return asr_result_list, cache

funasr_detach/auto/auto_tokenizer.py ADDED Viewed

	@@ -0,0 +1,7 @@

+class AutoTokenizer:
+    """
+    Undo
+    """
+    def __init__(self):
+        pass

funasr_detach/bin/__init__.py ADDED Viewed

File without changes

funasr_detach/bin/compute_audio_cmvn.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import os
+import json
+import numpy as np
+import torch
+import hydra
+import logging
+from omegaconf import DictConfig, OmegaConf
+from funasr_detach.register import tables
+from funasr_detach.download.download_from_hub import download_model
+from funasr_detach.train_utils.set_all_random_seed import set_all_random_seed
+@hydra.main(config_name=None, version_base=None)
+def main_hydra(kwargs: DictConfig):
+    if kwargs.get("debug", False):
+        import pdb
+        pdb.set_trace()
+    assert "model" in kwargs
+    if "model_conf" not in kwargs:
+        logging.info(
+            "download models from model hub: {}".format(kwargs.get("model_hub", "ms"))
+        )
+        kwargs = download_model(is_training=kwargs.get("is_training", True), **kwargs)
+    main(**kwargs)
+def main(**kwargs):
+    print(kwargs)
+    # set random seed
+    tables.print()
+    set_all_random_seed(kwargs.get("seed", 0))
+    torch.backends.cudnn.enabled = kwargs.get(
+        "cudnn_enabled", torch.backends.cudnn.enabled
+    )
+    torch.backends.cudnn.benchmark = kwargs.get(
+        "cudnn_benchmark", torch.backends.cudnn.benchmark
+    )
+    torch.backends.cudnn.deterministic = kwargs.get("cudnn_deterministic", True)
+    tokenizer = kwargs.get("tokenizer", None)
+    # build frontend if frontend is none None
+    frontend = kwargs.get("frontend", None)
+    if frontend is not None:
+        frontend_class = tables.frontend_classes.get(frontend)
+        frontend = frontend_class(**kwargs["frontend_conf"])
+        kwargs["frontend"] = frontend
+        kwargs["input_size"] = frontend.output_size()
+    # dataset
+    dataset_class = tables.dataset_classes.get(kwargs.get("dataset", "AudioDataset"))
+    dataset_train = dataset_class(
+        kwargs.get("train_data_set_list"),
+        frontend=frontend,
+        tokenizer=None,
+        is_training=False,
+        **kwargs.get("dataset_conf")
+    )
+    # dataloader
+    batch_sampler = kwargs["dataset_conf"].get(
+        "batch_sampler", "DynamicBatchLocalShuffleSampler"
+    )
+    batch_sampler_train = None
+    if batch_sampler is not None:
+        batch_sampler_class = tables.batch_sampler_classes.get(batch_sampler)
+        dataset_conf = kwargs.get("dataset_conf")
+        dataset_conf["batch_type"] = "example"
+        dataset_conf["batch_size"] = 1
+        batch_sampler_train = batch_sampler_class(
+            dataset_train, is_training=False, **dataset_conf
+        )
+    dataloader_train = torch.utils.data.DataLoader(
+        dataset_train,
+        collate_fn=dataset_train.collator,
+        batch_sampler=batch_sampler_train,
+        num_workers=int(kwargs.get("dataset_conf").get("num_workers", 4)),
+        pin_memory=True,
+    )
+    iter_stop = int(kwargs.get("scale", 1.0) * len(dataloader_train))
+    total_frames = 0
+    for batch_idx, batch in enumerate(dataloader_train):
+        if batch_idx >= iter_stop:
+            break
+        fbank = batch["speech"].numpy()[0, :, :]
+        if total_frames == 0:
+            mean_stats = np.sum(fbank, axis=0)
+            var_stats = np.sum(np.square(fbank), axis=0)
+        else:
+            mean_stats += np.sum(fbank, axis=0)
+            var_stats += np.sum(np.square(fbank), axis=0)
+        total_frames += fbank.shape[0]
+    cmvn_info = {
+        "mean_stats": list(mean_stats.tolist()),
+        "var_stats": list(var_stats.tolist()),
+        "total_frames": total_frames,
+    }
+    cmvn_file = kwargs.get("cmvn_file", "cmvn.json")
+    # import pdb;pdb.set_trace()
+    with open(cmvn_file, "w") as fout:
+        fout.write(json.dumps(cmvn_info))
+    mean = -1.0 * mean_stats / total_frames
+    var = 1.0 / np.sqrt(var_stats / total_frames - mean * mean)
+    dims = mean.shape[0]
+    am_mvn = os.path.dirname(cmvn_file) + "/am.mvn"
+    with open(am_mvn, "w") as fout:
+        fout.write(
+            "<Nnet>"
+            + "\n"
+            + "<Splice> "
+            + str(dims)
+            + " "
+            + str(dims)
+            + "\n"
+            + "[ 0 ]"
+            + "\n"
+            + "<AddShift> "
+            + str(dims)
+            + " "
+            + str(dims)
+            + "\n"
+        )
+        mean_str = (
+            str(list(mean)).replace(",", "").replace("[", "[ ").replace("]", " ]")
+        )
+        fout.write("<LearnRateCoef> 0 " + mean_str + "\n")
+        fout.write("<Rescale> " + str(dims) + " " + str(dims) + "\n")
+        var_str = str(list(var)).replace(",", "").replace("[", "[ ").replace("]", " ]")
+        fout.write("<LearnRateCoef> 0 " + var_str + "\n")
+        fout.write("</Nnet>" + "\n")
+"""
+python funasr/bin/compute_audio_cmvn.py \
+--config-path "/Users/zhifu/funasr1.0/examples/aishell/paraformer/conf" \
+--config-name "train_asr_paraformer_conformer_12e_6d_2048_256.yaml" \
+++train_data_set_list="/Users/zhifu/funasr1.0/data/list/audio_datasets.jsonl" \
+++cmvn_file="/Users/zhifu/funasr1.0/data/list/cmvn.json" \
+++dataset_conf.num_workers=0
+"""
+if __name__ == "__main__":
+    main_hydra()

funasr_detach/bin/inference.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import hydra
+import logging
+from omegaconf import DictConfig, OmegaConf, ListConfig
+from funasr_detach.auto.auto_model import AutoModel
+@hydra.main(config_name=None, version_base=None)
+def main_hydra(cfg: DictConfig):
+    def to_plain_list(cfg_item):
+        if isinstance(cfg_item, ListConfig):
+            return OmegaConf.to_container(cfg_item, resolve=True)
+        elif isinstance(cfg_item, DictConfig):
+            return {k: to_plain_list(v) for k, v in cfg_item.items()}
+        else:
+            return cfg_item
+    kwargs = to_plain_list(cfg)
+    log_level = getattr(logging, kwargs.get("log_level", "INFO").upper())
+    logging.basicConfig(level=log_level)
+    if kwargs.get("debug", False):
+        import pdb
+        pdb.set_trace()
+    model = AutoModel(**kwargs)
+    res = model.generate(input=kwargs["input"])
+    print(res)
+if __name__ == "__main__":
+    main_hydra()

funasr_detach/bin/tokenize_text.py ADDED Viewed

	@@ -0,0 +1,281 @@

+#!/usr/bin/env python3
+import argparse
+from collections import Counter
+import logging
+from pathlib import Path
+import sys
+from typing import List
+from typing import Optional
+from funasr_detach.utils.cli_utils import get_commandline_args
+from funasr_detach.tokenizer.build_tokenizer import build_tokenizer
+from funasr_detach.tokenizer.cleaner import TextCleaner
+from funasr_detach.tokenizer.phoneme_tokenizer import g2p_classes
+from funasr_detach.utils.types import str2bool
+from funasr_detach.utils.types import str_or_none
+def field2slice(field: Optional[str]) -> slice:
+    """Convert field string to slice
+    Note that field string accepts 1-based integer.
+    Examples:
+        >>> field2slice("1-")
+        slice(0, None, None)
+        >>> field2slice("1-3")
+        slice(0, 3, None)
+        >>> field2slice("-3")
+        slice(None, 3, None)
+    """
+    field = field.strip()
+    try:
+        if "-" in field:
+            # e.g. "2-" or "2-5" or "-7"
+            s1, s2 = field.split("-", maxsplit=1)
+            if s1.strip() == "":
+                s1 = None
+            else:
+                s1 = int(s1)
+                if s1 == 0:
+                    raise ValueError("1-based string")
+            if s2.strip() == "":
+                s2 = None
+            else:
+                s2 = int(s2)
+        else:
+            # e.g. "2"
+            s1 = int(field)
+            s2 = s1 + 1
+            if s1 == 0:
+                raise ValueError("must be 1 or more value")
+    except ValueError:
+        raise RuntimeError(f"Format error: e.g. '2-', '2-5', or '-5': {field}")
+    if s1 is None:
+        slic = slice(None, s2)
+    else:
+        # -1 because of 1-based integer following "cut" command
+        # e.g "1-3" -> slice(0, 3)
+        slic = slice(s1 - 1, s2)
+    return slic
+def tokenize(
+    input: str,
+    output: str,
+    field: Optional[str],
+    delimiter: Optional[str],
+    token_type: str,
+    space_symbol: str,
+    non_linguistic_symbols: Optional[str],
+    bpemodel: Optional[str],
+    log_level: str,
+    write_vocabulary: bool,
+    vocabulary_size: int,
+    remove_non_linguistic_symbols: bool,
+    cutoff: int,
+    add_symbol: List[str],
+    cleaner: Optional[str],
+    g2p: Optional[str],
+):
+    logging.basicConfig(
+        level=log_level,
+        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+    )
+    if input == "-":
+        fin = sys.stdin
+    else:
+        fin = Path(input).open("r", encoding="utf-8")
+    if output == "-":
+        fout = sys.stdout
+    else:
+        p = Path(output)
+        p.parent.mkdir(parents=True, exist_ok=True)
+        fout = p.open("w", encoding="utf-8")
+    cleaner = TextCleaner(cleaner)
+    tokenizer = build_tokenizer(
+        token_type=token_type,
+        bpemodel=bpemodel,
+        delimiter=delimiter,
+        space_symbol=space_symbol,
+        non_linguistic_symbols=non_linguistic_symbols,
+        remove_non_linguistic_symbols=remove_non_linguistic_symbols,
+        g2p_type=g2p,
+    )
+    counter = Counter()
+    if field is not None:
+        field = field2slice(field)
+    for line in fin:
+        line = line.rstrip()
+        if field is not None:
+            # e.g. field="2-"
+            # uttidA hello world!! -> hello world!!
+            tokens = line.split(delimiter)
+            tokens = tokens[field]
+            if delimiter is None:
+                line = " ".join(tokens)
+            else:
+                line = delimiter.join(tokens)
+        line = cleaner(line)
+        tokens = tokenizer.text2tokens(line)
+        if not write_vocabulary:
+            fout.write(" ".join(tokens) + "\n")
+        else:
+            for t in tokens:
+                counter[t] += 1
+    if not write_vocabulary:
+        return
+    ## FIXME
+    ## del duplicate add_symbols in counter
+    for symbol_and_id in add_symbol:
+        # e.g symbol="<blank>:0"
+        try:
+            symbol, idx = symbol_and_id.split(":")
+        except ValueError:
+            raise RuntimeError(f"Format error: e.g. '<blank>:0': {symbol_and_id}")
+        symbol = symbol.strip()
+        if symbol in counter:
+            del counter[symbol]
+    # ======= write_vocabulary mode from here =======
+    # Sort by the number of occurrences in descending order
+    # and filter lower frequency words than cutoff value
+    words_and_counts = list(
+        filter(lambda x: x[1] > cutoff, sorted(counter.items(), key=lambda x: -x[1]))
+    )
+    # Restrict the vocabulary size
+    if vocabulary_size > 0:
+        if vocabulary_size < len(add_symbol):
+            raise RuntimeError(f"vocabulary_size is too small: {vocabulary_size}")
+        words_and_counts = words_and_counts[: vocabulary_size - len(add_symbol)]
+    # Parse the values of --add_symbol
+    for symbol_and_id in add_symbol:
+        # e.g symbol="<blank>:0"
+        try:
+            symbol, idx = symbol_and_id.split(":")
+            idx = int(idx)
+        except ValueError:
+            raise RuntimeError(f"Format error: e.g. '<blank>:0': {symbol_and_id}")
+        symbol = symbol.strip()
+        # e.g. idx=0  -> append as the first symbol
+        # e.g. idx=-1 -> append as the last symbol
+        if idx < 0:
+            idx = len(words_and_counts) + 1 + idx
+        words_and_counts.insert(idx, (symbol, None))
+    # Write words
+    for w, c in words_and_counts:
+        fout.write(w + "\n")
+    # Logging
+    total_count = sum(counter.values())
+    invocab_count = sum(c for w, c in words_and_counts if c is not None)
+    logging.info(f"OOV rate = {(total_count - invocab_count) / total_count * 100} %")
+def get_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description="Tokenize texts",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--log_level",
+        type=lambda x: x.upper(),
+        default="INFO",
+        choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
+        help="The verbose level of logging",
+    )
+    parser.add_argument(
+        "--input", "-i", required=True, help="Input text. - indicates sys.stdin"
+    )
+    parser.add_argument(
+        "--output", "-o", required=True, help="Output text. - indicates sys.stdout"
+    )
+    parser.add_argument(
+        "--field",
+        "-f",
+        help="The target columns of the input text as 1-based integer. e.g 2-",
+    )
+    parser.add_argument(
+        "--token_type",
+        "-t",
+        default="char",
+        choices=["char", "bpe", "word", "phn"],
+        help="Token type",
+    )
+    parser.add_argument("--delimiter", "-d", default=None, help="The delimiter")
+    parser.add_argument("--space_symbol", default="<space>", help="The space symbol")
+    parser.add_argument("--bpemodel", default=None, help="The bpemodel file path")
+    parser.add_argument(
+        "--non_linguistic_symbols",
+        type=str_or_none,
+        help="non_linguistic_symbols file path",
+    )
+    parser.add_argument(
+        "--remove_non_linguistic_symbols",
+        type=str2bool,
+        default=False,
+        help="Remove non-language-symbols from tokens",
+    )
+    parser.add_argument(
+        "--cleaner",
+        type=str_or_none,
+        choices=[None, "tacotron", "jaconv", "vietnamese", "korean_cleaner"],
+        default=None,
+        help="Apply text cleaning",
+    )
+    parser.add_argument(
+        "--g2p",
+        type=str_or_none,
+        choices=g2p_classes,
+        default=None,
+        help="Specify g2p method if --token_type=phn",
+    )
+    group = parser.add_argument_group("write_vocabulary mode related")
+    group.add_argument(
+        "--write_vocabulary",
+        type=str2bool,
+        default=False,
+        help="Write tokens list instead of tokenized text per line",
+    )
+    group.add_argument("--vocabulary_size", type=int, default=0, help="Vocabulary size")
+    group.add_argument(
+        "--cutoff",
+        default=0,
+        type=int,
+        help="cut-off frequency used for write-vocabulary mode",
+    )
+    group.add_argument(
+        "--add_symbol",
+        type=str,
+        default=[],
+        action="append",
+        help="Append symbol e.g. --add_symbol '<blank>:0' --add_symbol '<unk>:1'",
+    )
+    return parser
+def main(cmd=None):
+    print(get_commandline_args(), file=sys.stderr)
+    parser = get_parser()
+    args = parser.parse_args(cmd)
+    kwargs = vars(args)
+    tokenize(**kwargs)
+if __name__ == "__main__":
+    main()

funasr_detach/bin/train.py ADDED Viewed

	@@ -0,0 +1,227 @@

+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+import os
+import sys
+import torch
+import hydra
+import logging
+import argparse
+from io import BytesIO
+import torch.distributed as dist
+from collections.abc import Sequence
+from omegaconf import DictConfig, OmegaConf
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from funasr_detach.register import tables
+from funasr_detach.optimizers import optim_classes
+from funasr_detach.train_utils.trainer import Trainer
+from funasr_detach.schedulers import scheduler_classes
+from funasr_detach.train_utils.initialize import initialize
+from funasr_detach.download.download_from_hub import download_model
+from funasr_detach.models.lora.utils import mark_only_lora_as_trainable
+from funasr_detach.train_utils.set_all_random_seed import set_all_random_seed
+from funasr_detach.train_utils.load_pretrained_model import load_pretrained_model
+# from funasr_detach.tokenizer.build_tokenizer import build_tokenizer
+# from funasr_detach.tokenizer.token_id_converter import TokenIDConverter
+# from funasr_detach.tokenizer.funtoken import build_tokenizer
+@hydra.main(config_name=None, version_base=None)
+def main_hydra(kwargs: DictConfig):
+    if kwargs.get("debug", False):
+        import pdb
+        pdb.set_trace()
+    assert "model" in kwargs
+    if "model_conf" not in kwargs:
+        logging.info(
+            "download models from model hub: {}".format(kwargs.get("model_hub", "ms"))
+        )
+        kwargs = download_model(is_training=kwargs.get("is_training", True), **kwargs)
+    main(**kwargs)
+def main(**kwargs):
+    print(kwargs)
+    # set random seed
+    set_all_random_seed(kwargs.get("seed", 0))
+    torch.backends.cudnn.enabled = kwargs.get(
+        "cudnn_enabled", torch.backends.cudnn.enabled
+    )
+    torch.backends.cudnn.benchmark = kwargs.get(
+        "cudnn_benchmark", torch.backends.cudnn.benchmark
+    )
+    torch.backends.cudnn.deterministic = kwargs.get("cudnn_deterministic", True)
+    local_rank = int(os.environ.get("LOCAL_RANK", 0))
+    if local_rank == 0:
+        tables.print()
+    # Check if we are using DDP or FSDP
+    use_ddp = "WORLD_SIZE" in os.environ and int(os.environ["WORLD_SIZE"]) > 1
+    use_fsdp = kwargs.get("use_fsdp", None)
+    if use_ddp or use_fsdp:
+        dist.init_process_group(
+            backend=kwargs.get("backend", "nccl"), init_method="env://"
+        )
+        torch.cuda.set_device(local_rank)
+    # save config.yaml
+    if (
+        (use_ddp or use_fsdp)
+        and dist.get_rank() == 0
+        or not (use_ddp or use_fsdp)
+        and local_rank == 0
+    ):
+        os.makedirs(kwargs.get("output_dir", "./"), exist_ok=True)
+        yaml_file = os.path.join(kwargs.get("output_dir", "./"), "config.yaml")
+        OmegaConf.save(config=kwargs, f=yaml_file)
+        logging.info("config.yaml is saved to: %s", yaml_file)
+    tokenizer = kwargs.get("tokenizer", None)
+    if tokenizer is not None:
+        tokenizer_class = tables.tokenizer_classes.get(tokenizer)
+        tokenizer = tokenizer_class(**kwargs["tokenizer_conf"])
+        kwargs["tokenizer"] = tokenizer
+    # build frontend if frontend is none None
+    frontend = kwargs.get("frontend", None)
+    if frontend is not None:
+        frontend_class = tables.frontend_classes.get(frontend)
+        frontend = frontend_class(**kwargs["frontend_conf"])
+        kwargs["frontend"] = frontend
+        kwargs["input_size"] = frontend.output_size()
+    # build model
+    model_class = tables.model_classes.get(kwargs["model"])
+    model = model_class(
+        **kwargs, **kwargs["model_conf"], vocab_size=len(tokenizer.token_list)
+    )
+    # init_param
+    init_param = kwargs.get("init_param", None)
+    if init_param is not None:
+        if not isinstance(init_param, (list, tuple)):
+            init_param = (init_param,)
+        logging.info("init_param is not None: %s", init_param)
+        for p in init_param:
+            logging.info(f"Loading pretrained params from {p}")
+            load_pretrained_model(
+                model=model,
+                path=p,
+                ignore_init_mismatch=kwargs.get("ignore_init_mismatch", True),
+                oss_bucket=kwargs.get("oss_bucket", None),
+                scope_map=kwargs.get("scope_map", None),
+                excludes=kwargs.get("excludes", None),
+            )
+    else:
+        initialize(model, kwargs.get("init", "kaiming_normal"))
+    # freeze_param
+    freeze_param = kwargs.get("freeze_param", None)
+    if freeze_param is not None:
+        freeze_param = eval(freeze_param)
+        if isinstance(freeze_param, Sequence):
+            freeze_param = (freeze_param,)
+        logging.info("freeze_param is not None: %s", freeze_param)
+        for t in freeze_param:
+            for k, p in model.named_parameters():
+                if k.startswith(t + ".") or k == t:
+                    logging.info(f"Setting {k}.requires_grad = False")
+                    p.requires_grad = False
+    if use_ddp:
+        model = model.cuda(local_rank)
+        model = DDP(
+            model,
+            device_ids=[local_rank],
+            find_unused_parameters=kwargs.get("train_conf", {}).get(
+                "find_unused_parameters", False
+            ),
+        )
+    elif use_fsdp:
+        model = FSDP(model).cuda(local_rank)
+    else:
+        model = model.to(device=kwargs.get("device", "cuda"))
+    # optim
+    optim = kwargs.get("optim", "adam")
+    assert optim in optim_classes
+    optim_class = optim_classes.get(optim)
+    optim = optim_class(model.parameters(), **kwargs.get("optim_conf"))
+    # scheduler
+    scheduler = kwargs.get("scheduler", "warmuplr")
+    assert scheduler in scheduler_classes
+    scheduler_class = scheduler_classes.get(scheduler)
+    scheduler = scheduler_class(optim, **kwargs.get("scheduler_conf"))
+    # dataset
+    dataset_class = tables.dataset_classes.get(kwargs.get("dataset", "AudioDataset"))
+    dataset_tr = dataset_class(
+        kwargs.get("train_data_set_list"),
+        frontend=frontend,
+        tokenizer=tokenizer,
+        is_training=True,
+        **kwargs.get("dataset_conf"),
+    )
+    dataset_val = dataset_class(
+        kwargs.get("valid_data_set_list"),
+        frontend=frontend,
+        tokenizer=tokenizer,
+        is_training=False,
+        **kwargs.get("dataset_conf"),
+    )
+    # dataloader
+    batch_sampler = kwargs["dataset_conf"].get(
+        "batch_sampler", "DynamicBatchLocalShuffleSampler"
+    )
+    batch_sampler_val = None
+    if batch_sampler is not None:
+        batch_sampler_class = tables.batch_sampler_classes.get(batch_sampler)
+        batch_sampler = batch_sampler_class(dataset_tr, **kwargs.get("dataset_conf"))
+        batch_sampler_val = batch_sampler_class(
+            dataset_val, is_training=False, **kwargs.get("dataset_conf")
+        )
+    dataloader_tr = torch.utils.data.DataLoader(
+        dataset_tr,
+        collate_fn=dataset_tr.collator,
+        batch_sampler=batch_sampler,
+        num_workers=kwargs.get("dataset_conf").get("num_workers", 4),
+        pin_memory=True,
+    )
+    dataloader_val = torch.utils.data.DataLoader(
+        dataset_val,
+        collate_fn=dataset_val.collator,
+        batch_sampler=batch_sampler_val,
+        num_workers=kwargs.get("dataset_conf").get("num_workers", 4),
+        pin_memory=True,
+    )
+    trainer = Trainer(
+        model=model,
+        optim=optim,
+        scheduler=scheduler,
+        dataloader_train=dataloader_tr,
+        dataloader_val=dataloader_val,
+        local_rank=local_rank,
+        use_ddp=use_ddp,
+        use_fsdp=use_fsdp,
+        output_dir=kwargs.get("output_dir", "./exp"),
+        resume=kwargs.get("resume", True),
+        **kwargs.get("train_conf"),
+    )
+    trainer.run()
+    if use_ddp or use_fsdp:
+        torch.distributed.destroy_process_group()
+if __name__ == "__main__":
+    main_hydra()

funasr_detach/datasets/__init__.py ADDED Viewed

File without changes

funasr_detach/datasets/audio_datasets/__init__.py ADDED Viewed

File without changes

funasr_detach/datasets/audio_datasets/datasets.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import torch
+from funasr_detach.register import tables
+from funasr_detach.utils.load_utils import extract_fbank, load_audio_text_image_video
+@tables.register("dataset_classes", "AudioDataset")
+class AudioDataset(torch.utils.data.Dataset):
+    """
+    AudioDataset
+    """
+    def __init__(
+        self,
+        path,
+        index_ds: str = None,
+        frontend=None,
+        tokenizer=None,
+        int_pad_value: int = -1,
+        float_pad_value: float = 0.0,
+        **kwargs
+    ):
+        super().__init__()
+        index_ds_class = tables.index_ds_classes.get(index_ds)
+        self.index_ds = index_ds_class(path, **kwargs)
+        preprocessor_speech = kwargs.get("preprocessor_speech", None)
+        if preprocessor_speech:
+            preprocessor_speech_class = tables.preprocessor_classes.get(
+                preprocessor_speech
+            )
+            preprocessor_speech = preprocessor_speech_class(
+                **kwargs.get("preprocessor_speech_conf")
+            )
+        self.preprocessor_speech = preprocessor_speech
+        preprocessor_text = kwargs.get("preprocessor_text", None)
+        if preprocessor_text:
+            preprocessor_text_class = tables.preprocessor_classes.get(preprocessor_text)
+            preprocessor_text = preprocessor_text_class(
+                **kwargs.get("preprocessor_text_conf")
+            )
+        self.preprocessor_text = preprocessor_text
+        self.frontend = frontend
+        self.fs = 16000 if frontend is None else frontend.fs
+        self.data_type = "sound"
+        self.tokenizer = tokenizer
+        self.int_pad_value = int_pad_value
+        self.float_pad_value = float_pad_value
+    def get_source_len(self, index):
+        item = self.index_ds[index]
+        return self.index_ds.get_source_len(item)
+    def get_target_len(self, index):
+        item = self.index_ds[index]
+        return self.index_ds.get_target_len(item)
+    def __len__(self):
+        return len(self.index_ds)
+    def __getitem__(self, index):
+        item = self.index_ds[index]
+        # import pdb;
+        # pdb.set_trace()
+        source = item["source"]
+        data_src = load_audio_text_image_video(source, fs=self.fs)
+        if self.preprocessor_speech:
+            data_src = self.preprocessor_speech(data_src, fs=self.fs)
+        speech, speech_lengths = extract_fbank(
+            data_src, data_type=self.data_type, frontend=self.frontend, is_final=True
+        )  # speech: [b, T, d]
+        target = item["target"]
+        if self.preprocessor_text:
+            target = self.preprocessor_text(target)
+        if self.tokenizer:
+            ids = self.tokenizer.encode(target)
+            text = torch.tensor(ids, dtype=torch.int64)
+        else:
+            ids = target
+            text = ids
+        ids_lengths = len(ids)
+        text_lengths = torch.tensor([ids_lengths], dtype=torch.int32)
+        return {
+            "speech": speech[0, :, :],
+            "speech_lengths": speech_lengths,
+            "text": text,
+            "text_lengths": text_lengths,
+        }
+    def collator(self, samples: list = None):
+        outputs = {}
+        for sample in samples:
+            for key in sample.keys():
+                if key not in outputs:
+                    outputs[key] = []
+                outputs[key].append(sample[key])
+        for key, data_list in outputs.items():
+            if isinstance(data_list[0], torch.Tensor):
+                if data_list[0].dtype == torch.int64:
+                    pad_value = self.int_pad_value
+                else:
+                    pad_value = self.float_pad_value
+                outputs[key] = torch.nn.utils.rnn.pad_sequence(
+                    data_list, batch_first=True, padding_value=pad_value
+                )
+        return outputs

funasr_detach/datasets/audio_datasets/index_ds.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import os
+import json
+import torch
+import logging
+import concurrent.futures
+import librosa
+import torch.distributed as dist
+from funasr_detach.register import tables
+@tables.register("index_ds_classes", "IndexDSJsonlRankSplit")
+class IndexDSJsonlRankSplit(torch.utils.data.Dataset):
+    def __init__(self, path):
+        super().__init__()
+        contents = []
+        with open(path, encoding="utf-8") as fin:
+            for line in fin:
+                data = json.loads(line.strip())
+                if "text" in data:  # for sft
+                    self.contents.append(data["text"])
+                if "source" in data:  # for speech lab pretrain
+                    prompt = data["prompt"]
+                    source = data["source"]
+                    target = data["target"]
+                    source_len = data["source_len"]
+                    target_len = data["target_len"]
+                    contents.append(
+                        {
+                            "source": source,
+                            "prompt": prompt,
+                            "target": target,
+                            "source_len": source_len,
+                            "target_len": target_len,
+                        }
+                    )
+        self.contents = []
+        total_num = len(contents)
+        try:
+            rank = dist.get_rank()
+            world_size = dist.get_world_size()
+        except:
+            rank = 0
+            world_size = 1
+            logging.warning("distributed is not initialized, only single shard")
+        num_per_rank = total_num // world_size
+        # rank = 0
+        # import ipdb; ipdb.set_trace()
+        self.contents = contents[rank * num_per_rank : (rank + 1) * num_per_rank]
+        logging.info(
+            "in rank: {}, num of samplers: {}, total_num of samplers across ranks: {}".format(
+                rank, len(self.contents), len(contents)
+            )
+        )
+    def __len__(self):
+        return len(self.contents)
+    def __getitem__(self, index):
+        try:
+            data = self.contents[index]
+        except:
+            print(index)
+        return data
+    def get_source_len(self, data_dict):
+        return data_dict["source_len"]
+    def get_target_len(self, data_dict):
+        return data_dict["target_len"] if "target_len" in data_dict else 0
+@tables.register("index_ds_classes", "IndexDSJsonl")
+@tables.register("index_ds_classes", "IndexDSJsonlRankFull")
+class IndexDSJsonlRankFull(torch.utils.data.Dataset):
+    def __init__(self, path: str, **kwargs):
+        super().__init__()
+        if isinstance(path, (list, tuple)):  # wav.scp, text.txt/text.trans
+            from funasr_detach.datasets.audio_datasets.scp2jsonl import (
+                gen_jsonl_from_wav_text_list,
+            )
+            jsonl_outdir = os.path.dirname(path[0])
+            jsonl_name = (
+                "datalist_train.jsonl"
+                if kwargs.get("is_training", True)
+                else "datalist_val.jsonl"
+            )
+            jsonl_file_out = os.path.join(jsonl_outdir, jsonl_name)
+            if not os.path.exists(jsonl_file_out):
+                print(f"datalist is: {path}, generate jsonl from it")
+                gen_jsonl_from_wav_text_list(
+                    path, jsonl_file_out=jsonl_file_out, **kwargs
+                )
+            path = jsonl_file_out
+        contents = []
+        with open(path, encoding="utf-8") as fin:
+            for line in fin:
+                data = json.loads(line.strip())
+                if "text" in data:  # for sft
+                    self.contents.append(data["text"])
+                if "source" in data:  # for speech lab pretrain
+                    prompt = data.get("prompt", "<ASR>")
+                    source = data["source"]
+                    target = data["target"]
+                    source_len = data.get("source_len", 1)
+                    target_len = data.get("target_len", 0)
+                    contents.append(
+                        {
+                            "source": source,
+                            "prompt": prompt,
+                            "target": target,
+                            "source_len": source_len,
+                            "target_len": target_len,
+                        }
+                    )
+        self.contents = contents
+        logging.info(
+            "total_num of samplers across ranks: {}".format(len(self.contents))
+        )
+    def __len__(self):
+        return len(self.contents)
+    def __getitem__(self, index):
+        try:
+            data = self.contents[index]
+        except:
+            print(index)
+        return data
+    def get_source_len(self, data_dict):
+        return data_dict.get("source_len", 1)
+    def get_target_len(self, data_dict):
+        return data_dict.get("target_len", 0)

funasr_detach/datasets/audio_datasets/preprocessor.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import os
+import json
+import torch
+import logging
+import concurrent.futures
+import librosa
+import torch.distributed as dist
+from typing import Collection
+import torch
+import torchaudio
+from torch import nn
+import random
+import re
+from funasr_detach.tokenizer.cleaner import TextCleaner
+from funasr_detach.register import tables
+@tables.register("preprocessor_classes", "SpeechPreprocessSpeedPerturb")
+class SpeechPreprocessSpeedPerturb(nn.Module):
+    def __init__(self, speed_perturb: list = None, **kwargs):
+        super().__init__()
+        self.speed_perturb = speed_perturb
+    def forward(self, waveform, fs, **kwargs):
+        if self.speed_perturb is None:
+            return waveform
+        speed = random.choice(self.speed_perturb)
+        if speed != 1.0:
+            if not isinstance(waveform, torch.Tensor):
+                waveform = torch.tensor(waveform)
+            waveform, _ = torchaudio.sox_effects.apply_effects_tensor(
+                waveform.view(1, -1), fs, [["speed", str(speed)], ["rate", str(fs)]]
+            )
+            waveform = waveform.view(-1)
+        return waveform
+@tables.register("preprocessor_classes", "TextPreprocessSegDict")
+class TextPreprocessSegDict(nn.Module):
+    def __init__(
+        self,
+        seg_dict: str = None,
+        text_cleaner: Collection[str] = None,
+        split_with_space: bool = False,
+        **kwargs
+    ):
+        super().__init__()
+        self.text_cleaner = TextCleaner(text_cleaner)
+    def forward(self, text, **kwargs):
+        text = self.text_cleaner(text)
+        return text

funasr_detach/datasets/audio_datasets/samplers.py ADDED Viewed

	@@ -0,0 +1,306 @@

+import torch
+import numpy as np
+import logging
+import torch.distributed as dist
+from funasr_detach.register import tables
+@tables.register("batch_sampler_classes", "DynamicBatchLocalShuffleSampler")
+class BatchSampler(torch.utils.data.BatchSampler):
+    def __init__(
+        self,
+        dataset,
+        batch_type: str = "example",
+        batch_size: int = 100,
+        buffer_size: int = 30,
+        drop_last: bool = False,
+        shuffle: bool = True,
+        is_training: bool = True,
+        **kwargs
+    ):
+        self.drop_last = drop_last
+        self.pre_idx = -1
+        self.dataset = dataset
+        self.total_samples = len(dataset)
+        self.batch_type = batch_type
+        self.batch_size = int(batch_size)
+        self.buffer_size = buffer_size
+        self.max_token_length = kwargs.get("max_token_length", 5000)
+        self.shuffle_idx = np.arange(self.total_samples)
+        self.shuffle = shuffle and is_training
+        self.length_scale_source = kwargs.get("length_scale_source", 1.0)
+    def __len__(self):
+        return (self.total_samples - 1) // self.batch_size + 1
+    def set_epoch(self, epoch):
+        np.random.seed(epoch)
+    def __iter__(self):
+        if self.shuffle:
+            np.random.shuffle(self.shuffle_idx)
+        batch = []
+        max_token = 0
+        num_sample = 0
+        iter_num = (self.total_samples - 1) // self.buffer_size + 1
+        # print("iter_num: ", iter_num)
+        for iter in range(self.pre_idx + 1, iter_num):
+            datalen_with_index = []
+            for i in range(self.buffer_size):
+                idx = iter * self.buffer_size + i
+                if idx >= self.total_samples:
+                    continue
+                idx_map = self.shuffle_idx[idx]
+                # prompt = self.dataset.indexed_dataset[idx_map]["prompt"]
+                target_len = (
+                    self.dataset.get_target_len(idx_map)
+                    if self.batch_type == "length"
+                    else 0.0
+                )
+                source_len = (
+                    self.dataset.get_source_len(idx_map) / self.length_scale_source
+                )
+                sample_len_cur = source_len + target_len
+                datalen_with_index.append([idx, sample_len_cur])
+            datalen_with_index_sort = sorted(datalen_with_index, key=lambda x: x[1])
+            for item in datalen_with_index_sort:
+                idx, sample_len_cur_raw = item
+                if sample_len_cur_raw > self.max_token_length:
+                    continue
+                max_token_cur = max(max_token, sample_len_cur_raw)
+                max_token_padding = 1 + num_sample
+                if self.batch_type != "example":
+                    max_token_padding *= max_token_cur
+                if max_token_padding <= self.batch_size:
+                    batch.append(idx)
+                    max_token = max_token_cur
+                    num_sample += 1
+                else:
+                    yield batch
+                    batch = [idx]
+                    max_token = sample_len_cur_raw
+                    num_sample = 1
+@tables.register("batch_sampler_classes", "BatchSampler")
+@tables.register("batch_sampler_classes", "RankFullLocalShuffleBatchSampler")
+class RankFullLocalShuffleBatchSampler(torch.utils.data.BatchSampler):
+    def __init__(
+        self,
+        dataset,
+        batch_type: str = "example",
+        batch_size: int = 100,
+        buffer_size: int = 30,
+        drop_last: bool = True,
+        shuffle: bool = True,
+        is_training: bool = True,
+        **kwargs
+    ):
+        self.drop_last = drop_last
+        self.pre_idx = -1
+        self.dataset = dataset
+        self.total_samples = len(dataset)
+        self.batch_type = batch_type
+        self.batch_size = int(batch_size)
+        self.buffer_size = buffer_size
+        self.max_token_length = kwargs.get("max_token_length", 1500)
+        self.shuffle_idx = np.arange(self.total_samples)
+        self.shuffle = shuffle and is_training
+        self.length_scale_source = kwargs.get("length_scale_source", 1.0)
+        try:
+            rank = dist.get_rank()
+            world_size = dist.get_world_size()
+        except:
+            rank = 0
+            world_size = 1
+        self.rank = rank
+        self.world_size = world_size
+    def __len__(self):
+        return (self.total_samples - 1) // (self.batch_size * self.world_size) + 1
+    def set_epoch(self, epoch):
+        np.random.seed(epoch)
+    def __iter__(self):
+        batch_size_total = self.batch_size * self.world_size
+        if self.shuffle:
+            np.random.shuffle(self.shuffle_idx)
+        batch = []
+        max_token = 0
+        num_sample = 0
+        iter_num = (self.total_samples - 1) // self.buffer_size + 1
+        # print("iter_num: ", iter_num)
+        for iter in range(self.pre_idx + 1, iter_num):
+            # if iter == iter_num -1 and self.drop_last:
+            #     continue
+            datalen_with_index = []
+            for i in range(self.buffer_size):
+                idx = iter * self.buffer_size + i
+                if idx >= self.total_samples:
+                    continue
+                idx_map = self.shuffle_idx[idx]
+                # prompt = self.dataset.indexed_dataset[idx_map]["prompt"]
+                source_len = (
+                    self.dataset.get_source_len(idx_map) / self.length_scale_source
+                )
+                target_len = (
+                    self.dataset.get_target_len(idx_map)
+                    if self.batch_type == "length"
+                    else 0.0
+                )
+                sample_len_cur = source_len + target_len
+                datalen_with_index.append([idx, sample_len_cur])
+            datalen_with_index_sort = sorted(datalen_with_index, key=lambda x: x[1])
+            for item in datalen_with_index_sort:
+                idx, sample_len_cur_raw = item
+                if sample_len_cur_raw > self.max_token_length:
+                    continue
+                max_token_cur = max(max_token, sample_len_cur_raw)
+                max_token_padding = 1 + num_sample
+                # if self.batch_type != 'example':
+                #     max_token_padding *= max_token_cur
+                if max_token_padding <= batch_size_total:
+                    batch.append(idx)
+                    max_token = max_token_cur
+                    num_sample += 1
+                else:
+                    batch_rank = batch[
+                        self.rank * self.batch_size : (self.rank + 1) * self.batch_size
+                    ]
+                    yield batch_rank
+                    batch = [idx]
+                    max_token = sample_len_cur_raw
+                    num_sample = 1
+@tables.register("batch_sampler_classes", "RankFullLocalShuffleDynamicBatchSampler")
+class RankFullLocalShuffleDynamicBatchSampler(torch.utils.data.BatchSampler):
+    def __init__(
+        self,
+        dataset,
+        batch_type: str = "example",
+        batch_size: int = 100,
+        buffer_size: int = 30,
+        drop_last: bool = True,
+        shuffle: bool = True,
+        is_training: bool = True,
+        **kwargs
+    ):
+        self.drop_last = drop_last
+        self.pre_idx = -1
+        self.dataset = dataset
+        self.total_samples = len(dataset)
+        self.batch_type = batch_type
+        self.batch_size = int(batch_size)
+        self.buffer_size = buffer_size
+        self.max_token_length = kwargs.get("max_token_length", 1500)
+        self.shuffle_idx = np.arange(self.total_samples)
+        self.shuffle = shuffle and is_training
+        self.length_scale_source = kwargs.get("length_scale_source", 1.0)
+        try:
+            rank = dist.get_rank()
+            world_size = dist.get_world_size()
+        except:
+            rank = 0
+            world_size = 1
+        self.rank = rank
+        self.world_size = world_size
+    def __len__(self):
+        return (self.total_samples - 1) // (self.batch_size * self.world_size) + 1
+    def set_epoch(self, epoch):
+        np.random.seed(epoch)
+    def __iter__(self):
+        batch_size_total = self.batch_size * self.world_size
+        if self.shuffle:
+            np.random.shuffle(self.shuffle_idx)
+        batch_list_all_rank = []
+        batch_list_cur = []
+        max_token = 0
+        num_sample = 0
+        iter_num = (self.total_samples - 1) // self.buffer_size + 1
+        # print("iter_num: ", iter_num)
+        for iter in range(self.pre_idx + 1, iter_num):
+            # if iter == iter_num - 1 and self.drop_last:
+            #     continue
+            datalen_with_index = []
+            for i in range(self.buffer_size):
+                idx = iter * self.buffer_size + i
+                if idx >= self.total_samples:
+                    continue
+                idx_map = self.shuffle_idx[idx]
+                # prompt = self.dataset.indexed_dataset[idx_map]["prompt"]
+                source_len = (
+                    self.dataset.get_source_len(idx_map) / self.length_scale_source
+                )
+                target_len = (
+                    self.dataset.get_target_len(idx_map)
+                    if self.batch_type == "length"
+                    else 0.0
+                )
+                sample_len_cur = source_len + target_len
+                datalen_with_index.append([idx, sample_len_cur])
+            datalen_with_index_sort = sorted(datalen_with_index, key=lambda x: x[1])
+            for ii, item in enumerate(datalen_with_index_sort):
+                is_last_batch = iter == iter_num - 1 and ii == len(
+                    datalen_with_index_sort
+                )
+                idx, sample_len_cur_raw = item
+                if sample_len_cur_raw > self.max_token_length:
+                    continue
+                max_token_cur = max(max_token, sample_len_cur_raw)
+                max_token_padding = 1 + num_sample
+                if self.batch_type != "example":
+                    max_token_padding *= max_token_cur
+                if len(batch_list_all_rank) < self.world_size:
+                    if max_token_padding <= self.batch_size:
+                        batch_list_cur.append(idx)
+                        max_token = max_token_cur
+                        num_sample += 1
+                    else:
+                        batch_list_all_rank.append(batch_list_cur)
+                        batch_list_cur = []
+                else:
+                    batch_rank = batch_list_all_rank[self.rank]
+                    yield batch_rank
+                    batch_list_all_rank = [idx]
+                    max_token = sample_len_cur_raw
+                    num_sample = 1

funasr_detach/datasets/audio_datasets/scp2jsonl.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import os
+import json
+import torch
+import logging
+import hydra
+from omegaconf import DictConfig, OmegaConf
+import concurrent.futures
+import librosa
+import torch.distributed as dist
+def gen_jsonl_from_wav_text_list(
+    path, data_type_list=("source", "target"), jsonl_file_out: str = None, **kwargs
+):
+    try:
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+    except:
+        rank = 0
+        world_size = 1
+    cpu_cores = os.cpu_count() or 1
+    print(f"convert wav.scp text to jsonl, ncpu: {cpu_cores}")
+    if rank == 0:
+        json_dict = {}
+        for data_type, data_file in zip(data_type_list, path):
+            json_dict[data_type] = {}
+            with open(data_file, "r") as f:
+                data_file_lists = f.readlines()
+                lines_for_each_th = (len(data_file_lists) - 1) // cpu_cores + 1
+                task_num = cpu_cores if len(data_file_lists) > cpu_cores else 1
+                with concurrent.futures.ThreadPoolExecutor(
+                    max_workers=cpu_cores
+                ) as executor:
+                    futures = [
+                        executor.submit(
+                            parse_context_length,
+                            data_file_lists[
+                                i * lines_for_each_th : (i + 1) * lines_for_each_th
+                            ],
+                            data_type,
+                        )
+                        for i in range(task_num)
+                    ]
+                    for future in concurrent.futures.as_completed(futures):
+                        json_dict[data_type].update(future.result())
+            # print(json_dict)
+        with open(jsonl_file_out, "w") as f:
+            for key in json_dict[data_type_list[0]].keys():
+                jsonl_line = {"key": key}
+                for data_file in data_type_list:
+                    jsonl_line.update(json_dict[data_file][key])
+                jsonl_line = json.dumps(jsonl_line, ensure_ascii=False)
+                f.write(jsonl_line + "\n")
+                f.flush()
+    else:
+        pass
+    if world_size > 1:
+        dist.barrier()
+def parse_context_length(data_list: list, data_type: str):
+    res = {}
+    for i, line in enumerate(data_list):
+        key, line = line.strip().split(maxsplit=1)
+        line = line.strip()
+        if os.path.exists(line):
+            waveform, _ = librosa.load(line, sr=16000)
+            sample_num = len(waveform)
+            context_len = int(sample_num // 16000 * 1000 / 10)
+        else:
+            context_len = len(line.split()) if " " in line else len(line)
+        res[key] = {data_type: line, f"{data_type}_len": context_len}
+    return res
+@hydra.main(config_name=None, version_base=None)
+def main_hydra(cfg: DictConfig):
+    kwargs = OmegaConf.to_container(cfg, resolve=True)
+    scp_file_list = kwargs.get(
+        "scp_file_list",
+        (
+            "/Users/zhifu/funasr1.0/test_local/wav.scp",
+            "/Users/zhifu/funasr1.0/test_local/text.txt",
+        ),
+    )
+    if isinstance(scp_file_list, str):
+        scp_file_list = eval(scp_file_list)
+    data_type_list = kwargs.get("data_type_list", ("source", "target"))
+    jsonl_file_out = kwargs.get(
+        "jsonl_file_out", "/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl"
+    )
+    gen_jsonl_from_wav_text_list(
+        scp_file_list, data_type_list=data_type_list, jsonl_file_out=jsonl_file_out
+    )
+"""
+python -m funasr_detach.datasets.audio_datasets.scp2jsonl \
+++scp_file_list='["/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"]' \
+++data_type_list='["source", "target"]' \
+++jsonl_file_out=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl
+"""
+if __name__ == "__main__":
+    main_hydra()

funasr_detach/download/__init__.py ADDED Viewed

File without changes

funasr_detach/download/download_dataset_from_hub.py ADDED Viewed

	@@ -0,0 +1,19 @@

+def download_dataset():
+    pass
+def download_dataset_from_ms(**kwargs):
+    from modelscope.msdatasets import MsDataset
+    dataset_name = kwargs.get(
+        "dataset_name", "speech_asr/speech_asr_aishell1_trainsets"
+    )
+    subset_name = kwargs.get("subset_name", "default")
+    split = kwargs.get("split", "train")
+    data_dump_dir = kwargs.get("data_dump_dir", None)
+    ds = MsDataset.load(
+        dataset_name=dataset_name,
+        subset_name=subset_name,
+        split=split,
+        cache_dir=data_dump_dir,
+    )

funasr_detach/download/download_from_hub.py ADDED Viewed

	@@ -0,0 +1,231 @@

+import os
+import json
+import threading
+from omegaconf import OmegaConf
+from funasr_detach.download.name_maps_from_hub import name_maps_ms, name_maps_hf
+# Global cache for downloaded models to avoid repeated downloads
+# Key: (repo_id, model_revision, model_hub)
+# Value: repo_cache_dir
+_model_cache = {}
+_cache_lock = threading.Lock()
+def download_model(**kwargs):
+    model_hub = kwargs.get("model_hub", "ms")
+    model_or_path = kwargs.get("model")
+    repo_path = kwargs.get("repo_path", "")
+    # Handle name mapping based on model_hub
+    if model_hub == "ms" and model_or_path in name_maps_ms:
+        model_or_path = name_maps_ms[model_or_path]
+    elif model_hub == "hf" and model_or_path in name_maps_hf:
+        model_or_path = name_maps_hf[model_or_path]
+    model_revision = kwargs.get("model_revision")
+    # Download model if it doesn't exist locally
+    if not os.path.exists(model_or_path):
+        if model_hub == "local":
+            # For local models, the path should already exist
+            raise FileNotFoundError(f"Local model path does not exist: {model_or_path}")
+        elif model_hub in ["ms", "hf"]:
+            repo_path, model_or_path = get_or_download_model_dir(
+                model_or_path,
+                model_revision,
+                is_training=kwargs.get("is_training"),
+                check_latest=kwargs.get("kwargs", True),
+                model_hub=model_hub,
+            )
+        else:
+            raise ValueError(f"Unsupported model_hub: {model_hub}")
+    print(f"Using model path: {model_or_path}")
+    kwargs["model_path"] = model_or_path
+    kwargs["repo_path"] = repo_path
+    # Common logic for processing configuration files (same for all model hubs)
+    if os.path.exists(os.path.join(model_or_path, "configuration.json")):
+        with open(
+            os.path.join(model_or_path, "configuration.json"), "r", encoding="utf-8"
+        ) as f:
+            conf_json = json.load(f)
+            cfg = {}
+            add_file_root_path(model_or_path, conf_json["file_path_metas"], cfg)
+            cfg.update(kwargs)
+            config = OmegaConf.load(cfg["config"])
+            kwargs = OmegaConf.merge(config, cfg)
+        kwargs["model"] = config["model"]
+    elif os.path.exists(os.path.join(model_or_path, "config.yaml")) and os.path.exists(
+        os.path.join(model_or_path, "model.pt")
+    ):
+        config = OmegaConf.load(os.path.join(model_or_path, "config.yaml"))
+        kwargs = OmegaConf.merge(config, kwargs)
+        init_param = os.path.join(model_or_path, "model.pb")
+        kwargs["init_param"] = init_param
+        if os.path.exists(os.path.join(model_or_path, "tokens.txt")):
+            kwargs["tokenizer_conf"]["token_list"] = os.path.join(
+                model_or_path, "tokens.txt"
+            )
+        if os.path.exists(os.path.join(model_or_path, "tokens.json")):
+            kwargs["tokenizer_conf"]["token_list"] = os.path.join(
+                model_or_path, "tokens.json"
+            )
+        if os.path.exists(os.path.join(model_or_path, "seg_dict")):
+            kwargs["tokenizer_conf"]["seg_dict"] = os.path.join(
+                model_or_path, "seg_dict"
+            )
+        if os.path.exists(os.path.join(model_or_path, "bpe.model")):
+            kwargs["tokenizer_conf"]["bpemodel"] = os.path.join(
+                model_or_path, "bpe.model"
+            )
+        kwargs["model"] = config["model"]
+        if os.path.exists(os.path.join(model_or_path, "am.mvn")):
+            kwargs["frontend_conf"]["cmvn_file"] = os.path.join(model_or_path, "am.mvn")
+        if os.path.exists(os.path.join(model_or_path, "jieba_usr_dict")):
+            kwargs["jieba_usr_dict"] = os.path.join(model_or_path, "jieba_usr_dict")
+    return OmegaConf.to_container(kwargs, resolve=True)
+def add_file_root_path(model_or_path: str, file_path_metas: dict, cfg={}):
+    if isinstance(file_path_metas, dict):
+        for k, v in file_path_metas.items():
+            if isinstance(v, str):
+                p = os.path.join(model_or_path, v)
+                if os.path.exists(p):
+                    cfg[k] = p
+            elif isinstance(v, dict):
+                if k not in cfg:
+                    cfg[k] = {}
+                add_file_root_path(model_or_path, v, cfg[k])
+    return cfg
+def get_or_download_model_dir(
+    model,
+    model_revision=None,
+    is_training=False,
+    check_latest=True,
+    model_hub="ms",
+):
+    """Get local model directory or download model if necessary.
+    Args:
+        model (str): model id or path to local model directory.
+                    For HF subfolders, use format: "repo_id/subfolder_path"
+        model_revision  (str, optional): model version number.
+        is_training (bool): Whether this is for training
+        check_latest (bool): Whether to check for latest version
+        model_hub (str): Model hub type ("ms" for ModelScope, "hf" for HuggingFace)
+    """
+    # Extract repo_id for caching (handle subfolder case)
+    if "/" in model and len(model.split("/")) > 2:
+        parts = model.split("/")
+        repo_id = "/".join(parts[:2])  # e.g., "organization/repo" or "stepfun-ai/Step-Audio-EditX"
+        subfolder = "/".join(parts[2:])  # e.g., "subfolder/model"
+    else:
+        repo_id = model
+        subfolder = None
+    # Create cache key
+    cache_key = (repo_id, model_revision, model_hub)
+    # Check cache first
+    with _cache_lock:
+        if cache_key in _model_cache:
+            cached_repo_dir = _model_cache[cache_key]
+            print(f"Using cached model for {repo_id}: {cached_repo_dir}")
+            # For subfolder case, construct the model_cache_dir from cached repo
+            if subfolder:
+                model_cache_dir = os.path.join(cached_repo_dir, subfolder)
+                if not os.path.exists(model_cache_dir):
+                    raise FileNotFoundError(f"Subfolder {subfolder} not found in cached repo {repo_id}")
+            else:
+                model_cache_dir = cached_repo_dir
+            return cached_repo_dir, model_cache_dir
+    # Cache miss, need to download
+    if model_hub == "ms":
+        # ModelScope download
+        from modelscope.hub.snapshot_download import snapshot_download
+        from modelscope.utils.constant import Invoke, ThirdParty
+        key = Invoke.LOCAL_TRAINER if is_training else Invoke.PIPELINE
+        # Download the repo (use repo_id, not the full model path with subfolder)
+        repo_cache_dir = snapshot_download(
+            repo_id,
+            revision=model_revision,
+            user_agent={Invoke.KEY: key, ThirdParty.KEY: "funasr"},
+        )
+        repo_cache_dir = normalize_cache_path(repo_cache_dir)
+        # Construct model_cache_dir
+        if subfolder:
+            model_cache_dir = os.path.join(repo_cache_dir, subfolder)
+            if not os.path.exists(model_cache_dir):
+                raise FileNotFoundError(f"Subfolder {subfolder} not found in downloaded repo {repo_id}")
+        else:
+            model_cache_dir = normalize_cache_path(repo_cache_dir)
+    elif model_hub == "hf":
+        # HuggingFace download
+        try:
+            from huggingface_hub import snapshot_download
+        except ImportError:
+            raise ImportError(
+                "huggingface_hub is required for downloading from HuggingFace. "
+                "Please install it with: pip install huggingface_hub"
+            )
+        # Download the repo (use repo_id, not the full model path with subfolder)
+        repo_cache_dir = snapshot_download(
+            repo_id=repo_id,
+            revision=model_revision,
+            allow_patterns=None,  # Download all files to ensure resource files are available
+        )
+        repo_cache_dir = normalize_cache_path(repo_cache_dir)
+        # Construct model_cache_dir
+        if subfolder:
+            model_cache_dir = os.path.join(repo_cache_dir, subfolder)
+            if not os.path.exists(model_cache_dir):
+                raise FileNotFoundError(f"Subfolder {subfolder} not found in downloaded repo {repo_id}")
+        else:
+            model_cache_dir = normalize_cache_path(repo_cache_dir)
+    else:
+        raise ValueError(f"Unsupported model_hub: {model_hub}")
+    # Cache the result before returning
+    with _cache_lock:
+        _model_cache[cache_key] = repo_cache_dir
+    print(f"Model downloaded to: {model_cache_dir}")
+    return repo_cache_dir, model_cache_dir
+def normalize_cache_path(cache_path):
+    """Normalize cache path to ensure consistent format with snapshots/{commit_id}."""
+    # Check if the cache_path directory contains a snapshots folder
+    snapshots_dir = os.path.join(cache_path, "snapshots")
+    if os.path.exists(snapshots_dir) and os.path.isdir(snapshots_dir):
+        # Find the commit_id subdirectory in snapshots
+        try:
+            snapshot_items = os.listdir(snapshots_dir)
+            # Look for the first directory (should be the commit_id)
+            for item in snapshot_items:
+                item_path = os.path.join(snapshots_dir, item)
+                if os.path.isdir(item_path):
+                    # Found commit_id directory, return the full path
+                    return os.path.join(cache_path, "snapshots", item)
+        except OSError:
+            pass
+    # If no snapshots directory found or error occurred, return original path
+    return cache_path

funasr_detach/download/file.py ADDED Viewed

	@@ -0,0 +1,335 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+import contextlib
+import os
+import tempfile
+from abc import ABCMeta, abstractmethod
+from pathlib import Path
+from typing import Generator, Union
+import requests
+from urllib.parse import urlparse
+def download_from_url(url):
+    result = urlparse(url)
+    file_path = None
+    if result.scheme is not None and len(result.scheme) > 0:
+        storage = HTTPStorage()
+        # bytes
+        data = storage.read(url)
+        work_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(work_dir):
+            os.makedirs(work_dir)
+        file_path = os.path.join(work_dir, os.path.basename(url))
+        with open(file_path, "wb") as fb:
+            fb.write(data)
+    assert file_path is not None, f"failed to download: {url}"
+    return file_path
+class Storage(metaclass=ABCMeta):
+    """Abstract class of storage.
+    All backends need to implement two apis: ``read()`` and ``read_text()``.
+    ``read()`` reads the file as a byte stream and ``read_text()`` reads
+    the file as texts.
+    """
+    @abstractmethod
+    def read(self, filepath: str):
+        pass
+    @abstractmethod
+    def read_text(self, filepath: str):
+        pass
+    @abstractmethod
+    def write(self, obj: bytes, filepath: Union[str, Path]) -> None:
+        pass
+    @abstractmethod
+    def write_text(
+        self, obj: str, filepath: Union[str, Path], encoding: str = "utf-8"
+    ) -> None:
+        pass
+class LocalStorage(Storage):
+    """Local hard disk storage"""
+    def read(self, filepath: Union[str, Path]) -> bytes:
+        """Read data from a given ``filepath`` with 'rb' mode.
+        Args:
+            filepath (str or Path): Path to read data.
+        Returns:
+            bytes: Expected bytes object.
+        """
+        with open(filepath, "rb") as f:
+            content = f.read()
+        return content
+    def read_text(self, filepath: Union[str, Path], encoding: str = "utf-8") -> str:
+        """Read data from a given ``filepath`` with 'r' mode.
+        Args:
+            filepath (str or Path): Path to read data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Default: 'utf-8'.
+        Returns:
+            str: Expected text reading from ``filepath``.
+        """
+        with open(filepath, "r", encoding=encoding) as f:
+            value_buf = f.read()
+        return value_buf
+    def write(self, obj: bytes, filepath: Union[str, Path]) -> None:
+        """Write data to a given ``filepath`` with 'wb' mode.
+        Note:
+            ``write`` will create a directory if the directory of ``filepath``
+            does not exist.
+        Args:
+            obj (bytes): Data to be written.
+            filepath (str or Path): Path to write data.
+        """
+        dirname = os.path.dirname(filepath)
+        if dirname and not os.path.exists(dirname):
+            os.makedirs(dirname, exist_ok=True)
+        with open(filepath, "wb") as f:
+            f.write(obj)
+    def write_text(
+        self, obj: str, filepath: Union[str, Path], encoding: str = "utf-8"
+    ) -> None:
+        """Write data to a given ``filepath`` with 'w' mode.
+        Note:
+            ``write_text`` will create a directory if the directory of
+            ``filepath`` does not exist.
+        Args:
+            obj (str): Data to be written.
+            filepath (str or Path): Path to write data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Default: 'utf-8'.
+        """
+        dirname = os.path.dirname(filepath)
+        if dirname and not os.path.exists(dirname):
+            os.makedirs(dirname, exist_ok=True)
+        with open(filepath, "w", encoding=encoding) as f:
+            f.write(obj)
+    @contextlib.contextmanager
+    def as_local_path(
+        self, filepath: Union[str, Path]
+    ) -> Generator[Union[str, Path], None, None]:
+        """Only for unified API and do nothing."""
+        yield filepath
+class HTTPStorage(Storage):
+    """HTTP and HTTPS storage."""
+    def read(self, url):
+        # TODO @wenmeng.zwm add progress bar if file is too large
+        r = requests.get(url)
+        r.raise_for_status()
+        return r.content
+    def read_text(self, url):
+        r = requests.get(url)
+        r.raise_for_status()
+        return r.text
+    @contextlib.contextmanager
+    def as_local_path(self, filepath: str) -> Generator[Union[str, Path], None, None]:
+        """Download a file from ``filepath``.
+        ``as_local_path`` is decorated by :meth:`contextlib.contextmanager`. It
+        can be called with ``with`` statement, and when exists from the
+        ``with`` statement, the temporary path will be released.
+        Args:
+            filepath (str): Download a file from ``filepath``.
+        Examples:
+            >>> storage = HTTPStorage()
+            >>> # After existing from the ``with`` clause,
+            >>> # the path will be removed
+            >>> with storage.get_local_path('http://path/to/file') as path:
+            ...     # do something here
+        """
+        try:
+            f = tempfile.NamedTemporaryFile(delete=False)
+            f.write(self.read(filepath))
+            f.close()
+            yield f.name
+        finally:
+            os.remove(f.name)
+    def write(self, obj: bytes, url: Union[str, Path]) -> None:
+        raise NotImplementedError("write is not supported by HTTP Storage")
+    def write_text(
+        self, obj: str, url: Union[str, Path], encoding: str = "utf-8"
+    ) -> None:
+        raise NotImplementedError("write_text is not supported by HTTP Storage")
+class OSSStorage(Storage):
+    """OSS storage."""
+    def __init__(self, oss_config_file=None):
+        # read from config file or env var
+        raise NotImplementedError("OSSStorage.__init__ to be implemented in the future")
+    def read(self, filepath):
+        raise NotImplementedError("OSSStorage.read to be implemented in the future")
+    def read_text(self, filepath, encoding="utf-8"):
+        raise NotImplementedError(
+            "OSSStorage.read_text to be implemented in the future"
+        )
+    @contextlib.contextmanager
+    def as_local_path(self, filepath: str) -> Generator[Union[str, Path], None, None]:
+        """Download a file from ``filepath``.
+        ``as_local_path`` is decorated by :meth:`contextlib.contextmanager`. It
+        can be called with ``with`` statement, and when exists from the
+        ``with`` statement, the temporary path will be released.
+        Args:
+            filepath (str): Download a file from ``filepath``.
+        Examples:
+            >>> storage = OSSStorage()
+            >>> # After existing from the ``with`` clause,
+            >>> # the path will be removed
+            >>> with storage.get_local_path('http://path/to/file') as path:
+            ...     # do something here
+        """
+        try:
+            f = tempfile.NamedTemporaryFile(delete=False)
+            f.write(self.read(filepath))
+            f.close()
+            yield f.name
+        finally:
+            os.remove(f.name)
+    def write(self, obj: bytes, filepath: Union[str, Path]) -> None:
+        raise NotImplementedError("OSSStorage.write to be implemented in the future")
+    def write_text(
+        self, obj: str, filepath: Union[str, Path], encoding: str = "utf-8"
+    ) -> None:
+        raise NotImplementedError(
+            "OSSStorage.write_text to be implemented in the future"
+        )
+G_STORAGES = {}
+class File(object):
+    _prefix_to_storage: dict = {
+        "oss": OSSStorage,
+        "http": HTTPStorage,
+        "https": HTTPStorage,
+        "local": LocalStorage,
+    }
+    @staticmethod
+    def _get_storage(uri):
+        assert isinstance(uri, str), f"uri should be str type, but got {type(uri)}"
+        if "://" not in uri:
+            # local path
+            storage_type = "local"
+        else:
+            prefix, _ = uri.split("://")
+            storage_type = prefix
+        assert storage_type in File._prefix_to_storage, (
+            f"Unsupported uri {uri}, valid prefixs: "
+            f"{list(File._prefix_to_storage.keys())}"
+        )
+        if storage_type not in G_STORAGES:
+            G_STORAGES[storage_type] = File._prefix_to_storage[storage_type]()
+        return G_STORAGES[storage_type]
+    @staticmethod
+    def read(uri: str) -> bytes:
+        """Read data from a given ``filepath`` with 'rb' mode.
+        Args:
+            filepath (str or Path): Path to read data.
+        Returns:
+            bytes: Expected bytes object.
+        """
+        storage = File._get_storage(uri)
+        return storage.read(uri)
+    @staticmethod
+    def read_text(uri: Union[str, Path], encoding: str = "utf-8") -> str:
+        """Read data from a given ``filepath`` with 'r' mode.
+        Args:
+            filepath (str or Path): Path to read data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Default: 'utf-8'.
+        Returns:
+            str: Expected text reading from ``filepath``.
+        """
+        storage = File._get_storage(uri)
+        return storage.read_text(uri)
+    @staticmethod
+    def write(obj: bytes, uri: Union[str, Path]) -> None:
+        """Write data to a given ``filepath`` with 'wb' mode.
+        Note:
+            ``write`` will create a directory if the directory of ``filepath``
+            does not exist.
+        Args:
+            obj (bytes): Data to be written.
+            filepath (str or Path): Path to write data.
+        """
+        storage = File._get_storage(uri)
+        return storage.write(obj, uri)
+    @staticmethod
+    def write_text(obj: str, uri: str, encoding: str = "utf-8") -> None:
+        """Write data to a given ``filepath`` with 'w' mode.
+        Note:
+            ``write_text`` will create a directory if the directory of
+            ``filepath`` does not exist.
+        Args:
+            obj (str): Data to be written.
+            filepath (str or Path): Path to write data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Default: 'utf-8'.
+        """
+        storage = File._get_storage(uri)
+        return storage.write_text(obj, uri)
+    @contextlib.contextmanager
+    def as_local_path(uri: str) -> Generator[Union[str, Path], None, None]:
+        """Only for unified API and do nothing."""
+        storage = File._get_storage(uri)
+        with storage.as_local_path(uri) as local_path:
+            yield local_path

funasr_detach/download/name_maps_from_hub.py ADDED Viewed

	@@ -0,0 +1,13 @@

+name_maps_ms = {
+    "paraformer-zh": "damo/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+    "paraformer-en": "damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020",
+    "paraformer-en-spk": "damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020",
+    "paraformer-zh-streaming": "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online",
+    "fsmn-vad": "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
+    "ct-punc": "damo/punc_ct-transformer_cn-en-common-vocab471067-large",
+    "ct-punc-c": "damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
+    "fa-zh": "damo/speech_timestamp_prediction-v1-16k-offline",
+    "cam++": "damo/speech_campplus_sv_zh-cn_16k-common",
+}
+name_maps_hf = {}

funasr_detach/download/runtime_sdk_download_tool.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import os
+import argparse
+from pathlib import Path
+from funasr_detach.utils.types import str2bool
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-name", type=str, required=True)
+    parser.add_argument("--export-dir", type=str, required=True)
+    parser.add_argument(
+        "--export", type=str2bool, default=True, help="whether to export model"
+    )
+    parser.add_argument("--type", type=str, default="onnx", help='["onnx", "torch"]')
+    parser.add_argument("--device", type=str, default="cpu", help='["cpu", "cuda"]')
+    parser.add_argument(
+        "--quantize", type=str2bool, default=False, help="export quantized model"
+    )
+    parser.add_argument(
+        "--fallback-num", type=int, default=0, help="amp fallback number"
+    )
+    parser.add_argument("--audio_in", type=str, default=None, help='["wav", "wav.scp"]')
+    parser.add_argument(
+        "--model_revision", type=str, default=None, help="model_revision"
+    )
+    parser.add_argument("--calib_num", type=int, default=200, help="calib max num")
+    args = parser.parse_args()
+    model_dir = args.model_name
+    if not Path(args.model_name).exists():
+        from modelscope.hub.snapshot_download import snapshot_download
+        try:
+            model_dir = snapshot_download(
+                args.model_name, cache_dir=args.export_dir, revision=args.model_revision
+            )
+        except:
+            raise "model_dir must be model_name in modelscope or local path downloaded from modelscope, but is {}".format(
+                model_dir
+            )
+    if args.export:
+        model_file = os.path.join(model_dir, "model.onnx")
+        if args.quantize:
+            model_file = os.path.join(model_dir, "model_quant.onnx")
+        if not os.path.exists(model_file):
+            print(".onnx is not exist, begin to export onnx")
+            from funasr_detach.bin.export_model import ModelExport
+            export_model = ModelExport(
+                cache_dir=args.export_dir,
+                onnx=True,
+                device="cpu",
+                quant=args.quantize,
+            )
+            export_model.export(model_dir)
+if __name__ == "__main__":
+    main()

funasr_detach/frontends/__init__.py ADDED Viewed

File without changes

funasr_detach/frontends/default.py ADDED Viewed

	@@ -0,0 +1,347 @@

+import copy
+from typing import Optional
+from typing import Tuple
+from typing import Union
+import logging
+import humanfriendly
+import numpy as np
+import torch
+import torch.nn as nn
+try:
+    from torch_complex.tensor import ComplexTensor
+except:
+    print("Please install torch_complex firstly")
+from funasr_detach.frontends.utils.log_mel import LogMel
+from funasr_detach.frontends.utils.stft import Stft
+from funasr_detach.frontends.utils.frontend import Frontend
+from funasr_detach.models.transformer.utils.nets_utils import make_pad_mask
+class DefaultFrontend(nn.Module):
+    """Conventional frontend structure for ASR.
+    Stft -> WPE -> MVDR-Beamformer -> Power-spec -> Mel-Fbank -> CMVN
+    """
+    def __init__(
+        self,
+        fs: Union[int, str] = 16000,
+        n_fft: int = 512,
+        win_length: int = None,
+        hop_length: int = 128,
+        window: Optional[str] = "hann",
+        center: bool = True,
+        normalized: bool = False,
+        onesided: bool = True,
+        n_mels: int = 80,
+        fmin: int = None,
+        fmax: int = None,
+        htk: bool = False,
+        frontend_conf: Optional[dict] = None,
+        apply_stft: bool = True,
+        use_channel: int = None,
+    ):
+        super().__init__()
+        if isinstance(fs, str):
+            fs = humanfriendly.parse_size(fs)
+        # Deepcopy (In general, dict shouldn't be used as default arg)
+        frontend_conf = copy.deepcopy(frontend_conf)
+        self.hop_length = hop_length
+        if apply_stft:
+            self.stft = Stft(
+                n_fft=n_fft,
+                win_length=win_length,
+                hop_length=hop_length,
+                center=center,
+                window=window,
+                normalized=normalized,
+                onesided=onesided,
+            )
+        else:
+            self.stft = None
+        self.apply_stft = apply_stft
+        if frontend_conf is not None:
+            self.frontend = Frontend(idim=n_fft // 2 + 1, **frontend_conf)
+        else:
+            self.frontend = None
+        self.logmel = LogMel(
+            fs=fs,
+            n_fft=n_fft,
+            n_mels=n_mels,
+            fmin=fmin,
+            fmax=fmax,
+            htk=htk,
+        )
+        self.n_mels = n_mels
+        self.use_channel = use_channel
+        self.frontend_type = "default"
+    def output_size(self) -> int:
+        return self.n_mels
+    def forward(
+        self, input: torch.Tensor, input_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # 1. Domain-conversion: e.g. Stft: time -> time-freq
+        if self.stft is not None:
+            input_stft, feats_lens = self._compute_stft(input, input_lengths)
+        else:
+            input_stft = ComplexTensor(input[..., 0], input[..., 1])
+            feats_lens = input_lengths
+        # 2. [Option] Speech enhancement
+        if self.frontend is not None:
+            assert isinstance(input_stft, ComplexTensor), type(input_stft)
+            # input_stft: (Batch, Length, [Channel], Freq)
+            input_stft, _, mask = self.frontend(input_stft, feats_lens)
+        # 3. [Multi channel case]: Select a channel
+        if input_stft.dim() == 4:
+            # h: (B, T, C, F) -> h: (B, T, F)
+            if self.training:
+                if self.use_channel is not None:
+                    input_stft = input_stft[:, :, self.use_channel, :]
+                else:
+                    # Select 1ch randomly
+                    ch = np.random.randint(input_stft.size(2))
+                    input_stft = input_stft[:, :, ch, :]
+            else:
+                # Use the first channel
+                input_stft = input_stft[:, :, 0, :]
+        # 4. STFT -> Power spectrum
+        # h: ComplexTensor(B, T, F) -> torch.Tensor(B, T, F)
+        input_power = input_stft.real**2 + input_stft.imag**2
+        # 5. Feature transform e.g. Stft -> Log-Mel-Fbank
+        # input_power: (Batch, [Channel,] Length, Freq)
+        #       -> input_feats: (Batch, Length, Dim)
+        input_feats, _ = self.logmel(input_power, feats_lens)
+        return input_feats, feats_lens
+    def _compute_stft(
+        self, input: torch.Tensor, input_lengths: torch.Tensor
+    ) -> torch.Tensor:
+        input_stft, feats_lens = self.stft(input, input_lengths)
+        assert input_stft.dim() >= 4, input_stft.shape
+        # "2" refers to the real/imag parts of Complex
+        assert input_stft.shape[-1] == 2, input_stft.shape
+        # Change torch.Tensor to ComplexTensor
+        # input_stft: (..., F, 2) -> (..., F)
+        input_stft = ComplexTensor(input_stft[..., 0], input_stft[..., 1])
+        return input_stft, feats_lens
+class MultiChannelFrontend(nn.Module):
+    """Conventional frontend structure for ASR.
+    Stft -> WPE -> MVDR-Beamformer -> Power-spec -> Mel-Fbank -> CMVN
+    """
+    def __init__(
+        self,
+        fs: Union[int, str] = 16000,
+        n_fft: int = 512,
+        win_length: int = None,
+        hop_length: int = None,
+        frame_length: int = None,
+        frame_shift: int = None,
+        window: Optional[str] = "hann",
+        center: bool = True,
+        normalized: bool = False,
+        onesided: bool = True,
+        n_mels: int = 80,
+        fmin: int = None,
+        fmax: int = None,
+        htk: bool = False,
+        frontend_conf: Optional[dict] = None,
+        apply_stft: bool = True,
+        use_channel: int = None,
+        lfr_m: int = 1,
+        lfr_n: int = 1,
+        cmvn_file: str = None,
+        mc: bool = True,
+    ):
+        super().__init__()
+        if isinstance(fs, str):
+            fs = humanfriendly.parse_size(fs)
+        # Deepcopy (In general, dict shouldn't be used as default arg)
+        frontend_conf = copy.deepcopy(frontend_conf)
+        if win_length is None and hop_length is None:
+            self.win_length = frame_length * 16
+            self.hop_length = frame_shift * 16
+        elif frame_length is None and frame_shift is None:
+            self.win_length = self.win_length
+            self.hop_length = self.hop_length
+        else:
+            logging.error(
+                "Only one of (win_length, hop_length) and (frame_length, frame_shift)"
+                "can be set."
+            )
+            exit(1)
+        if apply_stft:
+            self.stft = Stft(
+                n_fft=n_fft,
+                win_length=self.win_length,
+                hop_length=self.hop_length,
+                center=center,
+                window=window,
+                normalized=normalized,
+                onesided=onesided,
+            )
+        else:
+            self.stft = None
+        self.apply_stft = apply_stft
+        if frontend_conf is not None:
+            self.frontend = Frontend(idim=n_fft // 2 + 1, **frontend_conf)
+        else:
+            self.frontend = None
+        self.logmel = LogMel(
+            fs=fs,
+            n_fft=n_fft,
+            n_mels=n_mels,
+            fmin=fmin,
+            fmax=fmax,
+            htk=htk,
+        )
+        self.n_mels = n_mels
+        self.use_channel = use_channel
+        self.mc = mc
+        if not self.mc:
+            if self.use_channel is not None:
+                logging.info("use the channel %d" % (self.use_channel))
+            else:
+                logging.info("random select channel")
+            self.cmvn_file = cmvn_file
+            if self.cmvn_file is not None:
+                mean, std = self._load_cmvn(self.cmvn_file)
+                self.register_buffer("mean", torch.from_numpy(mean))
+                self.register_buffer("std", torch.from_numpy(std))
+        self.frontend_type = "multichannelfrontend"
+    def output_size(self) -> int:
+        return self.n_mels
+    def forward(
+        self, input: torch.Tensor, input_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # 1. Domain-conversion: e.g. Stft: time -> time-freq
+        # import pdb;pdb.set_trace()
+        if self.stft is not None:
+            input_stft, feats_lens = self._compute_stft(input, input_lengths)
+        else:
+            input_stft = ComplexTensor(input[..., 0], input[..., 1])
+            feats_lens = input_lengths
+        # 2. [Option] Speech enhancement
+        if self.frontend is not None:
+            assert isinstance(input_stft, ComplexTensor), type(input_stft)
+            # input_stft: (Batch, Length, [Channel], Freq)
+            input_stft, _, mask = self.frontend(input_stft, feats_lens)
+        # 3. [Multi channel case]: Select a channel(sa_asr)
+        if input_stft.dim() == 4 and not self.mc:
+            # h: (B, T, C, F) -> h: (B, T, F)
+            if self.training:
+                if self.use_channel is not None:
+                    input_stft = input_stft[:, :, self.use_channel, :]
+                else:
+                    # Select 1ch randomly
+                    ch = np.random.randint(input_stft.size(2))
+                    input_stft = input_stft[:, :, ch, :]
+            else:
+                # Use the first channel
+                input_stft = input_stft[:, :, 0, :]
+        # 4. STFT -> Power spectrum
+        # h: ComplexTensor(B, T, F) -> torch.Tensor(B, T, F)
+        input_power = input_stft.real**2 + input_stft.imag**2
+        # 5. Feature transform e.g. Stft -> Log-Mel-Fbank
+        # input_power: (Batch, [Channel,] Length, Freq)
+        #       -> input_feats: (Batch, Length, Dim)
+        input_feats, _ = self.logmel(input_power, feats_lens)
+        if self.mc:
+            # MFCCA
+            if input_feats.dim() == 4:
+                bt = input_feats.size(0)
+                channel_size = input_feats.size(2)
+                input_feats = (
+                    input_feats.transpose(1, 2)
+                    .reshape(bt * channel_size, -1, 80)
+                    .contiguous()
+                )
+                feats_lens = feats_lens.repeat(1, channel_size).squeeze()
+            else:
+                channel_size = 1
+            return input_feats, feats_lens, channel_size
+        else:
+            # 6. Apply CMVN
+            if self.cmvn_file is not None:
+                if feats_lens is None:
+                    feats_lens = input_feats.new_full(
+                        [input_feats.size(0)], input_feats.size(1)
+                    )
+                self.mean = self.mean.to(input_feats.device, input_feats.dtype)
+                self.std = self.std.to(input_feats.device, input_feats.dtype)
+                mask = make_pad_mask(feats_lens, input_feats, 1)
+                if input_feats.requires_grad:
+                    input_feats = input_feats + self.mean
+                else:
+                    input_feats += self.mean
+                if input_feats.requires_grad:
+                    input_feats = input_feats.masked_fill(mask, 0.0)
+                else:
+                    input_feats.masked_fill_(mask, 0.0)
+                input_feats *= self.std
+            return input_feats, feats_lens
+    def _compute_stft(
+        self, input: torch.Tensor, input_lengths: torch.Tensor
+    ) -> torch.Tensor:
+        input_stft, feats_lens = self.stft(input, input_lengths)
+        assert input_stft.dim() >= 4, input_stft.shape
+        # "2" refers to the real/imag parts of Complex
+        assert input_stft.shape[-1] == 2, input_stft.shape
+        # Change torch.Tensor to ComplexTensor
+        # input_stft: (..., F, 2) -> (..., F)
+        input_stft = ComplexTensor(input_stft[..., 0], input_stft[..., 1])
+        return input_stft, feats_lens
+    def _load_cmvn(self, cmvn_file):
+        with open(cmvn_file, "r", encoding="utf-8") as f:
+            lines = f.readlines()
+        means_list = []
+        vars_list = []
+        for i in range(len(lines)):
+            line_item = lines[i].split()
+            if line_item[0] == "<AddShift>":
+                line_item = lines[i + 1].split()
+                if line_item[0] == "<LearnRateCoef>":
+                    add_shift_line = line_item[3 : (len(line_item) - 1)]
+                    means_list = list(add_shift_line)
+                    continue
+            elif line_item[0] == "<Rescale>":
+                line_item = lines[i + 1].split()
+                if line_item[0] == "<LearnRateCoef>":
+                    rescale_line = line_item[3 : (len(line_item) - 1)]
+                    vars_list = list(rescale_line)
+                    continue
+        means = np.array(means_list).astype(np.float)
+        vars = np.array(vars_list).astype(np.float)
+        return means, vars

funasr_detach/frontends/eend_ola_feature.py ADDED Viewed

	@@ -0,0 +1,49 @@

+# Copyright 2019 Hitachi, Ltd. (author: Yusuke Fujita)
+# Licensed under the MIT license.
+#
+# This module is for computing audio features
+import librosa
+import numpy as np
+def transform(Y, dtype=np.float32):
+    Y = np.abs(Y)
+    n_fft = 2 * (Y.shape[1] - 1)
+    sr = 8000
+    n_mels = 23
+    mel_basis = librosa.filters.mel(sr, n_fft, n_mels)
+    Y = np.dot(Y**2, mel_basis.T)
+    Y = np.log10(np.maximum(Y, 1e-10))
+    mean = np.mean(Y, axis=0)
+    Y = Y - mean
+    return Y.astype(dtype)
+def subsample(Y, T, subsampling=1):
+    Y_ss = Y[::subsampling]
+    T_ss = T[::subsampling]
+    return Y_ss, T_ss
+def splice(Y, context_size=0):
+    Y_pad = np.pad(Y, [(context_size, context_size), (0, 0)], "constant")
+    Y_spliced = np.lib.stride_tricks.as_strided(
+        np.ascontiguousarray(Y_pad),
+        (Y.shape[0], Y.shape[1] * (2 * context_size + 1)),
+        (Y.itemsize * Y.shape[1], Y.itemsize),
+        writeable=False,
+    )
+    return Y_spliced
+def stft(data, frame_size=1024, frame_shift=256):
+    fft_size = 1 << (frame_size - 1).bit_length()
+    if len(data) % frame_shift == 0:
+        return librosa.stft(
+            data, n_fft=fft_size, win_length=frame_size, hop_length=frame_shift
+        ).T[:-1]
+    else:
+        return librosa.stft(
+            data, n_fft=fft_size, win_length=frame_size, hop_length=frame_shift
+        ).T

funasr_detach/frontends/fused.py ADDED Viewed

	@@ -0,0 +1,144 @@

+from funasr_detach.frontends.default import DefaultFrontend
+from funasr_detach.frontends.s3prl import S3prlFrontend
+import numpy as np
+import torch
+import torch.nn as nn
+from typing import Tuple
+class FusedFrontends(nn.Module):
+    def __init__(
+        self, frontends=None, align_method="linear_projection", proj_dim=100, fs=16000
+    ):
+        super().__init__()
+        self.align_method = (
+            align_method  # fusing method : linear_projection only for now
+        )
+        self.proj_dim = proj_dim  # dim of the projection done on each frontend
+        self.frontends = []  # list of the frontends to combine
+        for i, frontend in enumerate(frontends):
+            frontend_type = frontend["frontend_type"]
+            if frontend_type == "default":
+                n_mels, fs, n_fft, win_length, hop_length = (
+                    frontend.get("n_mels", 80),
+                    fs,
+                    frontend.get("n_fft", 512),
+                    frontend.get("win_length"),
+                    frontend.get("hop_length", 128),
+                )
+                window, center, normalized, onesided = (
+                    frontend.get("window", "hann"),
+                    frontend.get("center", True),
+                    frontend.get("normalized", False),
+                    frontend.get("onesided", True),
+                )
+                fmin, fmax, htk, apply_stft = (
+                    frontend.get("fmin", None),
+                    frontend.get("fmax", None),
+                    frontend.get("htk", False),
+                    frontend.get("apply_stft", True),
+                )
+                self.frontends.append(
+                    DefaultFrontend(
+                        n_mels=n_mels,
+                        n_fft=n_fft,
+                        fs=fs,
+                        win_length=win_length,
+                        hop_length=hop_length,
+                        window=window,
+                        center=center,
+                        normalized=normalized,
+                        onesided=onesided,
+                        fmin=fmin,
+                        fmax=fmax,
+                        htk=htk,
+                        apply_stft=apply_stft,
+                    )
+                )
+            elif frontend_type == "s3prl":
+                frontend_conf, download_dir, multilayer_feature = (
+                    frontend.get("frontend_conf"),
+                    frontend.get("download_dir"),
+                    frontend.get("multilayer_feature"),
+                )
+                self.frontends.append(
+                    S3prlFrontend(
+                        fs=fs,
+                        frontend_conf=frontend_conf,
+                        download_dir=download_dir,
+                        multilayer_feature=multilayer_feature,
+                    )
+                )
+            else:
+                raise NotImplementedError  # frontends are only default or s3prl
+        self.frontends = torch.nn.ModuleList(self.frontends)
+        self.gcd = np.gcd.reduce([frontend.hop_length for frontend in self.frontends])
+        self.factors = [frontend.hop_length // self.gcd for frontend in self.frontends]
+        if torch.cuda.is_available():
+            dev = "cuda"
+        else:
+            dev = "cpu"
+        if self.align_method == "linear_projection":
+            self.projection_layers = [
+                torch.nn.Linear(
+                    in_features=frontend.output_size(),
+                    out_features=self.factors[i] * self.proj_dim,
+                )
+                for i, frontend in enumerate(self.frontends)
+            ]
+            self.projection_layers = torch.nn.ModuleList(self.projection_layers)
+            self.projection_layers = self.projection_layers.to(torch.device(dev))
+    def output_size(self) -> int:
+        return len(self.frontends) * self.proj_dim
+    def forward(
+        self, input: torch.Tensor, input_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # step 0 : get all frontends features
+        self.feats = []
+        for frontend in self.frontends:
+            with torch.no_grad():
+                input_feats, feats_lens = frontend.forward(input, input_lengths)
+            self.feats.append([input_feats, feats_lens])
+        if (
+            self.align_method == "linear_projection"
+        ):  # TODO(Dan): to add other align methods
+            # first step : projections
+            self.feats_proj = []
+            for i, frontend in enumerate(self.frontends):
+                input_feats = self.feats[i][0]
+                self.feats_proj.append(self.projection_layers[i](input_feats))
+            # 2nd step : reshape
+            self.feats_reshaped = []
+            for i, frontend in enumerate(self.frontends):
+                input_feats_proj = self.feats_proj[i]
+                bs, nf, dim = input_feats_proj.shape
+                input_feats_reshaped = torch.reshape(
+                    input_feats_proj, (bs, nf * self.factors[i], dim // self.factors[i])
+                )
+                self.feats_reshaped.append(input_feats_reshaped)
+            # 3rd step : drop the few last frames
+            m = min([x.shape[1] for x in self.feats_reshaped])
+            self.feats_final = [x[:, :m, :] for x in self.feats_reshaped]
+            input_feats = torch.cat(
+                self.feats_final, dim=-1
+            )  # change the input size of the preencoder : proj_dim * n_frontends
+            feats_lens = torch.ones_like(self.feats[0][1]) * (m)
+        else:
+            raise NotImplementedError
+        return input_feats, feats_lens

funasr_detach/frontends/s3prl.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import copy
+import logging
+import os
+from argparse import Namespace
+from typing import Optional
+from typing import Tuple
+from typing import Union
+import humanfriendly
+import torch
+import torch.nn as nn
+from funasr_detach.frontends.utils.frontend import Frontend
+from funasr_detach.models.transformer.utils.nets_utils import pad_list
+def base_s3prl_setup(args):
+    args.upstream_feature_selection = getattr(args, "upstream_feature_selection", None)
+    args.upstream_model_config = getattr(args, "upstream_model_config", None)
+    args.upstream_refresh = getattr(args, "upstream_refresh", False)
+    args.upstream_ckpt = getattr(args, "upstream_ckpt", None)
+    args.init_ckpt = getattr(args, "init_ckpt", None)
+    args.verbose = getattr(args, "verbose", False)
+    args.tile_factor = getattr(args, "tile_factor", 1)
+    return args
+class S3prlFrontend(nn.Module):
+    """Speech Pretrained Representation frontend structure for ASR."""
+    def __init__(
+        self,
+        fs: Union[int, str] = 16000,
+        frontend_conf: Optional[dict] = None,
+        download_dir: str = None,
+        multilayer_feature: bool = False,
+    ):
+        super().__init__()
+        if isinstance(fs, str):
+            fs = humanfriendly.parse_size(fs)
+        if download_dir is not None:
+            torch.hub.set_dir(download_dir)
+        self.multilayer_feature = multilayer_feature
+        self.upstream, self.featurizer = self._get_upstream(frontend_conf)
+        self.pretrained_params = copy.deepcopy(self.upstream.state_dict())
+        self.output_dim = self.featurizer.output_dim
+        self.frontend_type = "s3prl"
+        self.hop_length = self.upstream.get_downsample_rates("key")
+    def _get_upstream(self, frontend_conf):
+        """Get S3PRL upstream model."""
+        s3prl_args = base_s3prl_setup(
+            Namespace(**frontend_conf, device="cpu"),
+        )
+        self.args = s3prl_args
+        s3prl_path = None
+        python_path_list = os.environ.get("PYTHONPATH", "(None)").split(":")
+        for p in python_path_list:
+            if p.endswith("s3prl"):
+                s3prl_path = p
+                break
+        assert s3prl_path is not None
+        s3prl_upstream = torch.hub.load(
+            s3prl_path,
+            s3prl_args.upstream,
+            ckpt=s3prl_args.upstream_ckpt,
+            model_config=s3prl_args.upstream_model_config,
+            refresh=s3prl_args.upstream_refresh,
+            source="local",
+        ).to("cpu")
+        if getattr(
+            s3prl_upstream, "model", None
+        ) is not None and s3prl_upstream.model.__class__.__name__ in [
+            "Wav2Vec2Model",
+            "HubertModel",
+        ]:
+            s3prl_upstream.model.encoder.layerdrop = 0.0
+        from s3prl.upstream.interfaces import Featurizer
+        if self.multilayer_feature is None:
+            feature_selection = "last_hidden_state"
+        else:
+            feature_selection = "hidden_states"
+        s3prl_featurizer = Featurizer(
+            upstream=s3prl_upstream,
+            feature_selection=feature_selection,
+            upstream_device="cpu",
+        )
+        return s3prl_upstream, s3prl_featurizer
+    def _tile_representations(self, feature):
+        """Tile up the representations by `tile_factor`.
+        Input - sequence of representations
+                shape: (batch_size, seq_len, feature_dim)
+        Output - sequence of tiled representations
+                 shape: (batch_size, seq_len * factor, feature_dim)
+        """
+        assert (
+            len(feature.shape) == 3
+        ), "Input argument `feature` has invalid shape: {}".format(feature.shape)
+        tiled_feature = feature.repeat(1, 1, self.args.tile_factor)
+        tiled_feature = tiled_feature.reshape(
+            feature.size(0), feature.size(1) * self.args.tile_factor, feature.size(2)
+        )
+        return tiled_feature
+    def output_size(self) -> int:
+        return self.output_dim
+    def forward(
+        self, input: torch.Tensor, input_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        wavs = [wav[: input_lengths[i]] for i, wav in enumerate(input)]
+        self.upstream.eval()
+        with torch.no_grad():
+            feats = self.upstream(wavs)
+        feats = self.featurizer(wavs, feats)
+        if self.args.tile_factor != 1:
+            feats = self._tile_representations(feats)
+        input_feats = pad_list(feats, 0.0)
+        feats_lens = torch.tensor([f.shape[0] for f in feats], dtype=torch.long)
+        # Saving CUDA Memory
+        del feats
+        return input_feats, feats_lens
+    def reload_pretrained_parameters(self):
+        self.upstream.load_state_dict(self.pretrained_params)
+        logging.info("Pretrained S3PRL frontend model parameters reloaded!")

funasr_detach/frontends/utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Initialize sub package."""

funasr_detach/frontends/utils/beamformer.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import torch
+from torch_complex import functional as FC
+from torch_complex.tensor import ComplexTensor
+def get_power_spectral_density_matrix(
+    xs: ComplexTensor, mask: torch.Tensor, normalization=True, eps: float = 1e-15
+) -> ComplexTensor:
+    """Return cross-channel power spectral density (PSD) matrix
+    Args:
+        xs (ComplexTensor): (..., F, C, T)
+        mask (torch.Tensor): (..., F, C, T)
+        normalization (bool):
+        eps (float):
+    Returns
+        psd (ComplexTensor): (..., F, C, C)
+    """
+    # outer product: (..., C_1, T) x (..., C_2, T) -> (..., T, C, C_2)
+    psd_Y = FC.einsum("...ct,...et->...tce", [xs, xs.conj()])
+    # Averaging mask along C: (..., C, T) -> (..., T)
+    mask = mask.mean(dim=-2)
+    # Normalized mask along T: (..., T)
+    if normalization:
+        # If assuming the tensor is padded with zero, the summation along
+        # the time axis is same regardless of the padding length.
+        mask = mask / (mask.sum(dim=-1, keepdim=True) + eps)
+    # psd: (..., T, C, C)
+    psd = psd_Y * mask[..., None, None]
+    # (..., T, C, C) -> (..., C, C)
+    psd = psd.sum(dim=-3)
+    return psd
+def get_mvdr_vector(
+    psd_s: ComplexTensor,
+    psd_n: ComplexTensor,
+    reference_vector: torch.Tensor,
+    eps: float = 1e-15,
+) -> ComplexTensor:
+    """Return the MVDR(Minimum Variance Distortionless Response) vector:
+        h = (Npsd^-1 @ Spsd) / (Tr(Npsd^-1 @ Spsd)) @ u
+    Reference:
+        On optimal frequency-domain multichannel linear filtering
+        for noise reduction; M. Souden et al., 2010;
+        https://ieeexplore.ieee.org/document/5089420
+    Args:
+        psd_s (ComplexTensor): (..., F, C, C)
+        psd_n (ComplexTensor): (..., F, C, C)
+        reference_vector (torch.Tensor): (..., C)
+        eps (float):
+    Returns:
+        beamform_vector (ComplexTensor)r: (..., F, C)
+    """
+    # Add eps
+    C = psd_n.size(-1)
+    eye = torch.eye(C, dtype=psd_n.dtype, device=psd_n.device)
+    shape = [1 for _ in range(psd_n.dim() - 2)] + [C, C]
+    eye = eye.view(*shape)
+    psd_n += eps * eye
+    # numerator: (..., C_1, C_2) x (..., C_2, C_3) -> (..., C_1, C_3)
+    numerator = FC.einsum("...ec,...cd->...ed", [psd_n.inverse(), psd_s])
+    # ws: (..., C, C) / (...,) -> (..., C, C)
+    ws = numerator / (FC.trace(numerator)[..., None, None] + eps)
+    # h: (..., F, C_1, C_2) x (..., C_2) -> (..., F, C_1)
+    beamform_vector = FC.einsum("...fec,...c->...fe", [ws, reference_vector])
+    return beamform_vector
+def apply_beamforming_vector(
+    beamform_vector: ComplexTensor, mix: ComplexTensor
+) -> ComplexTensor:
+    # (..., C) x (..., C, T) -> (..., T)
+    es = FC.einsum("...c,...ct->...t", [beamform_vector.conj(), mix])
+    return es

funasr_detach/frontends/utils/complex_utils.py ADDED Viewed

	@@ -0,0 +1,194 @@

+"""Beamformer module."""
+from distutils.version import LooseVersion
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+import torch
+try:
+    from torch_complex import functional as FC
+    from torch_complex.tensor import ComplexTensor
+except:
+    print("Please install torch_complex firstly")
+EPS = torch.finfo(torch.double).eps
+is_torch_1_8_plus = LooseVersion(torch.__version__) >= LooseVersion("1.8.0")
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
+def new_complex_like(
+    ref: Union[torch.Tensor, ComplexTensor],
+    real_imag: Tuple[torch.Tensor, torch.Tensor],
+):
+    if isinstance(ref, ComplexTensor):
+        return ComplexTensor(*real_imag)
+    elif is_torch_complex_tensor(ref):
+        return torch.complex(*real_imag)
+    else:
+        raise ValueError(
+            "Please update your PyTorch version to 1.9+ for complex support."
+        )
+def is_torch_complex_tensor(c):
+    return (
+        not isinstance(c, ComplexTensor) and is_torch_1_9_plus and torch.is_complex(c)
+    )
+def is_complex(c):
+    return isinstance(c, ComplexTensor) or is_torch_complex_tensor(c)
+def to_double(c):
+    if not isinstance(c, ComplexTensor) and is_torch_1_9_plus and torch.is_complex(c):
+        return c.to(dtype=torch.complex128)
+    else:
+        return c.double()
+def to_float(c):
+    if not isinstance(c, ComplexTensor) and is_torch_1_9_plus and torch.is_complex(c):
+        return c.to(dtype=torch.complex64)
+    else:
+        return c.float()
+def cat(seq: Sequence[Union[ComplexTensor, torch.Tensor]], *args, **kwargs):
+    if not isinstance(seq, (list, tuple)):
+        raise TypeError(
+            "cat(): argument 'tensors' (position 1) must be tuple of Tensors, "
+            "not Tensor"
+        )
+    if isinstance(seq[0], ComplexTensor):
+        return FC.cat(seq, *args, **kwargs)
+    else:
+        return torch.cat(seq, *args, **kwargs)
+def complex_norm(
+    c: Union[torch.Tensor, ComplexTensor], dim=-1, keepdim=False
+) -> torch.Tensor:
+    if not is_complex(c):
+        raise TypeError("Input is not a complex tensor.")
+    if is_torch_complex_tensor(c):
+        return torch.norm(c, dim=dim, keepdim=keepdim)
+    else:
+        return torch.sqrt((c.real**2 + c.imag**2).sum(dim=dim, keepdim=keepdim) + EPS)
+def einsum(equation, *operands):
+    # NOTE: Do not mix ComplexTensor and torch.complex in the input!
+    # NOTE (wangyou): Until PyTorch 1.9.0, torch.einsum does not support
+    # mixed input with complex and real tensors.
+    if len(operands) == 1:
+        if isinstance(operands[0], (tuple, list)):
+            operands = operands[0]
+        complex_module = FC if isinstance(operands[0], ComplexTensor) else torch
+        return complex_module.einsum(equation, *operands)
+    elif len(operands) != 2:
+        op0 = operands[0]
+        same_type = all(op.dtype == op0.dtype for op in operands[1:])
+        if same_type:
+            _einsum = FC.einsum if isinstance(op0, ComplexTensor) else torch.einsum
+            return _einsum(equation, *operands)
+        else:
+            raise ValueError("0 or More than 2 operands are not supported.")
+    a, b = operands
+    if isinstance(a, ComplexTensor) or isinstance(b, ComplexTensor):
+        return FC.einsum(equation, a, b)
+    elif is_torch_1_9_plus and (torch.is_complex(a) or torch.is_complex(b)):
+        if not torch.is_complex(a):
+            o_real = torch.einsum(equation, a, b.real)
+            o_imag = torch.einsum(equation, a, b.imag)
+            return torch.complex(o_real, o_imag)
+        elif not torch.is_complex(b):
+            o_real = torch.einsum(equation, a.real, b)
+            o_imag = torch.einsum(equation, a.imag, b)
+            return torch.complex(o_real, o_imag)
+        else:
+            return torch.einsum(equation, a, b)
+    else:
+        return torch.einsum(equation, a, b)
+def inverse(
+    c: Union[torch.Tensor, ComplexTensor],
+) -> Union[torch.Tensor, ComplexTensor]:
+    if isinstance(c, ComplexTensor):
+        return c.inverse2()
+    else:
+        return c.inverse()
+def matmul(
+    a: Union[torch.Tensor, ComplexTensor], b: Union[torch.Tensor, ComplexTensor]
+) -> Union[torch.Tensor, ComplexTensor]:
+    # NOTE: Do not mix ComplexTensor and torch.complex in the input!
+    # NOTE (wangyou): Until PyTorch 1.9.0, torch.matmul does not support
+    # multiplication between complex and real tensors.
+    if isinstance(a, ComplexTensor) or isinstance(b, ComplexTensor):
+        return FC.matmul(a, b)
+    elif is_torch_1_9_plus and (torch.is_complex(a) or torch.is_complex(b)):
+        if not torch.is_complex(a):
+            o_real = torch.matmul(a, b.real)
+            o_imag = torch.matmul(a, b.imag)
+            return torch.complex(o_real, o_imag)
+        elif not torch.is_complex(b):
+            o_real = torch.matmul(a.real, b)
+            o_imag = torch.matmul(a.imag, b)
+            return torch.complex(o_real, o_imag)
+        else:
+            return torch.matmul(a, b)
+    else:
+        return torch.matmul(a, b)
+def trace(a: Union[torch.Tensor, ComplexTensor]):
+    # NOTE (wangyou): until PyTorch 1.9.0, torch.trace does not
+    # support bacth processing. Use FC.trace() as fallback.
+    return FC.trace(a)
+def reverse(a: Union[torch.Tensor, ComplexTensor], dim=0):
+    if isinstance(a, ComplexTensor):
+        return FC.reverse(a, dim=dim)
+    else:
+        return torch.flip(a, dims=(dim,))
+def solve(b: Union[torch.Tensor, ComplexTensor], a: Union[torch.Tensor, ComplexTensor]):
+    """Solve the linear equation ax = b."""
+    # NOTE: Do not mix ComplexTensor and torch.complex in the input!
+    # NOTE (wangyou): Until PyTorch 1.9.0, torch.solve does not support
+    # mixed input with complex and real tensors.
+    if isinstance(a, ComplexTensor) or isinstance(b, ComplexTensor):
+        if isinstance(a, ComplexTensor) and isinstance(b, ComplexTensor):
+            return FC.solve(b, a, return_LU=False)
+        else:
+            return matmul(inverse(a), b)
+    elif is_torch_1_9_plus and (torch.is_complex(a) or torch.is_complex(b)):
+        if torch.is_complex(a) and torch.is_complex(b):
+            return torch.linalg.solve(a, b)
+        else:
+            return matmul(inverse(a), b)
+    else:
+        if is_torch_1_8_plus:
+            return torch.linalg.solve(a, b)
+        else:
+            return torch.solve(b, a)[0]
+def stack(seq: Sequence[Union[ComplexTensor, torch.Tensor]], *args, **kwargs):
+    if not isinstance(seq, (list, tuple)):
+        raise TypeError(
+            "stack(): argument 'tensors' (position 1) must be tuple of Tensors, "
+            "not Tensor"
+        )
+    if isinstance(seq[0], ComplexTensor):
+        return FC.stack(seq, *args, **kwargs)
+    else:
+        return torch.stack(seq, *args, **kwargs)

funasr_detach/frontends/utils/dnn_beamformer.py ADDED Viewed

	@@ -0,0 +1,173 @@

+"""DNN beamformer module."""
+from typing import Tuple
+import torch
+from torch.nn import functional as F
+from funasr_detach.frontends.utils.beamformer import apply_beamforming_vector
+from funasr_detach.frontends.utils.beamformer import get_mvdr_vector
+from funasr_detach.frontends.utils.beamformer import (
+    get_power_spectral_density_matrix,  # noqa: H301
+)
+from funasr_detach.frontends.utils.mask_estimator import MaskEstimator
+from torch_complex.tensor import ComplexTensor
+class DNN_Beamformer(torch.nn.Module):
+    """DNN mask based Beamformer
+    Citation:
+        Multichannel End-to-end Speech Recognition; T. Ochiai et al., 2017;
+        https://arxiv.org/abs/1703.04783
+    """
+    def __init__(
+        self,
+        bidim,
+        btype="blstmp",
+        blayers=3,
+        bunits=300,
+        bprojs=320,
+        bnmask=2,
+        dropout_rate=0.0,
+        badim=320,
+        ref_channel: int = -1,
+        beamformer_type="mvdr",
+    ):
+        super().__init__()
+        self.mask = MaskEstimator(
+            btype, bidim, blayers, bunits, bprojs, dropout_rate, nmask=bnmask
+        )
+        self.ref = AttentionReference(bidim, badim)
+        self.ref_channel = ref_channel
+        self.nmask = bnmask
+        if beamformer_type != "mvdr":
+            raise ValueError(
+                "Not supporting beamformer_type={}".format(beamformer_type)
+            )
+        self.beamformer_type = beamformer_type
+    def forward(
+        self, data: ComplexTensor, ilens: torch.LongTensor
+    ) -> Tuple[ComplexTensor, torch.LongTensor, ComplexTensor]:
+        """The forward function
+        Notation:
+            B: Batch
+            C: Channel
+            T: Time or Sequence length
+            F: Freq
+        Args:
+            data (ComplexTensor): (B, T, C, F)
+            ilens (torch.Tensor): (B,)
+        Returns:
+            enhanced (ComplexTensor): (B, T, F)
+            ilens (torch.Tensor): (B,)
+        """
+        def apply_beamforming(data, ilens, psd_speech, psd_noise):
+            # u: (B, C)
+            if self.ref_channel < 0:
+                u, _ = self.ref(psd_speech, ilens)
+            else:
+                # (optional) Create onehot vector for fixed reference microphone
+                u = torch.zeros(
+                    *(data.size()[:-3] + (data.size(-2),)), device=data.device
+                )
+                u[..., self.ref_channel].fill_(1)
+            ws = get_mvdr_vector(psd_speech, psd_noise, u)
+            enhanced = apply_beamforming_vector(ws, data)
+            return enhanced, ws
+        # data (B, T, C, F) -> (B, F, C, T)
+        data = data.permute(0, 3, 2, 1)
+        # mask: (B, F, C, T)
+        masks, _ = self.mask(data, ilens)
+        assert self.nmask == len(masks)
+        if self.nmask == 2:  # (mask_speech, mask_noise)
+            mask_speech, mask_noise = masks
+            psd_speech = get_power_spectral_density_matrix(data, mask_speech)
+            psd_noise = get_power_spectral_density_matrix(data, mask_noise)
+            enhanced, ws = apply_beamforming(data, ilens, psd_speech, psd_noise)
+            # (..., F, T) -> (..., T, F)
+            enhanced = enhanced.transpose(-1, -2)
+            mask_speech = mask_speech.transpose(-1, -3)
+        else:  # multi-speaker case: (mask_speech1, ..., mask_noise)
+            mask_speech = list(masks[:-1])
+            mask_noise = masks[-1]
+            psd_speeches = [
+                get_power_spectral_density_matrix(data, mask) for mask in mask_speech
+            ]
+            psd_noise = get_power_spectral_density_matrix(data, mask_noise)
+            enhanced = []
+            ws = []
+            for i in range(self.nmask - 1):
+                psd_speech = psd_speeches.pop(i)
+                # treat all other speakers' psd_speech as noises
+                enh, w = apply_beamforming(
+                    data, ilens, psd_speech, sum(psd_speeches) + psd_noise
+                )
+                psd_speeches.insert(i, psd_speech)
+                # (..., F, T) -> (..., T, F)
+                enh = enh.transpose(-1, -2)
+                mask_speech[i] = mask_speech[i].transpose(-1, -3)
+                enhanced.append(enh)
+                ws.append(w)
+        return enhanced, ilens, mask_speech
+class AttentionReference(torch.nn.Module):
+    def __init__(self, bidim, att_dim):
+        super().__init__()
+        self.mlp_psd = torch.nn.Linear(bidim, att_dim)
+        self.gvec = torch.nn.Linear(att_dim, 1)
+    def forward(
+        self, psd_in: ComplexTensor, ilens: torch.LongTensor, scaling: float = 2.0
+    ) -> Tuple[torch.Tensor, torch.LongTensor]:
+        """The forward function
+        Args:
+            psd_in (ComplexTensor): (B, F, C, C)
+            ilens (torch.Tensor): (B,)
+            scaling (float):
+        Returns:
+            u (torch.Tensor): (B, C)
+            ilens (torch.Tensor): (B,)
+        """
+        B, _, C = psd_in.size()[:3]
+        assert psd_in.size(2) == psd_in.size(3), psd_in.size()
+        # psd_in: (B, F, C, C)
+        psd = psd_in.masked_fill(
+            torch.eye(C, dtype=torch.bool, device=psd_in.device), 0
+        )
+        # psd: (B, F, C, C) -> (B, C, F)
+        psd = (psd.sum(dim=-1) / (C - 1)).transpose(-1, -2)
+        # Calculate amplitude
+        psd_feat = (psd.real**2 + psd.imag**2) ** 0.5
+        # (B, C, F) -> (B, C, F2)
+        mlp_psd = self.mlp_psd(psd_feat)
+        # (B, C, F2) -> (B, C, 1) -> (B, C)
+        e = self.gvec(torch.tanh(mlp_psd)).squeeze(-1)
+        u = F.softmax(scaling * e, dim=-1)
+        return u, ilens

funasr_detach/frontends/utils/dnn_wpe.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from typing import Tuple
+from pytorch_wpe import wpe_one_iteration
+import torch
+from torch_complex.tensor import ComplexTensor
+from funasr_detach.frontends.utils.mask_estimator import MaskEstimator
+from funasr_detach.models.transformer.utils.nets_utils import make_pad_mask
+class DNN_WPE(torch.nn.Module):
+    def __init__(
+        self,
+        wtype: str = "blstmp",
+        widim: int = 257,
+        wlayers: int = 3,
+        wunits: int = 300,
+        wprojs: int = 320,
+        dropout_rate: float = 0.0,
+        taps: int = 5,
+        delay: int = 3,
+        use_dnn_mask: bool = True,
+        iterations: int = 1,
+        normalization: bool = False,
+    ):
+        super().__init__()
+        self.iterations = iterations
+        self.taps = taps
+        self.delay = delay
+        self.normalization = normalization
+        self.use_dnn_mask = use_dnn_mask
+        self.inverse_power = True
+        if self.use_dnn_mask:
+            self.mask_est = MaskEstimator(
+                wtype, widim, wlayers, wunits, wprojs, dropout_rate, nmask=1
+            )
+    def forward(
+        self, data: ComplexTensor, ilens: torch.LongTensor
+    ) -> Tuple[ComplexTensor, torch.LongTensor, ComplexTensor]:
+        """The forward function
+        Notation:
+            B: Batch
+            C: Channel
+            T: Time or Sequence length
+            F: Freq or Some dimension of the feature vector
+        Args:
+            data: (B, C, T, F)
+            ilens: (B,)
+        Returns:
+            data: (B, C, T, F)
+            ilens: (B,)
+        """
+        # (B, T, C, F) -> (B, F, C, T)
+        enhanced = data = data.permute(0, 3, 2, 1)
+        mask = None
+        for i in range(self.iterations):
+            # Calculate power: (..., C, T)
+            power = enhanced.real**2 + enhanced.imag**2
+            if i == 0 and self.use_dnn_mask:
+                # mask: (B, F, C, T)
+                (mask,), _ = self.mask_est(enhanced, ilens)
+                if self.normalization:
+                    # Normalize along T
+                    mask = mask / mask.sum(dim=-1)[..., None]
+                # (..., C, T) * (..., C, T) -> (..., C, T)
+                power = power * mask
+            # Averaging along the channel axis: (..., C, T) -> (..., T)
+            power = power.mean(dim=-2)
+            # enhanced: (..., C, T) -> (..., C, T)
+            enhanced = wpe_one_iteration(
+                data.contiguous(),
+                power,
+                taps=self.taps,
+                delay=self.delay,
+                inverse_power=self.inverse_power,
+            )
+            enhanced.masked_fill_(make_pad_mask(ilens, enhanced.real), 0)
+        # (B, F, C, T) -> (B, T, C, F)
+        enhanced = enhanced.permute(0, 3, 2, 1)
+        if mask is not None:
+            mask = mask.transpose(-1, -3)
+        return enhanced, ilens, mask

funasr_detach/frontends/utils/feature_transform.py ADDED Viewed

	@@ -0,0 +1,263 @@

+from typing import List
+from typing import Tuple
+from typing import Union
+import librosa
+import numpy as np
+import torch
+from torch_complex.tensor import ComplexTensor
+from funasr_detach.models.transformer.utils.nets_utils import make_pad_mask
+class FeatureTransform(torch.nn.Module):
+    def __init__(
+        self,
+        # Mel options,
+        fs: int = 16000,
+        n_fft: int = 512,
+        n_mels: int = 80,
+        fmin: float = 0.0,
+        fmax: float = None,
+        # Normalization
+        stats_file: str = None,
+        apply_uttmvn: bool = True,
+        uttmvn_norm_means: bool = True,
+        uttmvn_norm_vars: bool = False,
+    ):
+        super().__init__()
+        self.apply_uttmvn = apply_uttmvn
+        self.logmel = LogMel(fs=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
+        self.stats_file = stats_file
+        if stats_file is not None:
+            self.global_mvn = GlobalMVN(stats_file)
+        else:
+            self.global_mvn = None
+        if self.apply_uttmvn is not None:
+            self.uttmvn = UtteranceMVN(
+                norm_means=uttmvn_norm_means, norm_vars=uttmvn_norm_vars
+            )
+        else:
+            self.uttmvn = None
+    def forward(
+        self, x: ComplexTensor, ilens: Union[torch.LongTensor, np.ndarray, List[int]]
+    ) -> Tuple[torch.Tensor, torch.LongTensor]:
+        # (B, T, F) or (B, T, C, F)
+        if x.dim() not in (3, 4):
+            raise ValueError(f"Input dim must be 3 or 4: {x.dim()}")
+        if not torch.is_tensor(ilens):
+            ilens = torch.from_numpy(np.asarray(ilens)).to(x.device)
+        if x.dim() == 4:
+            # h: (B, T, C, F) -> h: (B, T, F)
+            if self.training:
+                # Select 1ch randomly
+                ch = np.random.randint(x.size(2))
+                h = x[:, :, ch, :]
+            else:
+                # Use the first channel
+                h = x[:, :, 0, :]
+        else:
+            h = x
+        # h: ComplexTensor(B, T, F) -> torch.Tensor(B, T, F)
+        h = h.real**2 + h.imag**2
+        h, _ = self.logmel(h, ilens)
+        if self.stats_file is not None:
+            h, _ = self.global_mvn(h, ilens)
+        if self.apply_uttmvn:
+            h, _ = self.uttmvn(h, ilens)
+        return h, ilens
+class LogMel(torch.nn.Module):
+    """Convert STFT to fbank feats
+    The arguments is same as librosa.filters.mel
+    Args:
+        fs: number > 0 [scalar] sampling rate of the incoming signal
+        n_fft: int > 0 [scalar] number of FFT components
+        n_mels: int > 0 [scalar] number of Mel bands to generate
+        fmin: float >= 0 [scalar] lowest frequency (in Hz)
+        fmax: float >= 0 [scalar] highest frequency (in Hz).
+            If `None`, use `fmax = fs / 2.0`
+        htk: use HTK formula instead of Slaney
+        norm: {None, 1, np.inf} [scalar]
+            if 1, divide the triangular mel weights by the width of the mel band
+            (area normalization).  Otherwise, leave all the triangles aiming for
+            a peak value of 1.0
+    """
+    def __init__(
+        self,
+        fs: int = 16000,
+        n_fft: int = 512,
+        n_mels: int = 80,
+        fmin: float = 0.0,
+        fmax: float = None,
+        htk: bool = False,
+        norm=1,
+    ):
+        super().__init__()
+        _mel_options = dict(
+            sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, htk=htk, norm=norm
+        )
+        self.mel_options = _mel_options
+        # Note(kamo): The mel matrix of librosa is different from kaldi.
+        melmat = librosa.filters.mel(**_mel_options)
+        # melmat: (D2, D1) -> (D1, D2)
+        self.register_buffer("melmat", torch.from_numpy(melmat.T).float())
+    def extra_repr(self):
+        return ", ".join(f"{k}={v}" for k, v in self.mel_options.items())
+    def forward(
+        self, feat: torch.Tensor, ilens: torch.LongTensor
+    ) -> Tuple[torch.Tensor, torch.LongTensor]:
+        # feat: (B, T, D1) x melmat: (D1, D2) -> mel_feat: (B, T, D2)
+        mel_feat = torch.matmul(feat, self.melmat)
+        logmel_feat = (mel_feat + 1e-20).log()
+        # Zero padding
+        logmel_feat = logmel_feat.masked_fill(make_pad_mask(ilens, logmel_feat, 1), 0.0)
+        return logmel_feat, ilens
+class GlobalMVN(torch.nn.Module):
+    """Apply global mean and variance normalization
+    Args:
+        stats_file(str): npy file of 1-dim array or text file.
+            From the _first element to
+            the {(len(array) - 1) / 2}th element are treated as
+            the sum of features,
+            and the rest excluding the last elements are
+            treated as the sum of the square value of features,
+            and the last elements eqauls to the number of samples.
+        std_floor(float):
+    """
+    def __init__(
+        self,
+        stats_file: str,
+        norm_means: bool = True,
+        norm_vars: bool = True,
+        eps: float = 1.0e-20,
+    ):
+        super().__init__()
+        self.norm_means = norm_means
+        self.norm_vars = norm_vars
+        self.stats_file = stats_file
+        stats = np.load(stats_file)
+        stats = stats.astype(float)
+        assert (len(stats) - 1) % 2 == 0, stats.shape
+        count = stats.flatten()[-1]
+        mean = stats[: (len(stats) - 1) // 2] / count
+        var = stats[(len(stats) - 1) // 2 : -1] / count - mean * mean
+        std = np.maximum(np.sqrt(var), eps)
+        self.register_buffer("bias", torch.from_numpy(-mean.astype(np.float32)))
+        self.register_buffer("scale", torch.from_numpy(1 / std.astype(np.float32)))
+    def extra_repr(self):
+        return (
+            f"stats_file={self.stats_file}, "
+            f"norm_means={self.norm_means}, norm_vars={self.norm_vars}"
+        )
+    def forward(
+        self, x: torch.Tensor, ilens: torch.LongTensor
+    ) -> Tuple[torch.Tensor, torch.LongTensor]:
+        # feat: (B, T, D)
+        if self.norm_means:
+            x += self.bias.type_as(x)
+            x.masked_fill(make_pad_mask(ilens, x, 1), 0.0)
+        if self.norm_vars:
+            x *= self.scale.type_as(x)
+        return x, ilens
+class UtteranceMVN(torch.nn.Module):
+    def __init__(
+        self, norm_means: bool = True, norm_vars: bool = False, eps: float = 1.0e-20
+    ):
+        super().__init__()
+        self.norm_means = norm_means
+        self.norm_vars = norm_vars
+        self.eps = eps
+    def extra_repr(self):
+        return f"norm_means={self.norm_means}, norm_vars={self.norm_vars}"
+    def forward(
+        self, x: torch.Tensor, ilens: torch.LongTensor
+    ) -> Tuple[torch.Tensor, torch.LongTensor]:
+        return utterance_mvn(
+            x, ilens, norm_means=self.norm_means, norm_vars=self.norm_vars, eps=self.eps
+        )
+def utterance_mvn(
+    x: torch.Tensor,
+    ilens: torch.LongTensor,
+    norm_means: bool = True,
+    norm_vars: bool = False,
+    eps: float = 1.0e-20,
+) -> Tuple[torch.Tensor, torch.LongTensor]:
+    """Apply utterance mean and variance normalization
+    Args:
+        x: (B, T, D), assumed zero padded
+        ilens: (B, T, D)
+        norm_means:
+        norm_vars:
+        eps:
+    """
+    ilens_ = ilens.type_as(x)
+    # mean: (B, D)
+    mean = x.sum(dim=1) / ilens_[:, None]
+    if norm_means:
+        x -= mean[:, None, :]
+        x_ = x
+    else:
+        x_ = x - mean[:, None, :]
+    # Zero padding
+    x_.masked_fill(make_pad_mask(ilens, x_, 1), 0.0)
+    if norm_vars:
+        var = x_.pow(2).sum(dim=1) / ilens_[:, None]
+        var = torch.clamp(var, min=eps)
+        x /= var.sqrt()[:, None, :]
+        x_ = x
+    return x_, ilens
+def feature_transform_for(args, n_fft):
+    return FeatureTransform(
+        # Mel options,
+        fs=args.fbank_fs,
+        n_fft=n_fft,
+        n_mels=args.n_mels,
+        fmin=args.fbank_fmin,
+        fmax=args.fbank_fmax,
+        # Normalization
+        stats_file=args.stats_file,
+        apply_uttmvn=args.apply_uttmvn,
+        uttmvn_norm_means=args.uttmvn_norm_means,
+        uttmvn_norm_vars=args.uttmvn_norm_vars,
+    )

funasr_detach/frontends/utils/frontend.py ADDED Viewed

	@@ -0,0 +1,151 @@

+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+import numpy
+import torch
+import torch.nn as nn
+from torch_complex.tensor import ComplexTensor
+from funasr_detach.frontends.utils.dnn_beamformer import DNN_Beamformer
+from funasr_detach.frontends.utils.dnn_wpe import DNN_WPE
+class Frontend(nn.Module):
+    def __init__(
+        self,
+        idim: int,
+        # WPE options
+        use_wpe: bool = False,
+        wtype: str = "blstmp",
+        wlayers: int = 3,
+        wunits: int = 300,
+        wprojs: int = 320,
+        wdropout_rate: float = 0.0,
+        taps: int = 5,
+        delay: int = 3,
+        use_dnn_mask_for_wpe: bool = True,
+        # Beamformer options
+        use_beamformer: bool = False,
+        btype: str = "blstmp",
+        blayers: int = 3,
+        bunits: int = 300,
+        bprojs: int = 320,
+        bnmask: int = 2,
+        badim: int = 320,
+        ref_channel: int = -1,
+        bdropout_rate=0.0,
+    ):
+        super().__init__()
+        self.use_beamformer = use_beamformer
+        self.use_wpe = use_wpe
+        self.use_dnn_mask_for_wpe = use_dnn_mask_for_wpe
+        # use frontend for all the data,
+        # e.g. in the case of multi-speaker speech separation
+        self.use_frontend_for_all = bnmask > 2
+        if self.use_wpe:
+            if self.use_dnn_mask_for_wpe:
+                # Use DNN for power estimation
+                # (Not observed significant gains)
+                iterations = 1
+            else:
+                # Performing as conventional WPE, without DNN Estimator
+                iterations = 2
+            self.wpe = DNN_WPE(
+                wtype=wtype,
+                widim=idim,
+                wunits=wunits,
+                wprojs=wprojs,
+                wlayers=wlayers,
+                taps=taps,
+                delay=delay,
+                dropout_rate=wdropout_rate,
+                iterations=iterations,
+                use_dnn_mask=use_dnn_mask_for_wpe,
+            )
+        else:
+            self.wpe = None
+        if self.use_beamformer:
+            self.beamformer = DNN_Beamformer(
+                btype=btype,
+                bidim=idim,
+                bunits=bunits,
+                bprojs=bprojs,
+                blayers=blayers,
+                bnmask=bnmask,
+                dropout_rate=bdropout_rate,
+                badim=badim,
+                ref_channel=ref_channel,
+            )
+        else:
+            self.beamformer = None
+    def forward(
+        self, x: ComplexTensor, ilens: Union[torch.LongTensor, numpy.ndarray, List[int]]
+    ) -> Tuple[ComplexTensor, torch.LongTensor, Optional[ComplexTensor]]:
+        assert len(x) == len(ilens), (len(x), len(ilens))
+        # (B, T, F) or (B, T, C, F)
+        if x.dim() not in (3, 4):
+            raise ValueError(f"Input dim must be 3 or 4: {x.dim()}")
+        if not torch.is_tensor(ilens):
+            ilens = torch.from_numpy(numpy.asarray(ilens)).to(x.device)
+        mask = None
+        h = x
+        if h.dim() == 4:
+            if self.training:
+                choices = [(False, False)] if not self.use_frontend_for_all else []
+                if self.use_wpe:
+                    choices.append((True, False))
+                if self.use_beamformer:
+                    choices.append((False, True))
+                use_wpe, use_beamformer = choices[numpy.random.randint(len(choices))]
+            else:
+                use_wpe = self.use_wpe
+                use_beamformer = self.use_beamformer
+            # 1. WPE
+            if use_wpe:
+                # h: (B, T, C, F) -> h: (B, T, C, F)
+                h, ilens, mask = self.wpe(h, ilens)
+            # 2. Beamformer
+            if use_beamformer:
+                # h: (B, T, C, F) -> h: (B, T, F)
+                h, ilens, mask = self.beamformer(h, ilens)
+        return h, ilens, mask
+def frontend_for(args, idim):
+    return Frontend(
+        idim=idim,
+        # WPE options
+        use_wpe=args.use_wpe,
+        wtype=args.wtype,
+        wlayers=args.wlayers,
+        wunits=args.wunits,
+        wprojs=args.wprojs,
+        wdropout_rate=args.wdropout_rate,
+        taps=args.wpe_taps,
+        delay=args.wpe_delay,
+        use_dnn_mask_for_wpe=args.use_dnn_mask_for_wpe,
+        # Beamformer options
+        use_beamformer=args.use_beamformer,
+        btype=args.btype,
+        blayers=args.blayers,
+        bunits=args.bunits,
+        bprojs=args.bprojs,
+        bnmask=args.bnmask,
+        badim=args.badim,
+        ref_channel=args.ref_channel,
+        bdropout_rate=args.bdropout_rate,
+    )

funasr_detach/frontends/utils/log_mel.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import librosa
+import torch
+from typing import Tuple
+from funasr_detach.models.transformer.utils.nets_utils import make_pad_mask
+class LogMel(torch.nn.Module):
+    """Convert STFT to fbank feats
+    The arguments is same as librosa.filters.mel
+    Args:
+        fs: number > 0 [scalar] sampling rate of the incoming signal
+        n_fft: int > 0 [scalar] number of FFT components
+        n_mels: int > 0 [scalar] number of Mel bands to generate
+        fmin: float >= 0 [scalar] lowest frequency (in Hz)
+        fmax: float >= 0 [scalar] highest frequency (in Hz).
+            If `None`, use `fmax = fs / 2.0`
+        htk: use HTK formula instead of Slaney
+    """
+    def __init__(
+        self,
+        fs: int = 16000,
+        n_fft: int = 512,
+        n_mels: int = 80,
+        fmin: float = None,
+        fmax: float = None,
+        htk: bool = False,
+        log_base: float = None,
+    ):
+        super().__init__()
+        fmin = 0 if fmin is None else fmin
+        fmax = fs / 2 if fmax is None else fmax
+        _mel_options = dict(
+            sr=fs,
+            n_fft=n_fft,
+            n_mels=n_mels,
+            fmin=fmin,
+            fmax=fmax,
+            htk=htk,
+        )
+        self.mel_options = _mel_options
+        self.log_base = log_base
+        # Note(kamo): The mel matrix of librosa is different from kaldi.
+        melmat = librosa.filters.mel(**_mel_options)
+        # melmat: (D2, D1) -> (D1, D2)
+        self.register_buffer("melmat", torch.from_numpy(melmat.T).float())
+    def extra_repr(self):
+        return ", ".join(f"{k}={v}" for k, v in self.mel_options.items())
+    def forward(
+        self,
+        feat: torch.Tensor,
+        ilens: torch.Tensor = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # feat: (B, T, D1) x melmat: (D1, D2) -> mel_feat: (B, T, D2)
+        mel_feat = torch.matmul(feat, self.melmat)
+        mel_feat = torch.clamp(mel_feat, min=1e-10)
+        if self.log_base is None:
+            logmel_feat = mel_feat.log()
+        elif self.log_base == 2.0:
+            logmel_feat = mel_feat.log2()
+        elif self.log_base == 10.0:
+            logmel_feat = mel_feat.log10()
+        else:
+            logmel_feat = mel_feat.log() / torch.log(self.log_base)
+        # Zero padding
+        if ilens is not None:
+            logmel_feat = logmel_feat.masked_fill(
+                make_pad_mask(ilens, logmel_feat, 1), 0.0
+            )
+        else:
+            ilens = feat.new_full(
+                [feat.size(0)], fill_value=feat.size(1), dtype=torch.long
+            )
+        return logmel_feat, ilens

funasr_detach/frontends/utils/mask_estimator.py ADDED Viewed

	@@ -0,0 +1,77 @@

+from typing import Tuple
+import numpy as np
+import torch
+from torch.nn import functional as F
+from torch_complex.tensor import ComplexTensor
+from funasr_detach.models.transformer.utils.nets_utils import make_pad_mask
+from funasr_detach.models.language_model.rnn.encoders import RNN
+from funasr_detach.models.language_model.rnn.encoders import RNNP
+class MaskEstimator(torch.nn.Module):
+    def __init__(self, type, idim, layers, units, projs, dropout, nmask=1):
+        super().__init__()
+        subsample = np.ones(layers + 1, dtype=np.int32)
+        typ = type.lstrip("vgg").rstrip("p")
+        if type[-1] == "p":
+            self.brnn = RNNP(idim, layers, units, projs, subsample, dropout, typ=typ)
+        else:
+            self.brnn = RNN(idim, layers, units, projs, dropout, typ=typ)
+        self.type = type
+        self.nmask = nmask
+        self.linears = torch.nn.ModuleList(
+            [torch.nn.Linear(projs, idim) for _ in range(nmask)]
+        )
+    def forward(
+        self, xs: ComplexTensor, ilens: torch.LongTensor
+    ) -> Tuple[Tuple[torch.Tensor, ...], torch.LongTensor]:
+        """The forward function
+        Args:
+            xs: (B, F, C, T)
+            ilens: (B,)
+        Returns:
+            hs (torch.Tensor): The hidden vector (B, F, C, T)
+            masks: A tuple of the masks. (B, F, C, T)
+            ilens: (B,)
+        """
+        assert xs.size(0) == ilens.size(0), (xs.size(0), ilens.size(0))
+        _, _, C, input_length = xs.size()
+        # (B, F, C, T) -> (B, C, T, F)
+        xs = xs.permute(0, 2, 3, 1)
+        # Calculate amplitude: (B, C, T, F) -> (B, C, T, F)
+        xs = (xs.real**2 + xs.imag**2) ** 0.5
+        # xs: (B, C, T, F) -> xs: (B * C, T, F)
+        xs = xs.contiguous().view(-1, xs.size(-2), xs.size(-1))
+        # ilens: (B,) -> ilens_: (B * C)
+        ilens_ = ilens[:, None].expand(-1, C).contiguous().view(-1)
+        # xs: (B * C, T, F) -> xs: (B * C, T, D)
+        xs, _, _ = self.brnn(xs, ilens_)
+        # xs: (B * C, T, D) -> xs: (B, C, T, D)
+        xs = xs.view(-1, C, xs.size(-2), xs.size(-1))
+        masks = []
+        for linear in self.linears:
+            # xs: (B, C, T, D) -> mask:(B, C, T, F)
+            mask = linear(xs)
+            mask = torch.sigmoid(mask)
+            # Zero padding
+            mask.masked_fill(make_pad_mask(ilens, mask, length_dim=2), 0)
+            # (B, C, T, F) -> (B, F, C, T)
+            mask = mask.permute(0, 3, 1, 2)
+            # Take cares of multi gpu cases: If input_length > max(ilens)
+            if mask.size(-1) < input_length:
+                mask = F.pad(mask, [0, input_length - mask.size(-1)], value=0)
+            masks.append(mask)
+        return tuple(masks), ilens

funasr_detach/frontends/utils/stft.py ADDED Viewed

	@@ -0,0 +1,239 @@

+from distutils.version import LooseVersion
+from typing import Optional
+from typing import Tuple
+from typing import Union
+import torch
+try:
+    from torch_complex.tensor import ComplexTensor
+except:
+    print("Please install torch_complex firstly")
+from funasr_detach.models.transformer.utils.nets_utils import make_pad_mask
+from funasr_detach.frontends.utils.complex_utils import is_complex
+import librosa
+import numpy as np
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
+is_torch_1_7_plus = LooseVersion(torch.__version__) >= LooseVersion("1.7")
+class Stft(torch.nn.Module):
+    def __init__(
+        self,
+        n_fft: int = 512,
+        win_length: int = None,
+        hop_length: int = 128,
+        window: Optional[str] = "hann",
+        center: bool = True,
+        normalized: bool = False,
+        onesided: bool = True,
+    ):
+        super().__init__()
+        self.n_fft = n_fft
+        if win_length is None:
+            self.win_length = n_fft
+        else:
+            self.win_length = win_length
+        self.hop_length = hop_length
+        self.center = center
+        self.normalized = normalized
+        self.onesided = onesided
+        if window is not None and not hasattr(torch, f"{window}_window"):
+            if window.lower() != "povey":
+                raise ValueError(f"{window} window is not implemented")
+        self.window = window
+    def extra_repr(self):
+        return (
+            f"n_fft={self.n_fft}, "
+            f"win_length={self.win_length}, "
+            f"hop_length={self.hop_length}, "
+            f"center={self.center}, "
+            f"normalized={self.normalized}, "
+            f"onesided={self.onesided}"
+        )
+    def forward(
+        self, input: torch.Tensor, ilens: torch.Tensor = None
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """STFT forward function.
+        Args:
+            input: (Batch, Nsamples) or (Batch, Nsample, Channels)
+            ilens: (Batch)
+        Returns:
+            output: (Batch, Frames, Freq, 2) or (Batch, Frames, Channels, Freq, 2)
+        """
+        bs = input.size(0)
+        if input.dim() == 3:
+            multi_channel = True
+            # input: (Batch, Nsample, Channels) -> (Batch * Channels, Nsample)
+            input = input.transpose(1, 2).reshape(-1, input.size(1))
+        else:
+            multi_channel = False
+        # NOTE(kamo):
+        #   The default behaviour of torch.stft is compatible with librosa.stft
+        #   about padding and scaling.
+        #   Note that it's different from scipy.signal.stft
+        # output: (Batch, Freq, Frames, 2=real_imag)
+        # or (Batch, Channel, Freq, Frames, 2=real_imag)
+        if self.window is not None:
+            if self.window.lower() == "povey":
+                window = torch.hann_window(
+                    self.win_length,
+                    periodic=False,
+                    device=input.device,
+                    dtype=input.dtype,
+                ).pow(0.85)
+            else:
+                window_func = getattr(torch, f"{self.window}_window")
+                window = window_func(
+                    self.win_length, dtype=input.dtype, device=input.device
+                )
+        else:
+            window = None
+        # For the compatibility of ARM devices, which do not support
+        # torch.stft() due to the lake of MKL.
+        if input.is_cuda or torch.backends.mkl.is_available():
+            stft_kwargs = dict(
+                n_fft=self.n_fft,
+                win_length=self.win_length,
+                hop_length=self.hop_length,
+                center=self.center,
+                window=window,
+                normalized=self.normalized,
+                onesided=self.onesided,
+            )
+            if is_torch_1_7_plus:
+                stft_kwargs["return_complex"] = False
+            output = torch.stft(input, **stft_kwargs)
+        else:
+            if self.training:
+                raise NotImplementedError(
+                    "stft is implemented with librosa on this device, which does not "
+                    "support the training mode."
+                )
+            # use stft_kwargs to flexibly control different PyTorch versions' kwargs
+            stft_kwargs = dict(
+                n_fft=self.n_fft,
+                win_length=self.win_length,
+                hop_length=self.hop_length,
+                center=self.center,
+                window=window,
+            )
+            if window is not None:
+                # pad the given window to n_fft
+                n_pad_left = (self.n_fft - window.shape[0]) // 2
+                n_pad_right = self.n_fft - window.shape[0] - n_pad_left
+                stft_kwargs["window"] = torch.cat(
+                    [torch.zeros(n_pad_left), window, torch.zeros(n_pad_right)], 0
+                ).numpy()
+            else:
+                win_length = (
+                    self.win_length if self.win_length is not None else self.n_fft
+                )
+                stft_kwargs["window"] = torch.ones(win_length)
+            output = []
+            # iterate over istances in a batch
+            for i, instance in enumerate(input):
+                stft = librosa.stft(input[i].numpy(), **stft_kwargs)
+                output.append(torch.tensor(np.stack([stft.real, stft.imag], -1)))
+            output = torch.stack(output, 0)
+            if not self.onesided:
+                len_conj = self.n_fft - output.shape[1]
+                conj = output[:, 1 : 1 + len_conj].flip(1)
+                conj[:, :, :, -1].data *= -1
+                output = torch.cat([output, conj], 1)
+            if self.normalized:
+                output = output * (stft_kwargs["window"].shape[0] ** (-0.5))
+        # output: (Batch, Freq, Frames, 2=real_imag)
+        # -> (Batch, Frames, Freq, 2=real_imag)
+        output = output.transpose(1, 2)
+        if multi_channel:
+            # output: (Batch * Channel, Frames, Freq, 2=real_imag)
+            # -> (Batch, Frame, Channel, Freq, 2=real_imag)
+            output = output.view(bs, -1, output.size(1), output.size(2), 2).transpose(
+                1, 2
+            )
+        if ilens is not None:
+            if self.center:
+                pad = self.n_fft // 2
+                ilens = ilens + 2 * pad
+            olens = (ilens - self.n_fft) // self.hop_length + 1
+            output.masked_fill_(make_pad_mask(olens, output, 1), 0.0)
+        else:
+            olens = None
+        return output, olens
+    def inverse(
+        self, input: Union[torch.Tensor, ComplexTensor], ilens: torch.Tensor = None
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Inverse STFT.
+        Args:
+            input: Tensor(batch, T, F, 2) or ComplexTensor(batch, T, F)
+            ilens: (batch,)
+        Returns:
+            wavs: (batch, samples)
+            ilens: (batch,)
+        """
+        if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
+            istft = torch.functional.istft
+        else:
+            try:
+                import torchaudio
+            except ImportError:
+                raise ImportError(
+                    "Please install torchaudio>=0.3.0 or use torch>=1.6.0"
+                )
+            if not hasattr(torchaudio.functional, "istft"):
+                raise ImportError(
+                    "Please install torchaudio>=0.3.0 or use torch>=1.6.0"
+                )
+            istft = torchaudio.functional.istft
+        if self.window is not None:
+            window_func = getattr(torch, f"{self.window}_window")
+            if is_complex(input):
+                datatype = input.real.dtype
+            else:
+                datatype = input.dtype
+            window = window_func(self.win_length, dtype=datatype, device=input.device)
+        else:
+            window = None
+        if is_complex(input):
+            input = torch.stack([input.real, input.imag], dim=-1)
+        elif input.shape[-1] != 2:
+            raise TypeError("Invalid input type")
+        input = input.transpose(1, 2)
+        wavs = istft(
+            input,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            win_length=self.win_length,
+            window=window,
+            center=self.center,
+            normalized=self.normalized,
+            onesided=self.onesided,
+            length=ilens.max() if ilens is not None else ilens,
+        )
+        return wavs, ilens

funasr_detach/frontends/wav_frontend.py ADDED Viewed

	@@ -0,0 +1,556 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Part of the implementation is borrowed from espnet/espnet.
+from typing import Tuple
+import copy
+import numpy as np
+import torch
+import torch.nn as nn
+import torchaudio.compliance.kaldi as kaldi
+from torch.nn.utils.rnn import pad_sequence
+import funasr_detach.frontends.eend_ola_feature as eend_ola_feature
+from funasr_detach.register import tables
+def load_cmvn(cmvn_file):
+    with open(cmvn_file, "r", encoding="utf-8") as f:
+        lines = f.readlines()
+    means_list = []
+    vars_list = []
+    for i in range(len(lines)):
+        line_item = lines[i].split()
+        if line_item[0] == "<AddShift>":
+            line_item = lines[i + 1].split()
+            if line_item[0] == "<LearnRateCoef>":
+                add_shift_line = line_item[3 : (len(line_item) - 1)]
+                means_list = list(add_shift_line)
+                continue
+        elif line_item[0] == "<Rescale>":
+            line_item = lines[i + 1].split()
+            if line_item[0] == "<LearnRateCoef>":
+                rescale_line = line_item[3 : (len(line_item) - 1)]
+                vars_list = list(rescale_line)
+                continue
+    means = np.array(means_list).astype(np.float32)
+    vars = np.array(vars_list).astype(np.float32)
+    cmvn = np.array([means, vars])
+    cmvn = torch.as_tensor(cmvn, dtype=torch.float32)
+    return cmvn
+def apply_cmvn(inputs, cmvn):  # noqa
+    """
+    Apply CMVN with mvn data
+    """
+    device = inputs.device
+    dtype = inputs.dtype
+    frame, dim = inputs.shape
+    means = cmvn[0:1, :dim]
+    vars = cmvn[1:2, :dim]
+    inputs += means.to(device)
+    inputs *= vars.to(device)
+    return inputs.type(torch.float32)
+def apply_lfr(inputs, lfr_m, lfr_n):
+    LFR_inputs = []
+    T = inputs.shape[0]
+    T_lfr = int(np.ceil(T / lfr_n))
+    left_padding = inputs[0].repeat((lfr_m - 1) // 2, 1)
+    inputs = torch.vstack((left_padding, inputs))
+    T = T + (lfr_m - 1) // 2
+    for i in range(T_lfr):
+        if lfr_m <= T - i * lfr_n:
+            LFR_inputs.append((inputs[i * lfr_n : i * lfr_n + lfr_m]).view(1, -1))
+        else:  # process last LFR frame
+            num_padding = lfr_m - (T - i * lfr_n)
+            frame = (inputs[i * lfr_n :]).view(-1)
+            for _ in range(num_padding):
+                frame = torch.hstack((frame, inputs[-1]))
+            LFR_inputs.append(frame)
+    LFR_outputs = torch.vstack(LFR_inputs)
+    return LFR_outputs.type(torch.float32)
+@tables.register("frontend_classes", "WavFrontend")
+class WavFrontend(nn.Module):
+    """Conventional frontend structure for ASR."""
+    def __init__(
+        self,
+        cmvn_file: str = None,
+        fs: int = 16000,
+        window: str = "hamming",
+        n_mels: int = 80,
+        frame_length: int = 25,
+        frame_shift: int = 10,
+        filter_length_min: int = -1,
+        filter_length_max: int = -1,
+        lfr_m: int = 1,
+        lfr_n: int = 1,
+        dither: float = 1.0,
+        snip_edges: bool = True,
+        upsacle_samples: bool = True,
+        **kwargs,
+    ):
+        super().__init__()
+        self.fs = fs
+        self.window = window
+        self.n_mels = n_mels
+        self.frame_length = frame_length
+        self.frame_shift = frame_shift
+        self.filter_length_min = filter_length_min
+        self.filter_length_max = filter_length_max
+        self.lfr_m = lfr_m
+        self.lfr_n = lfr_n
+        self.cmvn_file = cmvn_file
+        self.dither = dither
+        self.snip_edges = snip_edges
+        self.upsacle_samples = upsacle_samples
+        self.cmvn = None if self.cmvn_file is None else load_cmvn(self.cmvn_file)
+    def output_size(self) -> int:
+        return self.n_mels * self.lfr_m
+    def forward(
+        self,
+        input: torch.Tensor,
+        input_lengths,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        batch_size = input.size(0)
+        feats = []
+        feats_lens = []
+        for i in range(batch_size):
+            waveform_length = input_lengths[i]
+            waveform = input[i][:waveform_length]
+            if self.upsacle_samples:
+                waveform = waveform * (1 << 15)
+            waveform = waveform.unsqueeze(0)
+            mat = kaldi.fbank(
+                waveform,
+                num_mel_bins=self.n_mels,
+                frame_length=self.frame_length,
+                frame_shift=self.frame_shift,
+                dither=self.dither,
+                energy_floor=0.0,
+                window_type=self.window,
+                sample_frequency=self.fs,
+                snip_edges=self.snip_edges,
+            )
+            if self.lfr_m != 1 or self.lfr_n != 1:
+                mat = apply_lfr(mat, self.lfr_m, self.lfr_n)
+            if self.cmvn is not None:
+                mat = apply_cmvn(mat, self.cmvn)
+            feat_length = mat.size(0)
+            feats.append(mat)
+            feats_lens.append(feat_length)
+        feats_lens = torch.as_tensor(feats_lens)
+        if batch_size == 1:
+            feats_pad = feats[0][None, :, :]
+        else:
+            feats_pad = pad_sequence(feats, batch_first=True, padding_value=0.0)
+        return feats_pad, feats_lens
+    def forward_fbank(
+        self, input: torch.Tensor, input_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        batch_size = input.size(0)
+        feats = []
+        feats_lens = []
+        for i in range(batch_size):
+            waveform_length = input_lengths[i]
+            waveform = input[i][:waveform_length]
+            waveform = waveform * (1 << 15)
+            waveform = waveform.unsqueeze(0)
+            mat = kaldi.fbank(
+                waveform,
+                num_mel_bins=self.n_mels,
+                frame_length=self.frame_length,
+                frame_shift=self.frame_shift,
+                dither=self.dither,
+                energy_floor=0.0,
+                window_type=self.window,
+                sample_frequency=self.fs,
+            )
+            feat_length = mat.size(0)
+            feats.append(mat)
+            feats_lens.append(feat_length)
+        feats_lens = torch.as_tensor(feats_lens)
+        feats_pad = pad_sequence(feats, batch_first=True, padding_value=0.0)
+        return feats_pad, feats_lens
+    def forward_lfr_cmvn(
+        self, input: torch.Tensor, input_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        batch_size = input.size(0)
+        feats = []
+        feats_lens = []
+        for i in range(batch_size):
+            mat = input[i, : input_lengths[i], :]
+            if self.lfr_m != 1 or self.lfr_n != 1:
+                mat = apply_lfr(mat, self.lfr_m, self.lfr_n)
+            if self.cmvn is not None:
+                mat = apply_cmvn(mat, self.cmvn)
+            feat_length = mat.size(0)
+            feats.append(mat)
+            feats_lens.append(feat_length)
+        feats_lens = torch.as_tensor(feats_lens)
+        feats_pad = pad_sequence(feats, batch_first=True, padding_value=0.0)
+        return feats_pad, feats_lens
+@tables.register("frontend_classes", "WavFrontendOnline")
+class WavFrontendOnline(nn.Module):
+    """Conventional frontend structure for streaming ASR/VAD."""
+    def __init__(
+        self,
+        cmvn_file: str = None,
+        fs: int = 16000,
+        window: str = "hamming",
+        n_mels: int = 80,
+        frame_length: int = 25,
+        frame_shift: int = 10,
+        filter_length_min: int = -1,
+        filter_length_max: int = -1,
+        lfr_m: int = 1,
+        lfr_n: int = 1,
+        dither: float = 1.0,
+        snip_edges: bool = True,
+        upsacle_samples: bool = True,
+        **kwargs,
+    ):
+        super().__init__()
+        self.fs = fs
+        self.window = window
+        self.n_mels = n_mels
+        self.frame_length = frame_length
+        self.frame_shift = frame_shift
+        self.frame_sample_length = int(self.frame_length * self.fs / 1000)
+        self.frame_shift_sample_length = int(self.frame_shift * self.fs / 1000)
+        self.filter_length_min = filter_length_min
+        self.filter_length_max = filter_length_max
+        self.lfr_m = lfr_m
+        self.lfr_n = lfr_n
+        self.cmvn_file = cmvn_file
+        self.dither = dither
+        self.snip_edges = snip_edges
+        self.upsacle_samples = upsacle_samples
+        # self.waveforms = None
+        # self.reserve_waveforms = None
+        # self.fbanks = None
+        # self.fbanks_lens = None
+        self.cmvn = None if self.cmvn_file is None else load_cmvn(self.cmvn_file)
+        # self.input_cache = None
+        # self.lfr_splice_cache = []
+    def output_size(self) -> int:
+        return self.n_mels * self.lfr_m
+    @staticmethod
+    def apply_cmvn(inputs: torch.Tensor, cmvn: torch.Tensor) -> torch.Tensor:
+        """
+        Apply CMVN with mvn data
+        """
+        device = inputs.device
+        dtype = inputs.dtype
+        frame, dim = inputs.shape
+        means = np.tile(cmvn[0:1, :dim], (frame, 1))
+        vars = np.tile(cmvn[1:2, :dim], (frame, 1))
+        inputs += torch.from_numpy(means).type(dtype).to(device)
+        inputs *= torch.from_numpy(vars).type(dtype).to(device)
+        return inputs.type(torch.float32)
+    @staticmethod
+    def apply_lfr(
+        inputs: torch.Tensor, lfr_m: int, lfr_n: int, is_final: bool = False
+    ) -> Tuple[torch.Tensor, torch.Tensor, int]:
+        """
+        Apply lfr with data
+        """
+        LFR_inputs = []
+        # inputs = torch.vstack((inputs_lfr_cache, inputs))
+        T = inputs.shape[0]  # include the right context
+        T_lfr = int(
+            np.ceil((T - (lfr_m - 1) // 2) / lfr_n)
+        )  # minus the right context: (lfr_m - 1) // 2
+        splice_idx = T_lfr
+        for i in range(T_lfr):
+            if lfr_m <= T - i * lfr_n:
+                LFR_inputs.append((inputs[i * lfr_n : i * lfr_n + lfr_m]).view(1, -1))
+            else:  # process last LFR frame
+                if is_final:
+                    num_padding = lfr_m - (T - i * lfr_n)
+                    frame = (inputs[i * lfr_n :]).view(-1)
+                    for _ in range(num_padding):
+                        frame = torch.hstack((frame, inputs[-1]))
+                    LFR_inputs.append(frame)
+                else:
+                    # update splice_idx and break the circle
+                    splice_idx = i
+                    break
+        splice_idx = min(T - 1, splice_idx * lfr_n)
+        lfr_splice_cache = inputs[splice_idx:, :]
+        LFR_outputs = torch.vstack(LFR_inputs)
+        return LFR_outputs.type(torch.float32), lfr_splice_cache, splice_idx
+    @staticmethod
+    def compute_frame_num(
+        sample_length: int, frame_sample_length: int, frame_shift_sample_length: int
+    ) -> int:
+        frame_num = int(
+            (sample_length - frame_sample_length) / frame_shift_sample_length + 1
+        )
+        return (
+            frame_num if frame_num >= 1 and sample_length >= frame_sample_length else 0
+        )
+    def forward_fbank(
+        self,
+        input: torch.Tensor,
+        input_lengths: torch.Tensor,
+        cache: dict = {},
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        batch_size = input.size(0)
+        assert batch_size == 1
+        input = torch.cat((cache["input_cache"], input), dim=1)
+        frame_num = self.compute_frame_num(
+            input.shape[-1], self.frame_sample_length, self.frame_shift_sample_length
+        )
+        # update self.in_cache
+        cache["input_cache"] = input[
+            :, -(input.shape[-1] - frame_num * self.frame_shift_sample_length) :
+        ]
+        waveforms = torch.empty(0)
+        feats_pad = torch.empty(0)
+        feats_lens = torch.empty(0)
+        if frame_num:
+            waveforms = []
+            feats = []
+            feats_lens = []
+            for i in range(batch_size):
+                waveform = input[i].cuda()
+                # we need accurate wave samples that used for fbank extracting
+                waveforms.append(
+                    waveform[
+                        : (
+                            (frame_num - 1) * self.frame_shift_sample_length
+                            + self.frame_sample_length
+                        )
+                    ]
+                )
+                waveform = waveform * (1 << 15)
+                waveform = waveform.unsqueeze(0)
+                mat = kaldi.fbank(
+                    waveform,
+                    num_mel_bins=self.n_mels,
+                    frame_length=self.frame_length,
+                    frame_shift=self.frame_shift,
+                    dither=self.dither,
+                    energy_floor=0.0,
+                    window_type=self.window,
+                    sample_frequency=self.fs,
+                )
+                feat_length = mat.size(0)
+                feats.append(mat)
+                feats_lens.append(feat_length)
+            waveforms = torch.stack(waveforms)
+            feats_lens = torch.as_tensor(feats_lens)
+            feats_pad = pad_sequence(feats, batch_first=True, padding_value=0.0)
+        cache["fbanks"] = feats_pad
+        cache["fbanks_lens"] = copy.deepcopy(feats_lens)
+        return waveforms, feats_pad, feats_lens
+    def forward_lfr_cmvn(
+        self,
+        input: torch.Tensor,
+        input_lengths: torch.Tensor,
+        is_final: bool = False,
+        cache: dict = {},
+        **kwargs,
+    ):
+        batch_size = input.size(0)
+        feats = []
+        feats_lens = []
+        lfr_splice_frame_idxs = []
+        for i in range(batch_size):
+            mat = input[i, : input_lengths[i], :]
+            if self.lfr_m != 1 or self.lfr_n != 1:
+                # update self.lfr_splice_cache in self.apply_lfr
+                # mat, self.lfr_splice_cache[i], lfr_splice_frame_idx = self.apply_lfr(mat, self.lfr_m, self.lfr_n, self.lfr_splice_cache[i],
+                mat, cache["lfr_splice_cache"][i], lfr_splice_frame_idx = (
+                    self.apply_lfr(mat, self.lfr_m, self.lfr_n, is_final)
+                )
+            if self.cmvn_file is not None:
+                mat = self.apply_cmvn(mat, self.cmvn)
+            feat_length = mat.size(0)
+            feats.append(mat)
+            feats_lens.append(feat_length)
+            lfr_splice_frame_idxs.append(lfr_splice_frame_idx)
+        feats_lens = torch.as_tensor(feats_lens)
+        feats_pad = pad_sequence(feats, batch_first=True, padding_value=0.0)
+        lfr_splice_frame_idxs = torch.as_tensor(lfr_splice_frame_idxs)
+        return feats_pad, feats_lens, lfr_splice_frame_idxs
+    def forward(self, input: torch.Tensor, input_lengths: torch.Tensor, **kwargs):
+        is_final = kwargs.get("is_final", False)
+        cache = kwargs.get("cache", {})
+        if len(cache) == 0:
+            self.init_cache(cache)
+        batch_size = input.shape[0]
+        assert (
+            batch_size == 1
+        ), "we support to extract feature online only when the batch size is equal to 1 now"
+        waveforms, feats, feats_lengths = self.forward_fbank(
+            input, input_lengths, cache=cache
+        )  # input shape: B T D
+        if feats.shape[0]:
+            cache["waveforms"] = torch.cat(
+                (cache["reserve_waveforms"], waveforms.cpu()), dim=1
+            )
+            if not cache["lfr_splice_cache"]:  # 初始化splice_cache
+                for i in range(batch_size):
+                    cache["lfr_splice_cache"].append(
+                        feats[i][0, :].unsqueeze(dim=0).repeat((self.lfr_m - 1) // 2, 1)
+                    )
+            # need the number of the input frames + self.lfr_splice_cache[0].shape[0] is greater than self.lfr_m
+            if feats_lengths[0] + cache["lfr_splice_cache"][0].shape[0] >= self.lfr_m:
+                lfr_splice_cache_tensor = torch.stack(
+                    cache["lfr_splice_cache"]
+                )  # B T D
+                feats = torch.cat((lfr_splice_cache_tensor, feats), dim=1)
+                feats_lengths += lfr_splice_cache_tensor[0].shape[0]
+                frame_from_waveforms = int(
+                    (cache["waveforms"].shape[1] - self.frame_sample_length)
+                    / self.frame_shift_sample_length
+                    + 1
+                )
+                minus_frame = (
+                    (self.lfr_m - 1) // 2
+                    if cache["reserve_waveforms"].numel() == 0
+                    else 0
+                )
+                feats, feats_lengths, lfr_splice_frame_idxs = self.forward_lfr_cmvn(
+                    feats, feats_lengths, is_final, cache=cache
+                )
+                if self.lfr_m == 1:
+                    cache["reserve_waveforms"] = torch.empty(0)
+                else:
+                    reserve_frame_idx = lfr_splice_frame_idxs[0] - minus_frame
+                    # print('reserve_frame_idx:  ' + str(reserve_frame_idx))
+                    # print('frame_frame:  ' + str(frame_from_waveforms))
+                    cache["reserve_waveforms"] = cache["waveforms"][
+                        :,
+                        reserve_frame_idx
+                        * self.frame_shift_sample_length : frame_from_waveforms
+                        * self.frame_shift_sample_length,
+                    ]
+                    sample_length = (
+                        frame_from_waveforms - 1
+                    ) * self.frame_shift_sample_length + self.frame_sample_length
+                    cache["waveforms"] = cache["waveforms"][:, :sample_length]
+            else:
+                # update self.reserve_waveforms and self.lfr_splice_cache
+                cache["reserve_waveforms"] = cache["waveforms"][
+                    :, : -(self.frame_sample_length - self.frame_shift_sample_length)
+                ]
+                for i in range(batch_size):
+                    cache["lfr_splice_cache"][i] = torch.cat(
+                        (cache["lfr_splice_cache"][i], feats[i]), dim=0
+                    )
+                return torch.empty(0), feats_lengths
+        else:
+            if is_final:
+                cache["waveforms"] = (
+                    waveforms
+                    if cache["reserve_waveforms"].numel() == 0
+                    else cache["reserve_waveforms"]
+                )
+                feats = torch.stack(cache["lfr_splice_cache"])
+                feats_lengths = (
+                    torch.zeros(batch_size, dtype=torch.int) + feats.shape[1]
+                )
+                feats, feats_lengths, _ = self.forward_lfr_cmvn(
+                    feats, feats_lengths, is_final, cache=cache
+                )
+        # if is_final:
+        #     self.init_cache(cache)
+        return feats, feats_lengths
+    def init_cache(self, cache: dict = {}):
+        cache["reserve_waveforms"] = torch.empty(0)
+        cache["input_cache"] = torch.empty(0)
+        cache["lfr_splice_cache"] = []
+        cache["waveforms"] = None
+        cache["fbanks"] = None
+        cache["fbanks_lens"] = None
+        return cache
+class WavFrontendMel23(nn.Module):
+    """Conventional frontend structure for ASR."""
+    def __init__(
+        self,
+        fs: int = 16000,
+        frame_length: int = 25,
+        frame_shift: int = 10,
+        lfr_m: int = 1,
+        lfr_n: int = 1,
+        **kwargs,
+    ):
+        super().__init__()
+        self.fs = fs
+        self.frame_length = frame_length
+        self.frame_shift = frame_shift
+        self.lfr_m = lfr_m
+        self.lfr_n = lfr_n
+        self.n_mels = 23
+    def output_size(self) -> int:
+        return self.n_mels * (2 * self.lfr_m + 1)
+    def forward(
+        self, input: torch.Tensor, input_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        batch_size = input.size(0)
+        feats = []
+        feats_lens = []
+        for i in range(batch_size):
+            waveform_length = input_lengths[i]
+            waveform = input[i][:waveform_length]
+            waveform = waveform.numpy()
+            mat = eend_ola_feature.stft(waveform, self.frame_length, self.frame_shift)
+            mat = eend_ola_feature.transform(mat)
+            mat = eend_ola_feature.splice(mat, context_size=self.lfr_m)
+            mat = mat[:: self.lfr_n]
+            mat = torch.from_numpy(mat)
+            feat_length = mat.size(0)
+            feats.append(mat)
+            feats_lens.append(feat_length)
+        feats_lens = torch.as_tensor(feats_lens)
+        feats_pad = pad_sequence(feats, batch_first=True, padding_value=0.0)
+        return feats_pad, feats_lens

funasr_detach/frontends/windowing.py ADDED Viewed

	@@ -0,0 +1,74 @@

+#!/usr/bin/env python3
+#  2020, Technische Universität München;  Ludwig Kürzinger
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Sliding Window for raw audio input data."""
+import torch
+import torch.nn as nn
+from typing import Tuple
+class SlidingWindow(nn.Module):
+    """Sliding Window.
+    Provides a sliding window over a batched continuous raw audio tensor.
+    Optionally, provides padding (Currently not implemented).
+    Combine this module with a pre-encoder compatible with raw audio data,
+    for example Sinc convolutions.
+    Known issues:
+    Output length is calculated incorrectly if audio shorter than win_length.
+    WARNING: trailing values are discarded - padding not implemented yet.
+    There is currently no additional window function applied to input values.
+    """
+    def __init__(
+        self,
+        win_length: int = 400,
+        hop_length: int = 160,
+        channels: int = 1,
+        padding: int = None,
+        fs=None,
+    ):
+        """Initialize.
+        Args:
+            win_length: Length of frame.
+            hop_length: Relative starting point of next frame.
+            channels: Number of input channels.
+            padding: Padding (placeholder, currently not implemented).
+            fs:  Sampling rate (placeholder for compatibility, not used).
+        """
+        super().__init__()
+        self.fs = fs
+        self.win_length = win_length
+        self.hop_length = hop_length
+        self.channels = channels
+        self.padding = padding
+    def forward(
+        self, input: torch.Tensor, input_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Apply a sliding window on the input.
+        Args:
+            input: Input (B, T, C*D) or (B, T*C*D), with D=C=1.
+            input_lengths: Input lengths within batch.
+        Returns:
+            Tensor: Output with dimensions (B, T, C, D), with D=win_length.
+            Tensor: Output lengths within batch.
+        """
+        input_size = input.size()
+        B = input_size[0]
+        T = input_size[1]
+        C = self.channels
+        D = self.win_length
+        # (B, T, C) --> (T, B, C)
+        continuous = input.view(B, T, C).permute(1, 0, 2)
+        windowed = continuous.unfold(0, D, self.hop_length)
+        # (T, B, C, D) --> (B, T, C, D)
+        output = windowed.permute(1, 0, 2, 3).contiguous()
+        # After unfold(), windowed lengths change:
+        output_lengths = (input_lengths - self.win_length) // self.hop_length + 1
+        return output, output_lengths
+    def output_size(self) -> int:
+        """Return output length of feature dimension D, i.e. the window length."""
+        return self.win_length

funasr_detach/losses/__init__.py ADDED Viewed

File without changes

funasr_detach/losses/label_smoothing_loss.py ADDED Viewed

	@@ -0,0 +1,125 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright 2019 Shigeki Karita
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Label smoothing module."""
+import torch
+from torch import nn
+from funasr_detach.models.transformer.utils.nets_utils import make_pad_mask
+class LabelSmoothingLoss(nn.Module):
+    """Label-smoothing loss.
+    :param int size: the number of class
+    :param int padding_idx: ignored class id
+    :param float smoothing: smoothing rate (0.0 means the conventional CE)
+    :param bool normalize_length: normalize loss by sequence length if True
+    :param torch.nn.Module criterion: loss function to be smoothed
+    """
+    def __init__(
+        self,
+        size,
+        padding_idx,
+        smoothing,
+        normalize_length=False,
+        criterion=nn.KLDivLoss(reduction="none"),
+    ):
+        """Construct an LabelSmoothingLoss object."""
+        super(LabelSmoothingLoss, self).__init__()
+        self.criterion = criterion
+        self.padding_idx = padding_idx
+        self.confidence = 1.0 - smoothing
+        self.smoothing = smoothing
+        self.size = size
+        self.true_dist = None
+        self.normalize_length = normalize_length
+    def forward(self, x, target):
+        """Compute loss between x and target.
+        :param torch.Tensor x: prediction (batch, seqlen, class)
+        :param torch.Tensor target:
+            target signal masked with self.padding_id (batch, seqlen)
+        :return: scalar float value
+        :rtype torch.Tensor
+        """
+        assert x.size(2) == self.size
+        batch_size = x.size(0)
+        x = x.view(-1, self.size)
+        target = target.view(-1)
+        with torch.no_grad():
+            true_dist = x.clone()
+            true_dist.fill_(self.smoothing / (self.size - 1))
+            ignore = target == self.padding_idx  # (B,)
+            total = len(target) - ignore.sum().item()
+            target = target.masked_fill(ignore, 0)  # avoid -1 index
+            true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
+        kl = self.criterion(torch.log_softmax(x, dim=1), true_dist)
+        denom = total if self.normalize_length else batch_size
+        return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom
+class SequenceBinaryCrossEntropy(nn.Module):
+    def __init__(
+        self, normalize_length=False, criterion=nn.BCEWithLogitsLoss(reduction="none")
+    ):
+        super().__init__()
+        self.normalize_length = normalize_length
+        self.criterion = criterion
+    def forward(self, pred, label, lengths):
+        pad_mask = make_pad_mask(lengths, maxlen=pred.shape[1]).to(pred.device)
+        loss = self.criterion(pred, label)
+        denom = (~pad_mask).sum() if self.normalize_length else pred.shape[0]
+        return loss.masked_fill(pad_mask.unsqueeze(-1), 0).sum() / denom
+class NllLoss(nn.Module):
+    """Nll loss.
+    :param int size: the number of class
+    :param int padding_idx: ignored class id
+    :param bool normalize_length: normalize loss by sequence length if True
+    :param torch.nn.Module criterion: loss function
+    """
+    def __init__(
+        self,
+        size,
+        padding_idx,
+        normalize_length=False,
+        criterion=nn.NLLLoss(reduction="none"),
+    ):
+        """Construct an NllLoss object."""
+        super(NllLoss, self).__init__()
+        self.criterion = criterion
+        self.padding_idx = padding_idx
+        self.size = size
+        self.true_dist = None
+        self.normalize_length = normalize_length
+    def forward(self, x, target):
+        """Compute loss between x and target.
+        :param torch.Tensor x: prediction (batch, seqlen, class)
+        :param torch.Tensor target:
+            target signal masked with self.padding_id (batch, seqlen)
+        :return: scalar float value
+        :rtype torch.Tensor
+        """
+        assert x.size(2) == self.size
+        batch_size = x.size(0)
+        x = x.view(-1, self.size)
+        target = target.view(-1)
+        with torch.no_grad():
+            ignore = target == self.padding_idx  # (B,)
+            total = len(target) - ignore.sum().item()
+            target = target.masked_fill(ignore, 0)  # avoid -1 index
+        kl = self.criterion(x, target)
+        denom = total if self.normalize_length else batch_size
+        return kl.masked_fill(ignore, 0).sum() / denom