.

Browse files

Files changed (4) hide show

inference.py +43 -0
requirements.txt +13 -0
test_prompt.json +31 -0
utils.py +158 -0

inference.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import torch
+from transformers import WhisperFeatureExtractor
+from models.tinyoctopus import TINYOCTOPUS
+from utils import prepare_one_sample
+# Load model
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model = TINYOCTOPUS.from_config(cfg.config.model)
+model.to(device)
+model.eval()
+# Load processor
+wav_processor = WhisperFeatureExtractor.from_pretrained("distil-whisper/distil-large-v3")
+def transcribe(audio_path, task="dialect"):
+    """
+    Perform inference on an audio file.
+    Args:
+        audio_path (str): Path to the audio file.
+        task (str): Task to perform. Options: "dialect", "asr", "translation".
+    Returns:
+        str: The generated text.
+    """
+    task_prompts = {
+        "dialect": "What is the dialect of the speaker?",
+        "asr": "تعرف على الكلام وأعطني النص.",
+        "translation": "الرجاء ترجمة هذا المقطع الصوتي إلى اللغة الإنجليزية."
+    }
+    if task not in task_prompts:
+        raise ValueError("Invalid task. Choose from: 'dialect', 'asr', or 'translation'.")
+    try:
+        prompt = task_prompts[task]
+        samples = prepare_one_sample(audio_path, wav_processor)
+        prompt = [f"<Speech><SpeechHere></Speech> {prompt.strip()}"]
+        generated_text = model.generate(samples, {"temperature": 0.7}, prompts=prompt)[0]
+        return generated_text.replace('<s>', '').replace('</s>', '').strip()
+    except Exception as e:
+        return f"Error: {e}"

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+torch==2.0.1
+torchaudio==2.0.2
+peft==0.3.0
+soundfile
+librosa
+transformers==4.28.0
+sentencepiece==0.1.97
+accelerate==0.20.3
+bitsandbytes==0.35.0
+gradio==3.23.0
+safetensors
+tensorboardX
+jiwer

test_prompt.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+    "asr": "<Speech><SpeechHere></Speech> Recognize the speech and give me the transcription.",
+    "gender_recognition": "<Speech><SpeechHere></Speech> What is the gender of the speaker?",
+    "dialect_identification": "<Speech><SpeechHere></Speech> What is the dialect of the speaker?",
+    "asr_zh": "<Speech><SpeechHere></Speech> 请将语音中的内容写下来。",
+    "summarization": "<Speech><SpeechHere></Speech> Could you capture the main points of this audio in a short summary?",
+    "translation_ae": "<Speech><SpeechHere></Speech> Listen to the speech and translate it into English.",
+    "asr_de": "<Speech><SpeechHere></Speech> Hören Sie sich die Rede an und schreiben Sie ihren Inhalt auf.",
+    "translation_ec": "<Speech><SpeechHere></Speech> Listen to the speech and translate it into Chinese.",
+    "audiocaption": "<Speech><SpeechHere></Speech> Please describe the audio.",
+    "audiocaption_v2": "<Speech><SpeechHere></Speech> Please write down what your hear in the audio.",
+    "QA": "<Speech><SpeechHere></Speech> {}",
+    "gender_QA": "<Speech><SpeechHere></Speech> {}",
+    "phone_recognition": "<Speech><SpeechHere></Speech> Provide the phonetic transcription for the speech.",
+    "speech_query": "<Speech><SpeechHere></Speech> Please answer the question in detail.",
+    "emotion_recognition": "<Speech><SpeechHere></Speech> Describe the emotion of the speaker in one word.",
+    "lyrics_recognition": "<Speech><SpeechHere></Speech> Listen to the song and write down its content.",
+    "audio_speech_description": "<Speech><SpeechHere></Speech> Describe the speech and the background audio",
+    "speaker_verification": "<Speech><SpeechHere></Speech> Do you only hear the same person talking? Answer yes or no.",
+    "fluent_speech_audio": "<Speech><SpeechHere></Speech> Describe the background audio and the speech in a fluent sentence.",
+    "speech_separation": "<Speech><SpeechHere></Speech> Please write down what you hear each person says.",
+    "audio_story_telling": "<Speech><SpeechHere></Speech> Based on the audio, write a story in detail. Your story should be highly related to the audio.",
+    "speech_audio_query": "<Speech><SpeechHere></Speech> Please answer the speaker's question in detail based on the background sound.",
+    "slot_filling": "<Speech><SpeechHere></Speech> According to the speech, what is the {}?",
+    "music_description": "<Speech><SpeechHere></Speech> Listen to this music clip and describe the music.",
+    "translation_en2ja": "<Speech><SpeechHere></Speech> Listen to the speech and translate it into Japanese.",
+    "translation_en2de": "<Speech><SpeechHere></Speech> Listen to the speech and translate it into German.",
+    "speech_audio_coreasoning": "<Speech><SpeechHere></Speech> Use your strong reasoning skills to answer the speaker's question in detail based on the background sound.",
+    "keywords": "<Speech><SpeechHere></Speech> Give me only three keywords of the text.",
+    "speaker_diarization_asr": "<Speech><SpeechHere></Speech> Please recognize each speaker and transcribe their speech content."
+}

utils.py ADDED Viewed

	@@ -0,0 +1,158 @@

+# Copyright (2024) Tsinghua University, Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import time
+import torch
+from torch.utils.data import DataLoader, DistributedSampler
+import soundfile as sf
+import numpy as np
+from dist_utils import is_main_process, get_world_size, get_rank
+def now():
+    from datetime import datetime
+    return datetime.now().strftime("%Y%m%d%H%M")
+def setup_logger():
+    logging.basicConfig(
+        level=logging.INFO if is_main_process() else logging.WARN,
+        format="%(asctime)s [%(levelname)s] %(message)s",
+        handlers=[logging.StreamHandler()],
+    )
+def get_dataloader(dataset, config, is_train=True, use_distributed=True):
+    if use_distributed:
+        sampler = DistributedSampler(
+            dataset,
+            shuffle=is_train,
+            num_replicas=get_world_size(),
+            rank=get_rank()
+        )
+    else:
+        sampler = None
+    loader = DataLoader(
+        dataset,
+        batch_size=config.batch_size_train if is_train else config.batch_size_eval,
+        num_workers=config.num_workers,
+        pin_memory=True,
+        sampler=sampler,
+        shuffle=sampler is None and is_train,
+        collate_fn=dataset.collater,
+        drop_last=is_train,
+    )
+    if is_train:
+        loader = IterLoader(loader, use_distributed=use_distributed)
+    return loader
+def apply_to_sample(f, sample):
+    if len(sample) == 0:
+        return {}
+    def _apply(x):
+        if torch.is_tensor(x):
+            return f(x)
+        elif isinstance(x, dict):
+            return {key: _apply(value) for key, value in x.items()}
+        elif isinstance(x, list):
+            return [_apply(x) for x in x]
+        else:
+            return x
+    return _apply(sample)
+def move_to_cuda(sample):
+    def _move_to_cuda(tensor):
+        return tensor.cuda()
+    return apply_to_sample(_move_to_cuda, sample)
+def prepare_sample(samples, cuda_enabled=True):
+    if cuda_enabled:
+        samples = move_to_cuda(samples)
+    # TODO fp16 support
+    return samples
+class IterLoader:
+    """
+    A wrapper to convert DataLoader as an infinite iterator.
+    Modified from:
+        https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/iter_based_runner.py
+    """
+    def __init__(self, dataloader: DataLoader, use_distributed: bool = False):
+        self._dataloader = dataloader
+        self.iter_loader = iter(self._dataloader)
+        self._use_distributed = use_distributed
+        self._epoch = 0
+    @property
+    def epoch(self) -> int:
+        return self._epoch
+    def __next__(self):
+        try:
+            data = next(self.iter_loader)
+        except StopIteration:
+            self._epoch += 1
+            if hasattr(self._dataloader.sampler, "set_epoch") and self._use_distributed:
+                self._dataloader.sampler.set_epoch(self._epoch)
+            time.sleep(2)  # Prevent possible deadlock during epoch transition
+            self.iter_loader = iter(self._dataloader)
+            data = next(self.iter_loader)
+        return data
+    def __iter__(self):
+        return self
+    def __len__(self):
+        return len(self._dataloader)
+def prepare_one_sample(wav_path, wav_processor, cuda_enabled=True):
+    audio, sr = sf.read(wav_path)
+    if len(audio.shape) == 2: # stereo to mono
+        audio = audio[:, 0]
+    if len(audio) < sr: # pad audio to at least 1s
+        sil = np.zeros(sr - len(audio), dtype=float)
+        audio = np.concatenate((audio, sil), axis=0)
+    audio = audio[: sr * 30] # truncate audio to at most 30s
+    spectrogram = wav_processor(audio, sampling_rate=sr, return_tensors="pt")["input_features"]
+    samples = {
+        "spectrogram": spectrogram,
+        "raw_wav": torch.from_numpy(audio).unsqueeze(0),
+        "padding_mask": torch.zeros(len(audio), dtype=torch.bool).unsqueeze(0),
+    }
+    if cuda_enabled:
+        samples = move_to_cuda(samples)
+    return samples