Spaces:

OdiaGenAI
/

Olive_Whisper_ASR

Running

App Files Files Community

sam2ai commited on Aug 11, 2023

Commit

6de3e11

1 Parent(s): f6a9722

Synced repo using 'sync_with_huggingface' Github Action

Browse files

Files changed (26) hide show

Dockerfile +69 -0
LICENSE +201 -0
configs/augmentation.json +43 -0
dataset/test.mp3 +0 -0
download.py +44 -0
evaluation.py +96 -0
finetune.py +160 -0
infer_ct2.py +46 -0
infer_server.py +143 -0
infer_tfs.py +43 -0
merge_lora.py +47 -0
requirements.txt +21 -0
run.sh +22 -0
static/index.css +109 -0
static/record.js +229 -0
static/record.png +0 -0
static/recording.gif +0 -0
templates/index.html +167 -0
utils/__init__.py +0 -0
utils/binary.py +72 -0
utils/callback.py +37 -0
utils/data_utils.py +65 -0
utils/model_utils.py +20 -0
utils/pun_predictor.py +110 -0
utils/reader.py +289 -0
utils/utils.py +87 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,69 @@

+FROM nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu20.04
+# Use Python 3.11 for better Python perf
+# Update the package lists and install necessary dependencies
+RUN apt-get update && apt-get install -y \
+    software-properties-common \
+    && add-apt-repository -y ppa:deadsnakes/ppa \
+    && apt-get update \
+    && apt-get install -y python3.11 python3.11-dev
+# Set Python 3.11 as the default version (for python3)
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1
+# Download get-pip.py script
+RUN apt install curl -y
+RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
+# Install pip for Python 3.11
+RUN python3 get-pip.py
+# Verify Python and pip versions
+RUN python3 --version && pip3.11 --version
+# Set pip3.11 as the default pip command
+RUN update-alternatives --install /usr/bin/pip3 pip3 /usr/local/lib/python3.11/dist-packages/pip 1
+ENV PYTHONUNBUFFERED=1
+# Install necessary dependencies
+# RUN apt-get update && \
+#     apt-get install -y python3-pip
+# Set the working directory. /app is mounted to the container with -v,
+# but we want to have the right cwd for uvicorn command below
+RUN mkdir /app
+# WORKDIR /app
+# # Copy the app code and requirements filed
+# COPY . /app
+# COPY requirements.txt .
+# WORKDIR $PYSETUP_PATH
+COPY ./requirements.txt  /app
+COPY ./utils /app/utils
+COPY ./static /app/static
+COPY ./templates /app/templates
+COPY ./infer_server.py /app/infer_server.py
+COPY ./download.py /app/download.py
+WORKDIR /app
+# Install the app dependencies
+# RUN pip3 install -r requirements.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+        pip3 install -r requirements.txt
+# Expose the FastAPI port
+EXPOSE 5001
+# Start the FastAPI app using Uvicorn web server
+# CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "14000", "--limit-concurrency", "1000"]
+RUN python3 download.py
+CMD ["python3", "infer_server.py", "--host=0.0.0.0", "--port=5001", "--model_path=models/sam2ai/whisper-odia-small-finetune-int8-ct2", "--num_workers=2"]

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

configs/augmentation.json ADDED Viewed

	@@ -0,0 +1,43 @@

+[
+  {
+    "type": "resample",
+    "params": {
+      "new_sample_rates": [8000, 32000, 44100]
+    },
+    "prob": 0.0
+  },
+  {
+    "type": "noise",
+    "params": {
+      "min_snr_dB": 10,
+      "max_snr_dB": 50,
+      "noise_dir": "dataset/noise"
+    },
+    "prob": 0.2
+  },
+  {
+    "type": "speed",
+    "params": {
+      "min_speed_rate": 0.9,
+      "max_speed_rate": 1.1,
+      "num_rates": 3
+    },
+    "prob": 0.5
+  },
+  {
+    "type": "shift",
+    "params": {
+      "min_shift_ms": -5,
+      "max_shift_ms": 5
+    },
+    "prob": 0.0
+  },
+  {
+    "type": "volume",
+    "params": {
+      "min_gain_dBFS": -15,
+      "max_gain_dBFS": 15
+    },
+    "prob": 0.5
+  }
+]

dataset/test.mp3 ADDED Viewed

Binary file (61.7 kB). View file

download.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import argparse
+import requests
+import os
+from tqdm import tqdm
+def download_file(url, path):
+    response = requests.get(url, stream=True)
+    total_size_in_bytes = int(response.headers.get('content-length', 0))
+    block_size = 1024 #1 Kbyte
+    progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
+    with open(path, 'wb') as file:
+        for data in response.iter_content(block_size):
+            progress_bar.update(len(data))
+            file.write(data)
+    progress_bar.close()
+def download_model(model_name, destination_folder="models"):
+    # Define the base URL and headers for the Hugging Face API
+    base_url = f"https://huggingface.co/{model_name}/resolve/main"
+    headers = {"User-Agent": "Hugging Face Python"}
+    # Send a GET request to the Hugging Face API to get a list of all files
+    response = requests.get(f"https://huggingface.co/api/models/{model_name}", headers=headers)
+    response.raise_for_status()
+    # Extract the list of files from the response JSON
+    files_to_download = [file["rfilename"] for file in response.json()["siblings"]]
+    # Ensure the directory exists
+    os.makedirs(f"{destination_folder}/{model_name}", exist_ok=True)
+    # Download each file
+    for file in files_to_download:
+        print(f"Downloading {file}...")
+        download_file(f"{base_url}/{file}", f"{destination_folder}/{model_name}/{file}")
+if __name__ == "__main__":
+    # parser = argparse.ArgumentParser()
+    # parser.add_argument("model_name", type=str, default="sam2ai/whisper-odia-small-finetune-int8-ct2", help="Name of the model to download.")
+    # args = parser.parse_args()
+    download_model("sam2ai/whisper-odia-small-finetune-int8-ct2")

evaluation.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import argparse
+import functools
+import gc
+import os
+import evaluate
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import WhisperForConditionalGeneration, WhisperProcessor
+from utils.data_utils import DataCollatorSpeechSeq2SeqWithPadding, remove_punctuation, to_simple
+from utils.reader import CustomDataset
+from utils.utils import print_arguments, add_arguments
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+add_arg("test_data",   type=str, default="dataset/test.json",            help="测试集的路径")
+add_arg("model_path",  type=str, default="models/whisper-tiny-finetune", help="合并模型的路径，或者是huggingface上模型的名称")
+add_arg("batch_size",  type=int, default=16,        help="评估的batch size")
+add_arg("num_workers", type=int, default=8,         help="读取数据的线程数量")
+add_arg("language",    type=str, default="Chinese", help="设置语言，可全称也可简写，如果为None则评估的是多语言")
+add_arg("remove_pun",  type=bool, default=True,     help="是否移除标点符号")
+add_arg("to_simple",   type=bool, default=True,     help="是否转为简体中文")
+add_arg("timestamps",  type=bool, default=False,    help="评估时是否使用时间戳数据")
+add_arg("min_audio_len",     type=float, default=0.5,  help="最小的音频长度，单位秒")
+add_arg("max_audio_len",     type=float, default=30,   help="最大的音频长度，单位秒")
+add_arg("local_files_only",  type=bool,  default=True, help="是否只在本地加载模型，不尝试下载")
+add_arg("task",       type=str, default="transcribe", choices=['transcribe', 'translate'], help="模型的任务")
+add_arg("metric",     type=str, default="cer",        choices=['cer', 'wer'],              help="评估方式")
+args = parser.parse_args()
+print_arguments(args)
+# 判断模型路径是否合法
+assert 'openai' == os.path.dirname(args.model_path) or os.path.exists(args.model_path), \
+    f"模型文件{args.model_path}不存在，请检查是否已经成功合并模型，或者是否为huggingface存在模型"
+# 获取Whisper的数据处理器，这个包含了特征提取器、tokenizer
+processor = WhisperProcessor.from_pretrained(args.model_path,
+                                             language=args.language,
+                                             task=args.task,
+                                             no_timestamps=not args.timestamps,
+                                             local_files_only=args.local_files_only)
+forced_decoder_ids = processor.get_decoder_prompt_ids()
+# 获取模型
+model = WhisperForConditionalGeneration.from_pretrained(args.model_path,
+                                                        device_map="auto",
+                                                        local_files_only=args.local_files_only)
+model.eval()
+# 获取测试数据
+test_dataset = CustomDataset(data_list_path=args.test_data,
+                             processor=processor,
+                             timestamps=args.timestamps,
+                             min_duration=args.min_audio_len,
+                             max_duration=args.max_audio_len)
+print(f"测试数据：{len(test_dataset)}")
+# 数据padding器
+data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
+eval_dataloader = DataLoader(test_dataset, batch_size=args.batch_size,
+                             num_workers=args.num_workers, collate_fn=data_collator)
+# 获取评估方法
+metric = evaluate.load(args.metric)
+# 开始评估
+for step, batch in enumerate(tqdm(eval_dataloader)):
+    with torch.cuda.amp.autocast():
+        with torch.no_grad():
+            generated_tokens = (
+                model.generate(
+                    input_features=batch["input_features"].cuda(),
+                    decoder_input_ids=batch["labels"][:, :4].cuda(),
+                    forced_decoder_ids=forced_decoder_ids,
+                    max_new_tokens=255).cpu().numpy())
+            labels = batch["labels"].cpu().numpy()
+            labels = np.where(labels != -100, labels, processor.tokenizer.pad_token_id)
+            # 将预测和实际的token转换为文本
+            decoded_preds = processor.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+            decoded_labels = processor.tokenizer.batch_decode(labels, skip_special_tokens=True)
+            # 删除标点符号
+            if args.remove_pun:
+                decoded_preds = remove_punctuation(decoded_preds)
+                decoded_labels = remove_punctuation(decoded_labels)
+            # 将繁体中文总成简体中文
+            if args.to_simple:
+                decoded_preds = to_simple(decoded_preds)
+                decoded_labels = to_simple(decoded_labels)
+            metric.add_batch(predictions=decoded_preds, references=decoded_labels)
+    # 删除计算的记录
+    del generated_tokens, labels, batch
+    gc.collect()
+# 计算评估结果
+m = metric.compute()
+print(f"评估结果：{args.metric}={round(m, 5)}")

finetune.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import argparse
+import functools
+import os
+import platform
+import torch
+from peft import LoraConfig, get_peft_model, AdaLoraConfig, PeftModel, prepare_model_for_kbit_training
+from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, WhisperForConditionalGeneration, WhisperProcessor
+from utils.callback import SavePeftModelCallback
+from utils.data_utils import DataCollatorSpeechSeq2SeqWithPadding
+from utils.model_utils import load_from_checkpoint
+from utils.reader import CustomDataset
+from utils.utils import print_arguments, make_inputs_require_grad, add_arguments
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+add_arg("train_data",    type=str, default="dataset/train.json",       help="")
+add_arg("test_data",     type=str, default="dataset/test.json",        help="")
+add_arg("base_model",    type=str, default="openai/whisper-tiny",      help="Whisper")
+add_arg("output_dir",    type=str, default="output/",                  help="")
+add_arg("warmup_steps",  type=int, default=50,      help="")
+add_arg("logging_steps", type=int, default=100,     help="")
+add_arg("eval_steps",    type=int, default=1000,    help="")
+add_arg("save_steps",    type=int, default=1000,    help="")
+add_arg("num_workers",   type=int, default=8,       help="")
+add_arg("learning_rate", type=float, default=1e-3,  help="")
+add_arg("min_audio_len", type=float, default=0.5,   help="")
+add_arg("max_audio_len", type=float, default=30,    help="")
+add_arg("use_adalora",   type=bool,  default=True,  help="AdaLora/Lora")
+add_arg("fp16",          type=bool,  default=True,  help="fp16")
+add_arg("use_8bit",      type=bool,  default=False, help="8 bit")
+add_arg("timestamps",    type=bool,  default=False, help="")
+add_arg("local_files_only", type=bool, default=False, help="")
+add_arg("num_train_epochs", type=int, default=3,      help="")
+add_arg("language",      type=str, default="bn", help="")
+add_arg("task",     type=str, default="transcribe", choices=['transcribe', 'translate'], help="模型的任务")
+add_arg("augment_config_path",         type=str, default=None, help="")
+add_arg("resume_from_checkpoint",      type=str, default=None, help="")
+add_arg("per_device_train_batch_size", type=int, default=8,    help="batch size")
+add_arg("per_device_eval_batch_size",  type=int, default=8,    help="batch size")
+add_arg("gradient_accumulation_steps", type=int, default=1,    help="")
+args = parser.parse_args()
+print_arguments(args)
+# Whisper tokenizer
+processor = WhisperProcessor.from_pretrained(args.base_model,
+                                             language=args.language,
+                                             task=args.task,
+                                             no_timestamps=not args.timestamps,
+                                             local_files_only=args.local_files_only)
+#
+train_dataset = CustomDataset(data_list_path=args.train_data,
+                              processor=processor,
+                              language=args.language,
+                              timestamps=args.timestamps,
+                              min_duration=args.min_audio_len,
+                              max_duration=args.max_audio_len,
+                              augment_config_path=args.augment_config_path)
+test_dataset = CustomDataset(data_list_path=args.test_data,
+                             processor=processor,
+                             language=args.language,
+                             timestamps=args.timestamps,
+                             min_duration=args.min_audio_len,
+                             max_duration=args.max_audio_len)
+print(f"len train - {len(train_dataset)} test len - {len(test_dataset)}")
+# padding
+data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
+# Whisper
+device_map = "auto"
+world_size = int(os.environ.get("WORLD_SIZE", 1))
+ddp = world_size != 1
+if ddp:
+    device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
+#
+model = WhisperForConditionalGeneration.from_pretrained(args.base_model,
+                                                        load_in_8bit=args.use_8bit,
+                                                        device_map=device_map,
+                                                        local_files_only=args.local_files_only)
+model.config.forced_decoder_ids = None
+model.config.suppress_tokens = []
+#
+model = prepare_model_for_kbit_training(model)
+# forward，req grad
+model.model.encoder.conv1.register_forward_hook(make_inputs_require_grad)
+print('加载LoRA模块...')
+if args.resume_from_checkpoint:
+    #
+    print("Loading adapters from checkpoint.")
+    model = PeftModel.from_pretrained(model, args.resume_from_checkpoint, is_trainable=True)
+else:
+    print(f'adding LoRA modules...')
+    target_modules = ["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"]
+    print(target_modules)
+    if args.use_adalora:
+        config = AdaLoraConfig(init_r=12, target_r=4, beta1=0.85, beta2=0.85, tinit=200, tfinal=1000, deltaT=10,
+                               lora_alpha=32, lora_dropout=0.1, orth_reg_weight=0.5, target_modules=target_modules)
+    else:
+        config = LoraConfig(r=32, lora_alpha=64, target_modules=target_modules, lora_dropout=0.05, bias="none")
+    model = get_peft_model(model, config)
+output_dir = os.path.join(args.output_dir, os.path.basename(args.base_model))
+#
+training_args = \
+    Seq2SeqTrainingArguments(output_dir=output_dir,  # Directory to save checkpoints
+                             per_device_train_batch_size=args.per_device_train_batch_size,  # Training batch_size size
+                             per_device_eval_batch_size=args.per_device_eval_batch_size,  # Eval batch_size
+                             gradient_accumulation_steps=args.gradient_accumulation_steps,  # Cumulative steps of training gradient
+                             learning_rate=args.learning_rate,  # learning rate size
+                             warmup_steps=args.warmup_steps,  # Warm-up steps
+                             num_train_epochs=args.num_train_epochs,  # epochs
+                             save_strategy="steps",  #
+                             evaluation_strategy="steps",  #
+                             load_best_model_at_end=True,  #
+                             fp16=args.fp16,  #
+                             report_to=["tensorboard"],  # tensorboard
+                             save_steps=args.save_steps,  #
+                             eval_steps=args.eval_steps,  #
+                             save_total_limit=5,  #
+                             optim='adamw_torch',  #
+                             ddp_find_unused_parameters=False if ddp else None,  #
+                             dataloader_num_workers=args.num_workers,  #
+                             logging_steps=args.logging_steps,  #
+                             remove_unused_columns=False,  #
+                             label_names=["labels"])  #
+if training_args.local_rank == 0 or training_args.local_rank == -1:
+    print('=' * 90)
+    model.print_trainable_parameters()
+    print('=' * 90)
+# Pytorch2.0
+if torch.__version__ >= "2" and platform.system().lower() == 'windows':
+    model = torch.compile(model)
+#
+trainer = Seq2SeqTrainer(args=training_args,
+                         model=model,
+                         train_dataset=train_dataset,
+                         eval_dataset=test_dataset,
+                         data_collator=data_collator,
+                         tokenizer=processor.feature_extractor,
+                         callbacks=[SavePeftModelCallback])
+model.config.use_cache = False
+trainer._load_from_checkpoint = load_from_checkpoint
+#
+trainer.train(resume_from_checkpoint=args.resume_from_checkpoint)
+#
+trainer.save_state()
+if training_args.local_rank == 0 or training_args.local_rank == -1:
+    model.save_pretrained(os.path.join(output_dir, "checkpoint-final"))

infer_ct2.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import argparse
+import functools
+import os
+from faster_whisper import WhisperModel
+from utils.utils import print_arguments, add_arguments
+os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+add_arg("audio_path",  type=str,  default="dataset/test.wav",        help="")
+add_arg("model_path",  type=str,  default="models/whisper-tiny-finetune-ct2", help="")
+add_arg("language",    type=str,  default="zh",   help="")
+add_arg("use_gpu",     type=bool, default=True,   help="")
+add_arg("use_int8",    type=bool, default=False,  help="int8")
+add_arg("beam_size",   type=int,  default=10,     help="")
+add_arg("num_workers", type=int,  default=1,      help="")
+add_arg("vad_filter",  type=bool, default=False,  help="")
+add_arg("local_files_only", type=bool, default=True, help="")
+args = parser.parse_args()
+print_arguments(args)
+#
+assert os.path.exists(args.model_path), f"{args.model_path}"
+#
+if args.use_gpu:
+    if not args.use_int8:
+        model = WhisperModel(args.model_path, device="cuda", compute_type="float16", num_workers=args.num_workers,
+                             local_files_only=args.local_files_only)
+    else:
+        model = WhisperModel(args.model_path, device="cuda", compute_type="int8_float16", num_workers=args.num_workers,
+                             local_files_only=args.local_files_only)
+else:
+    model = WhisperModel(args.model_path, device="cpu", compute_type="int8", num_workers=args.num_workers,
+                         local_files_only=args.local_files_only)
+#
+_, _ = model.transcribe("dataset/test.wav", beam_size=5)
+#
+segments, info = model.transcribe(args.audio_path, beam_size=args.beam_size, language=args.language,
+                                  vad_filter=args.vad_filter)
+for segment in segments:
+    text = segment.text
+    print(f"[{round(segment.start, 2)} - {round(segment.end, 2)}]：{text}\n")

infer_server.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import argparse
+import asyncio
+import functools
+import json
+import os
+from io import BytesIO
+import uvicorn
+from fastapi import FastAPI, BackgroundTasks, File, Body, UploadFile, Request
+from fastapi.responses import StreamingResponse
+from faster_whisper import WhisperModel
+from starlette.staticfiles import StaticFiles
+from starlette.templating import Jinja2Templates
+from zhconv import convert
+from utils.data_utils import remove_punctuation
+from utils.utils import add_arguments, print_arguments
+os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+add_arg("host",        type=str,  default="0.0.0.0", help="")
+add_arg("port",        type=int,  default=5000,        help="")
+add_arg("model_path",  type=str,  default="models/sam2ai/whisper-odia-small-finetune-int8-ct2", help="")
+add_arg("use_gpu",     type=bool, default=False,   help="")
+add_arg("use_int8",    type=bool, default=True,  help="")
+add_arg("beam_size",   type=int,  default=10,     help="")
+add_arg("num_workers", type=int,  default=2,      help="")
+add_arg("vad_filter",  type=bool, default=True,  help="")
+add_arg("local_files_only", type=bool, default=True, help="")
+args = parser.parse_args()
+print_arguments(args)
+#
+assert os.path.exists(args.model_path), f"{args.model_path}"
+#
+if args.use_gpu:
+    if not args.use_int8:
+        model = WhisperModel(args.model_path, device="cuda", compute_type="float16",
+                            num_workers=args.num_workers, local_files_only=args.local_files_only)
+    else:
+        model = WhisperModel(args.model_path, device="cuda",
+                            compute_type="int8_float16", num_workers=args.num_workers,
+                            local_files_only=args.local_files_only)
+else:
+    model = WhisperModel(args.model_path, device="cpu",
+                        compute_type="int8", num_workers=args.num_workers,
+                        local_files_only=args.local_files_only)
+#
+# _, _ = model.transcribe("dataset/test.wav", beam_size=5)
+app = FastAPI(title="")
+app.mount('/static', StaticFiles(directory='static'), name='static')
+templates = Jinja2Templates(directory="templates")
+model_semaphore = None
+def release_model_semaphore():
+    model_semaphore.release()
+def recognition(file: File, to_simple: int,
+                remove_pun: int, language: str = "ory",
+                task: str = "transcribe"
+    ):
+    segments, info = model.transcribe(file, beam_size=10, task=task, language=language, vad_filter=args.vad_filter)
+    for segment in segments:
+        text = segment.text
+        if to_simple == 1:
+            # text = convert(text, '')
+            pass
+        if remove_pun == 1:
+            # text = remove_punctuation(text)
+            pass
+        ret = {"result": text, "start": round(segment.start, 2), "end": round(segment.end, 2)}
+        #
+        yield json.dumps(ret).encode() + b"\0"
+@app.post("/recognition_stream")
+async def api_recognition_stream(
+        to_simple: int = Body(1, description="", embed=True),
+        remove_pun: int = Body(0, description="", embed=True),
+        language: str = Body("ory", description="", embed=True),
+        task: str = Body("transcribe", description="", embed=True),
+        audio: UploadFile = File(..., description="")
+        ):
+    global model_semaphore
+    if language == "None": language = None
+    if model_semaphore is None:
+        model_semaphore = asyncio.Semaphore(5)
+    await model_semaphore.acquire()
+    contents = await audio.read()
+    data = BytesIO(contents)
+    generator = recognition(
+        file=data, to_simple=to_simple,
+        remove_pun=remove_pun, language=language,
+        task=task
+        )
+    background_tasks = BackgroundTasks()
+    background_tasks.add_task(release_model_semaphore)
+    return StreamingResponse(generator, background=background_tasks)
+@app.post("/recognition")
+async def api_recognition(
+        to_simple: int = Body(1, description="", embed=True),
+        remove_pun: int = Body(0, description="", embed=True),
+        language: str = Body("ory", description="", embed=True),
+        task: str = Body("transcribe", description="", embed=True),
+        audio: UploadFile = File(..., description="")
+        ):
+    if language == "None":language=None
+    contents = await audio.read()
+    data = BytesIO(contents)
+    generator = recognition(
+        file=data, to_simple=to_simple,
+        remove_pun=remove_pun, language=language,
+        task=task
+        )
+    results = []
+    for output in generator:
+        output = json.loads(output[:-1].decode("utf-8"))
+        results.append(output)
+    ret = {"results": results, "code": 0}
+    return ret
+@app.get("/")
+async def index(request: Request):
+    return templates.TemplateResponse(
+        "index.html", {"request": request, "id": id}
+        )
+if __name__ == '__main__':
+    uvicorn.run(app, host=args.host, port=args.port)

infer_tfs.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import argparse
+import functools
+import librosa
+from transformers import WhisperForConditionalGeneration, WhisperProcessor
+from utils.utils import print_arguments, add_arguments
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+add_arg("audio_path", type=str, default="dataset/test.wav",              help="")
+add_arg("model_path", type=str, default="models/whisper-tiny-finetune",  help="")
+add_arg("language",   type=str, default="Oriya",                         help="")
+add_arg("task",       type=str, default="transcribe", choices=['transcribe', 'translate'], help="")
+add_arg("local_files_only", type=bool, default=True,  help="")
+args = parser.parse_args()
+print_arguments(args)
+# Whisper
+processor = WhisperProcessor.from_pretrained(args.model_path,
+                                             language=args.language,
+                                             task=args.task,
+                                             local_files_only=args.local_files_only)
+forced_decoder_ids = processor.get_decoder_prompt_ids(language=args.language, task=args.task)
+#
+model = WhisperForConditionalGeneration.from_pretrained(args.model_path,
+                                                        device_map="auto",
+                                                        local_files_only=args.local_files_only).half()
+model.eval()
+#
+sample, sr = librosa.load(args.audio_path, sr=16000)
+duration = sample.shape[-1]/sr
+assert duration < 30, f"This program is only suitable for inferring audio less than 30 seconds, the current audio {duration} seconds, use another inference program!"
+#
+input_features = processor(sample, sampling_rate=sr, return_tensors="pt", do_normalize=True).input_features.cuda().half()
+#
+predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids, max_new_tokens=256)
+#
+transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+print(f"result ：{transcription}")

merge_lora.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import argparse
+import functools
+import os
+from transformers import WhisperForConditionalGeneration, WhisperFeatureExtractor, WhisperTokenizerFast,\
+    WhisperProcessor
+from peft import PeftModel, PeftConfig
+from utils.utils import print_arguments, add_arguments
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+add_arg("lora_model", type=str, default="output/whisper-tiny/checkpoint-best/", help="")
+add_arg('output_dir', type=str, default='models/',    help="")
+add_arg("local_files_only", type=bool, default=False, help="")
+args = parser.parse_args()
+print_arguments(args)
+#
+assert os.path.exists(args.lora_model), f"{args.lora_model}"
+# Lora
+peft_config = PeftConfig.from_pretrained(args.lora_model)
+# Whisper
+base_model = WhisperForConditionalGeneration.from_pretrained(peft_config.base_model_name_or_path, device_map={"": "cpu"},
+                                                             local_files_only=args.local_files_only)
+# Lora
+model = PeftModel.from_pretrained(base_model, args.lora_model, local_files_only=args.local_files_only)
+feature_extractor = WhisperFeatureExtractor.from_pretrained(peft_config.base_model_name_or_path,
+                                                            local_files_only=args.local_files_only)
+tokenizer = WhisperTokenizerFast.from_pretrained(peft_config.base_model_name_or_path,
+                                                 local_files_only=args.local_files_only)
+processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path,
+                                             local_files_only=args.local_files_only)
+#
+model = model.merge_and_unload()
+model.train(False)
+#
+save_directory = os.path.join(args.output_dir, f'{os.path.basename(peft_config.base_model_name_or_path)}-finetune')
+os.makedirs(save_directory, exist_ok=True)
+#
+model.save_pretrained(save_directory)
+feature_extractor.save_pretrained(save_directory)
+tokenizer.save_pretrained(save_directory)
+processor.save_pretrained(save_directory)
+print(f'model saved directory ：{save_directory}')

requirements.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+numpy>=1.23.1
+soundfile>=0.12.1
+librosa>=0.10.0
+dataclasses>=0.6
+transformers>=4.31.0
+bitsandbytes>=0.41.0
+soundfile>=0.12.1
+datasets>=2.11.0
+evaluate>=0.4.0
+faster-whisper>=0.7.0
+jiwer>=2.5.1
+peft>=0.4.0
+accelerate>=0.21.0
+zhconv>=1.4.2
+tqdm>=4.62.1
+soundcard>=0.4.2
+uvicorn>=0.21.1
+fastapi>=0.95.1
+starlette>=0.26.1
+tensorboardX>=2.2
+python-multipart

run.sh ADDED Viewed

	@@ -0,0 +1,22 @@

+#!/bin/bash
+CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node=2 finetune.py --base_model=openai/whisper-tiny --use_8bit=False --per_device_train_batch_size=8 --per_device_eval_batch_size=8 --gradient_accumulation_steps=1
+CUDA_VISIBLE_DEVICES=0 python merge_lora.py --lora_model=output/whisper-tiny/checkpoint-final
+CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node=2 finetune.py --base_model=openai/whisper-base --use_8bit=False --per_device_train_batch_size=8 --per_device_eval_batch_size=8 --gradient_accumulation_steps=1
+CUDA_VISIBLE_DEVICES=0 python merge_lora.py --lora_model=output/whisper-base/checkpoint-final
+CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node=2 finetune.py --base_model=openai/whisper-small --use_8bit=True --per_device_train_batch_size=8 --per_device_eval_batch_size=8 --gradient_accumulation_steps=1
+CUDA_VISIBLE_DEVICES=0 python merge_lora.py --lora_model=output/whisper-small/checkpoint-final
+CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node=2 finetune.py --base_model=openai/whisper-medium --use_8bit=True --per_device_train_batch_size=4 --per_device_eval_batch_size=2 --gradient_accumulation_steps=2
+CUDA_VISIBLE_DEVICES=0 python merge_lora.py --lora_model=output/whisper-medium/checkpoint-final
+CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node=2 finetune.py --base_model=openai/whisper-large-v2 --use_8bit=True --per_device_train_batch_size=2 --per_device_eval_batch_size=2 --gradient_accumulation_steps=4
+CUDA_VISIBLE_DEVICES=0 python merge_lora.py --lora_model=output/whisper-large-v2/checkpoint-final
+CUDA_VISIBLE_DEVICES=0 python evaluation.py --model_path=models/whisper-tiny-finetune
+CUDA_VISIBLE_DEVICES=0 python evaluation.py --model_path=models/whisper-base-finetune
+CUDA_VISIBLE_DEVICES=0 python evaluation.py --model_path=models/whisper-small-finetune
+CUDA_VISIBLE_DEVICES=0 python evaluation.py --model_path=models/whisper-medium-finetune
+CUDA_VISIBLE_DEVICES=0 python evaluation.py --model_path=models/whisper-large-v2-finetune

static/index.css ADDED Viewed

	@@ -0,0 +1,109 @@

+* {
+    box-sizing: border-box;
+}
+body {
+    font-family: "Helvetica Neue", "Roboto", sans-serif;
+    background-color: #f2f2f2;
+    margin: 0;
+    padding: 0;
+}
+#header {
+    background-color: #fff;
+    color: #333;
+    display: flex;
+    justify-content: center;
+    align-items: center;
+    height: 80px;
+}
+h1 {
+    font-size: 36px;
+    margin: 0;
+}
+#content {
+    background-color: #fff;
+    border-radius: 10px;
+    box-shadow: 0 2px 10px rgba(0, 0, 0, 0.2);
+    margin: 50px auto;
+    max-width: 800px;
+    padding: 20px;
+}
+#content div {
+    display: flex;
+    flex-wrap: wrap;
+    justify-content: space-between;
+    margin-bottom: 20px;
+}
+#content a {
+    background-color: #fff;
+    border-radius: 5px;
+    box-shadow: 0 2px 10px rgba(0, 0, 0, 0.2);
+    color: #333;
+    padding: 10px;
+    text-align: center;
+    text-decoration: none;
+    transition: background-color 0.2s;
+    width: 20%;
+}
+#content a:hover {
+    background-color: #f2f2f2;
+}
+#content img {
+    cursor: pointer;
+    height: 50px;
+    transition: transform 0.2s;
+    width: 50px;
+}
+#content img:hover {
+    transform: scale(1.1);
+}
+#result {
+    background-color: #fff;
+    border-radius: 5px;
+    box-shadow: 0 2px 10px rgba(0, 0, 0, 0.2);
+    padding: 10px;
+}
+#result textarea {
+    border: none;
+    border-radius: 5px;
+    font-size: 16px;
+    height: 100px;
+    margin-top: 10px;
+    padding: 10px;
+    resize: none;
+    width: 100%;
+}
+/* #llm_result {
+    background-color: #fff;
+    border-radius: 5px;
+    box-shadow: 0 2px 10px rgba(0, 0, 0, 0.2);
+    padding: 10px;
+}
+#llm_result textarea {
+    border: none;
+    border-radius: 5px;
+    font-size: 16px;
+    height: 100px;
+    margin-top: 10px;
+    padding: 10px;
+    resize: none;
+    width: 100%;
+} */
+@media only screen and (max-width: 600px) {
+    #content a {
+        width: 100%;
+    }
+}

static/record.js ADDED Viewed

	@@ -0,0 +1,229 @@

+//兼容
+window.URL = window.URL || window.webkitURL;
+//获取计算机的设备：摄像头或者录音设备
+navigator.getUserMedia = navigator.getUserMedia || navigator.webkitGetUserMedia || navigator.mozGetUserMedia || navigator.msGetUserMedia;
+var HZRecorder = function (stream, config) {
+    config = config || {};
+    config.sampleBits = config.sampleBits || 16;      //采样数位 8, 16
+    config.sampleRate = config.sampleRate || 16000;   //采样率 16000
+    //创建一个音频环境对象
+    var audioContext = window.AudioContext || window.webkitAudioContext;
+    var context = new audioContext();
+    var audioInput = context.createMediaStreamSource(stream);
+    // 第二个和第三个参数指的是输入和输出都是单声道,2是双声道。
+    var recorder = context.createScriptProcessor(4096, 2, 2);
+    var audioData = {
+        size: 0          //录音文件长度
+        , buffer: []     //录音缓存
+        , inputSampleRate: context.sampleRate    //输入采样率
+        , inputSampleBits: 16       //输入采样数位 8, 16
+        , outputSampleRate: config.sampleRate    //输出采样率
+        , outputSampleBits: config.sampleBits       //输出采样数位 8, 16
+        , input: function (data) {
+            this.buffer.push(new Float32Array(data));
+            this.size += data.length;
+        }
+        , compress: function () { //合并压缩
+            //合并
+            var data = new Float32Array(this.size);
+            var offset = 0;
+            for (var i = 0; i < this.buffer.length; i++) {
+                data.set(this.buffer[i], offset);
+                offset += this.buffer[i].length;
+            }
+            //压缩
+            var compression = parseInt(this.inputSampleRate / this.outputSampleRate);
+            var length = data.length / compression;
+            var result = new Float32Array(length);
+            var index = 0, j = 0;
+            while (index < length) {
+                result[index] = data[j];
+                j += compression;
+                index++;
+            }
+            return result;
+        }
+        , encodeWAV: function () {
+            var sampleRate = Math.min(this.inputSampleRate, this.outputSampleRate);
+            var sampleBits = Math.min(this.inputSampleBits, this.outputSampleBits);
+            var bytes = this.compress();
+            var dataLength = bytes.length * (sampleBits / 8);
+            var buffer = new ArrayBuffer(44 + dataLength);
+            var data = new DataView(buffer);
+            var channelCount = 1;//单声道
+            var offset = 0;
+            var writeString = function (str) {
+                for (var i = 0; i < str.length; i++) {
+                    data.setUint8(offset + i, str.charCodeAt(i));
+                }
+            }
+            // 资源交换文件标识符
+            writeString('RIFF');
+            offset += 4;
+            // 下个地址开始到文件尾总字节数,即文件大小-8
+            data.setUint32(offset, 36 + dataLength, true);
+            offset += 4;
+            // WAV文件标志
+            writeString('WAVE');
+            offset += 4;
+            // 波形格式标志
+            writeString('fmt ');
+            offset += 4;
+            // 过滤字节,一般为 0x10 = 16
+            data.setUint32(offset, 16, true);
+            offset += 4;
+            // 格式类别 (PCM形式采样数据)
+            data.setUint16(offset, 1, true);
+            offset += 2;
+            // 通道数
+            data.setUint16(offset, channelCount, true);
+            offset += 2;
+            // 采样率,每秒样本数,表示每个通道的播放速度
+            data.setUint32(offset, sampleRate, true);
+            offset += 4;
+            // 波形数据传输率 (每秒平均字节数) 单声道×每秒数据位数×每样本数据位/8
+            data.setUint32(offset, channelCount * sampleRate * (sampleBits / 8), true);
+            offset += 4;
+            // 快数据调整数 采样一次占用字节数 单声道×每样本的数据位数/8
+            data.setUint16(offset, channelCount * (sampleBits / 8), true);
+            offset += 2;
+            // 每样本数据位数
+            data.setUint16(offset, sampleBits, true);
+            offset += 2;
+            // 数据标识符
+            writeString('data');
+            offset += 4;
+            // 采样数据总数,即数据总大小-44
+            data.setUint32(offset, dataLength, true);
+            offset += 4;
+            // 写入采样数据
+            if (sampleBits === 8) {
+                for (var i = 0; i < bytes.length; i++, offset++) {
+                    var s = Math.max(-1, Math.min(1, bytes[i]));
+                    var val = s < 0 ? s * 0x8000 : s * 0x7FFF;
+                    val = parseInt(255 / (65535 / (val + 32768)));
+                    data.setInt8(offset, val, true);
+                }
+            } else {
+                for (var i = 0; i < bytes.length; i++, offset += 2) {
+                    var s = Math.max(-1, Math.min(1, bytes[i]));
+                    data.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
+                }
+            }
+            return new Blob([data], {type: 'audio/wav'});
+        }
+    };
+    //开始录音
+    this.start = function () {
+        audioInput.connect(recorder);
+        recorder.connect(context.destination);
+    }
+    //停止
+    this.stop = function () {
+        recorder.disconnect();
+    }
+    //获取音频文件
+    this.getBlob = function () {
+        this.stop();
+        return audioData.encodeWAV();
+    }
+    //回放
+    this.play = function (audio) {
+        audio.src = window.URL.createObjectURL(this.getBlob());
+    }
+    //清除
+    this.clear = function () {
+        audioData.buffer = [];
+        audioData.size = 0;
+    }
+    //上传
+    this.upload = function (url, callback) {
+        var fd = new FormData();
+        // 上传的文件名和数据
+        fd.append("audio", this.getBlob());
+        var xhr = new XMLHttpRequest();
+        xhr.timeout = 60000
+        if (callback) {
+            xhr.upload.addEventListener("progress", function (e) {
+                callback('uploading', e);
+            }, false);
+            xhr.addEventListener("load", function (e) {
+                callback('ok', e);
+            }, false);
+            xhr.addEventListener("error", function (e) {
+                callback('error', e);
+            }, false);
+            xhr.addEventListener("abort", function (e) {
+                callback('cancel', e);
+            }, false);
+        }
+        xhr.open("POST", url);
+        xhr.send(fd);
+    }
+    //音频采集
+    recorder.onaudioprocess = function (e) {
+        audioData.input(e.inputBuffer.getChannelData(0));
+        //record(e.inputBuffer.getChannelData(0));
+    }
+};
+//抛出异常
+HZRecorder.throwError = function (message) {
+    alert(message);
+    throw new function () {
+        this.toString = function () {
+            return message;
+        }
+    }
+}
+//是否支持录音
+HZRecorder.canRecording = (navigator.getUserMedia != null);
+//获取录音机
+HZRecorder.get = function (callback, config) {
+    if (callback) {
+        if (navigator.getUserMedia) {
+            navigator.getUserMedia(
+                {audio: true} //只启用音频
+                , function (stream) {
+                    var rec = new HZRecorder(stream, config);
+                    callback(rec);
+                }
+                , function (error) {
+                    switch (error.code || error.name) {
+                        case 'PERMISSION_DENIED':
+                        case 'PermissionDeniedError':
+                            HZRecorder.throwError('用户拒绝提供信息。');
+                            break;
+                        case 'NOT_SUPPORTED_ERROR':
+                        case 'NotSupportedError':
+                            HZRecorder.throwError('浏览器不支持硬件设备。');
+                            break;
+                        case 'MANDATORY_UNSATISFIED_ERROR':
+                        case 'MandatoryUnsatisfiedError':
+                            HZRecorder.throwError('无法发现指定的硬件设备。');
+                            break;
+                        default:
+                            HZRecorder.throwError('无法打开麦克风。异常信息:' + (error.code || error.name));
+                            break;
+                    }
+                });
+        } else {
+            window.alert('不是HTTPS协议或者localhost地址，不能使用录音功能！')
+            HZRecorder.throwErr('当前浏览器不支持录音功能。');
+            return;
+        }
+    }
+};

static/record.png ADDED Viewed

static/recording.gif ADDED Viewed

templates/index.html ADDED Viewed

	@@ -0,0 +1,167 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <title>OdiaGenAI Speech Recognition</title>
+    <script type="text/javascript" src="/static/record.js"></script>
+    <link href="/static/index.css" rel="stylesheet" type="text/css"/>
+</head>
+<body>
+<div id="header">
+    <h1>OdiaGenAI Speech Recognition</h1>
+</div>
+<div id="content">
+    <div>
+        <a id="upload" onclick="uploadAudioFile()" class="file">select audio file</a>
+        <a id="play_btn" onclick="uploadRecordAudio()" class="file">predict audio file</a>
+        <audio controls autoplay></audio>
+        <img id="record_btn" onclick="record()" src="/static/record.png" alt="record"/>
+    </div>
+    <div id="result">
+        <label for="result_p"></label><textarea id="result_p"></textarea>
+    </div>
+    <!-- <div id="llm_result">
+        <a id="llm_predict" onclick="uploadAudioFile()" class="file">generate text</a>
+        <label for="result_llm"></label><textarea id="result_llm"></textarea>
+    </div> -->
+</div>
+<script>
+    let is_recording = false;
+    let is_playing = false;
+    let host = location.origin;
+    let recorder;
+    let audio = document.querySelector('audio');
+    let textarea = document.getElementById('result_p')
+    function record() {
+        if (is_recording) {
+            is_recording = false;
+            stopRecording()
+            document.getElementById('record_btn').src = '/static/record.png'
+            startPlay();
+            stopPlay();
+        } else {
+            is_recording = true;
+            startRecording()
+            document.getElementById('record_btn').src = '/static/recording.gif'
+        }
+    }
+    function play() {
+        if (is_playing) {
+            is_playing = false;
+            stopPlay()
+            document.getElementById('play_btn').innerText = 'play audio'
+        } else {
+            is_playing = true;
+            startPlay()
+            document.getElementById('play_btn').innerText = 'Stop play'
+        }
+    }
+    function startRecording() {
+        HZRecorder.get(function (rec) {
+            recorder = rec;
+            recorder.start();
+        });
+    }
+    function stopRecording() {
+        recorder.stop();
+    }
+    function startPlay() {
+        recorder.play(audio);
+    }
+    function stopPlay() {
+        audio.pause();
+    }
+    function cancelAudio() {
+        recorder.stop();
+        recorder.clear();
+    }
+    function uploadRecordAudio() {
+        recorder.upload(location.origin + "/recognition", function (state, e) {
+            switch (state) {
+                case 'uploading':
+                    const percentComplete = Math.round(e.loaded * 100 / e.total) + '%';
+                    console.log(percentComplete);
+                    break;
+                case 'ok':
+                    console.log(e.target.responseText)
+                    document.getElementById('result_p').innerHTML = e.target.responseText
+                    break;
+                case 'error':
+                    alert("upload failed");
+                    break;
+                case 'cancel':
+                    alert("upload canceled");
+                    break;
+            }
+        });
+    }
+    //
+    function uploadAudioFile() {
+        const input = document.createElement("input");
+        input.type = "file";
+        input.accept = "audio/*,video/*";
+        input.click();
+        input.onchange = function () {
+            const file = input.files[0];
+            console.log(file)
+            audio.src = window.URL.createObjectURL(file);
+            stopPlay();
+            upload_file(host + "/recognition", file, function (state, e) {
+                switch (state) {
+                    case 'uploading':
+                        const percentComplete = Math.round(e.loaded * 100 / e.total) + '%';
+                        console.log(percentComplete);
+                        break;
+                    case 'ok':
+                        console.log(e.target.responseText)
+                        textarea.innerText = e.target.responseText
+                        break;
+                    case 'error':
+                        alert("upload failed");
+                        break;
+                    case 'cancel':
+                        alert("upload canceled");
+                        break;
+                }
+            });
+        }
+    }
+    //
+    upload_file = function (url, file, callback) {
+        const fd = new FormData();
+        //
+        fd.append("audio", file);
+        const xhr = new XMLHttpRequest();
+        xhr.timeout = 60000
+        if (callback) {
+            xhr.upload.addEventListener("progress", function (e) {
+                callback('uploading', e);
+            }, false);
+            xhr.addEventListener("load", function (e) {
+                callback('ok', e);
+            }, false);
+            xhr.addEventListener("error", function (e) {
+                callback('error', e);
+            }, false);
+            xhr.addEventListener("abort", function (e) {
+                callback('cancel', e);
+            }, false);
+        }
+        xhr.open("POST", url);
+        xhr.send(fd);
+    }
+</script>
+</body>
+</html>

utils/__init__.py ADDED Viewed

File without changes

utils/binary.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import json
+import mmap
+import struct
+from tqdm import tqdm
+class DatasetWriter(object):
+    def __init__(self, prefix):
+        #
+        self.data_file = open(prefix + '.data', 'wb')
+        self.header_file = open(prefix + '.header', 'wb')
+        self.data_sum = 0
+        self.offset = 0
+        self.header = ''
+    def add_data(self, data):
+        key = str(self.data_sum)
+        data = bytes(data, encoding="utf8")
+        #
+        self.data_file.write(struct.pack('I', len(key)))
+        self.data_file.write(key.encode('ascii'))
+        self.data_file.write(struct.pack('I', len(data)))
+        self.data_file.write(data)
+        #
+        self.offset += 4 + len(key) + 4
+        self.header = key + '\t' + str(self.offset) + '\t' + str(len(data)) + '\n'
+        self.header_file.write(self.header.encode('ascii'))
+        self.offset += len(data)
+        self.data_sum += 1
+    def close(self):
+        self.data_file.close()
+        self.header_file.close()
+class DatasetReader(object):
+    def __init__(self, data_header_path, min_duration=0, max_duration=30):
+        self.keys = []
+        self.offset_dict = {}
+        self.fp = open(data_header_path.replace('.header', '.data'), 'rb')
+        self.m = mmap.mmap(self.fp.fileno(), 0, access=mmap.ACCESS_READ)
+        for line in tqdm(open(data_header_path, 'rb'), desc='读取数据列表'):
+            key, val_pos, val_len = line.split('\t'.encode('ascii'))
+            data = self.m[int(val_pos):int(val_pos) + int(val_len)]
+            data = str(data, encoding="utf-8")
+            data = json.loads(data)
+            #
+            if data["duration"] < min_duration:
+                continue
+            if max_duration != -1 and data["duration"] > max_duration:
+                continue
+            self.keys.append(key)
+            self.offset_dict[key] = (int(val_pos), int(val_len))
+    #
+    def get_data(self, key):
+        p = self.offset_dict.get(key, None)
+        if p is None:
+            return None
+        val_pos, val_len = p
+        data = self.m[val_pos:val_pos + val_len]
+        data = str(data, encoding="utf-8")
+        return json.loads(data)
+    #
+    def get_keys(self):
+        return self.keys
+    def __len__(self):
+        return len(self.keys)

utils/callback.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import os
+import os
+import shutil
+from transformers import TrainerCallback, TrainingArguments, TrainerState, TrainerControl
+from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
+#
+class SavePeftModelCallback(TrainerCallback):
+    def on_save(self,
+                args: TrainingArguments,
+                state: TrainerState,
+                control: TrainerControl,
+                **kwargs, ):
+        if args.local_rank == 0 or args.local_rank == -1:
+            #
+            checkpoint_folder = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}")
+            peft_model_dir = os.path.join(checkpoint_folder, "adapter_model")
+            kwargs["model"].save_pretrained(peft_model_dir)
+            peft_config_path = os.path.join(checkpoint_folder, "adapter_model/adapter_config.json")
+            peft_model_path = os.path.join(checkpoint_folder, "adapter_model/adapter_model.bin")
+            if not os.path.exists(peft_config_path):
+                os.remove(peft_config_path)
+            if not os.path.exists(peft_model_path):
+                os.remove(peft_model_path)
+            if os.path.exists(peft_model_dir):
+                shutil.rmtree(peft_model_dir)
+            #
+            best_checkpoint_folder = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-best")
+            #
+            if os.path.exists(state.best_model_checkpoint):
+                if os.path.exists(best_checkpoint_folder):
+                    shutil.rmtree(best_checkpoint_folder)
+                shutil.copytree(state.best_model_checkpoint, best_checkpoint_folder)
+            print(f"{state.best_model_checkpoint}{state.best_metric}")
+        return control

utils/data_utils.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import re
+from dataclasses import dataclass
+from typing import Any, List, Dict, Union
+import torch
+from zhconv import convert
+# 删除标点符号
+def remove_punctuation(text: str or List[str]):
+    punctuation = '!,.;:?、！，。；：？'
+    if isinstance(text, str):
+        text = re.sub(r'[{}]+'.format(punctuation), '', text).strip()
+        return text
+    elif isinstance(text, list):
+        result_text = []
+        for t in text:
+            t = re.sub(r'[{}]+'.format(punctuation), '', t).strip()
+            result_text.append(t)
+        return result_text
+    else:
+        raise Exception(f'不支持该类型{type(text)}')
+# 将繁体中文总成简体中文
+def to_simple(text: str or List[str]):
+    if isinstance(text, str):
+        text = convert(text, 'zh-cn')
+        return text
+    elif isinstance(text, list):
+        result_text = []
+        for t in text:
+            t = convert(t, 'zh-cn')
+            result_text.append(t)
+        return result_text
+    else:
+        raise Exception(f'不支持该类型{type(text)}')
+@dataclass
+class DataCollatorSpeechSeq2SeqWithPadding:
+    processor: Any
+    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+        # split inputs and labels since they have to be of different lengths and need different padding methods
+        # first treat the audio inputs by simply returning torch tensors
+        input_features = [{"input_features": feature["input_features"][0]} for feature in features]
+        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
+        # get the tokenized label sequences
+        label_features = [{"input_ids": feature["labels"]} for feature in features]
+        # pad the labels to max length
+        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
+        # replace padding with -100 to ignore loss correctly
+        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
+        # if bos token is appended in previous tokenization step,
+        # cut bos token here as it's append later anyways
+        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
+            labels = labels[:, 1:]
+        batch["labels"] = labels
+        return batch

utils/model_utils.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import bitsandbytes as bnb
+import torch
+from transformers.trainer_pt_utils import LabelSmoother
+IGNORE_TOKEN_ID = LabelSmoother.ignore_index
+def find_all_linear_names(use_8bit, model):
+    cls = bnb.nn.Linear8bitLt if use_8bit else torch.nn.Linear
+    lora_module_names = set()
+    for name, module in model.named_modules():
+        if isinstance(module, cls):
+            names = name.split('.')
+            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
+    target_modules = list(lora_module_names)
+    return target_modules
+def load_from_checkpoint(resume_from_checkpoint, model=None):
+    pass

utils/pun_predictor.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import json
+import os
+import re
+import numpy as np
+import paddle.inference as paddle_infer
+from paddlenlp.transformers import ErnieTokenizer
+__all__ = ['PunctuationExecutor']
+class PunctuationExecutor:
+    def __init__(self, model_dir, use_gpu=True, gpu_mem=500, num_threads=4):
+        #  config
+        model_path = os.path.join(model_dir, 'model.pdmodel')
+        params_path = os.path.join(model_dir, 'model.pdiparams')
+        if not os.path.exists(model_path) or not os.path.exists(params_path):
+            raise Exception("{}{}".format(model_path, params_path))
+        self.config = paddle_infer.Config(model_path, params_path)
+        #
+        pretrained_token = 'ernie-1.0'
+        if os.path.exists(os.path.join(model_dir, 'info.json')):
+            with open(os.path.join(model_dir, 'info.json'), 'r', encoding='utf-8') as f:
+                data = json.load(f)
+                pretrained_token = data['pretrained_token']
+        if use_gpu:
+            self.config.enable_use_gpu(gpu_mem, 0)
+        else:
+            self.config.disable_gpu()
+            self.config.set_cpu_math_library_num_threads(num_threads)
+        # enable memory optim
+        self.config.enable_memory_optim()
+        self.config.disable_glog_info()
+        #  config  predictor
+        self.predictor = paddle_infer.create_predictor(self.config)
+        #
+        self.input_ids_handle = self.predictor.get_input_handle('input_ids')
+        self.token_type_ids_handle = self.predictor.get_input_handle('token_type_ids')
+        #
+        self.output_names = self.predictor.get_output_names()
+        self._punc_list = []
+        if not os.path.join(model_dir, 'vocab.txt'):
+            raise Exception("{}".format(os.path.join(model_dir, 'vocab.txt')))
+        with open(os.path.join(model_dir, 'vocab.txt'), 'r', encoding='utf-8') as f:
+            for line in f:
+                self._punc_list.append(line.strip())
+        self.tokenizer = ErnieTokenizer.from_pretrained(pretrained_token)
+        #
+        self('')
+    def _clean_text(self, text):
+        text = text.lower()
+        text = re.sub('[^A-Za-z0-9\u4e00-\u9fa5]', '', text)
+        text = re.sub(f'[{"".join([p for p in self._punc_list][1:])}]', '', text)
+        return text
+    #
+    def preprocess(self, text: str):
+        clean_text = self._clean_text(text)
+        if len(clean_text) == 0: return None
+        tokenized_input = self.tokenizer(list(clean_text), return_length=True, is_split_into_words=True)
+        input_ids = tokenized_input['input_ids']
+        seg_ids = tokenized_input['token_type_ids']
+        seq_len = tokenized_input['seq_len']
+        return input_ids, seg_ids, seq_len
+    def infer(self, input_ids: list, seg_ids: list):
+        #
+        self.input_ids_handle.reshape([1, len(input_ids)])
+        self.token_type_ids_handle.reshape([1, len(seg_ids)])
+        self.input_ids_handle.copy_from_cpu(np.array([input_ids]).astype('int64'))
+        self.token_type_ids_handle.copy_from_cpu(np.array([seg_ids]).astype('int64'))
+        # predictor
+        self.predictor.run()
+        #
+        output_handle = self.predictor.get_output_handle(self.output_names[0])
+        output_data = output_handle.copy_to_cpu()
+        return output_data
+    #
+    def postprocess(self, input_ids, seq_len, preds):
+        tokens = self.tokenizer.convert_ids_to_tokens(input_ids[1:seq_len - 1])
+        labels = preds[1:seq_len - 1].tolist()
+        assert len(tokens) == len(labels)
+        text = ''
+        for t, l in zip(tokens, labels):
+            text += t
+            if l != 0:
+                text += self._punc_list[l]
+        return text
+    def __call__(self, text: str) -> str:
+        #
+        input_ids, seg_ids, seq_len = self.preprocess(text)
+        preds = self.infer(input_ids=input_ids, seg_ids=seg_ids)
+        if len(preds.shape) == 2:
+            preds = preds[0]
+        text = self.postprocess(input_ids, seq_len, preds)
+        return text

utils/reader.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import json
+import os
+import random
+import sys
+from typing import List
+import librosa
+import numpy as np
+import soundfile
+from torch.utils.data import Dataset
+from tqdm import tqdm
+from utils.binary import DatasetReader
+class CustomDataset(Dataset):
+    def __init__(self,
+                 data_list_path,
+                 processor,
+                 mono=True,
+                 language=None,
+                 timestamps=False,
+                 sample_rate=16000,
+                 min_duration=0.5,
+                 max_duration=30,
+                 augment_config_path=None):
+        """
+        Args:
+            data_list_path:
+            processor: Whisper
+            mono: True
+            language:
+            timestamps:
+            sample_rate: 16000
+            min_duration: 0.5s
+            max_duration: 30s
+            augment_config_path:
+        """
+        super(CustomDataset, self).__init__()
+        assert min_duration >= 0.5, f"min_duration 0.5：{min_duration}"
+        assert max_duration <= 30, f"max_duration 30：{max_duration}"
+        self.data_list_path = data_list_path
+        self.processor = processor
+        self.data_list_path = data_list_path
+        self.sample_rate = sample_rate
+        self.mono = mono
+        self.language = language
+        self.timestamps = timestamps
+        self.min_duration = min_duration
+        self.max_duration = max_duration
+        self.vocab = self.processor.tokenizer.get_vocab()
+        self.timestamp_begin = self.vocab['<|notimestamps|>'] + 1
+        self.startoftranscript = self.vocab['<|startoftranscript|>']
+        self.endoftext = self.vocab['<|endoftext|>']
+        self.nocaptions = self.vocab['<|nocaptions|>']
+        self.data_list: List[dict] = []
+        #
+        self._load_data_list()
+        #
+        self.augment_configs = None
+        self.noises_path = None
+        self.speed_rates = None
+        if augment_config_path:
+            with open(augment_config_path, 'r', encoding='utf-8') as f:
+                self.augment_configs = json.load(f)
+    #
+    def _load_data_list(self):
+        if self.data_list_path.endswith(".header"):
+            #
+            self.dataset_reader = DatasetReader(data_header_path=self.data_list_path,
+                                                min_duration=self.min_duration,
+                                                max_duration=self.max_duration)
+            self.data_list = self.dataset_reader.get_keys()
+        else:
+            #
+            with open(self.data_list_path, 'r', encoding='utf-8') as f:
+                lines = f.readlines()
+            self.data_list = []
+            for line in tqdm(lines, desc=''):
+                if isinstance(line, str):
+                    line = json.loads(line)
+                if not isinstance(line, dict): continue
+                #
+                if line["duration"] < self.min_duration:
+                    continue
+                if self.max_duration != -1 and line["duration"] > self.max_duration:
+                    continue
+                self.data_list.append(dict(line))
+    #
+    def _get_list_data(self, idx):
+        if self.data_list_path.endswith(".header"):
+            data_list = self.dataset_reader.get_data(self.data_list[idx])
+        else:
+            data_list = self.data_list[idx]
+        #
+        audio_file = data_list["audio"]['path']
+        transcript = data_list["sentences"] if self.timestamps else data_list["sentence"]
+        language = data_list["language"] if 'language' in data_list.keys() else None
+        if 'start_time' not in data_list["audio"].keys():
+            sample, sample_rate = soundfile.read(audio_file, dtype='float32')
+        else:
+            start_time, end_time = data_list["audio"]["start_time"], data_list["audio"]["end_time"]
+            #
+            sample, sample_rate = self.slice_from_file(audio_file, start=start_time, end=end_time)
+        sample = sample.T
+        #
+        if self.mono:
+            sample = librosa.to_mono(sample)
+        #
+        if self.augment_configs:
+            sample, sample_rate = self.augment(sample, sample_rate)
+        #
+        if self.sample_rate != sample_rate:
+            sample = self.resample(sample, orig_sr=sample_rate, target_sr=self.sample_rate)
+        return sample, sample_rate, transcript, language
+    def _load_timestamps_transcript(self, transcript: List[dict]):
+        assert isinstance(transcript, list), f"transcript list：{type(transcript)}"
+        data = dict()
+        labels = self.processor.tokenizer.prefix_tokens[:3]
+        for t in transcript:
+            #
+            start = t['start'] if round(t['start'] * 100) % 2 == 0 else t['start'] + 0.01
+            start = self.timestamp_begin + round(start * 100) // 2
+            end = t['end'] if round(t['end'] * 100) % 2 == 0 else t['end'] - 0.01
+            end = self.timestamp_begin + round(end * 100) // 2
+            label = self.processor(text=t['text']).input_ids[4:-1]
+            labels.extend([start])
+            labels.extend(label)
+            labels.extend([end])
+        data['labels'] = labels + [self.endoftext]
+        return data
+    def __getitem__(self, idx):
+        try:
+            #
+            sample, sample_rate, transcript, language = self._get_list_data(idx=idx)
+            #
+            self.processor.tokenizer.set_prefix_tokens(language=language if language is not None else self.language)
+            if len(transcript) > 0:
+                #
+                if self.timestamps:
+                    data = self._load_timestamps_transcript(transcript=transcript)
+                    #
+                    data["input_features"] = self.processor(audio=sample, sampling_rate=self.sample_rate).input_features
+                else:
+                    #
+                    data = self.processor(audio=sample, sampling_rate=self.sample_rate, text=transcript)
+            else:
+                #
+                data = self.processor(audio=sample, sampling_rate=self.sample_rate)
+                data['labels'] = [self.startoftranscript, self.nocaptions, self.endoftext]
+            return data
+        except Exception as e:
+            print(f'idx：{idx} error - {e}', file=sys.stderr)
+            return self.__getitem__(random.randint(0, self.__len__() - 1))
+    def __len__(self):
+        return len(self.data_list)
+    #
+    @staticmethod
+    def slice_from_file(file, start, end):
+        sndfile = soundfile.SoundFile(file)
+        sample_rate = sndfile.samplerate
+        duration = round(float(len(sndfile)) / sample_rate, 3)
+        start = round(start, 3)
+        end = round(end, 3)
+        #
+        if start < 0.0: start += duration
+        if end < 0.0: end += duration
+        #
+        if start < 0.0: start = 0.0
+        if end > duration: end = duration
+        if end < 0.0:
+            raise ValueError("(%f s)" % end)
+        if start > end:
+            raise ValueError("(%f s)(%f s)" % (start, end))
+        start_frame = int(start * sample_rate)
+        end_frame = int(end * sample_rate)
+        sndfile.seek(start_frame)
+        sample = sndfile.read(frames=end_frame - start_frame, dtype='float32')
+        return sample, sample_rate
+    #
+    def augment(self, sample, sample_rate):
+        for config in self.augment_configs:
+            if config['type'] == 'speed' and random.random() < config['prob']:
+                if self.speed_rates is None:
+                    min_speed_rate, max_speed_rate, num_rates = config['params']['min_speed_rate'], \
+                        config['params']['max_speed_rate'], config['params']['num_rates']
+                    self.speed_rates = np.linspace(min_speed_rate, max_speed_rate, num_rates, endpoint=True)
+                rate = random.choice(self.speed_rates)
+                sample = self.change_speed(sample, speed_rate=rate)
+            if config['type'] == 'shift' and random.random() < config['prob']:
+                min_shift_ms, max_shift_ms = config['params']['min_shift_ms'], config['params']['max_shift_ms']
+                shift_ms = random.randint(min_shift_ms, max_shift_ms)
+                sample = self.shift(sample, sample_rate, shift_ms=shift_ms)
+            if config['type'] == 'volume' and random.random() < config['prob']:
+                min_gain_dBFS, max_gain_dBFS = config['params']['min_gain_dBFS'], config['params']['max_gain_dBFS']
+                gain = random.randint(min_gain_dBFS, max_gain_dBFS)
+                sample = self.volume(sample, gain=gain)
+            if config['type'] == 'resample' and random.random() < config['prob']:
+                new_sample_rates = config['params']['new_sample_rates']
+                new_sample_rate = np.random.choice(new_sample_rates)
+                sample = self.resample(sample, orig_sr=sample_rate, target_sr=new_sample_rate)
+                sample_rate = new_sample_rate
+            if config['type'] == 'noise' and random.random() < config['prob']:
+                min_snr_dB, max_snr_dB = config['params']['min_snr_dB'], config['params']['max_snr_dB']
+                if self.noises_path is None:
+                    self.noises_path = []
+                    noise_dir = config['params']['noise_dir']
+                    if os.path.exists(noise_dir):
+                        for file in os.listdir(noise_dir):
+                            self.noises_path.append(os.path.join(noise_dir, file))
+                noise_path = random.choice(self.noises_path)
+                snr_dB = random.randint(min_snr_dB, max_snr_dB)
+                sample = self.add_noise(sample, sample_rate, noise_path=noise_path, snr_dB=snr_dB)
+        return sample, sample_rate
+    #
+    @staticmethod
+    def change_speed(sample, speed_rate):
+        if speed_rate == 1.0:
+            return sample
+        if speed_rate <= 0:
+            raise ValueError("error")
+        old_length = sample.shape[0]
+        new_length = int(old_length / speed_rate)
+        old_indices = np.arange(old_length)
+        new_indices = np.linspace(start=0, stop=old_length, num=new_length)
+        sample = np.interp(new_indices, old_indices, sample).astype(np.float32)
+        return sample
+    #
+    @staticmethod
+    def shift(sample, sample_rate, shift_ms):
+        duration = sample.shape[0] / sample_rate
+        if abs(shift_ms) / 1000.0 > duration:
+            raise ValueError("shift_ms")
+        shift_samples = int(shift_ms * sample_rate / 1000)
+        if shift_samples > 0:
+            sample[:-shift_samples] = sample[shift_samples:]
+            sample[-shift_samples:] = 0
+        elif shift_samples < 0:
+            sample[-shift_samples:] = sample[:shift_samples]
+            sample[:-shift_samples] = 0
+        return sample
+    #
+    @staticmethod
+    def volume(sample, gain):
+        sample *= 10.**(gain / 20.)
+        return
+    #
+    @staticmethod
+    def resample(sample, orig_sr, target_sr):
+        sample = librosa.resample(sample, orig_sr=orig_sr, target_sr=target_sr)
+        return sample
+    #
+    def add_noise(self, sample, sample_rate, noise_path, snr_dB, max_gain_db=300.0):
+        noise_sample, sr = librosa.load(noise_path, sr=sample_rate)
+        #
+        target_db = -20
+        gain = min(max_gain_db, target_db - self.rms_db(sample))
+        sample *= 10. ** (gain / 20.)
+        #
+        sample_rms_db, noise_rms_db = self.rms_db(sample), self.rms_db(noise_sample)
+        noise_gain_db = min(sample_rms_db - noise_rms_db - snr_dB, max_gain_db)
+        noise_sample *= 10. ** (noise_gain_db / 20.)
+        #
+        if noise_sample.shape[0] < sample.shape[0]:
+            diff_duration = sample.shape[0] - noise_sample.shape[0]
+            noise_sample = np.pad(noise_sample, (0, diff_duration), 'wrap')
+        elif noise_sample.shape[0] > sample.shape[0]:
+            start_frame = random.randint(0, noise_sample.shape[0] - sample.shape[0])
+            noise_sample = noise_sample[start_frame:sample.shape[0] + start_frame]
+        sample += noise_sample
+        return sample
+    @staticmethod
+    def rms_db(sample):
+        mean_square = np.mean(sample ** 2)
+        return 10 * np.log10(mean_square)

utils/utils.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import hashlib
+import os
+import tarfile
+import urllib.request
+from tqdm import tqdm
+def print_arguments(args):
+    print("-----------  Configuration Arguments -----------")
+    for arg, value in vars(args).items():
+        print("%s: %s" % (arg, value))
+    print("------------------------------------------------")
+def strtobool(val):
+    val = val.lower()
+    if val in ('y', 'yes', 't', 'true', 'on', '1'):
+        return True
+    elif val in ('n', 'no', 'f', 'false', 'off', '0'):
+        return False
+    else:
+        raise ValueError("invalid truth value %r" % (val,))
+def str_none(val):
+    if val == 'None':
+        return None
+    else:
+        return val
+def add_arguments(argname, type, default, help, argparser, **kwargs):
+    type = strtobool if type == bool else type
+    type = str_none if type == str else type
+    argparser.add_argument("--" + argname,
+                           default=default,
+                           type=type,
+                           help=help + ' Default: %(default)s.',
+                           **kwargs)
+def md5file(fname):
+    hash_md5 = hashlib.md5()
+    f = open(fname, "rb")
+    for chunk in iter(lambda: f.read(4096), b""):
+        hash_md5.update(chunk)
+    f.close()
+    return hash_md5.hexdigest()
+def download(url, md5sum, target_dir):
+    """Download file from url to target_dir, and check md5sum."""
+    if not os.path.exists(target_dir): os.makedirs(target_dir)
+    filepath = os.path.join(target_dir, url.split("/")[-1])
+    if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
+        print(f"Downloading {url} to {filepath} ...")
+        with urllib.request.urlopen(url) as source, open(filepath, "wb") as output:
+            with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True,
+                      unit_divisor=1024) as loop:
+                while True:
+                    buffer = source.read(8192)
+                    if not buffer:
+                        break
+                    output.write(buffer)
+                    loop.update(len(buffer))
+        print(f"\nMD5 Chesksum {filepath} ...")
+        if not md5file(filepath) == md5sum:
+            raise RuntimeError("MD5 checksum failed.")
+    else:
+        print(f"File exists, skip downloading. ({filepath})")
+    return filepath
+def unpack(filepath, target_dir, rm_tar=False):
+    """Unpack the file to the target_dir."""
+    print("Unpacking %s ..." % filepath)
+    tar = tarfile.open(filepath)
+    tar.extractall(target_dir)
+    tar.close()
+    if rm_tar:
+        os.remove(filepath)
+def make_inputs_require_grad(module, input, output):
+    output.requires_grad_(True)