whisper
Browse files
.zshrc
CHANGED
@@ -8,6 +8,15 @@
|
|
8 |
# - conda-env: Adds support for Conda environment management
|
9 |
# 4. Set the custom theme for the shell prompt
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
# Load the custom git wrapper script
|
12 |
source $HOME/toolkit/git-wrapper.zsh
|
13 |
|
|
|
8 |
# - conda-env: Adds support for Conda environment management
|
9 |
# 4. Set the custom theme for the shell prompt
|
10 |
|
11 |
+
# The `export QT_QPA_PLATFORM=offscreen` command is used to set the `QT_QPA_PLATFORM`
|
12 |
+
# environment variable to `offscreen`. This is particularly useful when running Qt
|
13 |
+
# applications in a headless environment, such as a server or a CI/CD pipeline,
|
14 |
+
# where there is no display server available. By setting this variable, Qt
|
15 |
+
# applications can render their graphical output offscreen, allowing them to
|
16 |
+
# run without requiring a graphical user interface (GUI). This is commonly used for
|
17 |
+
# automated testing, rendering, or other tasks that do not require user interaction.
|
18 |
+
export QT_QPA_PLATFORM=offscreen
|
19 |
+
|
20 |
# Load the custom git wrapper script
|
21 |
source $HOME/toolkit/git-wrapper.zsh
|
22 |
|
whisper
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
"""
|
5 |
+
This script uses the Whisper large-v3-turbo model from OpenAI for automatic speech recognition (ASR).
|
6 |
+
The model is finetuned for faster performance with a minor quality trade-off. It leverages the Hugging Face
|
7 |
+
Transformers library to load the model and processor, and performs transcription on an input audio file.
|
8 |
+
|
9 |
+
Whisper is a state-of-the-art model for ASR and speech translation, proposed in the paper "Robust Speech
|
10 |
+
Recognition via Large-Scale Weak Supervision" by Alec Radford et al. from OpenAI. Trained on over 5 million
|
11 |
+
hours of labeled data, Whisper demonstrates a strong ability to generalize to many datasets and domains in
|
12 |
+
a zero-shot setting.
|
13 |
+
|
14 |
+
The script performs the following steps:
|
15 |
+
1. Checks if a CUDA-enabled GPU is available and sets the appropriate device and data type.
|
16 |
+
2. Loads the Whisper large-v3-turbo model and processor from the Hugging Face Hub.
|
17 |
+
3. Initializes an ASR pipeline using the model and processor.
|
18 |
+
4. Defines a function `transcribe_audio` that takes an audio file path as input, performs transcription,
|
19 |
+
and outputs the result to the terminal and a text file.
|
20 |
+
5. The script expects an audio file path as a command-line argument and calls the `transcribe_audio` function.
|
21 |
+
|
22 |
+
Usage:
|
23 |
+
whisper <audio_file>
|
24 |
+
|
25 |
+
Dependencies:
|
26 |
+
- torch
|
27 |
+
- transformers
|
28 |
+
- datasets
|
29 |
+
- accelerate
|
30 |
+
|
31 |
+
Example:
|
32 |
+
whisper sample_audio.wav
|
33 |
+
"""
|
34 |
+
|
35 |
+
import torch
|
36 |
+
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
|
37 |
+
import sys
|
38 |
+
import os
|
39 |
+
|
40 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
41 |
+
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
42 |
+
|
43 |
+
model_id = "openai/whisper-large-v3-turbo"
|
44 |
+
|
45 |
+
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
46 |
+
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
|
47 |
+
)
|
48 |
+
model.to(device)
|
49 |
+
|
50 |
+
processor = AutoProcessor.from_pretrained(model_id)
|
51 |
+
|
52 |
+
pipe = pipeline(
|
53 |
+
"automatic-speech-recognition",
|
54 |
+
model=model,
|
55 |
+
tokenizer=processor.tokenizer,
|
56 |
+
feature_extractor=processor.feature_extractor,
|
57 |
+
torch_dtype=torch_dtype,
|
58 |
+
device=device,
|
59 |
+
)
|
60 |
+
|
61 |
+
def transcribe_audio(audio_path):
|
62 |
+
# Load audio file
|
63 |
+
audio = {"path": audio_path}
|
64 |
+
|
65 |
+
# Perform transcription
|
66 |
+
result = pipe(audio)
|
67 |
+
|
68 |
+
# Get the base filename and directory
|
69 |
+
base_filename = os.path.splitext(audio_path)[0]
|
70 |
+
output_text_path = base_filename + ".txt"
|
71 |
+
|
72 |
+
# Output the result to the terminal
|
73 |
+
print(result["text"])
|
74 |
+
|
75 |
+
# Save the result to a text file
|
76 |
+
with open(output_text_path, "w") as f:
|
77 |
+
f.write(result["text"])
|
78 |
+
|
79 |
+
if __name__ == "__main__":
|
80 |
+
if len(sys.argv) != 2:
|
81 |
+
print("Usage: python script.py <audio_file>")
|
82 |
+
sys.exit(1)
|
83 |
+
|
84 |
+
audio_file = sys.argv[1]
|
85 |
+
transcribe_audio(audio_file)
|
86 |
+
|