devasheeshG commited on
Commit
7d5210d
1 Parent(s): 013bf1c

updated code

Browse files
Files changed (2) hide show
  1. README.md +1 -1
  2. __init__.py +40 -28
README.md CHANGED
@@ -266,7 +266,7 @@ language:
266
  | Original_Model (54 min) | 52.02 | 47.86 | 66.82 | 33.17 | 23.76 |
267
  | This_Model (38 min) | 54.97 | 47.86 | 66.83 | 33.16 | 30.23 |
268
 
269
- ### Hindi to English (test.tsv) [Common Voice 14.0](https://commonvoice.mozilla.org/en/datasets)
270
 
271
  **Test done on RTX 3060 on 1000 Samples**
272
 
 
266
  | Original_Model (54 min) | 52.02 | 47.86 | 66.82 | 33.17 | 23.76 |
267
  | This_Model (38 min) | 54.97 | 47.86 | 66.83 | 33.16 | 30.23 |
268
 
269
+ ### Hindi to English (test.tsv) [Custom Dataset](https://huggingface.co/datasets/devasheeshG/common_voices_14_0_hi2en_hi2hi)
270
 
271
  **Test done on RTX 3060 on 1000 Samples**
272
 
__init__.py CHANGED
@@ -1,5 +1,7 @@
1
  from transformers import (
2
- WhisperForConditionalGeneration, WhisperProcessor, WhisperConfig,
 
 
3
  )
4
  import torch
5
  import ffmpeg
@@ -13,6 +15,7 @@ SAMPLE_RATE = 16000
13
  CHUNK_LENGTH = 30 # 30-second chunks
14
  N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE # 480000 samples in a 30-second chunk
15
 
 
16
  # audio = whisper.load_audio('test.wav')
17
  def load_audio(file: str, sr: int = SAMPLE_RATE, start_time: int = 0, dtype=np.float16):
18
  """
@@ -59,55 +62,64 @@ def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
59
 
60
  return array
61
 
 
62
  class Model:
63
- def __init__(self,
64
- model_name_or_path: str,
65
- cuda_visible_device: str = "0",
66
- device: str = 'cuda' # torch.device("cuda" if torch.cuda.is_available() else "cpu")
67
- ):
68
-
69
  os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_device
70
  self.DEVICE = device
71
-
72
  self.processor = WhisperProcessor.from_pretrained(model_name_or_path)
73
  self.tokenizer = self.processor.tokenizer
74
 
75
  self.config = WhisperConfig.from_pretrained(model_name_or_path)
76
 
77
  self.model = WhisperForConditionalGeneration(
78
- config=self.config
79
- ).from_pretrained(
80
- pretrained_model_name_or_path = model_name_or_path,
81
- torch_dtype = self.config.torch_dtype,
82
- # device_map=DEVICE, # 'balanced', 'balanced_low_0', 'sequential', 'cuda', 'cpu'
83
- low_cpu_mem_usage = True,
84
- )
85
-
86
  # Move model to GPU
87
  if self.model.device.type != self.DEVICE:
88
- print(f'Moving model to {self.DEVICE}')
89
  self.model = self.model.to(self.DEVICE)
90
  self.model.eval()
91
 
92
  else:
93
- print(f'Model is already on {self.DEVICE}')
94
  self.model.eval()
95
-
96
- print('dtype of model acc to config: ', self.config.torch_dtype)
97
- print('dtype of loaded model: ', self.model.dtype)
98
-
99
- def transcribe(self, audio, language: str = "english", skip_special_tokens: bool = True) -> str:
100
- input_features = self.processor(audio, sampling_rate=SAMPLE_RATE, return_tensors="pt").input_features.half().to(self.DEVICE)
 
 
 
 
 
 
101
  with torch.no_grad():
102
  predicted_ids = self.model.generate(
103
  input_features,
104
- num_beams = 1,
105
  language=language,
106
  task="transcribe",
107
  use_cache=True,
108
  is_multilingual=True,
109
  return_timestamps=True,
110
  )
111
-
112
- transcription = self.tokenizer.batch_decode(predicted_ids, skip_special_tokens=skip_special_tokens)[0]
113
- return transcription.strip()
 
 
 
1
  from transformers import (
2
+ WhisperForConditionalGeneration,
3
+ WhisperProcessor,
4
+ WhisperConfig,
5
  )
6
  import torch
7
  import ffmpeg
 
15
  CHUNK_LENGTH = 30 # 30-second chunks
16
  N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE # 480000 samples in a 30-second chunk
17
 
18
+
19
  # audio = whisper.load_audio('test.wav')
20
  def load_audio(file: str, sr: int = SAMPLE_RATE, start_time: int = 0, dtype=np.float16):
21
  """
 
62
 
63
  return array
64
 
65
+
66
  class Model:
67
+ def __init__(
68
+ self,
69
+ model_name_or_path: str,
70
+ cuda_visible_device: str = "0",
71
+ device: str = "cuda", # torch.device("cuda" if torch.cuda.is_available() else "cpu")
72
+ ):
73
  os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_device
74
  self.DEVICE = device
75
+
76
  self.processor = WhisperProcessor.from_pretrained(model_name_or_path)
77
  self.tokenizer = self.processor.tokenizer
78
 
79
  self.config = WhisperConfig.from_pretrained(model_name_or_path)
80
 
81
  self.model = WhisperForConditionalGeneration(
82
+ config=self.config
83
+ ).from_pretrained(
84
+ pretrained_model_name_or_path=model_name_or_path,
85
+ torch_dtype=self.config.torch_dtype,
86
+ # device_map=DEVICE, # 'balanced', 'balanced_low_0', 'sequential', 'cuda', 'cpu'
87
+ low_cpu_mem_usage=True,
88
+ )
89
+
90
  # Move model to GPU
91
  if self.model.device.type != self.DEVICE:
92
+ print(f"Moving model to {self.DEVICE}")
93
  self.model = self.model.to(self.DEVICE)
94
  self.model.eval()
95
 
96
  else:
97
+ print(f"Model is already on {self.DEVICE}")
98
  self.model.eval()
99
+
100
+ print("dtype of model acc to config: ", self.config.torch_dtype)
101
+ print("dtype of loaded model: ", self.model.dtype)
102
+
103
+ def transcribe(
104
+ self, audio, language: str = "english", skip_special_tokens: bool = True
105
+ ) -> str:
106
+ input_features = (
107
+ self.processor(audio, sampling_rate=SAMPLE_RATE, return_tensors="pt")
108
+ .input_features.half()
109
+ .to(self.DEVICE)
110
+ )
111
  with torch.no_grad():
112
  predicted_ids = self.model.generate(
113
  input_features,
114
+ num_beams=1,
115
  language=language,
116
  task="transcribe",
117
  use_cache=True,
118
  is_multilingual=True,
119
  return_timestamps=True,
120
  )
121
+
122
+ transcription = self.tokenizer.batch_decode(
123
+ predicted_ids, skip_special_tokens=skip_special_tokens
124
+ )[0]
125
+ return transcription.strip()