elsayedissa commited on
Commit
bb2e2b4
1 Parent(s): 558ea70

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +13 -39
README.md CHANGED
@@ -5,7 +5,7 @@ tags:
5
  metrics:
6
  - wer
7
  model-index:
8
- - name: whisper-large-v2-arabic-5k-steps
9
  results: []
10
  datasets:
11
  - mozilla-foundation/common_voice_11_0
@@ -16,16 +16,13 @@ language:
16
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
17
  should probably proofread and complete it, then remove this comment. -->
18
 
19
- # whisper-large-v2-arabic-5k-steps
20
 
21
  This model is a fine-tuned version of [openai/whisper-large-v2](https://huggingface.co/openai/whisper-large-v2) on the Arabic CommonVoice dataset (v11).
22
- It achieves the following results on the evaluation set:
23
- - Loss: 0.3434
24
- - Wer: 0.4239
25
 
26
  ## Model description
27
 
28
- This model is finetuned for 5000 steps for research purposes which means that the transcriptions might not be that satisfactory for users.
29
 
30
  ## Training and evaluation data
31
 
@@ -48,16 +45,6 @@ The following hyperparameters were used during training:
48
  - training_steps: 5000
49
  - mixed_precision_training: Native AMP
50
 
51
- ### Training results
52
-
53
- | Training Loss | Epoch | Step | Validation Loss | Wer |
54
- |:-------------:|:-----:|:----:|:---------------:|:------:|
55
- | 0.1638 | 1.78 | 1000 | 0.2295 | 0.4410 |
56
- | 0.0587 | 3.57 | 2000 | 0.2337 | 0.4272 |
57
- | 0.0125 | 5.35 | 3000 | 0.2745 | 0.4208 |
58
- | 0.004 | 7.13 | 4000 | 0.3124 | 0.4252 |
59
- | 0.0016 | 8.91 | 5000 | 0.3434 | 0.4239 |
60
-
61
  ### Transcription:
62
 
63
  ```python
@@ -69,12 +56,12 @@ from transformers import WhisperProcessor, WhisperForConditionalGeneration
69
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
70
 
71
  # load the model
72
- processor = WhisperProcessor.from_pretrained("clu-ling/whisper-large-v2-arabic-5k-steps")
73
- model = WhisperForConditionalGeneration.from_pretrained("clu-ling/whisper-large-v2-arabic-5k-steps").to(device)
74
- forced_decoder_ids = processor.get_decoder_prompt_ids(language="ar", task="transcribe")
75
 
76
  # load the dataset
77
- commonvoice_eval = load_dataset("mozilla-foundation/common_voice_11_0", "ar", split="validation", streaming=True)
78
  commonvoice_eval = commonvoice_eval.cast_column("audio", Audio(sampling_rate=16000))
79
  sample = next(iter(commonvoice_eval))["audio"]
80
 
@@ -86,7 +73,6 @@ predicted_ids = model.generate(input_features.to(device), forced_decoder_ids=for
86
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
87
 
88
  print("Transcription:", transcription)
89
- Transcription: عمي هو أخو أبي.
90
  ```
91
 
92
  ### Evaluation:
@@ -94,7 +80,6 @@ Transcription: عمي هو أخو أبي.
94
  Evaluates this model on `mozilla-foundation/common_voice_11_0` test split.
95
 
96
  ```python
97
- import pyarabic.araby as araby
98
  from transformers.models.whisper.english_normalizer import BasicTextNormalizer
99
  from datasets import load_dataset, Audio
100
  import evaluate
@@ -109,40 +94,29 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
109
  wer_metric = evaluate.load("wer")
110
 
111
  # model
112
- processor = WhisperProcessor.from_pretrained("clu-ling/whisper-large-v2-arabic-5k-steps")
113
- model = WhisperForConditionalGeneration.from_pretrained("clu-ling/whisper-large-v2-arabic-5k-steps")
114
 
115
  # dataset
116
- dataset = load_dataset("mozilla-foundation/common_voice_11_0", "ar", split="test", ) #cache_dir=args.cache_dir
117
  dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
118
 
119
  #for debuggings: it gets two examples
120
  #dataset = dataset.shard(num_shards=10000, index=0)
121
  #print(dataset)
122
-
123
- def clean_text(text):
124
- """Normalizes TRANSCRIPT"""
125
- text = re.sub(r'[\,\?\.\!\-\;\:\"\“\%\٪\‘\”\�\«\»\،\.\:\؟\؛\*\>\<]', '', text) + " " # special characters
126
- text = re.sub(r'http\S+', '', text) + " " # links
127
- text = re.sub(r'[\[\]\(\)\-\/\{\}]', '', text) + " " # brackets
128
- text = re.sub(r'\s+', ' ', text) + " " # extra white space
129
- text = araby.strip_diacritics(text) # remove diacrirics
130
- return text.strip()
131
 
132
  def normalize(batch):
133
- """Normalizes GOLD"""
134
- #batch["gold_text"] = whisper_norm(batch['sentence'])
135
- batch["gold_text"] = clean_text(batch['sentence'])
136
  return batch
137
 
138
  def map_wer(batch):
139
  model.to(device)
140
- forced_decoder_ids = processor.get_decoder_prompt_ids(language = "ar", task = "transcribe")
141
  inputs = processor(batch["audio"]["array"], sampling_rate=batch["audio"]["sampling_rate"], return_tensors="pt").input_features
142
  with torch.no_grad():
143
  generated_ids = model.generate(inputs=inputs.to(device), forced_decoder_ids=forced_decoder_ids)
144
  transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
145
- batch["predicted_text"] = clean_text(transcription)
146
  return batch
147
 
148
  # process GOLD text
 
5
  metrics:
6
  - wer
7
  model-index:
8
+ - name: whisper-large-v2-english-2k-steps
9
  results: []
10
  datasets:
11
  - mozilla-foundation/common_voice_11_0
 
16
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
17
  should probably proofread and complete it, then remove this comment. -->
18
 
19
+ # whisper-large-v2-english-2k-steps
20
 
21
  This model is a fine-tuned version of [openai/whisper-large-v2](https://huggingface.co/openai/whisper-large-v2) on the Arabic CommonVoice dataset (v11).
 
 
 
22
 
23
  ## Model description
24
 
25
+ This model is finetuned for 2000 steps for research purposes which means that the transcriptions might not be that satisfactory for users.
26
 
27
  ## Training and evaluation data
28
 
 
45
  - training_steps: 5000
46
  - mixed_precision_training: Native AMP
47
 
 
 
 
 
 
 
 
 
 
 
48
  ### Transcription:
49
 
50
  ```python
 
56
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
57
 
58
  # load the model
59
+ processor = WhisperProcessor.from_pretrained("clu-ling/whisper-large-v2-english-2k-steps")
60
+ model = WhisperForConditionalGeneration.from_pretrained("clu-ling/whisper-large-v2-english-2k-steps").to(device)
61
+ forced_decoder_ids = processor.get_decoder_prompt_ids(language="en", task="transcribe")
62
 
63
  # load the dataset
64
+ commonvoice_eval = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="validation", streaming=True)
65
  commonvoice_eval = commonvoice_eval.cast_column("audio", Audio(sampling_rate=16000))
66
  sample = next(iter(commonvoice_eval))["audio"]
67
 
 
73
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
74
 
75
  print("Transcription:", transcription)
 
76
  ```
77
 
78
  ### Evaluation:
 
80
  Evaluates this model on `mozilla-foundation/common_voice_11_0` test split.
81
 
82
  ```python
 
83
  from transformers.models.whisper.english_normalizer import BasicTextNormalizer
84
  from datasets import load_dataset, Audio
85
  import evaluate
 
94
  wer_metric = evaluate.load("wer")
95
 
96
  # model
97
+ processor = WhisperProcessor.from_pretrained("clu-ling/whisper-large-v2-english-2k-steps")
98
+ model = WhisperForConditionalGeneration.from_pretrained("clu-ling/whisper-large-v2-english-2k-steps")
99
 
100
  # dataset
101
+ dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="test", ) #cache_dir=args.cache_dir
102
  dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
103
 
104
  #for debuggings: it gets two examples
105
  #dataset = dataset.shard(num_shards=10000, index=0)
106
  #print(dataset)
 
 
 
 
 
 
 
 
 
107
 
108
  def normalize(batch):
109
+ batch["gold_text"] = whisper_norm(batch['sentence'])
 
 
110
  return batch
111
 
112
  def map_wer(batch):
113
  model.to(device)
114
+ forced_decoder_ids = processor.get_decoder_prompt_ids(language = "en", task = "transcribe")
115
  inputs = processor(batch["audio"]["array"], sampling_rate=batch["audio"]["sampling_rate"], return_tensors="pt").input_features
116
  with torch.no_grad():
117
  generated_ids = model.generate(inputs=inputs.to(device), forced_decoder_ids=forced_decoder_ids)
118
  transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
119
+ batch["predicted_text"] = whisper_norm(transcription)
120
  return batch
121
 
122
  # process GOLD text