birgermoell commited on
Commit
aa2d5b7
1 Parent(s): bbdd9bc

Updated files

Browse files
Files changed (2) hide show
  1. get_tokens.py +15 -0
  2. lm.py +1 -0
get_tokens.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import Wav2Vec2ProcessorWithLM
2
+ import torchaudio
3
+
4
+ import torch
5
+ from datasets import load_dataset
6
+ from transformers import AutoModelForCTC, AutoProcessor
7
+ import torchaudio.functional as F
8
+
9
+ model_id = "."
10
+ model = AutoModelForCTC.from_pretrained(model_id)
11
+ processor = AutoProcessor.from_pretrained(model_id)
12
+
13
+ vocab_dict = processor.tokenizer.get_vocab()
14
+ print(vocab_dict)
15
+ sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}
lm.py CHANGED
@@ -10,6 +10,7 @@ import torchaudio.functional as F
10
 
11
  model_id = "."
12
 
 
13
  sample_iter = iter(load_dataset("mozilla-foundation/common_voice_7_0", "sv-SE", split="test", streaming=True, use_auth_token=True))
14
 
15
  sample = next(sample_iter)
 
10
 
11
  model_id = "."
12
 
13
+
14
  sample_iter = iter(load_dataset("mozilla-foundation/common_voice_7_0", "sv-SE", split="test", streaming=True, use_auth_token=True))
15
 
16
  sample = next(sample_iter)