birgermoell
commited on
Commit
·
aa2d5b7
1
Parent(s):
bbdd9bc
Updated files
Browse files- get_tokens.py +15 -0
- lm.py +1 -0
get_tokens.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import Wav2Vec2ProcessorWithLM
|
2 |
+
import torchaudio
|
3 |
+
|
4 |
+
import torch
|
5 |
+
from datasets import load_dataset
|
6 |
+
from transformers import AutoModelForCTC, AutoProcessor
|
7 |
+
import torchaudio.functional as F
|
8 |
+
|
9 |
+
model_id = "."
|
10 |
+
model = AutoModelForCTC.from_pretrained(model_id)
|
11 |
+
processor = AutoProcessor.from_pretrained(model_id)
|
12 |
+
|
13 |
+
vocab_dict = processor.tokenizer.get_vocab()
|
14 |
+
print(vocab_dict)
|
15 |
+
sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}
|
lm.py
CHANGED
@@ -10,6 +10,7 @@ import torchaudio.functional as F
|
|
10 |
|
11 |
model_id = "."
|
12 |
|
|
|
13 |
sample_iter = iter(load_dataset("mozilla-foundation/common_voice_7_0", "sv-SE", split="test", streaming=True, use_auth_token=True))
|
14 |
|
15 |
sample = next(sample_iter)
|
|
|
10 |
|
11 |
model_id = "."
|
12 |
|
13 |
+
|
14 |
sample_iter = iter(load_dataset("mozilla-foundation/common_voice_7_0", "sv-SE", split="test", streaming=True, use_auth_token=True))
|
15 |
|
16 |
sample = next(sample_iter)
|