birgermoell
commited on
Commit
·
c498527
1
Parent(s):
aa2d5b7
WIP updated lm
Browse files- .gitattributes +2 -0
- .gitignore +1 -0
- language_model/5gram.bin → 5gram_correct.arpa +2 -2
- build_n_gram.py +9 -0
- end_token.py +12 -0
- get_tokens.py +18 -9
- language_model/attrs.json +0 -1
- language_model/unigrams.txt +0 -0
- preprocessor_config.json +1 -2
- requirements.txt +0 -0
- text.txt +3 -0
.gitattributes
CHANGED
@@ -15,3 +15,5 @@
|
|
15 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
16 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
17 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
15 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
16 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
17 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.arpa filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.txt filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
kenlm/
|
language_model/5gram.bin → 5gram_correct.arpa
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3aab18b31980b9b9fdf85546c8763af2a4e2220d464ab5e3fab99cf19c3158dd
|
3 |
+
size 4394946469
|
build_n_gram.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset
|
2 |
+
|
3 |
+
target_lang="sv" # change to your target lang
|
4 |
+
username = "hf-test" # change to your username
|
5 |
+
|
6 |
+
dataset = load_dataset(f"{username}/{target_lang}_corpora_parliament_processed", split="train")
|
7 |
+
|
8 |
+
with open("text.txt", "w") as file:
|
9 |
+
file.write(" ".join(dataset["text"]))
|
end_token.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
with open("5gram.arpa", "r") as read_file, open("5gram_correct.arpa", "w") as write_file:
|
2 |
+
has_added_eos = False
|
3 |
+
for line in read_file:
|
4 |
+
if not has_added_eos and "ngram 1=" in line:
|
5 |
+
count=line.strip().split("=")[-1]
|
6 |
+
write_file.write(line.replace(f"{count}", f"{int(count)+1}"))
|
7 |
+
elif not has_added_eos and "<s>" in line:
|
8 |
+
write_file.write(line)
|
9 |
+
write_file.write(line.replace("<s>", "</s>"))
|
10 |
+
has_added_eos = True
|
11 |
+
else:
|
12 |
+
write_file.write(line)
|
get_tokens.py
CHANGED
@@ -1,15 +1,24 @@
|
|
1 |
-
from transformers import Wav2Vec2ProcessorWithLM
|
2 |
-
import torchaudio
|
3 |
-
|
4 |
-
import torch
|
5 |
-
from datasets import load_dataset
|
6 |
from transformers import AutoModelForCTC, AutoProcessor
|
7 |
-
import torchaudio.functional as F
|
8 |
-
|
9 |
model_id = "."
|
10 |
model = AutoModelForCTC.from_pretrained(model_id)
|
11 |
processor = AutoProcessor.from_pretrained(model_id)
|
12 |
|
13 |
vocab_dict = processor.tokenizer.get_vocab()
|
14 |
-
print(vocab_dict)
|
15 |
-
sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from transformers import AutoModelForCTC, AutoProcessor
|
|
|
|
|
2 |
model_id = "."
|
3 |
model = AutoModelForCTC.from_pretrained(model_id)
|
4 |
processor = AutoProcessor.from_pretrained(model_id)
|
5 |
|
6 |
vocab_dict = processor.tokenizer.get_vocab()
|
7 |
+
# print(vocab_dict)
|
8 |
+
sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}
|
9 |
+
|
10 |
+
from pyctcdecode import build_ctcdecoder
|
11 |
+
|
12 |
+
|
13 |
+
decoder = build_ctcdecoder(
|
14 |
+
labels=list(sorted_vocab_dict.keys()),
|
15 |
+
kenlm_model_path="5gram_correct.arpa",
|
16 |
+
)
|
17 |
+
|
18 |
+
from transformers import Wav2Vec2ProcessorWithLM
|
19 |
+
|
20 |
+
processor_with_lm = Wav2Vec2ProcessorWithLM(
|
21 |
+
feature_extractor=processor.feature_extractor,
|
22 |
+
tokenizer=processor.tokenizer,
|
23 |
+
decoder=decoder
|
24 |
+
)
|
language_model/attrs.json
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
{"alpha": 0.5, "beta": 1.5, "unk_score_offset": -10.0, "score_boundary": true}
|
|
|
|
language_model/unigrams.txt
DELETED
The diff for this file is too large to render.
See raw diff
|
|
preprocessor_config.json
CHANGED
@@ -5,8 +5,7 @@
|
|
5 |
"padding_side": "right",
|
6 |
"padding_value": 0,
|
7 |
"return_attention_mask": true,
|
8 |
-
"sampling_rate": 16000
|
9 |
-
"processor_class": "Wav2Vec2ProcessorWithLM"
|
10 |
}
|
11 |
|
12 |
|
|
|
5 |
"padding_side": "right",
|
6 |
"padding_value": 0,
|
7 |
"return_attention_mask": true,
|
8 |
+
"sampling_rate": 16000
|
|
|
9 |
}
|
10 |
|
11 |
|
requirements.txt
ADDED
File without changes
|
text.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fba5ecc23b254f6c36c7f18e9052bb7db01c3dfe2fd5786dd105410b4b9e094f
|
3 |
+
size 286673267
|