birgermoell commited on
Commit
c498527
1 Parent(s): aa2d5b7

WIP updated lm

Browse files
.gitattributes CHANGED
@@ -15,3 +15,5 @@
15
  *.pt filter=lfs diff=lfs merge=lfs -text
16
  *.pth filter=lfs diff=lfs merge=lfs -text
17
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
15
  *.pt filter=lfs diff=lfs merge=lfs -text
16
  *.pth filter=lfs diff=lfs merge=lfs -text
17
  *tfevents* filter=lfs diff=lfs merge=lfs -text
18
+ *.arpa filter=lfs diff=lfs merge=lfs -text
19
+ *.txt filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ kenlm/
language_model/5gram.bin → 5gram_correct.arpa RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c803936922612f71cf0abdb37763c18d24624e36bfa4abac20187cc17b88541d
3
- size 1981380707
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3aab18b31980b9b9fdf85546c8763af2a4e2220d464ab5e3fab99cf19c3158dd
3
+ size 4394946469
build_n_gram.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+
3
+ target_lang="sv" # change to your target lang
4
+ username = "hf-test" # change to your username
5
+
6
+ dataset = load_dataset(f"{username}/{target_lang}_corpora_parliament_processed", split="train")
7
+
8
+ with open("text.txt", "w") as file:
9
+ file.write(" ".join(dataset["text"]))
end_token.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ with open("5gram.arpa", "r") as read_file, open("5gram_correct.arpa", "w") as write_file:
2
+ has_added_eos = False
3
+ for line in read_file:
4
+ if not has_added_eos and "ngram 1=" in line:
5
+ count=line.strip().split("=")[-1]
6
+ write_file.write(line.replace(f"{count}", f"{int(count)+1}"))
7
+ elif not has_added_eos and "<s>" in line:
8
+ write_file.write(line)
9
+ write_file.write(line.replace("<s>", "</s>"))
10
+ has_added_eos = True
11
+ else:
12
+ write_file.write(line)
get_tokens.py CHANGED
@@ -1,15 +1,24 @@
1
- from transformers import Wav2Vec2ProcessorWithLM
2
- import torchaudio
3
-
4
- import torch
5
- from datasets import load_dataset
6
  from transformers import AutoModelForCTC, AutoProcessor
7
- import torchaudio.functional as F
8
-
9
  model_id = "."
10
  model = AutoModelForCTC.from_pretrained(model_id)
11
  processor = AutoProcessor.from_pretrained(model_id)
12
 
13
  vocab_dict = processor.tokenizer.get_vocab()
14
- print(vocab_dict)
15
- sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from transformers import AutoModelForCTC, AutoProcessor
 
 
2
  model_id = "."
3
  model = AutoModelForCTC.from_pretrained(model_id)
4
  processor = AutoProcessor.from_pretrained(model_id)
5
 
6
  vocab_dict = processor.tokenizer.get_vocab()
7
+ # print(vocab_dict)
8
+ sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}
9
+
10
+ from pyctcdecode import build_ctcdecoder
11
+
12
+
13
+ decoder = build_ctcdecoder(
14
+ labels=list(sorted_vocab_dict.keys()),
15
+ kenlm_model_path="5gram_correct.arpa",
16
+ )
17
+
18
+ from transformers import Wav2Vec2ProcessorWithLM
19
+
20
+ processor_with_lm = Wav2Vec2ProcessorWithLM(
21
+ feature_extractor=processor.feature_extractor,
22
+ tokenizer=processor.tokenizer,
23
+ decoder=decoder
24
+ )
language_model/attrs.json DELETED
@@ -1 +0,0 @@
1
- {"alpha": 0.5, "beta": 1.5, "unk_score_offset": -10.0, "score_boundary": true}
 
 
language_model/unigrams.txt DELETED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json CHANGED
@@ -5,8 +5,7 @@
5
  "padding_side": "right",
6
  "padding_value": 0,
7
  "return_attention_mask": true,
8
- "sampling_rate": 16000,
9
- "processor_class": "Wav2Vec2ProcessorWithLM"
10
  }
11
 
12
 
 
5
  "padding_side": "right",
6
  "padding_value": 0,
7
  "return_attention_mask": true,
8
+ "sampling_rate": 16000
 
9
  }
10
 
11
 
requirements.txt ADDED
File without changes
text.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fba5ecc23b254f6c36c7f18e9052bb7db01c3dfe2fd5786dd105410b4b9e094f
3
+ size 286673267