versae commited on
Commit
3ddfd5c
1 Parent(s): 013db08

Adding 5gram models

Browse files
.gitattributes CHANGED
@@ -25,3 +25,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ *.arpa filter=lfs diff=lfs merge=lfs -text
29
+ *.txt filter=lfs diff=lfs merge=lfs -text
30
+
all_5gram.arpa ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5fd8b07a8edcaed3fe0dbc7f30659230f797668b1a798e12ef73526bbf3f702
3
+ size 16086549819
all_5gram.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e48d3691208ab8219abd2a0349e88f0e9f88ddb56ab5d56c6f9570a013018d6
3
+ size 8035805872
bokmaal_5gram.arpa ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:880a096c854de3c459d182e0acb60f3d0eaf8e7495652f7c280aa6c78c5e4557
3
+ size 8671958280
bokmaal_5gram.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b41c24c63f2f0585bea83666369593f3b3e6d047f327a90f36ebca2c35ef0ff
3
+ size 4243671427
clean_texts.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import argparse
3
+
4
+ #chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]'
5
+ chars_to_ignore_regex = '[\'=&()\*\/,?.!\-\;\:"“%‘”�—’…–<>#@0123456789°′»«\[\]]'
6
+ radical_regex = r'[^abcdefghijklmnopqrstuvwxyzåæø ]'
7
+
8
+
9
+ def extract_text(text, is_radical=False):
10
+ text = re.sub(chars_to_ignore_regex, " ", text.lower()) + " "
11
+ if is_radical:
12
+ text = re.sub(radical_regex, ' ', text)
13
+ text = re.sub(r'\s+', ' ', text)
14
+ return text
15
+
16
+
17
+ def main(args):
18
+ with open(args.input_file, 'r') as file:
19
+ data = file.read()
20
+
21
+ data = extract_text(data, bool(args.radical))
22
+
23
+ with open(args.output_file, 'w') as outputfile:
24
+ outputfile.write(data)
25
+
26
+
27
+ def parse_args():
28
+ parser = argparse.ArgumentParser()
29
+ parser.add_argument('--input_file', required=True, help='Path to input file.')
30
+ parser.add_argument('--output_file', required=True, help='Path to output file.')
31
+ parser.add_argument('--radical', dest='radical', action='store_true', help='Delete any non vocab character.')
32
+ args = parser.parse_args()
33
+ return args
34
+
35
+ if __name__ == "__main__":
36
+ args = parse_args()
37
+ main(args)
nynorsk_5gram.arpa ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18a84d547abe6bb8a5441916b296dbdf230e3efc76a953b14660720817bda62d
3
+ size 7718177308
nynorsk_5gram.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab572020f4871ec611ad95a92071040bafe77c4633de2d26166006749e964129
3
+ size 3981756555
prepare.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ import re
4
+ from tqdm import tqdm
5
+
6
+ from datasets import load_dataset, interleave_datasets, concatenate_datasets
7
+
8
+ TEXT_COLUMN_NAME = "text"
9
+ AUDIO_COLUMN_NAME = "audio"
10
+ CHARS_TO_IGNORE_REGEX = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\'\–\_\\\+\#\/0-9]'
11
+
12
+ # Pre-processing dataset
13
+ def replace_hatted_characters(batch):
14
+ text = batch["text"]
15
+ text = re.sub(CHARS_TO_IGNORE_REGEX, '', text).lower() + ' '
16
+ text = re.sub('[áàâ]', 'a', text)
17
+ text = re.sub('[ä]', 'æ', text)
18
+ text = re.sub('[éèëê]', 'e', text)
19
+ text = re.sub('[íìïî]', 'i', text)
20
+ text = re.sub('[óòöô]', 'o', text)
21
+ text = re.sub('[ö]', 'ø', text)
22
+ text = re.sub('[ç]', 'c', text)
23
+ text = re.sub('[úùüû]', 'u', text)
24
+ text = re.sub('\xa0', ' ', text)
25
+ text = re.sub('<ee>', 'eee', text)
26
+ text = re.sub('<qq>', 'qqq', text)
27
+ text = re.sub('<mm>', 'mmm', text)
28
+ text = re.sub('<inaudible>', 'xxx', text)
29
+ text = re.sub('[<>]', '', text)
30
+ text = re.sub(r'\s+', ' ', text)
31
+ return {"text": text}
32
+
33
+
34
+ def main():
35
+ npsc = load_dataset(
36
+ "NbAiLab/NPSC",
37
+ "16K_mp3",
38
+ split="train+validation",
39
+ use_auth_token=True,
40
+ )
41
+ ncc = load_dataset(
42
+ "NbAiLab/NCC",
43
+ split="train+validation",
44
+ use_auth_token=True
45
+ )
46
+ dataset = concatenate_datasets([npsc, ncc])
47
+ dataset = dataset.map(
48
+ replace_hatted_characters,
49
+ desc="replacing hesitations and homophones",
50
+ )
51
+
52
+ # Create file with all text together
53
+ text_count = len(dataset)
54
+ with open("text.txt", "w") as text_file:
55
+ for idx, text in tqdm(enumerate(dataset["text"]), total=text_count, desc="Writing text"):
56
+ if idx == text_count:
57
+ text_file.write(text)
58
+ else:
59
+ text_file.write(text + " ")
60
+
61
+ # Create KenLM model
62
+ !~/bin/lmplz -o 5 --text text.txt --arpa 5gram.arpa.orig -T $(pwd)
63
+
64
+ # Adjusting for Huggingface decoding
65
+ with open("5gram.arpa.orig", "r") as read_file, open("5gram.arpa", "w") as write_file:
66
+ has_added_eos = False
67
+ for line in read_file:
68
+ if not has_added_eos and "ngram 1=" in line:
69
+ count=line.strip().split("=")[-1]
70
+ write_file.write(line.replace(f"{count}", f"{int(count)+1}"))
71
+ elif not has_added_eos and "<s>" in line:
72
+ write_file.write(line)
73
+ write_file.write(line.replace("<s>", "</s>"))
74
+ has_added_eos = True
75
+ else:
76
+ write_file.write(line)
77
+
78
+ # Compress as binary
79
+ !~/bin/build_binary 5gram.arpa 5gram.bin -T $(pwd)
80
+ !rm 5gram.arpa*
81
+
82
+
83
+ if __name__ == "__main__":
84
+ main()
85
+
texts/all_text.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77bd024bc200949cf73266459d8e1ead26a8961b6962d415a7cadaf96af24e76
3
+ size 910152429
texts/all_text_clean.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:916471727e5cc2602a3339d94af1cf4e8bc0ea5ca7def6e68dbe53da0a19688a
3
+ size 908965879
texts/bokmaal_clean.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f26eccf2b2a3076237afc427c93449915053b9b8bd928705b4fbbc539740730
3
+ size 478844863
texts/bokmaal_text.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae57498e4b97f35f685b8bd30cf2e74f921d3a1fb734d4eaeb437c29aaef0cab
3
+ size 485346069
texts/bokmaal_text_clean.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:243e8924ffb667a7d854ba94e26393933a94f4fa4188ee87487639955eb1e938
3
+ size 484980197
texts/npsc_clean.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:432bc3fc0f5247fde26810f96e6ae4d26f91d87ad6ead2ab37ac934cbdb82a64
3
+ size 6501206
texts/nynorsk_clean.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53d9360849730d4d46a88d657dfbc0f0e36134e7862c70170f225334234c11fe
3
+ size 424806360
texts/nynorsk_text.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df6eb3db143c778bc1ac4ac308fb14d750b19860e25a5968d52ca4a7483b695e
3
+ size 431307566
texts/nynorsk_text_clean.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8621ec240641097cfaba360f814d6345da4a391b1b10735a02ae27996c6055f5
3
+ size 430486889