Yurii Paniv commited on
Commit
0f120d1
·
1 Parent(s): 72475af

Add instructions for scripts

Browse files
scripts/README.md CHANGED
@@ -10,4 +10,23 @@
10
  8. Put CV files into dataset files folder
11
  9. Put dev.csv and test.csv into folder
12
 
 
 
13
  You have a reproducible dataset!
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  8. Put CV files into dataset files folder
11
  9. Put dev.csv and test.csv into folder
12
 
13
+ Note: you can also specify dataset with "," e.g. dataset1/train.csv,dataset2/train.csv.
14
+
15
  You have a reproducible dataset!
16
+
17
+
18
+ # Scorer
19
+
20
+ 1. Refer to DeepSpeech guide for further explanations.
21
+
22
+ 2. Generate scorer package.
23
+ ```
24
+ python3 generate_lm.py --input_txt ../../../voice-recognition-ua/data/all_text.txt --output_dir . \
25
+ --top_k 500000 --kenlm_bins ../../../voice-recognition-ua/kenlm/build/bin \
26
+ --arpa_order 5 --max_arpa_memory "85%" --arpa_prune "0|0|1" \
27
+ --binary_a_bits 255 --binary_q_bits 8 --binary_type trie
28
+ ```
29
+ 3. Run lm_optimizer to find the best scorer value.
30
+ 4. Rerun step 2 to generate new scorer.
31
+
32
+ Caution: scorer is very model-dependant, so you'll likely need to adjust it to each model.
scripts/extract_text_corpus.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
  import nltk
3
  import re
@@ -32,10 +33,19 @@ for subdir, dirs, files in os.walk(FOLDER):
32
  text = text.strip()
33
 
34
  words = tokenizer.tokenize(text)
35
- words = [i for i in words if i.isalnum()]
36
  words = [i for i in words if not i.isdigit()]
37
- words = [i for i in words if len(i) > 1]
38
- if any([any(j not in allowed_chars for j in i) for i in words]):
 
 
 
 
 
 
 
 
 
 
39
  continue
40
  if len(words) == 0:
41
  continue
 
1
+ # this script is used for importing random texts from folder and converting it for scorer
2
  import os
3
  import nltk
4
  import re
 
33
  text = text.strip()
34
 
35
  words = tokenizer.tokenize(text)
 
36
  words = [i for i in words if not i.isdigit()]
37
+ new_words = []
38
+ for word in words:
39
+ include = True
40
+ for letter in word:
41
+ if word.startswith("-"):
42
+ word = word[1:]
43
+ if letter not in allowed_chars:
44
+ include = False
45
+ if include:
46
+ new_words.append(word)
47
+ words = new_words
48
+ if all([len(i) <= 1 for i in words]):
49
  continue
50
  if len(words) == 0:
51
  continue
scripts/wiki_import.py CHANGED
@@ -1,12 +1,16 @@
 
1
  from wiki_dump_reader import Cleaner, iterate
2
  from os import remove
 
3
  import nltk
4
  import re
5
  nltk.download("punkt")
6
 
 
7
 
8
- remove("../data/wiki_text.txt")
9
- text_file = open("../data/wiki_text.txt", mode="a")
 
10
 
11
  tokenizer = nltk.SpaceTokenizer()
12
  paranthesis_regex = re.compile(r'\(.*\)')
@@ -14,6 +18,7 @@ allowed_chars = ["а", "б", "в", "г", "ґ", "д", "е", "є", "ж", "з", "и
14
  "м", "н", "о", "п", "р", "с", "т", "у", "ф", "х", "ц", "ч", "ш", "щ", "ь", "ю", "я", "-", "'"]
15
 
16
  cleaner = Cleaner()
 
17
  for title, text in iterate('../data/ukwiki-20210320-pages-articles-multistream.xml'):
18
  text = cleaner.clean_text(text)
19
  cleaned_text, _ = cleaner.build_links(text)
@@ -34,10 +39,19 @@ for title, text in iterate('../data/ukwiki-20210320-pages-articles-multistream.x
34
  continue
35
 
36
  words = tokenizer.tokenize(text)
37
- words = [i for i in words if i.isalnum()]
38
  words = [i for i in words if not i.isdigit()]
39
- words = [i for i in words if len(i) > 1]
40
- if any([any(j not in allowed_chars for j in i) for i in words]):
 
 
 
 
 
 
 
 
 
 
41
  continue
42
  if len(words) == 0:
43
  continue
@@ -47,5 +61,8 @@ for title, text in iterate('../data/ukwiki-20210320-pages-articles-multistream.x
47
  if cleaned_text == "":
48
  continue
49
  text_file.write(cleaned_text + "\n")
 
 
 
50
 
51
  text_file.close()
 
1
+ # this script is used for importing wiki text into scorer format
2
  from wiki_dump_reader import Cleaner, iterate
3
  from os import remove
4
+ from os.path import exists
5
  import nltk
6
  import re
7
  nltk.download("punkt")
8
 
9
+ OUT_PATH = "../data/wiki_text.txt"
10
 
11
+ if exists(OUT_PATH):
12
+ remove(OUT_PATH)
13
+ text_file = open(OUT_PATH, mode="a")
14
 
15
  tokenizer = nltk.SpaceTokenizer()
16
  paranthesis_regex = re.compile(r'\(.*\)')
 
18
  "м", "н", "о", "п", "р", "с", "т", "у", "ф", "х", "ц", "ч", "ш", "щ", "ь", "ю", "я", "-", "'"]
19
 
20
  cleaner = Cleaner()
21
+ # iter = 0
22
  for title, text in iterate('../data/ukwiki-20210320-pages-articles-multistream.xml'):
23
  text = cleaner.clean_text(text)
24
  cleaned_text, _ = cleaner.build_links(text)
 
39
  continue
40
 
41
  words = tokenizer.tokenize(text)
 
42
  words = [i for i in words if not i.isdigit()]
43
+ new_words = []
44
+ for word in words:
45
+ include = True
46
+ for letter in word:
47
+ if word.startswith("-"):
48
+ word = word[1:]
49
+ if letter not in allowed_chars:
50
+ include = False
51
+ if include:
52
+ new_words.append(word)
53
+ words = new_words
54
+ if all([len(i) <= 1 for i in words]):
55
  continue
56
  if len(words) == 0:
57
  continue
 
61
  if cleaned_text == "":
62
  continue
63
  text_file.write(cleaned_text + "\n")
64
+ # iter += 1
65
+ # if iter > 5:
66
+ # break
67
 
68
  text_file.close()