Spaces:
Sleeping
Sleeping
Vineel Pratap
commited on
Commit
•
6f27821
1
Parent(s):
78e8beb
autolm
Browse files- app.py +10 -5
- upload/english/gutenberg_27045.txt +3 -0
- {normalization → utils}/README.txt +0 -0
- {normalization → utils}/__init__.py +0 -0
- utils/lm.py +71 -0
- {normalization → utils}/norm_config.py +0 -0
- {normalization → utils}/punctuations.lst +0 -0
- {normalization → utils}/text_norm.py +1 -1
- zeroshot.py +31 -7
app.py
CHANGED
@@ -51,18 +51,22 @@ with gr.Blocks(css="style.css") as demo:
|
|
51 |
interactive=False,
|
52 |
label="Language Model Score",
|
53 |
)
|
|
|
|
|
|
|
|
|
54 |
btn = gr.Button("Submit", elem_id="submit")
|
55 |
|
56 |
@gr.on(
|
57 |
-
inputs=[wscore_usedefault, lmscore_usedefault, lm_file],
|
58 |
outputs=[wscore, lmscore],
|
59 |
)
|
60 |
-
def update_slider(ws, ls, lm):
|
61 |
|
62 |
ws_slider = gr.Slider(
|
63 |
minimum=-10.0,
|
64 |
maximum=10.0,
|
65 |
-
value=LM_SCORE_DEFAULT if lm is not None else 0,
|
66 |
step=0.1,
|
67 |
interactive=not ws,
|
68 |
label="Word Insertion Score",
|
@@ -71,7 +75,7 @@ with gr.Blocks(css="style.css") as demo:
|
|
71 |
minimum=-10.0,
|
72 |
maximum=10.0,
|
73 |
value=WORD_SCORE_DEFAULT_IF_NOLM
|
74 |
-
if lm is None
|
75 |
else WORD_SCORE_DEFAULT_IF_LM,
|
76 |
step=0.1,
|
77 |
interactive=not ls,
|
@@ -97,6 +101,7 @@ with gr.Blocks(css="style.css") as demo:
|
|
97 |
lmscore,
|
98 |
wscore_usedefault,
|
99 |
lmscore_usedefault,
|
|
|
100 |
reference,
|
101 |
],
|
102 |
outputs=[text, logs],
|
@@ -118,7 +123,7 @@ with gr.Blocks(css="style.css") as demo:
|
|
118 |
],
|
119 |
[
|
120 |
"upload/english/english.mp3",
|
121 |
-
"upload/english/
|
122 |
" This is going to look at the code that we have in our configuration that we've already exported and compare it to our database, and we want to import",
|
123 |
],
|
124 |
],
|
|
|
51 |
interactive=False,
|
52 |
label="Language Model Score",
|
53 |
)
|
54 |
+
with gr.Column():
|
55 |
+
autolm = gr.Checkbox(
|
56 |
+
label="Automatically create Unigram LM from text data", value=True
|
57 |
+
)
|
58 |
btn = gr.Button("Submit", elem_id="submit")
|
59 |
|
60 |
@gr.on(
|
61 |
+
inputs=[wscore_usedefault, lmscore_usedefault, lm_file, autolm],
|
62 |
outputs=[wscore, lmscore],
|
63 |
)
|
64 |
+
def update_slider(ws, ls, lm, alm):
|
65 |
|
66 |
ws_slider = gr.Slider(
|
67 |
minimum=-10.0,
|
68 |
maximum=10.0,
|
69 |
+
value=LM_SCORE_DEFAULT if (lm is not None or alm) else 0,
|
70 |
step=0.1,
|
71 |
interactive=not ws,
|
72 |
label="Word Insertion Score",
|
|
|
75 |
minimum=-10.0,
|
76 |
maximum=10.0,
|
77 |
value=WORD_SCORE_DEFAULT_IF_NOLM
|
78 |
+
if (lm is None and not alm)
|
79 |
else WORD_SCORE_DEFAULT_IF_LM,
|
80 |
step=0.1,
|
81 |
interactive=not ls,
|
|
|
101 |
lmscore,
|
102 |
wscore_usedefault,
|
103 |
lmscore_usedefault,
|
104 |
+
autolm,
|
105 |
reference,
|
106 |
],
|
107 |
outputs=[text, logs],
|
|
|
123 |
],
|
124 |
[
|
125 |
"upload/english/english.mp3",
|
126 |
+
"upload/english/gutenberg_27045.txt",
|
127 |
" This is going to look at the code that we have in our configuration that we've already exported and compare it to our database, and we want to import",
|
128 |
],
|
129 |
],
|
upload/english/gutenberg_27045.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a6cb4e9c754924333e37dde766098f862ddd079c81009c77454f377c96b9ac19
|
3 |
+
size 84138
|
{normalization → utils}/README.txt
RENAMED
File without changes
|
{normalization → utils}/__init__.py
RENAMED
File without changes
|
utils/lm.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Creates unigram LM following KenLM
|
2 |
+
import math
|
3 |
+
import shutil, tempfile
|
4 |
+
|
5 |
+
def calculate_log_probabilities(word_counts, num_sentences, n_smoothing=0.01):
|
6 |
+
"""
|
7 |
+
Calculate log probabilities for each word in the corpus,
|
8 |
+
including a special <unk> token for unknown words.
|
9 |
+
"""
|
10 |
+
total_words = sum(word_counts.values())
|
11 |
+
total_words += 2 * num_sentences # add counts for <s> and </s>
|
12 |
+
# Adjust total for <unk>
|
13 |
+
total_words_with_unk = total_words + 1 # Adding 1 for <unk>
|
14 |
+
total_words_with_unk = total_words_with_unk + total_words_with_unk * n_smoothing
|
15 |
+
|
16 |
+
# Calculate probabilities, adjust for <unk>
|
17 |
+
probabilities = {
|
18 |
+
word: ((count + n_smoothing) / total_words_with_unk)
|
19 |
+
for word, count in word_counts.items()
|
20 |
+
}
|
21 |
+
probabilities["<unk>"] = 1 / total_words_with_unk
|
22 |
+
probabilities["<s>"] = (num_sentences + n_smoothing) / total_words_with_unk
|
23 |
+
probabilities["</s>"] = (num_sentences + n_smoothing) / total_words_with_unk
|
24 |
+
|
25 |
+
# Convert to log probabilities
|
26 |
+
return {word: math.log10(prob) for word, prob in probabilities.items()}
|
27 |
+
|
28 |
+
def maybe_generate_pseudo_bigram_arpa(arpa_fpath):
|
29 |
+
with open(arpa_fpath, "r") as file:
|
30 |
+
lines = file.readlines()
|
31 |
+
|
32 |
+
# if ngram order >=2 , do not modify
|
33 |
+
if any(["2-grams:" in l for l in lines]):
|
34 |
+
return
|
35 |
+
|
36 |
+
with open(arpa_fpath, "w") as file:
|
37 |
+
for line in lines:
|
38 |
+
if line.strip().startswith("ngram 1="):
|
39 |
+
file.write(line)
|
40 |
+
file.write("ngram 2=1\n") # Add the new ngram line
|
41 |
+
continue
|
42 |
+
|
43 |
+
if line.strip() == "\\end\\":
|
44 |
+
file.write("\\2-grams:\n")
|
45 |
+
file.write("-9.9999999\t</s> <s>\n\n")
|
46 |
+
|
47 |
+
file.write(line)
|
48 |
+
|
49 |
+
def save_log_probabilities(log_probabilities, file_path):
|
50 |
+
with open(file_path, "w") as file:
|
51 |
+
file.write(f"\data\\")
|
52 |
+
file.write(f"\n")
|
53 |
+
file.write(f"ngram 1={len(log_probabilities)}\n\n")
|
54 |
+
file.write(f"\\1-grams:")
|
55 |
+
file.write(f"\n")
|
56 |
+
for word, log_prob in log_probabilities.items():
|
57 |
+
if word == "<s>":
|
58 |
+
log_prob = 0
|
59 |
+
file.write(f"{log_prob}\t{word}\n")
|
60 |
+
file.write(f"\n")
|
61 |
+
file.write(f"\end\\")
|
62 |
+
|
63 |
+
def create_unigram_lm(word_counts, num_sentences, file_path, n_smoothing=0.01):
|
64 |
+
log_probs = calculate_log_probabilities(word_counts, num_sentences, n_smoothing)
|
65 |
+
save_log_probabilities(log_probs, file_path)
|
66 |
+
|
67 |
+
|
68 |
+
|
69 |
+
|
70 |
+
|
71 |
+
|
{normalization → utils}/norm_config.py
RENAMED
File without changes
|
{normalization → utils}/punctuations.lst
RENAMED
File without changes
|
{normalization → utils}/text_norm.py
RENAMED
@@ -2,7 +2,7 @@ import json
|
|
2 |
import re
|
3 |
import unicodedata
|
4 |
|
5 |
-
from
|
6 |
|
7 |
|
8 |
def text_normalize(text, iso_code, lower_case=True, remove_numbers=True, remove_brackets=False):
|
|
|
2 |
import re
|
3 |
import unicodedata
|
4 |
|
5 |
+
from utils.norm_config import norm_config
|
6 |
|
7 |
|
8 |
def text_normalize(text, iso_code, lower_case=True, remove_numbers=True, remove_brackets=False):
|
zeroshot.py
CHANGED
@@ -9,7 +9,8 @@ import numpy as np
|
|
9 |
from transformers import Wav2Vec2ForCTC, AutoProcessor
|
10 |
from huggingface_hub import hf_hub_download
|
11 |
from torchaudio.models.decoder import ctc_decoder
|
12 |
-
from
|
|
|
13 |
|
14 |
uroman_dir = "uroman"
|
15 |
assert os.path.exists(uroman_dir)
|
@@ -33,8 +34,8 @@ class MY_LOG:
|
|
33 |
def __init__(self):
|
34 |
self.text = "[START]"
|
35 |
|
36 |
-
def add(self, new_log):
|
37 |
-
self.text = self.text + "\n" + new_log
|
38 |
self.text = self.text.strip()
|
39 |
return self.text
|
40 |
|
@@ -92,15 +93,17 @@ def filter_lexicon(lexicon, word_counts):
|
|
92 |
|
93 |
def load_words(filepath):
|
94 |
words = {}
|
|
|
95 |
with open(filepath) as f:
|
96 |
for line in f:
|
97 |
line = line.strip().lower()
|
|
|
98 |
line = text_normalize(line, iso_code="xxx")
|
99 |
# ignore invalid words.
|
100 |
for w in line.split():
|
101 |
words.setdefault(w, 0)
|
102 |
words[w] += 1
|
103 |
-
return words
|
104 |
|
105 |
|
106 |
def process(
|
@@ -111,6 +114,7 @@ def process(
|
|
111 |
lmscore=None,
|
112 |
wscore_usedefault=True,
|
113 |
lmscore_usedefault=True,
|
|
|
114 |
reference=None,
|
115 |
):
|
116 |
transcription, logs = "", MY_LOG()
|
@@ -154,13 +158,13 @@ def process(
|
|
154 |
# Setup lexicon and decoder
|
155 |
yield transcription, logs.add(f"Loading words....")
|
156 |
try:
|
157 |
-
word_counts = load_words(words_file)
|
158 |
except Exception as e:
|
159 |
yield f"ERROR: Loading words failed '{str(e)}'", logs.text
|
160 |
return
|
161 |
|
162 |
yield transcription, logs.add(
|
163 |
-
f"Loaded {len(word_counts)} words.\nPreparing lexicon...."
|
164 |
)
|
165 |
|
166 |
try:
|
@@ -168,15 +172,35 @@ def process(
|
|
168 |
except Exception as e:
|
169 |
yield f"ERROR: Creating lexicon failed '{str(e)}'", logs.text
|
170 |
return
|
171 |
-
|
|
|
|
|
172 |
yield transcription, logs.add(f"Leixcon size: {len(lexicon)}")
|
173 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
if lm_path is None:
|
175 |
yield transcription, logs.add(f"Filtering lexicon....")
|
176 |
lexicon = filter_lexicon(lexicon, word_counts)
|
177 |
yield transcription, logs.add(
|
178 |
f"Ok. Leixcon size after filtering: {len(lexicon)}"
|
179 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
# print(lexicon["the"], lexicon["\"(t)he"])
|
181 |
with tempfile.NamedTemporaryFile() as lexicon_file:
|
182 |
if lm_path is not None and not lm_path.strip():
|
|
|
9 |
from transformers import Wav2Vec2ForCTC, AutoProcessor
|
10 |
from huggingface_hub import hf_hub_download
|
11 |
from torchaudio.models.decoder import ctc_decoder
|
12 |
+
from utils.text_norm import text_normalize
|
13 |
+
from utils.lm import create_unigram_lm, maybe_generate_pseudo_bigram_arpa
|
14 |
|
15 |
uroman_dir = "uroman"
|
16 |
assert os.path.exists(uroman_dir)
|
|
|
34 |
def __init__(self):
|
35 |
self.text = "[START]"
|
36 |
|
37 |
+
def add(self, new_log, new_line= True):
|
38 |
+
self.text = self.text + ("\n" if new_line else " ") + new_log
|
39 |
self.text = self.text.strip()
|
40 |
return self.text
|
41 |
|
|
|
93 |
|
94 |
def load_words(filepath):
|
95 |
words = {}
|
96 |
+
num_sentences = 0
|
97 |
with open(filepath) as f:
|
98 |
for line in f:
|
99 |
line = line.strip().lower()
|
100 |
+
num_sentences += 1
|
101 |
line = text_normalize(line, iso_code="xxx")
|
102 |
# ignore invalid words.
|
103 |
for w in line.split():
|
104 |
words.setdefault(w, 0)
|
105 |
words[w] += 1
|
106 |
+
return words, num_sentences
|
107 |
|
108 |
|
109 |
def process(
|
|
|
114 |
lmscore=None,
|
115 |
wscore_usedefault=True,
|
116 |
lmscore_usedefault=True,
|
117 |
+
autolm=True,
|
118 |
reference=None,
|
119 |
):
|
120 |
transcription, logs = "", MY_LOG()
|
|
|
158 |
# Setup lexicon and decoder
|
159 |
yield transcription, logs.add(f"Loading words....")
|
160 |
try:
|
161 |
+
word_counts, num_sentences = load_words(words_file)
|
162 |
except Exception as e:
|
163 |
yield f"ERROR: Loading words failed '{str(e)}'", logs.text
|
164 |
return
|
165 |
|
166 |
yield transcription, logs.add(
|
167 |
+
f"Loaded {len(word_counts)} words from {num_sentences} lines.\nPreparing lexicon...."
|
168 |
)
|
169 |
|
170 |
try:
|
|
|
172 |
except Exception as e:
|
173 |
yield f"ERROR: Creating lexicon failed '{str(e)}'", logs.text
|
174 |
return
|
175 |
+
# for k, v in lexicon.items():
|
176 |
+
# if len(v) < 5:
|
177 |
+
# print(k, v)
|
178 |
yield transcription, logs.add(f"Leixcon size: {len(lexicon)}")
|
179 |
|
180 |
+
# Input could be sentences OR list of words. Check if atleast one word has a count > 1 to diffentiate
|
181 |
+
tmp_file = tempfile.NamedTemporaryFile() # could be used for LM
|
182 |
+
if autolm and any([cnt > 2 for cnt in word_counts.values()]):
|
183 |
+
yield transcription, logs.add(f"Creating unigram LM...", False)
|
184 |
+
lm_path = tmp_file.name
|
185 |
+
create_unigram_lm(word_counts, num_sentences, lm_path)
|
186 |
+
yield transcription, logs.add(f"OK")
|
187 |
+
|
188 |
+
|
189 |
if lm_path is None:
|
190 |
yield transcription, logs.add(f"Filtering lexicon....")
|
191 |
lexicon = filter_lexicon(lexicon, word_counts)
|
192 |
yield transcription, logs.add(
|
193 |
f"Ok. Leixcon size after filtering: {len(lexicon)}"
|
194 |
)
|
195 |
+
else:
|
196 |
+
# kenlm throws an error if unigram LM is being used
|
197 |
+
# HACK: generate a bigram LM from unigram LM and a dummy bigram to trick it
|
198 |
+
maybe_generate_pseudo_bigram_arpa(lm_path)
|
199 |
+
|
200 |
+
# for k, v in lexicon.items():
|
201 |
+
# if len(v) < 5:
|
202 |
+
# print(k, v)
|
203 |
+
|
204 |
# print(lexicon["the"], lexicon["\"(t)he"])
|
205 |
with tempfile.NamedTemporaryFile() as lexicon_file:
|
206 |
if lm_path is not None and not lm_path.strip():
|