Lemswasabi
/

wav2vec2-large-xlsr-53-842h-luxembourgish-14h-with-lm

Automatic Speech Recognition

Generated from Trainer

Inference Endpoints

Model card Files Files and versions Metrics Training metrics Community

wav2vec2-large-xlsr-53-842h-luxembourgish-14h-with-lm / create_text_corpus.py

Lemswasabi's picture

add create lm scripts

98591ec about 2 years ago

981 Bytes

	#!/usr/bin/env python3
	#
	# Created by lemswasabi on 24/05/2022.
	# Copyright © 2022 letzspeak. All rights reserved.
	#

	import glob
	import re
	import textract


	chars_to_ignore_regex = '[,?.!;:"“%‘„”�—’…–]'

	def replace_chars(text, char, replace_char):
	return re.sub(char, replace_char, text.lower())

	def ignore_chars(sentence):
	return re.sub(chars_to_ignore_regex, "", text.lower())

	corpus = []

	for text_file in glob.glob("/home/lemswasabi/corpus/chamber_text_corpus/*/.doc", recursive=True):
	try:
	text = textract.process(text_file).decode("utf-8")
	text = replace_chars(text, "’", "'")
	text = replace_chars(text, "‘", "'")
	text = replace_chars(text, "-", " ")
	text = replace_chars(text, "\\n", " ")
	text = ignore_chars(text)
	corpus.append(text.strip())
	except textract.exceptions.ShellError:
	continue

	with open("chamber_text.txt", "w") as f:
	f.write(" ".join(corpus))