Spaces:

KdaiP
/

StableTTS_zh-demo

Sleeping

App Files Files Community

StableTTS_zh-demo / text /langdetect /detector.py

KdaiP

upload test model

09b47fc verified 11 months ago

raw

history blame

8.66 kB

	import random
	import re

	import six
	from six.moves import zip, xrange

	from .lang_detect_exception import ErrorCode, LangDetectException
	from .language import Language
	from .utils.ngram import NGram
	from .utils.unicode_block import unicode_block


	class Detector(object):
	'''
	Detector class is to detect language from specified text.
	Its instance is able to be constructed via the factory class DetectorFactory.

	After appending a target text to the Detector instance with .append(string),
	the detector provides the language detection results for target text via .detect() or .get_probabilities().

	.detect() method returns a single language name which has the highest probability.
	.get_probabilities() methods returns a list of multiple languages and their probabilities.

	The detector has some parameters for language detection.
	See set_alpha(double), .set_max_text_length(int) .set_prior_map(dict).

	Example:

	from langdetect.detector_factory import DetectorFactory
	factory = DetectorFactory()
	factory.load_profile('/path/to/profile/directory')

	def detect(text):
	detector = factory.create()
	detector.append(text)
	return detector.detect()

	def detect_langs(text):
	detector = factory.create()
	detector.append(text)
	return detector.get_probabilities()
	'''

	ALPHA_DEFAULT = 0.5
	ALPHA_WIDTH = 0.05

	ITERATION_LIMIT = 1000
	PROB_THRESHOLD = 0.1
	CONV_THRESHOLD = 0.99999
	BASE_FREQ = 10000
	UNKNOWN_LANG = 'unknown'

	URL_RE = re.compile(r'https?://[-_.?&~;+=/#0-9A-Za-z]{1,2076}')
	MAIL_RE = re.compile(r'[-_.0-9A-Za-z]{1,64}@[-_0-9A-Za-z]{1,255}[-_.0-9A-Za-z]{1,255}')

	def __init__(self, factory):
	self.word_lang_prob_map = factory.word_lang_prob_map
	self.langlist = factory.langlist
	self.seed = factory.seed
	self.random = random.Random()
	self.text = ''
	self.langprob = None

	self.alpha = self.ALPHA_DEFAULT
	self.n_trial = 7
	self.max_text_length = 10000
	self.prior_map = None
	self.verbose = False

	def set_verbose(self):
	self.verbose = True

	def set_alpha(self, alpha):
	self.alpha = alpha

	def set_prior_map(self, prior_map):
	'''Set prior information about language probabilities.'''
	self.prior_map = [0.0] * len(self.langlist)
	sump = 0.0
	for i in xrange(len(self.prior_map)):
	lang = self.langlist[i]
	if lang in prior_map:
	p = prior_map[lang]
	if p < 0:
	raise LangDetectException(ErrorCode.InitParamError, 'Prior probability must be non-negative.')
	self.prior_map[i] = p
	sump += p
	if sump <= 0.0:
	raise LangDetectException(ErrorCode.InitParamError, 'More one of prior probability must be non-zero.')
	for i in xrange(len(self.prior_map)):
	self.prior_map[i] /= sump

	def set_max_text_length(self, max_text_length):
	'''Specify max size of target text to use for language detection.
	The default value is 10000(10KB).
	'''
	self.max_text_length = max_text_length

	def append(self, text):
	'''Append the target text for language detection.
	If the total size of target text exceeds the limit size specified by
	Detector.set_max_text_length(int), the rest is cut down.
	'''
	text = self.URL_RE.sub(' ', text)
	text = self.MAIL_RE.sub(' ', text)
	text = NGram.normalize_vi(text)
	pre = 0
	for i in xrange(min(len(text), self.max_text_length)):
	ch = text[i]
	if ch != ' ' or pre != ' ':
	self.text += ch
	pre = ch

	def cleaning_text(self):
	'''Cleaning text to detect
	(eliminate URL, e-mail address and Latin sentence if it is not written in Latin alphabet).
	'''
	latin_count, non_latin_count = 0, 0
	for ch in self.text:
	if 'A' <= ch <= 'z':
	latin_count += 1
	elif ch >= six.u('\u0300') and unicode_block(ch) != 'Latin Extended Additional':
	non_latin_count += 1

	if latin_count * 2 < non_latin_count:
	text_without_latin = ''
	for ch in self.text:
	if ch < 'A' or 'z' < ch:
	text_without_latin += ch
	self.text = text_without_latin

	def detect(self):
	'''Detect language of the target text and return the language name
	which has the highest probability.
	'''
	probabilities = self.get_probabilities()
	if probabilities:
	return probabilities[0].lang
	return self.UNKNOWN_LANG

	def get_probabilities(self):
	if self.langprob is None:
	self._detect_block()
	return self._sort_probability(self.langprob)

	def _detect_block(self):
	self.cleaning_text()
	ngrams = self._extract_ngrams()
	if not ngrams:
	raise LangDetectException(ErrorCode.CantDetectError, 'No features in text.')

	self.langprob = [0.0] * len(self.langlist)

	self.random.seed(self.seed)
	for t in xrange(self.n_trial):
	prob = self._init_probability()
	alpha = self.alpha + self.random.gauss(0.0, 1.0) * self.ALPHA_WIDTH

	i = 0
	while True:
	self._update_lang_prob(prob, self.random.choice(ngrams), alpha)
	if i % 5 == 0:
	if self._normalize_prob(prob) > self.CONV_THRESHOLD or i >= self.ITERATION_LIMIT:
	break
	if self.verbose:
	six.print_('>', self._sort_probability(prob))
	i += 1
	for j in xrange(len(self.langprob)):
	self.langprob[j] += prob[j] / self.n_trial
	if self.verbose:
	six.print_('==>', self._sort_probability(prob))

	def _init_probability(self):
	'''Initialize the map of language probabilities.
	If there is the specified prior map, use it as initial map.
	'''
	if self.prior_map is not None:
	return list(self.prior_map)
	else:
	return [1.0 / len(self.langlist)] * len(self.langlist)

	def _extract_ngrams(self):
	'''Extract n-grams from target text.'''
	RANGE = list(xrange(1, NGram.N_GRAM + 1))

	result = []
	ngram = NGram()
	for ch in self.text:
	ngram.add_char(ch)
	if ngram.capitalword:
	continue
	for n in RANGE:
	# optimized w = ngram.get(n)
	if len(ngram.grams) < n:
	break
	w = ngram.grams[-n:]
	if w and w != ' ' and w in self.word_lang_prob_map:
	result.append(w)
	return result

	def _update_lang_prob(self, prob, word, alpha):
	'''Update language probabilities with N-gram string(N=1,2,3).'''
	if word is None or word not in self.word_lang_prob_map:
	return False

	lang_prob_map = self.word_lang_prob_map[word]
	if self.verbose:
	six.print_('%s(%s): %s' % (word, self._unicode_encode(word), self._word_prob_to_string(lang_prob_map)))

	weight = alpha / self.BASE_FREQ
	for i in xrange(len(prob)):
	prob[i] *= weight + lang_prob_map[i]
	return True

	def _word_prob_to_string(self, prob):
	result = ''
	for j in xrange(len(prob)):
	p = prob[j]
	if p >= 0.00001:
	result += ' %s:%.5f' % (self.langlist[j], p)
	return result

	def _normalize_prob(self, prob):
	'''Normalize probabilities and check convergence by the maximun probability.
	'''
	maxp, sump = 0.0, sum(prob)
	for i in xrange(len(prob)):
	p = prob[i] / sump
	if maxp < p:
	maxp = p
	prob[i] = p
	return maxp

	def _sort_probability(self, prob):
	result = [Language(lang, p) for (lang, p) in zip(self.langlist, prob) if p > self.PROB_THRESHOLD]
	result.sort(reverse=True)
	return result

	def _unicode_encode(self, word):
	buf = ''
	for ch in word:
	if ch >= six.u('\u0080'):
	st = hex(0x10000 + ord(ch))[2:]
	while len(st) < 4:
	st = '0' + st
	buf += r'\u' + st[1:5]
	else:
	buf += ch
	return buf