Spaces:
Sleeping
Sleeping
import random | |
import re | |
import six | |
from six.moves import zip, xrange | |
from .lang_detect_exception import ErrorCode, LangDetectException | |
from .language import Language | |
from .utils.ngram import NGram | |
from .utils.unicode_block import unicode_block | |
class Detector(object): | |
''' | |
Detector class is to detect language from specified text. | |
Its instance is able to be constructed via the factory class DetectorFactory. | |
After appending a target text to the Detector instance with .append(string), | |
the detector provides the language detection results for target text via .detect() or .get_probabilities(). | |
.detect() method returns a single language name which has the highest probability. | |
.get_probabilities() methods returns a list of multiple languages and their probabilities. | |
The detector has some parameters for language detection. | |
See set_alpha(double), .set_max_text_length(int) .set_prior_map(dict). | |
Example: | |
from langdetect.detector_factory import DetectorFactory | |
factory = DetectorFactory() | |
factory.load_profile('/path/to/profile/directory') | |
def detect(text): | |
detector = factory.create() | |
detector.append(text) | |
return detector.detect() | |
def detect_langs(text): | |
detector = factory.create() | |
detector.append(text) | |
return detector.get_probabilities() | |
''' | |
ALPHA_DEFAULT = 0.5 | |
ALPHA_WIDTH = 0.05 | |
ITERATION_LIMIT = 1000 | |
PROB_THRESHOLD = 0.1 | |
CONV_THRESHOLD = 0.99999 | |
BASE_FREQ = 10000 | |
UNKNOWN_LANG = 'unknown' | |
URL_RE = re.compile(r'https?://[-_.?&~;+=/#0-9A-Za-z]{1,2076}') | |
MAIL_RE = re.compile(r'[-_.0-9A-Za-z]{1,64}@[-_0-9A-Za-z]{1,255}[-_.0-9A-Za-z]{1,255}') | |
def __init__(self, factory): | |
self.word_lang_prob_map = factory.word_lang_prob_map | |
self.langlist = factory.langlist | |
self.seed = factory.seed | |
self.random = random.Random() | |
self.text = '' | |
self.langprob = None | |
self.alpha = self.ALPHA_DEFAULT | |
self.n_trial = 7 | |
self.max_text_length = 10000 | |
self.prior_map = None | |
self.verbose = False | |
def set_verbose(self): | |
self.verbose = True | |
def set_alpha(self, alpha): | |
self.alpha = alpha | |
def set_prior_map(self, prior_map): | |
'''Set prior information about language probabilities.''' | |
self.prior_map = [0.0] * len(self.langlist) | |
sump = 0.0 | |
for i in xrange(len(self.prior_map)): | |
lang = self.langlist[i] | |
if lang in prior_map: | |
p = prior_map[lang] | |
if p < 0: | |
raise LangDetectException(ErrorCode.InitParamError, 'Prior probability must be non-negative.') | |
self.prior_map[i] = p | |
sump += p | |
if sump <= 0.0: | |
raise LangDetectException(ErrorCode.InitParamError, 'More one of prior probability must be non-zero.') | |
for i in xrange(len(self.prior_map)): | |
self.prior_map[i] /= sump | |
def set_max_text_length(self, max_text_length): | |
'''Specify max size of target text to use for language detection. | |
The default value is 10000(10KB). | |
''' | |
self.max_text_length = max_text_length | |
def append(self, text): | |
'''Append the target text for language detection. | |
If the total size of target text exceeds the limit size specified by | |
Detector.set_max_text_length(int), the rest is cut down. | |
''' | |
text = self.URL_RE.sub(' ', text) | |
text = self.MAIL_RE.sub(' ', text) | |
text = NGram.normalize_vi(text) | |
pre = 0 | |
for i in xrange(min(len(text), self.max_text_length)): | |
ch = text[i] | |
if ch != ' ' or pre != ' ': | |
self.text += ch | |
pre = ch | |
def cleaning_text(self): | |
'''Cleaning text to detect | |
(eliminate URL, e-mail address and Latin sentence if it is not written in Latin alphabet). | |
''' | |
latin_count, non_latin_count = 0, 0 | |
for ch in self.text: | |
if 'A' <= ch <= 'z': | |
latin_count += 1 | |
elif ch >= six.u('\u0300') and unicode_block(ch) != 'Latin Extended Additional': | |
non_latin_count += 1 | |
if latin_count * 2 < non_latin_count: | |
text_without_latin = '' | |
for ch in self.text: | |
if ch < 'A' or 'z' < ch: | |
text_without_latin += ch | |
self.text = text_without_latin | |
def detect(self): | |
'''Detect language of the target text and return the language name | |
which has the highest probability. | |
''' | |
probabilities = self.get_probabilities() | |
if probabilities: | |
return probabilities[0].lang | |
return self.UNKNOWN_LANG | |
def get_probabilities(self): | |
if self.langprob is None: | |
self._detect_block() | |
return self._sort_probability(self.langprob) | |
def _detect_block(self): | |
self.cleaning_text() | |
ngrams = self._extract_ngrams() | |
if not ngrams: | |
raise LangDetectException(ErrorCode.CantDetectError, 'No features in text.') | |
self.langprob = [0.0] * len(self.langlist) | |
self.random.seed(self.seed) | |
for t in xrange(self.n_trial): | |
prob = self._init_probability() | |
alpha = self.alpha + self.random.gauss(0.0, 1.0) * self.ALPHA_WIDTH | |
i = 0 | |
while True: | |
self._update_lang_prob(prob, self.random.choice(ngrams), alpha) | |
if i % 5 == 0: | |
if self._normalize_prob(prob) > self.CONV_THRESHOLD or i >= self.ITERATION_LIMIT: | |
break | |
if self.verbose: | |
six.print_('>', self._sort_probability(prob)) | |
i += 1 | |
for j in xrange(len(self.langprob)): | |
self.langprob[j] += prob[j] / self.n_trial | |
if self.verbose: | |
six.print_('==>', self._sort_probability(prob)) | |
def _init_probability(self): | |
'''Initialize the map of language probabilities. | |
If there is the specified prior map, use it as initial map. | |
''' | |
if self.prior_map is not None: | |
return list(self.prior_map) | |
else: | |
return [1.0 / len(self.langlist)] * len(self.langlist) | |
def _extract_ngrams(self): | |
'''Extract n-grams from target text.''' | |
RANGE = list(xrange(1, NGram.N_GRAM + 1)) | |
result = [] | |
ngram = NGram() | |
for ch in self.text: | |
ngram.add_char(ch) | |
if ngram.capitalword: | |
continue | |
for n in RANGE: | |
# optimized w = ngram.get(n) | |
if len(ngram.grams) < n: | |
break | |
w = ngram.grams[-n:] | |
if w and w != ' ' and w in self.word_lang_prob_map: | |
result.append(w) | |
return result | |
def _update_lang_prob(self, prob, word, alpha): | |
'''Update language probabilities with N-gram string(N=1,2,3).''' | |
if word is None or word not in self.word_lang_prob_map: | |
return False | |
lang_prob_map = self.word_lang_prob_map[word] | |
if self.verbose: | |
six.print_('%s(%s): %s' % (word, self._unicode_encode(word), self._word_prob_to_string(lang_prob_map))) | |
weight = alpha / self.BASE_FREQ | |
for i in xrange(len(prob)): | |
prob[i] *= weight + lang_prob_map[i] | |
return True | |
def _word_prob_to_string(self, prob): | |
result = '' | |
for j in xrange(len(prob)): | |
p = prob[j] | |
if p >= 0.00001: | |
result += ' %s:%.5f' % (self.langlist[j], p) | |
return result | |
def _normalize_prob(self, prob): | |
'''Normalize probabilities and check convergence by the maximun probability. | |
''' | |
maxp, sump = 0.0, sum(prob) | |
for i in xrange(len(prob)): | |
p = prob[i] / sump | |
if maxp < p: | |
maxp = p | |
prob[i] = p | |
return maxp | |
def _sort_probability(self, prob): | |
result = [Language(lang, p) for (lang, p) in zip(self.langlist, prob) if p > self.PROB_THRESHOLD] | |
result.sort(reverse=True) | |
return result | |
def _unicode_encode(self, word): | |
buf = '' | |
for ch in word: | |
if ch >= six.u('\u0080'): | |
st = hex(0x10000 + ord(ch))[2:] | |
while len(st) < 4: | |
st = '0' + st | |
buf += r'\u' + st[1:5] | |
else: | |
buf += ch | |
return buf | |