Spaces:
Sleeping
Sleeping
File size: 4,309 Bytes
09b47fc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import os
from os import path
import sys
import json
from .detector import Detector
from .lang_detect_exception import ErrorCode, LangDetectException
from .utils.lang_profile import LangProfile
class DetectorFactory(object):
'''
Language Detector Factory Class.
This class manages an initialization and constructions of Detector.
Before using language detection library,
load profiles with DetectorFactory.load_profile(str)
and set initialization parameters.
When the language detection,
construct Detector instance via DetectorFactory.create().
See also Detector's sample code.
'''
seed = None
def __init__(self):
self.word_lang_prob_map = {}
self.langlist = []
def load_profile(self, profile_directory):
list_files = os.listdir(profile_directory)
if not list_files:
raise LangDetectException(ErrorCode.NeedLoadProfileError, 'Not found profile: ' + profile_directory)
langsize, index = len(list_files), 0
for filename in list_files:
if filename.startswith('.'):
continue
filename = path.join(profile_directory, filename)
if not path.isfile(filename):
continue
f = None
try:
if sys.version_info[0] < 3:
f = open(filename, 'r')
else:
f = open(filename, 'r', encoding='utf-8')
json_data = json.load(f)
profile = LangProfile(**json_data)
self.add_profile(profile, index, langsize)
index += 1
except IOError:
raise LangDetectException(ErrorCode.FileLoadError, 'Cannot open "%s"' % filename)
except:
raise LangDetectException(ErrorCode.FormatError, 'Profile format error in "%s"' % filename)
finally:
if f:
f.close()
def load_json_profile(self, json_profiles):
langsize, index = len(json_profiles), 0
if langsize < 2:
raise LangDetectException(ErrorCode.NeedLoadProfileError, 'Need more than 2 profiles.')
for json_profile in json_profiles:
try:
json_data = json.loads(json_profile)
profile = LangProfile(**json_data)
self.add_profile(profile, index, langsize)
index += 1
except:
raise LangDetectException(ErrorCode.FormatError, 'Profile format error.')
def add_profile(self, profile, index, langsize):
lang = profile.name
if lang in self.langlist:
raise LangDetectException(ErrorCode.DuplicateLangError, 'Duplicate the same language profile.')
self.langlist.append(lang)
for word in profile.freq:
if word not in self.word_lang_prob_map:
self.word_lang_prob_map[word] = [0.0] * langsize
length = len(word)
if 1 <= length <= 3:
prob = 1.0 * profile.freq.get(word) / profile.n_words[length - 1]
self.word_lang_prob_map[word][index] = prob
def clear(self):
self.langlist = []
self.word_lang_prob_map = {}
def create(self, alpha=None):
'''Construct Detector instance with smoothing parameter.'''
detector = self._create_detector()
if alpha is not None:
detector.set_alpha(alpha)
return detector
def _create_detector(self):
if not self.langlist:
raise LangDetectException(ErrorCode.NeedLoadProfileError, 'Need to load profiles.')
return Detector(self)
def set_seed(self, seed):
self.seed = seed
def get_lang_list(self):
return list(self.langlist)
PROFILES_DIRECTORY = path.join(path.dirname(__file__), 'profiles')
_factory = None
def init_factory():
global _factory
if _factory is None:
_factory = DetectorFactory()
_factory.load_profile(PROFILES_DIRECTORY)
def detect(text):
init_factory()
detector = _factory.create()
detector.append(text)
return detector.detect()
def detect_langs(text):
init_factory()
detector = _factory.create()
detector.append(text)
return detector.get_probabilities()
|