| import epitran
|
| from tqdm import tqdm
|
| import pickle as pkl
|
|
|
| ''' 统计分析。分词,利用分词结果,做统计分析和构建音标词典 '''
|
|
|
|
|
| def analyse_by_IPA_statistic(file_lo, file_th, statistic_conclusion_exist=False):
|
| from transformers import AutoTokenizer
|
| if statistic_conclusion_exist:
|
| IPA_lo_dict = pkl.load(open('IPA_lo_dict', 'rb'))
|
| IPA_th_dict = pkl.load(open('IPA_th_dict', 'rb'))
|
| IPA_lo_dict_cop = IPA_lo_dict.copy()
|
| IPA_th_dict_cop = IPA_th_dict.copy()
|
| for key_ in IPA_th_dict:
|
| for i in key_:
|
| if i.isdigit():
|
| del IPA_th_dict_cop[key_]
|
| break
|
|
|
| for key_ in IPA_lo_dict:
|
| for i in key_:
|
| if i.isdigit():
|
| del IPA_lo_dict_cop[key_]
|
| break
|
| sorted_IPA_lo_tp = sorted(IPA_th_dict_cop.items(), key=lambda x: x[1], reverse=True)
|
| sorted_IPA_th_tp = sorted(IPA_lo_dict_cop.items(), key=lambda x: x[1], reverse=True)
|
| sorted_IPA_lo = [t[0] for t in sorted_IPA_lo_tp]
|
| sorted_IPA_th = [t[0] for t in sorted_IPA_th_tp]
|
| same_list = []
|
| for idx, i in enumerate(sorted_IPA_lo):
|
| if i in sorted_IPA_th:
|
| '''
|
| 如果IPA_th,IPA_lo有相同元素,获取该元素的值
|
| '''
|
| same_list.append([i, idx, sorted_IPA_th.index(i), IPA_lo_dict[i], IPA_th_dict[i]])
|
|
|
| pkl.dump(same_list, open('same_list', 'wb'))
|
| return
|
| else:
|
| plm_tokenizer = AutoTokenizer.from_pretrained(
|
| r'../foundation/E5')
|
|
|
| with open(file_lo, 'r', encoding='utf-8') as f:
|
| data_lo = f.readlines()
|
| with open(file_th, 'r', encoding='utf-8') as f:
|
| data_th = f.readlines()
|
|
|
| IPA_lo_dict = {}
|
| IPA_th_dict = {}
|
| print(len(data_lo))
|
| print(len(data_th))
|
|
|
| for i, j in tqdm(zip(data_lo, data_th)):
|
| input_lo = i
|
| input_th = j
|
| tked_lo = \
|
| plm_tokenizer(input_lo, max_length=512, padding=True, truncation=True, return_tensors='pt').encodings[
|
| 0].tokens[2:-1]
|
| tked_th = \
|
| plm_tokenizer(input_th, max_length=512, padding=True, truncation=True, return_tensors='pt').encodings[
|
| 0].tokens[2:-1]
|
| epi_lo = epitran.Epitran("lao-Laoo")
|
| epi_th = epitran.Epitran("tha-Thai")
|
|
|
| for i in tked_lo:
|
| IPA_lo = epi_lo.transliterate(i)
|
| IPA_lo_dict[IPA_lo] = IPA_lo_dict.get(IPA_lo, 1) + 1
|
| for j in tked_th:
|
| IPA_th = epi_th.transliterate(j)
|
| IPA_th_dict[IPA_th] = IPA_th_dict.get(IPA_th, 1) + 1
|
|
|
| pkl.dump(IPA_lo_dict, open('IPA_lo_dict', 'wb'))
|
| pkl.dump(IPA_th_dict, open('IPA_th_dict', 'wb'))
|
|
|
|
|
| def spliteKeyWord(in_str):
|
|
|
|
|
| return set(list(in_str))4
|
|
|
|
|
| def minhash(str_a, str_b):
|
| score = 0.0
|
| jaccard_distance = lambda seta, setb: len(seta & setb) / float(len(seta | setb))
|
| try:
|
| score = jaccard_distance(spliteKeyWord(str_a), spliteKeyWord(str_b))
|
| except ZeroDivisionError:
|
| print('ZeroDivisionError')
|
|
|
| return score
|
|
|
|
|
| if __name__ == "__main__":
|
| analyse_by_IPA_statistic('../data/triple/data_lo.txt', '../data/triple/data_th.txt',
|
| statistic_conclusion_exist=False)
|
|
|