CLTMPSE / utils /IPA_sim_statistic_analysis.py
KairongLiu's picture
Upload 10 files
9d0d562 verified
import epitran
from tqdm import tqdm
import pickle as pkl
''' 统计分析。分词,利用分词结果,做统计分析和构建音标词典 '''
def analyse_by_IPA_statistic(file_lo, file_th, statistic_conclusion_exist=False):
from transformers import AutoTokenizer
if statistic_conclusion_exist:
IPA_lo_dict = pkl.load(open('IPA_lo_dict', 'rb'))
IPA_th_dict = pkl.load(open('IPA_th_dict', 'rb'))
IPA_lo_dict_cop = IPA_lo_dict.copy()
IPA_th_dict_cop = IPA_th_dict.copy()
for key_ in IPA_th_dict:
for i in key_:
if i.isdigit():
del IPA_th_dict_cop[key_]
break
for key_ in IPA_lo_dict:
for i in key_:
if i.isdigit():
del IPA_lo_dict_cop[key_]
break
sorted_IPA_lo_tp = sorted(IPA_th_dict_cop.items(), key=lambda x: x[1], reverse=True)
sorted_IPA_th_tp = sorted(IPA_lo_dict_cop.items(), key=lambda x: x[1], reverse=True)
sorted_IPA_lo = [t[0] for t in sorted_IPA_lo_tp]
sorted_IPA_th = [t[0] for t in sorted_IPA_th_tp]
same_list = []
for idx, i in enumerate(sorted_IPA_lo):
if i in sorted_IPA_th:
'''
如果IPA_th,IPA_lo有相同元素,获取该元素的值
'''
same_list.append([i, idx, sorted_IPA_th.index(i), IPA_lo_dict[i], IPA_th_dict[i]])
pkl.dump(same_list, open('same_list', 'wb'))
return
else:
plm_tokenizer = AutoTokenizer.from_pretrained(
r'../foundation/E5')
with open(file_lo, 'r', encoding='utf-8') as f:
data_lo = f.readlines()
with open(file_th, 'r', encoding='utf-8') as f:
data_th = f.readlines()
IPA_lo_dict = {}
IPA_th_dict = {}
print(len(data_lo))
print(len(data_th))
for i, j in tqdm(zip(data_lo, data_th)):
input_lo = i
input_th = j
tked_lo = \
plm_tokenizer(input_lo, max_length=512, padding=True, truncation=True, return_tensors='pt').encodings[
0].tokens[2:-1]
tked_th = \
plm_tokenizer(input_th, max_length=512, padding=True, truncation=True, return_tensors='pt').encodings[
0].tokens[2:-1]
epi_lo = epitran.Epitran("lao-Laoo")
epi_th = epitran.Epitran("tha-Thai")
for i in tked_lo:
IPA_lo = epi_lo.transliterate(i)
IPA_lo_dict[IPA_lo] = IPA_lo_dict.get(IPA_lo, 1) + 1
for j in tked_th:
IPA_th = epi_th.transliterate(j)
IPA_th_dict[IPA_th] = IPA_th_dict.get(IPA_th, 1) + 1
pkl.dump(IPA_lo_dict, open('IPA_lo_dict', 'wb'))
pkl.dump(IPA_th_dict, open('IPA_th_dict', 'wb'))
def spliteKeyWord(in_str):
# print(in_str)
# in_str.replace('/([0-9]+)/g', '')
return set(list(in_str))4
def minhash(str_a, str_b): # 相似度计算 0-1
score = 0.0
jaccard_distance = lambda seta, setb: len(seta & setb) / float(len(seta | setb))
try:
score = jaccard_distance(spliteKeyWord(str_a), spliteKeyWord(str_b))
except ZeroDivisionError:
print('ZeroDivisionError')
return score
if __name__ == "__main__":
analyse_by_IPA_statistic('../data/triple/data_lo.txt', '../data/triple/data_th.txt',
statistic_conclusion_exist=False)