KairongLiu
/

CLTMPSE

Model card Files Files and versions

CLTMPSE / utils /IPA_sim_statistic_analysis.py

KairongLiu's picture

Upload 10 files

9d0d562 verified over 1 year ago

history blame contribute delete

3.61 kB

	import epitran
	from tqdm import tqdm
	import pickle as pkl

	''' 统计分析。分词，利用分词结果，做统计分析和构建音标词典 '''


	def analyse_by_IPA_statistic(file_lo, file_th, statistic_conclusion_exist=False):
	from transformers import AutoTokenizer
	if statistic_conclusion_exist:
	IPA_lo_dict = pkl.load(open('IPA_lo_dict', 'rb'))
	IPA_th_dict = pkl.load(open('IPA_th_dict', 'rb'))
	IPA_lo_dict_cop = IPA_lo_dict.copy()
	IPA_th_dict_cop = IPA_th_dict.copy()
	for key_ in IPA_th_dict:
	for i in key_:
	if i.isdigit():
	del IPA_th_dict_cop[key_]
	break

	for key_ in IPA_lo_dict:
	for i in key_:
	if i.isdigit():
	del IPA_lo_dict_cop[key_]
	break
	sorted_IPA_lo_tp = sorted(IPA_th_dict_cop.items(), key=lambda x: x[1], reverse=True)
	sorted_IPA_th_tp = sorted(IPA_lo_dict_cop.items(), key=lambda x: x[1], reverse=True)
	sorted_IPA_lo = [t[0] for t in sorted_IPA_lo_tp]
	sorted_IPA_th = [t[0] for t in sorted_IPA_th_tp]
	same_list = []
	for idx, i in enumerate(sorted_IPA_lo):
	if i in sorted_IPA_th:
	'''
	如果IPA_th，IPA_lo有相同元素，获取该元素的值
	'''
	same_list.append([i, idx, sorted_IPA_th.index(i), IPA_lo_dict[i], IPA_th_dict[i]])

	pkl.dump(same_list, open('same_list', 'wb'))
	return
	else:
	plm_tokenizer = AutoTokenizer.from_pretrained(
	r'../foundation/E5')

	with open(file_lo, 'r', encoding='utf-8') as f:
	data_lo = f.readlines()
	with open(file_th, 'r', encoding='utf-8') as f:
	data_th = f.readlines()

	IPA_lo_dict = {}
	IPA_th_dict = {}
	print(len(data_lo))
	print(len(data_th))

	for i, j in tqdm(zip(data_lo, data_th)):
	input_lo = i
	input_th = j
	tked_lo = \
	plm_tokenizer(input_lo, max_length=512, padding=True, truncation=True, return_tensors='pt').encodings[
	0].tokens[2:-1]
	tked_th = \
	plm_tokenizer(input_th, max_length=512, padding=True, truncation=True, return_tensors='pt').encodings[
	0].tokens[2:-1]
	epi_lo = epitran.Epitran("lao-Laoo")
	epi_th = epitran.Epitran("tha-Thai")

	for i in tked_lo:
	IPA_lo = epi_lo.transliterate(i)
	IPA_lo_dict[IPA_lo] = IPA_lo_dict.get(IPA_lo, 1) + 1
	for j in tked_th:
	IPA_th = epi_th.transliterate(j)
	IPA_th_dict[IPA_th] = IPA_th_dict.get(IPA_th, 1) + 1

	pkl.dump(IPA_lo_dict, open('IPA_lo_dict', 'wb'))
	pkl.dump(IPA_th_dict, open('IPA_th_dict', 'wb'))


	def spliteKeyWord(in_str):
	# print(in_str)
	# in_str.replace('/([0-9]+)/g', '')
	return set(list(in_str))4


	def minhash(str_a, str_b): # 相似度计算 0-1
	score = 0.0
	jaccard_distance = lambda seta, setb: len(seta & setb) / float(len(seta \| setb))
	try:
	score = jaccard_distance(spliteKeyWord(str_a), spliteKeyWord(str_b))
	except ZeroDivisionError:
	print('ZeroDivisionError')

	return score


	if __name__ == "__main__":
	analyse_by_IPA_statistic('../data/triple/data_lo.txt', '../data/triple/data_th.txt',
	statistic_conclusion_exist=False)