Spaces:

ahdsoft
/

persian-keyphrase-extraction

Runtime error

App Files Files Community

persian-keyphrase-extraction / utils.py

mrmft

adding project source

4da642e almost 2 years ago

raw

history blame contribute delete

2.13 kB

	from parsinorm import General_normalization
	import re


	def get_ne_from_iob_output(sentences, tags_conf):
	sentences = sentences[0]
	tags = tags_conf[0][0]
	confs = tags_conf[1][0]

	seen_b = False
	keywords = {}
	new_token = []
	begin_index = 0
	for index, (tok, tag) in enumerate(zip(sentences, tags)):
	if tag[0] == 'I' and seen_b:
	new_token.append(tok)
	if tag[0] == 'B':
	if new_token:
	keywords[' '.join(new_token)] = confs[begin_index]
	new_token = []
	new_token.append(tok)
	begin_index = index
	seen_b = True
	if tag[0] == 'O':
	if new_token:
	keywords[' '.join(new_token)] = confs[begin_index]
	new_token = []
	seen_b = False

	# print('keywords before sort: ', [k for k in keywords.keys])
	#sort
	sorted_keywords = sorted(list(keywords.keys()), key=lambda kw: keywords[kw], reverse=True)
	print('keywords after sort: ', sorted_keywords)
	return sorted_keywords


	def fuzzy_subword_match(key, words):
	for index, w in enumerate(words):
	if (len(key.split()) < len(w.split())) and key in w:
	return index
	return -1


	#normalize
	def normalize(txt):
	general_normalization = General_normalization()
	txt = general_normalization.alphabet_correction(txt)
	txt = general_normalization.semi_space_correction(txt)
	txt = general_normalization.english_correction(txt)
	txt = general_normalization.html_correction(txt)
	txt = general_normalization.arabic_correction(txt)
	txt = general_normalization.punctuation_correction(txt)
	txt = general_normalization.specials_chars(txt)
	txt = general_normalization.remove_emojis(txt)
	txt = general_normalization.number_correction(txt)
	txt = general_normalization.remove_not_desired_chars(txt)
	txt = general_normalization.remove_repeated_punctuation(txt)
	return ' '.join(txt.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').split())



	def remove_puncs(txt):
	return re.sub('[!?،\(\)\.]','', txt)