Spaces:

ofig
/

live-lm-critic

Runtime error

App Files Files Community

live-lm-critic / utils /text_utils.py

Olivia Figueira

Upload code with streamlit addition

b6e5241 about 2 years ago

raw

history blame contribute delete

No virus

2.08 kB

	import re
	from nltk import sent_tokenize, word_tokenize
	from nltk.tokenize.treebank import TreebankWordDetokenizer
	detokenizer = TreebankWordDetokenizer()

	def handle_dounble_quote(sent):
	cur_str = ''
	exp_left = True
	ignore_space = False
	for char in sent:
	if char == '"':
	if exp_left: #this is a left "
	cur_str = cur_str.rstrip() + ' "'
	exp_left = (not exp_left)
	ignore_space = True
	else: #this is a right "
	cur_str = cur_str.rstrip() + '" '
	exp_left = (not exp_left)
	ignore_space = False
	else:
	if ignore_space: #expecting right
	if char == ' ':
	continue
	else:
	cur_str = cur_str + char
	ignore_space = False
	else:
	cur_str = cur_str + char
	cur_str = cur_str.strip()
	cur_str = re.sub(r'[ ]+', ' ', cur_str)
	return cur_str

	def postprocess_space(sent):
	sent = re.sub(r'[ ]+\.', '.', sent)
	sent = re.sub(r'[ ]+,', ',', sent)
	sent = re.sub(r'[ ]+!', '!', sent)
	sent = re.sub(r'[ ]+\?', '?', sent)
	sent = re.sub(r'\([ ]+', '(', sent)
	sent = re.sub(r'[ ]+\)', ')', sent)
	sent = re.sub(r' \'s( \|\.\|,\|!\|\?)', r"'s\1", sent)
	sent = re.sub(r'n \'t( \|\.\|,\|!\|\?)', r"n't\1", sent)
	return sent

	def detokenize_sent(sent):
	#Clean raw sent
	sent = re.sub(r'\' s ', '\'s ', sent)
	toks = sent.split()
	if len([1 for t in toks if t=="'"]) % 2 == 0:
	toks = ['"' if t=="'" else t for t in toks]
	sent = ' '.join(toks)
	#
	sents = sent_tokenize(sent)
	final_sents = []
	for _sent in sents:
	_sent = detokenizer.detokenize(_sent.split())
	res = handle_dounble_quote(_sent)
	if res == -1:
	print ('unbalanced double quote')
	print (_sent)
	else:
	_sent = res
	final_sents.append(_sent)
	sent = ' '.join(final_sents)
	sent = postprocess_space(sent)
	return sent