Spaces:

hkunlp
/

Binder

Runtime error

Binder / utils /matcher.py

Timothyxxx

Init

f6f97d8 almost 3 years ago

2.71 kB

	from fuzzywuzzy import fuzz
	import pandas as pd
	import string

	from utils.normalizer import str_normalize


	class Matcher(object):
	def __init__(self):
	pass

	def match_sentence_with_table(self, sent: str, df: pd.DataFrame, fuzz_threshold=100):
	phrase2matched_cells = dict()
	sent = str_normalize(sent)
	sent = sent.strip(string.punctuation)
	for ngram in range(5, 0, -1):
	ngram_tokens_list = self._create_ngram_list(sent.split(), ngram)
	for row_id, row in df.iterrows():
	for col_id, cell in enumerate(row):
	if df.columns[col_id] == 'row_id':
	continue
	cell = str(cell)
	for ngram_phrase in ngram_tokens_list:
	fuzz_score = fuzz.ratio(ngram_phrase, cell)
	if fuzz_score >= fuzz_threshold:
	if ngram_phrase not in phrase2matched_cells:
	phrase2matched_cells[ngram_phrase] = []
	phrase2matched_cells[ngram_phrase].append((cell, fuzz_score, (row_id, col_id)))
	# Remove non-longest phrase
	phrases = list(phrase2matched_cells.keys())
	for phrase in phrases:
	for other_phrase in phrases:
	if phrase != other_phrase and phrase in other_phrase:
	del phrase2matched_cells[phrase]
	break
	# Sort by fuzzy score
	for matched_cells in phrase2matched_cells.values():
	matched_cells.sort(key=lambda x: x[1], reverse=True)

	return phrase2matched_cells

	def match_phrase_with_table(self, phrase: str, df: pd.DataFrame, fuzz_threshold=70):
	matched_cells = []
	for row_id, row in df.iterrows():
	for col_id, cell in enumerate(row):
	cell = str(cell)
	fuzz_score = fuzz.ratio(phrase, cell)
	# if fuzz_score == 100:
	# matched_cells = [(cell, fuzz_score, (row_id, col_id))]
	# return matched_cells
	if fuzz_score >= fuzz_threshold:
	matched_cells.append((cell, fuzz_score, (row_id, col_id)))
	# Sort by fuzzy score
	matched_cells.sort(key=lambda x: x[1], reverse=True)
	return matched_cells

	def _create_ngram_list(self, input_list, ngram_num):
	ngram_list = []
	if len(input_list) <= ngram_num:
	ngram_list.extend(input_list)
	else:
	for tmp in zip(*[input_list[i:] for i in range(ngram_num)]):
	tmp = " ".join(tmp)
	ngram_list.append(tmp)
	return ngram_list