import re def generate_annotated_text(text, keyw_list): ''' :param text: str :param keyw_list: list of str :return: str e.g. input1 = 'I like an apple. Do you like apples too?' input2 = ["like", "apple"] output = annotated_text("I ", ("like", ""), " an ", ("apple", ""), ". Do you ", ("like", ""), " ", ("apple", ""), "s too?") ''' def find_keyword_index(text, keyw_list): indices = [] for keyword in keyw_list: for match in re.finditer(keyword, text): indices.append((keyword, match.start())) return indices indices = find_keyword_index(text, keyw_list) # 1. indices를 index 값에 따라 오름차순으로 정렬 sorted_indices = sorted(indices, key=lambda x: x[1]) output = 'annotated_text(' last_index = 0 # 2. input1에서 각 키워드의 위치를 활용하여 문자열 분할 및 재처리 for word, start_idx in sorted_indices: output += f'"{text[last_index:start_idx]}", ("{word}", ""), ' last_index = start_idx + len(word) # 3. 결과 문자열 생성 output += f'"{text[last_index:]}"' + ')' # 4. 연속 문자열 대치 string_cont_type1 = re.compile(r'", ""\), " ", \("') string_cont_type2 = re.compile(r'", ""\), "", \("') output = string_cont_type1.sub(' ', output) output = string_cont_type2.sub(' ', output) return output