import pandas as pd import json import glob def tag_document(keywords, tokens): # Initialize the tags list with all O's tags = ['O'] * len(tokens) # Loop over the keywords and tag the document for keyword in keywords: # Split the keyword into words keyword_words = keyword.split() # Loop over the words in the document for i in range(len(tokens)): # If the current word matches the first word of the keyword if tokens[i] == keyword_words[0]: match = True # Check if the rest of the words in the keyword match the following words in the document for j in range(1, len(keyword_words)): if i+j >= len(tokens) or tokens[i+j] != keyword_words[j]: match = False break # If all the words in the keyword match the following words in the document, tag them as B-KEYWORD and I-KEYWORD if match: tags[i] = 'B-KEYWORD' for j in range(1, len(keyword_words)): tags[i+j] = 'I-KEYWORD' return tags def create_tner_dataset(all_tags, all_tokens, output_file_addr): output_f = open(output_file_addr, 'a+') for tags, tokens in zip(all_tags, all_tokens): for tag, tok in zip(tags, tokens): line = '\t'.join([tok, tag]) output_f.write(line) output_f.write('\n') output_f.write('\n') if __name__ == '__main__': data_df = pd.read_csv('truncated_wiki_plus_shuffled_41203.csv') id2document = data_df.set_index('id')['truncated_text_300'].to_dict() #tag documents! print('------------------ tag documents --------------------') all_tags = [] all_tokens = [] for tagged_data_addr in glob.iglob('./tagged_data*'): for line in open(tagged_data_addr): item = json.loads(line) if type(item['keyphrases']) == list: keywords = item['keyphrases'] document = id2document[item['id']] tokens = document.split() tags = tag_document(keywords, tokens) assert len(tokens) == len(tags) all_tags.append(tags) all_tokens.append(tokens) print(len(keywords), len(tags), len(document.split()), len([t for t in tags if t[0]== 'B'])) nerda_dataset = {'sentences':all_tokens, 'tags': all_tags} with open('nerda_dataset.json', 'w+') as f: json.dump(nerda_dataset, f) # create_tner_dataset(all_tags, all_tokens, output_file_addr='./sample_train.conll')