Spaces:
Runtime error
Runtime error
import pandas as pd | |
import json | |
import glob | |
def tag_document(keywords, tokens): | |
# Initialize the tags list with all O's | |
tags = ['O'] * len(tokens) | |
# Loop over the keywords and tag the document | |
for keyword in keywords: | |
# Split the keyword into words | |
keyword_words = keyword.split() | |
# Loop over the words in the document | |
for i in range(len(tokens)): | |
# If the current word matches the first word of the keyword | |
if tokens[i] == keyword_words[0]: | |
match = True | |
# Check if the rest of the words in the keyword match the following words in the document | |
for j in range(1, len(keyword_words)): | |
if i+j >= len(tokens) or tokens[i+j] != keyword_words[j]: | |
match = False | |
break | |
# If all the words in the keyword match the following words in the document, tag them as B-KEYWORD and I-KEYWORD | |
if match: | |
tags[i] = 'B-KEYWORD' | |
for j in range(1, len(keyword_words)): | |
tags[i+j] = 'I-KEYWORD' | |
return tags | |
def create_tner_dataset(all_tags, all_tokens, output_file_addr): | |
output_f = open(output_file_addr, 'a+') | |
for tags, tokens in zip(all_tags, all_tokens): | |
for tag, tok in zip(tags, tokens): | |
line = '\t'.join([tok, tag]) | |
output_f.write(line) | |
output_f.write('\n') | |
output_f.write('\n') | |
if __name__ == '__main__': | |
data_df = pd.read_csv('truncated_wiki_plus_shuffled_41203.csv') | |
id2document = data_df.set_index('id')['truncated_text_300'].to_dict() | |
#tag documents! | |
print('------------------ tag documents --------------------') | |
all_tags = [] | |
all_tokens = [] | |
for tagged_data_addr in glob.iglob('./tagged_data*'): | |
for line in open(tagged_data_addr): | |
item = json.loads(line) | |
if type(item['keyphrases']) == list: | |
keywords = item['keyphrases'] | |
document = id2document[item['id']] | |
tokens = document.split() | |
tags = tag_document(keywords, tokens) | |
assert len(tokens) == len(tags) | |
all_tags.append(tags) | |
all_tokens.append(tokens) | |
print(len(keywords), len(tags), len(document.split()), len([t for t in tags if t[0]== 'B'])) | |
nerda_dataset = {'sentences':all_tokens, 'tags': all_tags} | |
with open('nerda_dataset.json', 'w+') as f: | |
json.dump(nerda_dataset, f) | |
# create_tner_dataset(all_tags, all_tokens, output_file_addr='./sample_train.conll') | |