persian-keyphrase-extraction / ner_data_construction.py
mrmft's picture
adding project source
4da642e
raw
history blame
No virus
2.68 kB
import pandas as pd
import json
import glob
def tag_document(keywords, tokens):
# Initialize the tags list with all O's
tags = ['O'] * len(tokens)
# Loop over the keywords and tag the document
for keyword in keywords:
# Split the keyword into words
keyword_words = keyword.split()
# Loop over the words in the document
for i in range(len(tokens)):
# If the current word matches the first word of the keyword
if tokens[i] == keyword_words[0]:
match = True
# Check if the rest of the words in the keyword match the following words in the document
for j in range(1, len(keyword_words)):
if i+j >= len(tokens) or tokens[i+j] != keyword_words[j]:
match = False
break
# If all the words in the keyword match the following words in the document, tag them as B-KEYWORD and I-KEYWORD
if match:
tags[i] = 'B-KEYWORD'
for j in range(1, len(keyword_words)):
tags[i+j] = 'I-KEYWORD'
return tags
def create_tner_dataset(all_tags, all_tokens, output_file_addr):
output_f = open(output_file_addr, 'a+')
for tags, tokens in zip(all_tags, all_tokens):
for tag, tok in zip(tags, tokens):
line = '\t'.join([tok, tag])
output_f.write(line)
output_f.write('\n')
output_f.write('\n')
if __name__ == '__main__':
data_df = pd.read_csv('truncated_wiki_plus_shuffled_41203.csv')
id2document = data_df.set_index('id')['truncated_text_300'].to_dict()
#tag documents!
print('------------------ tag documents --------------------')
all_tags = []
all_tokens = []
for tagged_data_addr in glob.iglob('./tagged_data*'):
for line in open(tagged_data_addr):
item = json.loads(line)
if type(item['keyphrases']) == list:
keywords = item['keyphrases']
document = id2document[item['id']]
tokens = document.split()
tags = tag_document(keywords, tokens)
assert len(tokens) == len(tags)
all_tags.append(tags)
all_tokens.append(tokens)
print(len(keywords), len(tags), len(document.split()), len([t for t in tags if t[0]== 'B']))
nerda_dataset = {'sentences':all_tokens, 'tags': all_tags}
with open('nerda_dataset.json', 'w+') as f:
json.dump(nerda_dataset, f)
# create_tner_dataset(all_tags, all_tokens, output_file_addr='./sample_train.conll')