Spaces:

ahdsoft
/

persian-keyphrase-extraction

Runtime error

App Files Files Community

persian-keyphrase-extraction / ner_data_construction.py

mrmft

adding project source

4da642e 12 months ago

raw

history blame

No virus

2.68 kB

	import pandas as pd
	import json
	import glob


	def tag_document(keywords, tokens):

	# Initialize the tags list with all O's
	tags = ['O'] * len(tokens)

	# Loop over the keywords and tag the document
	for keyword in keywords:
	# Split the keyword into words
	keyword_words = keyword.split()

	# Loop over the words in the document
	for i in range(len(tokens)):
	# If the current word matches the first word of the keyword
	if tokens[i] == keyword_words[0]:
	match = True
	# Check if the rest of the words in the keyword match the following words in the document
	for j in range(1, len(keyword_words)):
	if i+j >= len(tokens) or tokens[i+j] != keyword_words[j]:
	match = False
	break
	# If all the words in the keyword match the following words in the document, tag them as B-KEYWORD and I-KEYWORD
	if match:
	tags[i] = 'B-KEYWORD'
	for j in range(1, len(keyword_words)):
	tags[i+j] = 'I-KEYWORD'

	return tags


	def create_tner_dataset(all_tags, all_tokens, output_file_addr):
	output_f = open(output_file_addr, 'a+')
	for tags, tokens in zip(all_tags, all_tokens):
	for tag, tok in zip(tags, tokens):
	line = '\t'.join([tok, tag])
	output_f.write(line)
	output_f.write('\n')
	output_f.write('\n')


	if __name__ == '__main__':

	data_df = pd.read_csv('truncated_wiki_plus_shuffled_41203.csv')
	id2document = data_df.set_index('id')['truncated_text_300'].to_dict()


	#tag documents!
	print('------------------ tag documents --------------------')
	all_tags = []
	all_tokens = []
	for tagged_data_addr in glob.iglob('./tagged_data*'):
	for line in open(tagged_data_addr):
	item = json.loads(line)
	if type(item['keyphrases']) == list:
	keywords = item['keyphrases']
	document = id2document[item['id']]
	tokens = document.split()
	tags = tag_document(keywords, tokens)
	assert len(tokens) == len(tags)
	all_tags.append(tags)
	all_tokens.append(tokens)
	print(len(keywords), len(tags), len(document.split()), len([t for t in tags if t[0]== 'B']))
	nerda_dataset = {'sentences':all_tokens, 'tags': all_tags}
	with open('nerda_dataset.json', 'w+') as f:
	json.dump(nerda_dataset, f)
	# create_tner_dataset(all_tags, all_tokens, output_file_addr='./sample_train.conll')