LCM / utils /io_ /remove_xx.py
shivrajanand's picture
Upload folder using huggingface_hub
e8f4897 verified
import numpy as np
def read_file(filename):
sentences = []
sentence = []
lengths = []
num_sentneces_to_remove = 0
num_sentences = 0
num_tokens_to_remove = 0
num_tokens = 0
with open(filename, 'r') as file:
for line in file:
line = line.strip()
if len(line) == 0:
xx_count = 0
for row in sentence:
if row[2] == 'XX':
xx_count += 1
if xx_count / len(sentence) >= 0.5:
num_sentneces_to_remove += 1
num_tokens_to_remove += len(sentence)
else:
sentences.append(sentence)
lengths.append(len(sentence))
num_sentences += 1
num_tokens += len(sentence)
sentence = []
continue
tokens = line.split('\t')
idx = tokens[0]
word = tokens[1]
pos = tokens[2]
ner = tokens[3]
arc = tokens[4]
arc_tag = tokens[5]
sentence.append((idx, word, pos, ner, arc, arc_tag))
print("removed %d sentences out of %d sentences" % (num_sentneces_to_remove, num_sentences))
print("removed %d tokens out of %d tokens" % (num_tokens_to_remove, num_tokens))
return sentences
def write_file(filename, sentences):
with open(filename, 'w') as file:
for sentence in sentences:
for row in sentence:
file.write('\t'.join([token for token in row]) + '\n')
file.write('\n')
dataset_dict = {'ontonotes': 'onto'}
datasets = ['ontonotes']
splits = ['test']
domains = ['all', 'wb']
for dataset in datasets:
for domain in domains:
for split in splits:
print('dataset: %s, domain: %s, split: %s' % (dataset, domain, split))
filemame = 'data/'+ dataset_dict[dataset] + '_pos_ner_dp_' + split + '_' + domain
sentences = read_file(filemame)
write_filename = filemame + '_without_xx'
write_file(write_filename, sentences)