akhaliq's picture
akhaliq HF staff
add files
c80917c
raw
history blame
1.95 kB
import random
def repeat(text, n_max_gram=3, n_max_repeat=3):
"""repeat n-grams"""
tokens = text.split()
n_gram = random.randint(1, n_max_gram)
repeat_token_idx = random.randint(0, len(tokens) - n_gram)
repeated_tokens = tokens[repeat_token_idx:repeat_token_idx+n_gram]
n_repeat = random.randint(1, n_max_repeat)
for _ in range(n_repeat):
insert_idx = random.randint(0, len(tokens))
tokens = tokens[:insert_idx] + \
repeated_tokens + tokens[insert_idx:]
new_text = " ".join(tokens)
return new_text
def remove(text, n_max_gram=3):
"""remove n-grams"""
tokens = text.split()
n_gram = random.randint(1, n_max_gram)
remove_token_idx = random.randint(0, len(tokens) - n_gram)
tokens = tokens[:remove_token_idx] + tokens[remove_token_idx + n_gram:]
new_text = " ".join(tokens)
return new_text
def insert(text, vocab, n_max_tokens=3):
"""Insert tokens"""
tokens = text.split()
n_insert_token = random.randint(1, n_max_tokens)
for _ in range(n_insert_token):
insert_token_idx = random.randint(0, len(tokens) - 1)
insert_token = random.choice(vocab)
tokens = tokens[:insert_token_idx] + [insert_token] + tokens[insert_token_idx:]
new_text = " ".join(tokens)
return new_text
def swap(text, vocab, n_max_tokens=3):
"""Swap tokens"""
tokens = text.split()
n_swap_tokens = random.randint(1, n_max_tokens)
for _ in range(n_swap_tokens):
swap_token_idx = random.randint(0, len(tokens) - 1)
swap_token = random.choice(vocab)
while swap_token == tokens[swap_token_idx]:
swap_token = random.choice(vocab)
tokens[swap_token_idx] = swap_token
new_text = " ".join(tokens)
return new_text
def shuffle(text):
"""shuffle tokens"""
tokens = text.split()
random.shuffle(tokens)
new_text = " ".join(tokens)
return new_text