|
"""Code to augment the translated/untranslated passwords and create a dataset for the password translation task.""" |
|
|
|
import pandas as pd |
|
import random |
|
|
|
N_SAMPLES = 10000 |
|
|
|
def mutate_password_pair(pair): |
|
|
|
if random.random() < 0.2: |
|
pair = (pair[0].capitalize(), pair[1].capitalize()) |
|
|
|
if random.random() < 0.2: |
|
number = random.randint(0, 9) |
|
pair = (pair[0] + str(number), pair[1] + str(number)) |
|
|
|
if random.random() < 0.2: |
|
symbol = random.choice(['!', '@', '#', '$', '%', '&', '*']) |
|
pair = (pair[0] + symbol, pair[1] + symbol) |
|
|
|
if random.random() < 0.2: |
|
if "e" in pair[0]: |
|
letter = "e" |
|
number = "3" |
|
elif "E" in pair[0]: |
|
letter = "E" |
|
number = "3" |
|
elif "i" in pair[0]: |
|
letter = "i" |
|
number = "1" |
|
elif "I" in pair[0]: |
|
letter = "I" |
|
number = "1" |
|
elif "o" in pair[0]: |
|
letter = "o" |
|
number = "0" |
|
elif "O" in pair[0]: |
|
letter = "O" |
|
number = "0" |
|
elif "a" in pair[0]: |
|
letter = "a" |
|
number = "4" |
|
elif "A" in pair[0]: |
|
letter = "A" |
|
number = "4" |
|
elif "t" in pair[0]: |
|
letter = "t" |
|
number = "7" |
|
elif "T" in pair[0]: |
|
letter = "T" |
|
number = "7" |
|
else: |
|
return pair |
|
|
|
|
|
pair = (pair[0].replace(letter, number, 1), pair[1].replace(letter, number, 1)) |
|
return pair |
|
|
|
def create_dataframes(): |
|
|
|
with open('original_train.txt', 'r', encoding='latin1') as file: |
|
original = file.readlines() |
|
with open('translated_train.txt', 'r', encoding='utf-8') as file: |
|
translated = file.readlines() |
|
with open('untranslated.txt', 'r', encoding='latin1') as file: |
|
untranslated = file.readlines() |
|
|
|
|
|
df_translated = pd.DataFrame({ |
|
'original': [line.strip() for line in original], |
|
'translated': [line.strip() for line in translated] |
|
}) |
|
|
|
|
|
untranslated_list = [line.strip() for line in untranslated] |
|
|
|
|
|
df_instructions = pd.DataFrame(columns=['instruction', 'input', 'output']) |
|
|
|
|
|
for _ in range(N_SAMPLES): |
|
|
|
sampled_translated = df_translated.sample(8) |
|
original_samples = sampled_translated['original'].tolist() |
|
translated_samples = sampled_translated['translated'].tolist() |
|
|
|
|
|
untranslated_samples = random.sample(untranslated_list, 2) |
|
|
|
|
|
total_input = original_samples + untranslated_samples |
|
total_output = translated_samples + untranslated_samples |
|
|
|
combined_list = list(zip(total_input, total_output)) |
|
random.shuffle(combined_list) |
|
combined_list = [mutate_password_pair(pair) for pair in combined_list] |
|
shuffled_input, shuffled_output = zip(*combined_list) |
|
|
|
new_rows = { |
|
'instruction': 'Translate this passwords while keeping the original format.', |
|
'input': "\n".join(list(shuffled_input)), |
|
'output': "\n".join(list(shuffled_output)) |
|
} |
|
df_instructions = df_instructions._append(new_rows, ignore_index=True) |
|
|
|
return df_instructions |
|
|
|
|
|
df_instructions = create_dataframes() |
|
|
|
|
|
print(df_instructions.head()) |
|
|
|
|
|
df_instructions.to_csv('password_translation_instructions.csv', index=False) |
|
|