File size: 4,087 Bytes
00892f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
"""Code to augment the translated/untranslated passwords and create a dataset for the password translation task."""

import pandas as pd
import random

N_SAMPLES = 10000

def mutate_password_pair(pair):
    # 20% of the times we will capitalize the first letter
    if random.random() < 0.2:
        pair = (pair[0].capitalize(), pair[1].capitalize())
    # 20% of the times we will add a number at the end
    if random.random() < 0.2:
        number = random.randint(0, 9)
        pair = (pair[0] + str(number), pair[1] + str(number))
    # 20% of the times we will add a symbol at the end
    if random.random() < 0.2:
        symbol = random.choice(['!', '@', '#', '$', '%', '&', '*'])
        pair = (pair[0] + symbol, pair[1] + symbol)
    # 20% of the tims we will replace a letter with a number
    if random.random() < 0.2:
        if "e" in pair[0]:
            letter = "e"
            number = "3"
        elif "E" in pair[0]:
            letter = "E"
            number = "3"
        elif "i" in pair[0]:
            letter = "i"
            number = "1"
        elif "I" in pair[0]:
            letter = "I"
            number = "1"
        elif "o" in pair[0]:
            letter = "o"
            number = "0"
        elif "O" in pair[0]:
            letter = "O"
            number = "0"
        elif "a" in pair[0]:
            letter = "a"
            number = "4"
        elif "A" in pair[0]:
            letter = "A"
            number = "4"
        elif "t" in pair[0]:
            letter = "t"
            number = "7"
        elif "T" in pair[0]:
            letter = "T"
            number = "7"
        else:
            return pair
        
        # replace only first occurrence
        pair = (pair[0].replace(letter, number, 1), pair[1].replace(letter, number, 1))
    return pair

def create_dataframes():
    # Read the files
    with open('original_train.txt', 'r', encoding='latin1') as file:
        original = file.readlines()
    with open('translated_train.txt', 'r', encoding='utf-8') as file:
        translated = file.readlines()
    with open('untranslated.txt', 'r', encoding='latin1') as file:
        untranslated = file.readlines()

    # Create a dataframe from original and translated lists
    df_translated = pd.DataFrame({
        'original': [line.strip() for line in original],
        'translated': [line.strip() for line in translated]
    })

    # List for untranslated
    untranslated_list = [line.strip() for line in untranslated]

    # Create an empty dataframe for instructions
    df_instructions = pd.DataFrame(columns=['instruction', 'input', 'output'])

    # Generate 100 instruction rows (arbitrary choice to generate a substantial sample)
    for _ in range(N_SAMPLES):
        # Randomly pick 8 translated pairs
        sampled_translated = df_translated.sample(8)
        original_samples = sampled_translated['original'].tolist()
        translated_samples = sampled_translated['translated'].tolist()

        # Randomly pick 2 untranslated passwords
        untranslated_samples = random.sample(untranslated_list, 2)

        # Combine and shuffle maintaining pairing
        total_input = original_samples + untranslated_samples
        total_output = translated_samples + untranslated_samples

        combined_list = list(zip(total_input, total_output))
        random.shuffle(combined_list)
        combined_list = [mutate_password_pair(pair) for pair in combined_list]
        shuffled_input, shuffled_output = zip(*combined_list)

        new_rows = {
            'instruction': 'Translate this passwords while keeping the original format.',
            'input': "\n".join(list(shuffled_input)),
            'output': "\n".join(list(shuffled_output))
        }
        df_instructions = df_instructions._append(new_rows, ignore_index=True)

    return df_instructions

# Generate the dataframe
df_instructions = create_dataframes()

# Output to check
print(df_instructions.head())

# Saving the new DataFrame to a CSV (optional)
df_instructions.to_csv('password_translation_instructions.csv', index=False)