File size: 880 Bytes
0e87828
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#!/usr/bin/env python3
import os
import json
folder_path = "./vocabs"

all_dict = {}

def parse_file(filename):
    dictionary = {  
        "</s>": 2,
        "<pad>": 0,
        "<s>": 1,
        "<unk>": 3,
    }
    value = 4

    with open(filename, 'r') as file:
        for line in file:
            line = line.strip().split()
            if line:
                key = line[0]
                dictionary[key] = value
                value += 1

    return dictionary

for filename in os.listdir(folder_path):
    filepath = os.path.join(folder_path, filename)
    lang = filename.split(".")[0]
    if os.path.isfile(filepath):
        all_dict[lang] = parse_file(filepath)


output_path = "vocab_1.json"  # Replace "output.json" with the desired output file path

with open(output_path, 'w') as output_file:
    json.dump(all_dict, output_file, indent=4, sort_keys=True)