File size: 4,596 Bytes
75a65be
 
 
2e3bc39
75a65be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2e3bc39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import os
import re 
import json
import pickle
import itertools

import pandas as pd
from simplemma import lemmatize


def load_json(path):
    """
    Load gazetteers from a file
    :param path: path to the gazetteer file
    :return: a dict of gazetteers
    """
    with open(path, 'r') as file:
        data = json.load(file)
    return data


def save_json(data, path):
    """
    Save gazetteers to a file
    :param path: path to the gazetteer file
    :param gazetteers: a dict of gazetteers
    """
    with open(path, 'w') as file:
        json.dump(data, file, indent=4)

def merge_gazetteers(*gazetteers):
    """
    Merge multiple gazetteer dictionaries into a single gazetteer dictionary.
    
    Returns:
        dict: A merged gazetteer dictionary containing all the keys and values from the input gazetteers.
    """
    # Initialize a new dictionary to store merged results
    merged_gazetteers = {}
    # Iterate over each dictionary provided
    for gaz in gazetteers:
        # Iterate over each key and set in the current dictionary
        for key, value_set in gaz.items():
            if key in merged_gazetteers:
                # If the key already exists in the result, union the sets
                merged_gazetteers[key] |= value_set
            else:
                # Otherwise, initialize the key with the set from the current dictionary
                merged_gazetteers[key] = value_set.copy()  # Use copy to avoid mutating the original sets
    return merged_gazetteers


####################################################################################################
### PREPROCESSING OF GAZETTEERS  ###################################################################
####################################################################################################

def remove_all_brackets(text):
    return re.sub(r'[\(\{\[].*?[\)\}\]]', '', text)


def lemmatizing(x):
    if x == "":
        return ""
    return lemmatize(x, lang="cs")


def multi_lemmatizing(x):
    words = x.split(" ")
    phrase = ""
    for word in words:
        phrase += lemmatizing(word) + " "
    return phrase.strip()


def build_reverse_dictionary(dictionary, apply_lemmatizing=False):
    reverse_dictionary = {}
    for key, values in dictionary.items():
        for value in values:
            reverse_dictionary[value] = key
            if apply_lemmatizing:
                temp = lemmatizing(value)
                if temp != value:
                    reverse_dictionary[temp] = key
    return reverse_dictionary


def split_gazetteers_for_single_token_match(gazetteers):
    result = {}
    for k, v in gazetteers.items():
        result[k] = set([x for xs in [vv.split(" ") for vv in v] for x in xs])
        result[k] = {x for x in result[k] if len(x) > 2}
    return result


def preprocess_gazetteers(gazetteers, config):
    if config["remove_brackets"]:
        for k, values in gazetteers.items():
            gazetteers[k] = {remove_all_brackets(vv).strip() for vv in values if len(remove_all_brackets(vv).strip()) > 2}
    if config["split_person"]: 
        gazetteers["per"].update(set([x for x in list(itertools.chain(*[v.split(" ") for v in gazetteers["per"]])) if len(x) > 2]))
    if config["techniq_for_matching"] == "single":
        gazetteers = split_gazetteers_for_single_token_match(gazetteers)
        if config["lemmatize"]:
            for k, values in gazetteers.items():
                gazetteers[k] = set(list(itertools.chain(*[(vv, lemmatizing(vv)) for vv in values if len(vv) > 2])))
    elif config["lemmatize"]:
        for k, values in gazetteers.items():
                gazetteers[k] = set(list(itertools.chain(*[(value, multi_lemmatizing(value)) for value in values if len(value) > 2])))

    if config["remove_numeric"]:
        for k, values in gazetteers.items():
            gazetteers[k] = {vv for vv in values if not vv.isnumeric()}
    for k, values in gazetteers.items():
        gazetteers[k] = list(values)
    return gazetteers


def load_json_as_pickle(json_path, pickle_path):
    """
    Load data from a JSON file and save it as a pickle file.

    Parameters:
        json_path (str): Path to the JSON file.
        pickle_path (str): Path where the pickle file will be saved.
    """
    # Load data from the JSON file
    with open(json_path, 'r') as file:
        data = json.load(file)
    
    # Save data to a pickle file
    with open(pickle_path, 'wb') as file:
        pickle.dump(data, file)

    print(f"Data from {json_path} has been loaded and saved as a pickle file at {pickle_path}.")