Spaces:

bettystr
/

NerRoB-czech

Sleeping

File size: 4,596 Bytes

import os
import re 
import json
import pickle
import itertools

import pandas as pd
from simplemma import lemmatize


def load_json(path):
    """
    Load gazetteers from a file
    :param path: path to the gazetteer file
    :return: a dict of gazetteers
    """
    with open(path, 'r') as file:
        data = json.load(file)
    return data


def save_json(data, path):
    """
    Save gazetteers to a file
    :param path: path to the gazetteer file
    :param gazetteers: a dict of gazetteers
    """
    with open(path, 'w') as file:
        json.dump(data, file, indent=4)

def merge_gazetteers(*gazetteers):
    """
    Merge multiple gazetteer dictionaries into a single gazetteer dictionary.
    
    Returns:
        dict: A merged gazetteer dictionary containing all the keys and values from the input gazetteers.
    """
    # Initialize a new dictionary to store merged results
    merged_gazetteers = {}
    # Iterate over each dictionary provided
    for gaz in gazetteers:
        # Iterate over each key and set in the current dictionary
        for key, value_set in gaz.items():
            if key in merged_gazetteers:
                # If the key already exists in the result, union the sets
                merged_gazetteers[key] |= value_set
            else:
                # Otherwise, initialize the key with the set from the current dictionary
                merged_gazetteers[key] = value_set.copy()  # Use copy to avoid mutating the original sets
    return merged_gazetteers


####################################################################################################
### PREPROCESSING OF GAZETTEERS  ###################################################################
####################################################################################################

def remove_all_brackets(text):
    return re.sub(r'[\(\{\[].*?[\)\}\]]', '', text)


def lemmatizing(x):
    if x == "":
        return ""
    return lemmatize(x, lang="cs")


def multi_lemmatizing(x):
    words = x.split(" ")
    phrase = ""
    for word in words:
        phrase += lemmatizing(word) + " "
    return phrase.strip()


def build_reverse_dictionary(dictionary, apply_lemmatizing=False):
    reverse_dictionary = {}
    for key, values in dictionary.items():
        for value in values:
            reverse_dictionary[value] = key
            if apply_lemmatizing:
                temp = lemmatizing(value)
                if temp != value:
                    reverse_dictionary[temp] = key
    return reverse_dictionary


def split_gazetteers_for_single_token_match(gazetteers):
    result = {}
    for k, v in gazetteers.items():
        result[k] = set([x for xs in [vv.split(" ") for vv in v] for x in xs])
        result[k] = {x for x in result[k] if len(x) > 2}
    return result


def preprocess_gazetteers(gazetteers, config):
    if config["remove_brackets"]:
        for k, values in gazetteers.items():
            gazetteers[k] = {remove_all_brackets(vv).strip() for vv in values if len(remove_all_brackets(vv).strip()) > 2}
    if config["split_person"]: 
        gazetteers["per"].update(set([x for x in list(itertools.chain(*[v.split(" ") for v in gazetteers["per"]])) if len(x) > 2]))
    if config["techniq_for_matching"] == "single":
        gazetteers = split_gazetteers_for_single_token_match(gazetteers)
        if config["lemmatize"]:
            for k, values in gazetteers.items():
                gazetteers[k] = set(list(itertools.chain(*[(vv, lemmatizing(vv)) for vv in values if len(vv) > 2])))
    elif config["lemmatize"]:
        for k, values in gazetteers.items():
                gazetteers[k] = set(list(itertools.chain(*[(value, multi_lemmatizing(value)) for value in values if len(value) > 2])))

    if config["remove_numeric"]:
        for k, values in gazetteers.items():
            gazetteers[k] = {vv for vv in values if not vv.isnumeric()}
    for k, values in gazetteers.items():
        gazetteers[k] = list(values)
    return gazetteers


def load_json_as_pickle(json_path, pickle_path):
    """
    Load data from a JSON file and save it as a pickle file.

    Parameters:
        json_path (str): Path to the JSON file.
        pickle_path (str): Path where the pickle file will be saved.
    """
    # Load data from the JSON file
    with open(json_path, 'r') as file:
        data = json.load(file)
    
    # Save data to a pickle file
    with open(pickle_path, 'wb') as file:
        pickle.dump(data, file)

    print(f"Data from {json_path} has been loaded and saved as a pickle file at {pickle_path}.")