NerRoB-czech / data_manipulation /creation_gazetteers.py
AlzbetaStrompova
change json to pkl
2e3bc39
raw
history blame
4.6 kB
import os
import re
import json
import pickle
import itertools
import pandas as pd
from simplemma import lemmatize
def load_json(path):
"""
Load gazetteers from a file
:param path: path to the gazetteer file
:return: a dict of gazetteers
"""
with open(path, 'r') as file:
data = json.load(file)
return data
def save_json(data, path):
"""
Save gazetteers to a file
:param path: path to the gazetteer file
:param gazetteers: a dict of gazetteers
"""
with open(path, 'w') as file:
json.dump(data, file, indent=4)
def merge_gazetteers(*gazetteers):
"""
Merge multiple gazetteer dictionaries into a single gazetteer dictionary.
Returns:
dict: A merged gazetteer dictionary containing all the keys and values from the input gazetteers.
"""
# Initialize a new dictionary to store merged results
merged_gazetteers = {}
# Iterate over each dictionary provided
for gaz in gazetteers:
# Iterate over each key and set in the current dictionary
for key, value_set in gaz.items():
if key in merged_gazetteers:
# If the key already exists in the result, union the sets
merged_gazetteers[key] |= value_set
else:
# Otherwise, initialize the key with the set from the current dictionary
merged_gazetteers[key] = value_set.copy() # Use copy to avoid mutating the original sets
return merged_gazetteers
####################################################################################################
### PREPROCESSING OF GAZETTEERS ###################################################################
####################################################################################################
def remove_all_brackets(text):
return re.sub(r'[\(\{\[].*?[\)\}\]]', '', text)
def lemmatizing(x):
if x == "":
return ""
return lemmatize(x, lang="cs")
def multi_lemmatizing(x):
words = x.split(" ")
phrase = ""
for word in words:
phrase += lemmatizing(word) + " "
return phrase.strip()
def build_reverse_dictionary(dictionary, apply_lemmatizing=False):
reverse_dictionary = {}
for key, values in dictionary.items():
for value in values:
reverse_dictionary[value] = key
if apply_lemmatizing:
temp = lemmatizing(value)
if temp != value:
reverse_dictionary[temp] = key
return reverse_dictionary
def split_gazetteers_for_single_token_match(gazetteers):
result = {}
for k, v in gazetteers.items():
result[k] = set([x for xs in [vv.split(" ") for vv in v] for x in xs])
result[k] = {x for x in result[k] if len(x) > 2}
return result
def preprocess_gazetteers(gazetteers, config):
if config["remove_brackets"]:
for k, values in gazetteers.items():
gazetteers[k] = {remove_all_brackets(vv).strip() for vv in values if len(remove_all_brackets(vv).strip()) > 2}
if config["split_person"]:
gazetteers["per"].update(set([x for x in list(itertools.chain(*[v.split(" ") for v in gazetteers["per"]])) if len(x) > 2]))
if config["techniq_for_matching"] == "single":
gazetteers = split_gazetteers_for_single_token_match(gazetteers)
if config["lemmatize"]:
for k, values in gazetteers.items():
gazetteers[k] = set(list(itertools.chain(*[(vv, lemmatizing(vv)) for vv in values if len(vv) > 2])))
elif config["lemmatize"]:
for k, values in gazetteers.items():
gazetteers[k] = set(list(itertools.chain(*[(value, multi_lemmatizing(value)) for value in values if len(value) > 2])))
if config["remove_numeric"]:
for k, values in gazetteers.items():
gazetteers[k] = {vv for vv in values if not vv.isnumeric()}
for k, values in gazetteers.items():
gazetteers[k] = list(values)
return gazetteers
def load_json_as_pickle(json_path, pickle_path):
"""
Load data from a JSON file and save it as a pickle file.
Parameters:
json_path (str): Path to the JSON file.
pickle_path (str): Path where the pickle file will be saved.
"""
# Load data from the JSON file
with open(json_path, 'r') as file:
data = json.load(file)
# Save data to a pickle file
with open(pickle_path, 'wb') as file:
pickle.dump(data, file)
print(f"Data from {json_path} has been loaded and saved as a pickle file at {pickle_path}.")