Spaces:

bettystr
/

NerRoB-czech

Sleeping

NerRoB-czech / data_manipulation /creation_gazetteers.py

AlzbetaStrompova

change json to pkl

2e3bc39 7 months ago

4.6 kB

	import os
	import re
	import json
	import pickle
	import itertools

	import pandas as pd
	from simplemma import lemmatize


	def load_json(path):
	"""
	Load gazetteers from a file
	:param path: path to the gazetteer file
	:return: a dict of gazetteers
	"""
	with open(path, 'r') as file:
	data = json.load(file)
	return data


	def save_json(data, path):
	"""
	Save gazetteers to a file
	:param path: path to the gazetteer file
	:param gazetteers: a dict of gazetteers
	"""
	with open(path, 'w') as file:
	json.dump(data, file, indent=4)

	def merge_gazetteers(*gazetteers):
	"""
	Merge multiple gazetteer dictionaries into a single gazetteer dictionary.

	Returns:
	dict: A merged gazetteer dictionary containing all the keys and values from the input gazetteers.
	"""
	# Initialize a new dictionary to store merged results
	merged_gazetteers = {}
	# Iterate over each dictionary provided
	for gaz in gazetteers:
	# Iterate over each key and set in the current dictionary
	for key, value_set in gaz.items():
	if key in merged_gazetteers:
	# If the key already exists in the result, union the sets
	merged_gazetteers[key] \|= value_set
	else:
	# Otherwise, initialize the key with the set from the current dictionary
	merged_gazetteers[key] = value_set.copy() # Use copy to avoid mutating the original sets
	return merged_gazetteers


	####################################################################################################
	### PREPROCESSING OF GAZETTEERS ###################################################################
	####################################################################################################

	def remove_all_brackets(text):
	return re.sub(r'[\(\{\[].*?[\)\}\]]', '', text)


	def lemmatizing(x):
	if x == "":
	return ""
	return lemmatize(x, lang="cs")


	def multi_lemmatizing(x):
	words = x.split(" ")
	phrase = ""
	for word in words:
	phrase += lemmatizing(word) + " "
	return phrase.strip()


	def build_reverse_dictionary(dictionary, apply_lemmatizing=False):
	reverse_dictionary = {}
	for key, values in dictionary.items():
	for value in values:
	reverse_dictionary[value] = key
	if apply_lemmatizing:
	temp = lemmatizing(value)
	if temp != value:
	reverse_dictionary[temp] = key
	return reverse_dictionary


	def split_gazetteers_for_single_token_match(gazetteers):
	result = {}
	for k, v in gazetteers.items():
	result[k] = set([x for xs in [vv.split(" ") for vv in v] for x in xs])
	result[k] = {x for x in result[k] if len(x) > 2}
	return result


	def preprocess_gazetteers(gazetteers, config):
	if config["remove_brackets"]:
	for k, values in gazetteers.items():
	gazetteers[k] = {remove_all_brackets(vv).strip() for vv in values if len(remove_all_brackets(vv).strip()) > 2}
	if config["split_person"]:
	gazetteers["per"].update(set([x for x in list(itertools.chain(*[v.split(" ") for v in gazetteers["per"]])) if len(x) > 2]))
	if config["techniq_for_matching"] == "single":
	gazetteers = split_gazetteers_for_single_token_match(gazetteers)
	if config["lemmatize"]:
	for k, values in gazetteers.items():
	gazetteers[k] = set(list(itertools.chain(*[(vv, lemmatizing(vv)) for vv in values if len(vv) > 2])))
	elif config["lemmatize"]:
	for k, values in gazetteers.items():
	gazetteers[k] = set(list(itertools.chain(*[(value, multi_lemmatizing(value)) for value in values if len(value) > 2])))

	if config["remove_numeric"]:
	for k, values in gazetteers.items():
	gazetteers[k] = {vv for vv in values if not vv.isnumeric()}
	for k, values in gazetteers.items():
	gazetteers[k] = list(values)
	return gazetteers


	def load_json_as_pickle(json_path, pickle_path):
	"""
	Load data from a JSON file and save it as a pickle file.

	Parameters:
	json_path (str): Path to the JSON file.
	pickle_path (str): Path where the pickle file will be saved.
	"""
	# Load data from the JSON file
	with open(json_path, 'r') as file:
	data = json.load(file)

	# Save data to a pickle file
	with open(pickle_path, 'wb') as file:
	pickle.dump(data, file)

	print(f"Data from {json_path} has been loaded and saved as a pickle file at {pickle_path}.")