## built-in libraries
import json
import typing
import regex

## custom modules
from modules.common.file_ensurer import FileEnsurer

##-------------------start-of-GenderUtil---------------------------------------------------------------------------------------------------------------------------------------------------------------------------

class GenderUtil:

    genders:typing.Optional[dict] = None
    cache = {}

    is_cote:bool = False

##-------------------start-of-find_english_words()---------------------------------------------------------------------------------------------------------------------------------------------------------------------------

    @staticmethod
    def find_english_words(text:str) -> list[tuple[str, int]]:

        """

        Finds the english words in the text.

        Parameters:
        text (str) : The text to be searched.

        Returns:
        (list[tuple[str, int]]) : The list of words and their starting index.

        """

        return [(match.group(), match.start()) for match in regex.finditer(r'\p{Latin}+', text)]
    
##-------------------start-of-is_potential_name()---------------------------------------------------------------------------------------------------------------------------------------------------------------------------

    @staticmethod
    def is_potential_name(word:str) -> bool:
        
        """
        
        Assuming words are potential names and excluding full-width Latin characters, this function returns a boolean value indicating whether the word is a potential name.
        
        Parameters:
        word (str) : The word to be checked.

        Returns:
        (bool) : The result of the check.

        """

        return not any(0xFF00 <= ord(ch) <= 0xFFEF for ch in word)
    
##-------------------start-of-group_names()---------------------------------------------------------------------------------------------------------------------------------------------------------------------------

    @staticmethod
    def group_names(text, names_with_positions: list[tuple[str, int]], max_distance: int = 10) -> list[str]:

        """

        Groups names together if they follow one another within a certain distance and are separated by spaces.

        Parameters:
        text (str) : The text to be searched.
        names_with_positions (list[tuple[str, int]]) : The names with their positions.
        max_distance (int) : The maximum distance between names.
        
        Returns:
        (list[str]) : The grouped names.

        """

        honorifics = [
            "chan",
            "dono",
            "kun",
            "kōhai",
            "paisen",
            "sama",
            "san",
            "senpai",
            "sensei",
            "shi",
            "ue"
        ]

        blacklist = [
            "contents",
        ]

        grouped_names = []
        i = 0
        skip_next = False
        length = len(names_with_positions)
        
        while i < length - 1:

            if(skip_next):
                skip_next = False

            else:
                current_name, current_pos = names_with_positions[i]
                next_name, next_pos = names_with_positions[i + 1]

                if(current_name in blacklist):
                    i += 1
                    continue
                
                ## Check if names are separated by spaces and are within the maximum distance.
                separator = text[current_pos + len(current_name):next_pos]

                if(GenderUtil.is_potential_name(next_name) and (separator.isspace()) and next_pos - current_pos <= max_distance):
                    grouped_names.append(current_name + " " + next_name)
                    skip_next = True
                else:
                    grouped_names.append(current_name)
            i += 1
        
        if(not skip_next and names_with_positions):
            grouped_names.append(names_with_positions[-1][0])

        ## merge honorifics with names
        for i, name in enumerate(grouped_names):
            if(i + 1 < len(grouped_names) and grouped_names[i + 1].lower() in honorifics):
                grouped_names[i] += "-" + grouped_names[i + 1]
                grouped_names.pop(i + 1)
        
        return grouped_names

##-------------------start-of-load_genders()---------------------------------------------------------------------------------------------------------------------------------------------------------------------------

    @staticmethod
    def load_genders() -> dict:

        """
        
        Loads the genders from the specified file path.

        Parameters:
        file_path (str) : The file

        Returns:
        (dict) : The loaded json.

        """

        GenderUtil.cache = {}

        with open(FileEnsurer.config_translation_genders_path, 'r', encoding='utf-8') as file:
            return json.load(file)
        
##-------------------start-of-discard_non_names()---------------------------------------------------------------------------------------------------------------------------------------------------------------------------

    @staticmethod
    def discard_non_names(names: list[str]) -> list[str]:

        """
        
        Discards any names that are not in the gender list.

        Parameters:
        names (list[str]) : The names to be filtered.

        Returns:
        new_names (list[str]) : The filtered names.

        """

        GenderUtil.genders = GenderUtil.load_genders()

        new_names = [
            name for name in names
            if any(
                any(
                    part == full_part
                    for part in GenderUtil.honorific_stripper(name).split(' ')
                    for full_part in full_name.split(' ')
                )
                for gender, gender_names in GenderUtil.genders.items()
                for full_name, _ in gender_names.items()
            )
        ]

        if(GenderUtil.is_cote):
            ## known issues with cote
            new_names = [name for name in new_names if name not in ["king"] and len(name) > 1]

        return new_names
            
##-------------------start-of-honorific_stripper()---------------------------------------------------------------------------------------------------------------------------------------------------------------------------

    @staticmethod
    def honorific_stripper(name:str) -> str:

        """

        Strips the honorific from the name.

        Parameters:
        name (str) : The name to be stripped.

        Returns:
        (str) : The stripped name.

        """

        if("-" in name):
            return name.split("-")[0]
        
        return name
    
##-------------------start-of-reverse_honorific_stripper()---------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    
    @staticmethod
    def reverse_honorific_stripper(name:str) -> str:

        """

        Removes the name from the honorific. (Gets the honorific)

        Parameters:
        name (str) : The name to be stripped.

        Returns:
        (str) : The stripped name.

        """

        if("-" in name):
            return name.split("-")[1]
        
        return ""

##-------------------start-of-discard_similar_names()---------------------------------------------------------------------------------------------------------------------------------------------------------------------------

    @staticmethod
    def discard_similar_names(names: list[str]) -> list[str]:

        """
        
        Discards any names that are similar to each other.

        This totally didn't take me literally 2 hours because I'm a dipshit who overcomplicates things.

        Parameters:
        names (list[str]) : The names to be filtered.

        Returns:
        (list[str]) : The filtered names

        """

        seen = set()
        result = []

        # Sort names by length (shortest first)
        names.sort(key=len)

        for name in names:
            base_name = GenderUtil.honorific_stripper(name)
            if(not any(base_name in seen_name or seen_name in base_name for seen_name in seen)):
                result.append(name)
                seen.add(base_name)

        return result
        
##-------------------start-of-find_name_gender()---------------------------------------------------------------------------------------------------------------------------------------------------------------------------

    @staticmethod
    def find_name_gender(name:str) -> list[str]:

        """

        Finds the gender associated to a name.

        Parameters:
        name (str) : The name to find

        Returns:
        result (list[str])

        """

        ## known names that are literally 95% this
        cote_predetermined: typing.Dict[typing.Tuple[str, str], str] = {
            ("Sakayanagi", "san"): "Female",
            ("Horikita", "san"): "Female",
            ("Horikita", ""): "Female",
            ("Sakayanagi", ""): "Female",
            ("Sakayanagi", "sama"): "Male",
            ("Sakayanagi", "sensei"): "Male",
            ("Kei", ""): "Female"
        }

        GenderUtil.genders = GenderUtil.load_genders()

        if(name in GenderUtil.cache):
            return GenderUtil.cache[name]
        
        honorific = GenderUtil.reverse_honorific_stripper(name)
        stripped_name = GenderUtil.honorific_stripper(name)

        ## check if the name is predetermined
        if((stripped_name, honorific) in cote_predetermined and GenderUtil.is_cote):
            result = [cote_predetermined[(stripped_name, honorific)]]
            GenderUtil.cache[name] = result
            return result

        ## this does an in operation
        ## so it could return too many (Kei for instance, will trigger Keisei and Kei)
        result = [gender for gender, names in GenderUtil.genders.items() for full_name in names if stripped_name in full_name]

        ## so we can go through it again and split the full name into first and last name, compare them to the stripped name and set the result
        if(len(result) > 1):
            for gender, names in GenderUtil.genders.items():
                for full_name in names:
                    first_name = full_name.split(" ")[0]
                    last_name = full_name.split(" ")[-1]

                    if((first_name == stripped_name or last_name == stripped_name) and gender in result):
                        ## need to readd, done because this can do for multiple genders
                        result.remove(gender)
                        result.append(gender)

        if(len(set(result)) > 1 or result in ["Undetermined", "Unknown"]):
            if(honorific == "kun"):
                result = ["Male"]
            elif(honorific == "chan"):
                result = ["Female"]
            
            else:
                result = ["Undetermined"]

        GenderUtil.cache[name] = result

        return result
    
##-------------------start-of-get_pronoun_assumption_for_system_prompt()---------------------------------------------------------------------------------------------------------------------------------------------------------------------------

    @staticmethod
    def get_pronoun_assumption_for_system_prompt(sample:str) -> typing.List[str]:

        """

        Gets the pronoun assumptions for a text sample so it can be used in the system prompt.

        Parameters:
        sample (str) : The text to be analyzed.

        Returns:
        pronoun_assumptions (list[str]) : The pronoun assumptions.

        """

        gender_to_pronoun_map = {
            "Male": "he",
            "Female": "she",
            ## we used unknown in the json file, but we should use undetermined and no im not changing the json file
            "Undetermined": "they",
            "Unknown": "they"
        }

        names_with_positions = GenderUtil.find_english_words(sample)
        potential_names_with_positions = [(name, pos) for name, pos in names_with_positions if GenderUtil.is_potential_name(name)]
        grouped_names = GenderUtil.group_names(sample, potential_names_with_positions)
        actual_names = GenderUtil.discard_non_names(grouped_names)
        filtered_names = GenderUtil.discard_similar_names(actual_names)

        assumptions = [
            "{} : {}\n".format(name, gender[0]) if gender and len(set(gender)) == 1 and gender not in ["Undetermined", "Unknown"] else "{} : Undetermined\n".format(name)
            for name in filtered_names
            for gender in [GenderUtil.find_name_gender(name)]
        ]

        pronoun_assumptions = [
            "{} : {}\n".format(name.strip(), gender_to_pronoun_map.get(gender.strip(), "they"))
            for assumption in assumptions
            for name, gender in [assumption.split(":")]
        ]

        return pronoun_assumptions
    
##----------------start-of-get_gender_assumption_for_system_prompt()---------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    
    @staticmethod
    def get_gender_assumption_for_system_prompt(sample:str) -> typing.List[str]:

        """

        Gets the gender assumptions for a text sample.

        Parameters:
        sample (str) : The text to be analyzed.

        Returns:
        genders (list[str]) : The gender assumptions.
        
        """

        names_with_positions = GenderUtil.find_english_words(sample)
        potential_names_with_positions = [(name, pos) for name, pos in names_with_positions if GenderUtil.is_potential_name(name)]
        grouped_names = GenderUtil.group_names(sample, potential_names_with_positions)
        actual_names = GenderUtil.discard_non_names(grouped_names)
        filtered_names = GenderUtil.discard_similar_names(actual_names)

        assumptions = [
            "{} : {}\n".format(name, gender[0]) if gender and len(set(gender)) == 1 and gender not in ["Undetermined", "Unknown"] else "{} : Undetermined\n".format(name)
            for name in filtered_names
            for gender in [GenderUtil.find_name_gender(name)]
        ]

        gender_assumptions = [
            "{} : {}\n".format(name.strip(), gender.strip().replace("Unknown", "Undetermined"))
            for assumption in assumptions
            for name, gender in [assumption.split(":")]
        ]

        return gender_assumptions