#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# txtファイルを関連する絵文字に変換する
# Convert txt files to related emojis
#
# This script converts text to related emojis, injecting context-relevant visual
# cues into datasets. It enables the enrichment of text data with corresponding
# emojis, enhancing its expressiveness and engagement. 
#
# Before running this script, you need to download the required resources. Just open
# up `python` and type:
#
#     import nltk
#     nltk.download('punkt_tab')

import nltk
from nltk.tokenize import word_tokenize
from emoji import EMOJI_DATA
import argparse
from pathlib import Path
import re
import locale
import gettext

# Download required NLTK data (only needed once)
nltk.download('punkt', quiet=True)

def setup_i18n():
    """Set up internationalization"""
    try:
        locale.setlocale(locale.LC_ALL, '')
        current_locale = locale.getlocale()[0]
        locale_path = Path(__file__).parent / 'locales'
        
        trans = gettext.translation('txt2emoji', locale_path, languages=[current_locale])
        trans.install()
        return trans.gettext
    except:
        return gettext.gettext

# Initialize translation
_ = setup_i18n()

def get_emoji_mapping():
    """Create a mapping of words to emojis."""
    emoji_map = {}
    # Group emoji variations together
    emoji_variations = {}
    
    for emoji_char, data in EMOJI_DATA.items():
        base_emoji = data.get('base', emoji_char)  # Get base emoji or use current one
        if base_emoji not in emoji_variations:
            emoji_variations[base_emoji] = set()
        emoji_variations[base_emoji].add(emoji_char)
        
        if 'en' in data:  # If emoji has English description
            words = data['en'].lower().replace('_', ' ').split()
            for word in words:
                if word not in emoji_map:
                    emoji_map[word] = []
                emoji_map[word].append(emoji_char)
    return emoji_map, emoji_variations

def text_to_emojis(text):
    """Convert text to related emojis."""
    # Create emoji mapping and variations
    emoji_map, emoji_variations = get_emoji_mapping()
    
    # Regex pattern to match any token containing numbers
    number_pattern = re.compile(r'.*\d+.*')
    
    # Emojis to exclude
    excluded_emojis = {'🔶', '⭕', '🔷', '🔹', '🔸', '🔺', '🔻', '🔴',
                       '🔵', '🔼', '🔾', '🇵🇬', '🀄', '🔲', '✅'}
    
    # Words to exclude from emoji conversion
    excluded_words = {'.', 'to', 'be', '(', ')', 'purple', 'abdominal', 'penetration', 'feral', 'body',
                      'nude', 'anthro', 'big', 'small', 'the', 'a', 'an', 'and', 'or', 'but', 'if', 'then',
                      'because', 'as', 'until', 'while', ',', 'hi', 'res', 'pussy', 'penetrated', 'equine',
                      'felid', 'feline', 'equid', 'genital', 'genitals', 'penetrating', 'medial', 'ring',
                      'inside', 'duo', 'solo', 'in', 'andromorph', 'from', 'behind', 'position', 'pantherine',
                      'animal', 'brown', 'sub', 'dom', 'explicit', 'black', 'bulge', 'dominant', 'kousen',
                      'rendan', 'genitalia', 'tan', 'simple', 'media', 'vaginal', 'red', 'pecs', 'navel',
                      'background', 'pubes', 'mammal', 'lore', 'gaping', 'balls', 'penetrated', 'mustelid',
                      'white', 'erection', 'blue', 'nipples', 'precum', 'bodily', 'text', 'english',
                      'submissive', 'sperm', 'bottom', 'penile', 'humanoid', 'leash', 'leashed', 'collar',
                      'penile', 'tuft', 'backward', 'ovum', 'on', 'mane', 'long', 'string', 'butt',
                      'tail', 'whiskers', 'yellow', 'kousenzephyr', 'object', 'leaning', 'cell',
                      'blonde', 'anus', 'more', 'questionable', 'signature', 'for', 'foreskin',
                      'facial', 'triceps', 'claws', 'tail', 'toe', 'view', 'three-quarter', 'sharp',
                      'spreading', '\'', 'pose', 'quads', 'grey', 'green', 'top', 'primate', 'nonbinary',
                      'barazoku', 'jun', 'enzo', 'scj', 'fully', 'elbow', 'focus', 'clitoris', 'hood',
                      'armpit', 'jack-o', 'advised', 'contains', 'graphic', 'what', 'with', 'of', 'looks',
                      'like', 'onto', 'by', 'patterns', 'lit', 'judging', 'another', 'four', 'three', 'two',
                      'one', 'zero', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'at', 'least', 'most',
                      'only',
    }
    
    # Track used emojis and their variations
    used_emojis = set()
    
    # Additional manual mappings for common words
    custom_mappings = {
        'cushion': '🛏️',
        'indoors': '🏠',
        'outdoors': '🌳',
        'dimly': '🌃',
        'she': '♀️',
        'her': '♀️',
        'he': '♂️',
        'him': '♂️',
        'his': '♂️',
        'viewer': '👀',
        'safe': '🍼',
        'mature': '🔞',
        'abdominal': '🫃',
        'cellphone': '📱',
        'breath': '😤',
        'armpit': '🙋‍♂️',
        'arm': '🙎‍♂️',
        'bottomwear': '👖',
        'chest': '🧰',
        'clock': '🕒',
        'clothed': '👚',
        'clothing': '👚',
        'electronics': '💻',
        'flannel': '👕',
        'denim': '👖',
        'drooling': '🤤',
        'public': '📢',
        'panting': '😛',
        'shirt': '👕',
        'shorts': '🩳',
        'pants': '👖',
        'tenting': '🏕️',
        'reaching': '🤏',
        'topwear': '👕',
        'watch': '⌚',
        'top': '🔝',
        'tree': '🌳',
        'pirate': '🏴‍☠️',
        'monkey': '🐒',
        'fangs': '🦷',
        'looking': '👀',
        'leg': '🦵',
        'halloween': '🎃',
        'gloves': '🧤',
        'socks': '🧦',
        'holding': '🤝',
        'absurd': '😶‍🌫️',
        'chair': '🪑',
        'begging': '🙏',
        'beard': '🧔',
        'furniture': '🛋️',
        'impregnation': '🥚',
        'hair': '🪮',
        'oral': '👄',
        'dialogue': '💬',
        'bubble': '💬',
        'sitting': '🪑',
        'lion': '🦁',
        'otter': '🦦',
        'markings': '🏷️',
        'marking': '🏷️',
        'toes': '👣',
        'teeth': '🦷',
        'fingering': '👉',
        'blush': '😊',
        'male': '♂️',
        'tiger': '🐯',
        'fluids': '💧',
        'sweat': '💧',
        'saliva': '🤤',
        'wolf': '🐺',
        'dog': '🐶',
        'female': '♀️',
        'intersex': '⚧️',
        'muscular': '💪',
        'wheelbarrow': '🚜',
        'sex': '💑',
        'size': '📏',
        'difference': '🔢',
        'penis': '🔱',
        'paws': '🐾',
        'pawpads': '🐾',
        'hindpaw': '🐾',
        'fur': '🧥',
        'horse': '🐴',
        'ejaculation': '💦',
        'cum': '💦',
        'love': '❤️',
        'smaller': '🔽',
        'bigger': '🔼',
        'larger': '🔼',
        'cats': '😺',
        'cat': '😺',
        'dogs': '🐶',
        'dog': '🐶',
        'sun': '☀️',
        'moon': '🌙',
        'star': '⭐',
        'happy': '😊',
        'sad': '😢',
        'angry': '😠',
        'food': '🍔',
        'heart': '❤️',
        'fire': '🔥',
        'hot': '🔥',
        'cold': '❄️',
        'snow': '❄️',
        'rain': '🌧️',
        'smile': '😊',
        'laugh': '😂',
        'cry': '😢'
    }
    
    # Tokenize the input text
    tokens = word_tokenize(text.lower())
    
    # Store found emojis with their explanations
    found_emojis = []
    explanations = []
    
    def is_emoji_usable(emoji):
        """Check if emoji or any of its variations have been used."""
        # Find the base emoji
        base_emoji = None
        for base, variations in emoji_variations.items():
            if emoji in variations:
                base_emoji = base
                break
        
        if base_emoji:
            # Check if any variation has been used
            return not any(variation in used_emojis for variation in emoji_variations[base_emoji])
        return emoji not in used_emojis
    
    # Process each token
    for token in tokens:
        # Skip excluded words and anything containing numbers
        if token in excluded_words or number_pattern.match(token):
            continue
            
        # First check custom mappings
        if token in custom_mappings:
            emoji = custom_mappings[token]
            if emoji not in excluded_emojis and is_emoji_usable(emoji):
                found_emojis.append(emoji)
                used_emojis.add(emoji)
                explanations.append(f"'{token}' → {emoji} (custom mapping)")
            #else:
            #    explanations.append(f"'{token}' → (skipped - emoji {emoji} already used)")
            continue
            
        # Then check emoji mapping
        if token in emoji_map:
            found_match = False
            for emoji in emoji_map[token]:
                if emoji not in excluded_emojis and is_emoji_usable(emoji):
                    found_emojis.append(emoji)
                    used_emojis.add(emoji)
                    explanations.append(f"'{token}' → {emoji} (from emoji database)")
                    found_match = True
                    break
            if not found_match:
                available_emojis = [e for e in emoji_map[token] if e not in excluded_emojis]
                #if available_emojis:
                #    explanations.append(f"'{token}' → (skipped - all matching emojis {', '.join(available_emojis)} already used)")
                #else:
                #    explanations.append(f"'{token}' → (skipped - all matching emojis are excluded)")
        else:
            explanations.append(f"'{token}' → (no matching emoji found)")
    
    # Return emojis and explanations
    return ' '.join(found_emojis) if found_emojis else '', explanations

def process_file(file_path):
    """Process a single text file and create corresponding emoji and explanation files."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        
        emojis, explanations = text_to_emojis(text)
        
        # Create output filenames
        emoji_file = file_path.with_suffix('.emoji')
        explanation_file = file_path.with_suffix('.emoji.explain')
        
        with open(emoji_file, 'w', encoding='utf-8') as f:
            f.write(emojis)
            
        with open(explanation_file, 'w', encoding='utf-8') as f:
            f.write('\n'.join(explanations))
            
        print(f"Processed: {file_path} → {emoji_file} and {explanation_file}")
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")

def main():
    # Set up argument parser
    parser = argparse.ArgumentParser(description='Convert text files to emoji representations')
    parser.add_argument('directory', nargs='?', default='.',
                       help='Directory to process (default: current directory)')
    
    args = parser.parse_args()
    
    # Convert to Path object and resolve to absolute path
    base_dir = Path(args.directory).resolve()
    
    if not base_dir.exists():
        print(f"Error: Directory '{base_dir}' does not exist")
        return
    
    # Find all .txt files in directory and subdirectories, excluding specific files
    txt_files = [f for f in base_dir.rglob('*.txt') 
                 if f.name not in ['sample-prompts.txt', 'wordfreq.txt']]
    
    if not txt_files:
        print(f"No .txt files found in {base_dir}")
        return
    
    print(f"Found {len(txt_files)} .txt files to process")
    
    # Process each file
    for file_path in txt_files:
        process_file(file_path)

if __name__ == "__main__":
    main()