#!/usr/bin/env python # -*- coding: utf-8 -*- # # txtファイルを関連する絵文字に変換する # Convert txt files to related emojis # # This script converts text to related emojis, injecting context-relevant visual # cues into datasets. It enables the enrichment of text data with corresponding # emojis, enhancing its expressiveness and engagement. # # Before running this script, you need to download the required resources. Just open # up `python` and type: # # import nltk # nltk.download('punkt_tab') import nltk from nltk.tokenize import word_tokenize from emoji import EMOJI_DATA import argparse from pathlib import Path import re import locale import gettext # Download required NLTK data (only needed once) nltk.download('punkt', quiet=True) def setup_i18n(): """Set up internationalization""" try: locale.setlocale(locale.LC_ALL, '') current_locale = locale.getlocale()[0] locale_path = Path(__file__).parent / 'locales' trans = gettext.translation('txt2emoji', locale_path, languages=[current_locale]) trans.install() return trans.gettext except: return gettext.gettext # Initialize translation _ = setup_i18n() def get_emoji_mapping(): """Create a mapping of words to emojis.""" emoji_map = {} # Group emoji variations together emoji_variations = {} for emoji_char, data in EMOJI_DATA.items(): base_emoji = data.get('base', emoji_char) # Get base emoji or use current one if base_emoji not in emoji_variations: emoji_variations[base_emoji] = set() emoji_variations[base_emoji].add(emoji_char) if 'en' in data: # If emoji has English description words = data['en'].lower().replace('_', ' ').split() for word in words: if word not in emoji_map: emoji_map[word] = [] emoji_map[word].append(emoji_char) return emoji_map, emoji_variations def text_to_emojis(text): """Convert text to related emojis.""" # Create emoji mapping and variations emoji_map, emoji_variations = get_emoji_mapping() # Regex pattern to match any token containing numbers number_pattern = re.compile(r'.*\d+.*') # Emojis to exclude excluded_emojis = {'🔶', '⭕', '🔷', '🔹', '🔸', '🔺', '🔻', '🔴', '🔵', '🔼', '🔾', '🇵🇬', '🀄', '🔲', '✅'} # Words to exclude from emoji conversion excluded_words = {'.', 'to', 'be', '(', ')', 'purple', 'abdominal', 'penetration', 'feral', 'body', 'nude', 'anthro', 'big', 'small', 'the', 'a', 'an', 'and', 'or', 'but', 'if', 'then', 'because', 'as', 'until', 'while', ',', 'hi', 'res', 'pussy', 'penetrated', 'equine', 'felid', 'feline', 'equid', 'genital', 'genitals', 'penetrating', 'medial', 'ring', 'inside', 'duo', 'solo', 'in', 'andromorph', 'from', 'behind', 'position', 'pantherine', 'animal', 'brown', 'sub', 'dom', 'explicit', 'black', 'bulge', 'dominant', 'kousen', 'rendan', 'genitalia', 'tan', 'simple', 'media', 'vaginal', 'red', 'pecs', 'navel', 'background', 'pubes', 'mammal', 'lore', 'gaping', 'balls', 'penetrated', 'mustelid', 'white', 'erection', 'blue', 'nipples', 'precum', 'bodily', 'text', 'english', 'submissive', 'sperm', 'bottom', 'penile', 'humanoid', 'leash', 'leashed', 'collar', 'penile', 'tuft', 'backward', 'ovum', 'on', 'mane', 'long', 'string', 'butt', 'tail', 'whiskers', 'yellow', 'kousenzephyr', 'object', 'leaning', 'cell', 'blonde', 'anus', 'more', 'questionable', 'signature', 'for', 'foreskin', 'facial', 'triceps', 'claws', 'tail', 'toe', 'view', 'three-quarter', 'sharp', 'spreading', '\'', 'pose', 'quads', 'grey', 'green', 'top', 'primate', 'nonbinary', 'barazoku', 'jun', 'enzo', 'scj', 'fully', 'elbow', 'focus', 'clitoris', 'hood', 'armpit', 'jack-o', 'advised', 'contains', 'graphic', 'what', 'with', 'of', 'looks', 'like', 'onto', 'by', 'patterns', 'lit', 'judging', 'another', 'four', 'three', 'two', 'one', 'zero', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'at', 'least', 'most', 'only', } # Track used emojis and their variations used_emojis = set() # Additional manual mappings for common words custom_mappings = { 'cushion': '🛏️', 'indoors': '🏠', 'outdoors': '🌳', 'dimly': '🌃', 'she': '♀️', 'her': '♀️', 'he': '♂️', 'him': '♂️', 'his': '♂️', 'viewer': '👀', 'safe': '🍼', 'mature': '🔞', 'abdominal': '🫃', 'cellphone': '📱', 'breath': '😤', 'armpit': '🙋‍♂️', 'arm': '🙎‍♂️', 'bottomwear': '👖', 'chest': '🧰', 'clock': '🕒', 'clothed': '👚', 'clothing': '👚', 'electronics': '💻', 'flannel': '👕', 'denim': '👖', 'drooling': '🤤', 'public': '📢', 'panting': '😛', 'shirt': '👕', 'shorts': '🩳', 'pants': '👖', 'tenting': '🏕️', 'reaching': '🤏', 'topwear': '👕', 'watch': '⌚', 'top': '🔝', 'tree': '🌳', 'pirate': '🏴‍☠️', 'monkey': '🐒', 'fangs': '🦷', 'looking': '👀', 'leg': '🦵', 'halloween': '🎃', 'gloves': '🧤', 'socks': '🧦', 'holding': '🤝', 'absurd': '😶‍🌫️', 'chair': '🪑', 'begging': '🙏', 'beard': '🧔', 'furniture': '🛋️', 'impregnation': '🥚', 'hair': '🪮', 'oral': '👄', 'dialogue': '💬', 'bubble': '💬', 'sitting': '🪑', 'lion': '🦁', 'otter': '🦦', 'markings': '🏷️', 'marking': '🏷️', 'toes': '👣', 'teeth': '🦷', 'fingering': '👉', 'blush': '😊', 'male': '♂️', 'tiger': '🐯', 'fluids': '💧', 'sweat': '💧', 'saliva': '🤤', 'wolf': '🐺', 'dog': '🐶', 'female': '♀️', 'intersex': '⚧️', 'muscular': '💪', 'wheelbarrow': '🚜', 'sex': '💑', 'size': '📏', 'difference': '🔢', 'penis': '🔱', 'paws': '🐾', 'pawpads': '🐾', 'hindpaw': '🐾', 'fur': '🧥', 'horse': '🐴', 'ejaculation': '💦', 'cum': '💦', 'love': '❤️', 'smaller': '🔽', 'bigger': '🔼', 'larger': '🔼', 'cats': '😺', 'cat': '😺', 'dogs': '🐶', 'dog': '🐶', 'sun': '☀️', 'moon': '🌙', 'star': '⭐', 'happy': '😊', 'sad': '😢', 'angry': '😠', 'food': '🍔', 'heart': '❤️', 'fire': '🔥', 'hot': '🔥', 'cold': '❄️', 'snow': '❄️', 'rain': '🌧️', 'smile': '😊', 'laugh': '😂', 'cry': '😢' } # Tokenize the input text tokens = word_tokenize(text.lower()) # Store found emojis with their explanations found_emojis = [] explanations = [] def is_emoji_usable(emoji): """Check if emoji or any of its variations have been used.""" # Find the base emoji base_emoji = None for base, variations in emoji_variations.items(): if emoji in variations: base_emoji = base break if base_emoji: # Check if any variation has been used return not any(variation in used_emojis for variation in emoji_variations[base_emoji]) return emoji not in used_emojis # Process each token for token in tokens: # Skip excluded words and anything containing numbers if token in excluded_words or number_pattern.match(token): continue # First check custom mappings if token in custom_mappings: emoji = custom_mappings[token] if emoji not in excluded_emojis and is_emoji_usable(emoji): found_emojis.append(emoji) used_emojis.add(emoji) explanations.append(f"'{token}' → {emoji} (custom mapping)") #else: # explanations.append(f"'{token}' → (skipped - emoji {emoji} already used)") continue # Then check emoji mapping if token in emoji_map: found_match = False for emoji in emoji_map[token]: if emoji not in excluded_emojis and is_emoji_usable(emoji): found_emojis.append(emoji) used_emojis.add(emoji) explanations.append(f"'{token}' → {emoji} (from emoji database)") found_match = True break if not found_match: available_emojis = [e for e in emoji_map[token] if e not in excluded_emojis] #if available_emojis: # explanations.append(f"'{token}' → (skipped - all matching emojis {', '.join(available_emojis)} already used)") #else: # explanations.append(f"'{token}' → (skipped - all matching emojis are excluded)") else: explanations.append(f"'{token}' → (no matching emoji found)") # Return emojis and explanations return ' '.join(found_emojis) if found_emojis else '', explanations def process_file(file_path): """Process a single text file and create corresponding emoji and explanation files.""" try: with open(file_path, 'r', encoding='utf-8') as f: text = f.read() emojis, explanations = text_to_emojis(text) # Create output filenames emoji_file = file_path.with_suffix('.emoji') explanation_file = file_path.with_suffix('.emoji.explain') with open(emoji_file, 'w', encoding='utf-8') as f: f.write(emojis) with open(explanation_file, 'w', encoding='utf-8') as f: f.write('\n'.join(explanations)) print(f"Processed: {file_path} → {emoji_file} and {explanation_file}") except Exception as e: print(f"Error processing {file_path}: {str(e)}") def main(): # Set up argument parser parser = argparse.ArgumentParser(description='Convert text files to emoji representations') parser.add_argument('directory', nargs='?', default='.', help='Directory to process (default: current directory)') args = parser.parse_args() # Convert to Path object and resolve to absolute path base_dir = Path(args.directory).resolve() if not base_dir.exists(): print(f"Error: Directory '{base_dir}' does not exist") return # Find all .txt files in directory and subdirectories, excluding specific files txt_files = [f for f in base_dir.rglob('*.txt') if f.name not in ['sample-prompts.txt', 'wordfreq.txt']] if not txt_files: print(f"No .txt files found in {base_dir}") return print(f"Found {len(txt_files)} .txt files to process") # Process each file for file_path in txt_files: process_file(file_path) if __name__ == "__main__": main()