Spaces:
Sleeping
Sleeping
| import os | |
| from huggingface_hub import InferenceClient | |
| import tempfile | |
| import uuid | |
| # Initialize the client | |
| # We rely on the free tier which works for these specific models without a token locally, | |
| # but in production/Spaces, it uses the environment's token automatically. | |
| client = InferenceClient() | |
| # Define Models | |
| TRANSLATION_MODEL = "Helsinki-NLP/opus-mt-th-en" | |
| IMAGE_MODEL = "stabilityai/stable-diffusion-xl-base-1.0" | |
| AUDIO_MODEL = "facebook/mms-tts-eng" | |
| def translate_text(text): | |
| """Translates Thai text to English.""" | |
| try: | |
| if not text.strip(): | |
| return "" | |
| # Using the translation API | |
| result = client.translation(text, model=TRANSLATION_MODEL) | |
| # The API usually returns [{'translation_text': '...'}] or similar object | |
| if hasattr(result, 'translation_text'): | |
| return result.translation_text | |
| return result[0]['translation_text'] | |
| except Exception as e: | |
| print(f"Translation Error: {e}") | |
| return f"Error translating: {text}" | |
| def generate_image(prompt, style): | |
| """Generates an image from text.""" | |
| try: | |
| # Enhance prompt based on style | |
| enhanced_prompt = prompt | |
| if style == "Cinematic": | |
| enhanced_prompt += ", cinematic lighting, highly detailed, photorealistic, 8k" | |
| elif style == "Anime": | |
| enhanced_prompt += ", anime style, japanese animation, vibrant colors" | |
| elif style == "3D Model": | |
| enhanced_prompt += ", 3d render, blender, unreal engine 5, isometric" | |
| elif style == "Oil Painting": | |
| enhanced_prompt += ", oil painting, textured, artistic, van gogh style" | |
| elif style == "Pixel Art": | |
| enhanced_prompt += ", pixel art, 16-bit, retro game style" | |
| image = client.text_to_image( | |
| enhanced_prompt, | |
| model=IMAGE_MODEL | |
| ) | |
| return image | |
| except Exception as e: | |
| print(f"Image Generation Error: {e}") | |
| return None | |
| def generate_audio(text): | |
| """Generates audio from English text.""" | |
| try: | |
| # Generate audio bytes | |
| audio_bytes = client.text_to_speech( | |
| text, | |
| model=AUDIO_MODEL | |
| ) | |
| # Save to a temporary file | |
| temp_dir = tempfile.gettempdir() | |
| filename = f"{uuid.uuid4()}.flac" | |
| filepath = os.path.join(temp_dir, filename) | |
| with open(filepath, "wb") as f: | |
| f.write(audio_bytes) | |
| return filepath | |
| except Exception as e: | |
| print(f"Audio Generation Error: {e}") | |
| return None | |
| def process_pipeline(thai_text, style): | |
| """Main function to orchestrate the flow.""" | |
| if not thai_text: | |
| return "Please enter text.", None, None | |
| print(f"Processing: {thai_text}") | |
| # Step 1: Translate | |
| eng_text = translate_text(thai_text) | |
| # Step 2 & 3: Generate Image and Audio (can be done in parallel ideally, but sequential here for simplicity) | |
| image = generate_image(eng_text, style) | |
| audio_path = generate_audio(eng_text) | |
| return eng_text, image, audio_path |