LyricsAnalyzerAgent / tools /image_generation_tools.py
tonko22's picture
hmmmm
d83c062
"""
Image generation tools for visualizing song analysis results.
"""
import os
from typing import Dict
from loguru import logger
from smolagents import Tool
from api_utils import make_api_call_with_retry
def caption_gen_tool(analysis_json: Dict, title: str, artist: str) -> str:
"""
Generate a descriptive caption for image generation based on song analysis.
Uses LLM to create a high-quality image prompt based on the analysis.
Args:
analysis_json: Dictionary containing the song analysis results
title: Song title (required)
artist: Song artist
Returns:
A descriptive caption suitable for image generation
"""
logger.info("Generating image caption from analysis results")
# Use the provided title and artist
logger.info(f"Using song: '{title}' by '{artist}' for caption generation")
mood = analysis_json.get("mood") or "emotional"
themes = ", ".join(analysis_json.get("main_themes") or ["music"])
summary = analysis_json.get("summary") or ""
conclusion = analysis_json.get("conclusion") or ""
# Create an API prompt to generate a high-quality image caption
prompt = f"""Generate a detailed, vivid, and artistic image generation prompt based on the following song analysis.
This prompt will be used by an AI image generator to create a visual representation of the song's essence.
Song: {title} by {artist}
Mood: {mood}
Themes: {themes}
Summary: {summary[:200] if summary else ""}
Conclusion: {conclusion[:200] if conclusion else ""}
Your task is to create a single paragraph (approximately 100-150 words) that vividly describes a scene or abstract image
that captures the emotional essence and themes of this song. The description should be detailed, visual, and evocative.
DO NOT include any text, words, or lyrics in the image description. Focus on colors, composition, mood, symbols, and visuals only.
ONLY output the final image generation prompt with no additional text, explanations, or formatting.
"""
# Use the same model as in lyrics analysis
model_to_use = "openrouter/google/gemini-2.0-flash-lite-preview-02-05:free"
logger.info("Using {} for caption generation", model_to_use)
# Call the API to generate a caption
logger.info("Generating image caption for song: '{}' by '{}'", title, artist)
response_text = make_api_call_with_retry(model_to_use, prompt)
# Clean up the response if needed
caption = response_text.strip()
logger.debug(f"Generated image caption: {caption[:100]}...")
return caption
class GenerateImageTool(Tool):
"""Tool for generating images based on song analysis"""
name = "generate_image"
description = "Generates an image based on the song analysis results"
inputs = {
"analysis_json": {"type": "any", "description": "JSON dictionary containing the analysis results"},
"title": {"type": "string", "description": "Title of the song"},
"artist": {"type": "string", "description": "Artist of the song"}
}
output_type = "string"
def generate_with_gemini(self, caption: str) -> str:
"""
Generate image using Gemini API directly
Args:
caption: The prompt text for image generation
Returns:
HTML img tag with the image or error message
"""
try:
# Правильный импорт библиотеки
from google import genai
from google.genai import types
from io import BytesIO
import base64
# Get API key from environment variable
api_key = os.environ.get("GEMINI_API_KEY")
if not api_key:
logger.error("GEMINI_API_KEY not found in environment variables")
return "<p>Error: Gemini API key not found. Please set the GEMINI_API_KEY environment variable.</p>"
logger.info("Initializing Gemini client")
# Новый способ настройки клиента
client = genai.Client(api_key=api_key)
logger.info("Generating image with Gemini")
response = client.models.generate_content(
model="gemini-2.0-flash-exp-image-generation",
contents=caption,
config=types.GenerateContentConfig(
response_modalities=['Text', 'Image']
)
)
# Process the response
for part in response.candidates[0].content.parts:
if part.text is not None:
logger.info(f"Gemini response text: {part.text[:100]}...")
elif part.inline_data is not None:
# Извлекаем данные изображения
image_data = part.inline_data.data
# Преобразуем в base64 для HTML, если нужно
if isinstance(image_data, bytes):
image_b64 = base64.b64encode(image_data).decode('utf-8')
else:
# Если данные уже в base64
image_b64 = image_data
img_html = f'<img src="data:image/png;base64,{image_b64}" alt="Generated image based on song analysis" style="max-width:100%; border-radius:10px; box-shadow:0 4px 8px rgba(0,0,0,0.1);">'
return img_html
return "<p>Error: No image generated by Gemini API.</p>"
except ImportError:
logger.error("Google GenAI package not installed")
return "<p>Error: Google GenAI package not installed. Install with 'pip install google-generativeai'</p>"
except Exception as e:
logger.error(f"Error generating image with Gemini: {str(e)}")
return f"<p>Error generating image with Gemini: {str(e)}</p>"
def forward(self, analysis_json: Dict, title: str, artist: str) -> str:
"""
Generates an image based on the analysis results using Gemini API.
Args:
analysis_json: Dictionary containing the analysis results
title: Song title
artist: Song artist (required)
Returns:
HTML img tag with the image or error message
"""
try:
# Generate caption for the image
caption = caption_gen_tool(analysis_json, title=title, artist=artist)
logger.info("Caption generated successfully")
logger.warning("OpenRouter failed, falling back to Gemini API")
# Fall back to Gemini API
result = self.generate_with_gemini(caption)
return result
except Exception as e:
logger.error(f"Error in image generation: {str(e)}")
return f"<p>Error in image generation: {str(e)}</p>"