File size: 7,140 Bytes
26dfe2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d83c062
 
 
 
26dfe2c
 
 
 
 
 
 
 
d83c062
 
26dfe2c
 
 
 
 
 
 
 
 
 
 
 
 
 
d83c062
 
 
 
 
 
 
 
 
 
 
26dfe2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
"""
Image generation tools for visualizing song analysis results.
"""

import os
from typing import Dict
from loguru import logger
from smolagents import Tool

from api_utils import make_api_call_with_retry


def caption_gen_tool(analysis_json: Dict, title: str, artist: str) -> str:
    """
    Generate a descriptive caption for image generation based on song analysis.
    Uses LLM to create a high-quality image prompt based on the analysis.
    
    Args:
        analysis_json: Dictionary containing the song analysis results
        title: Song title (required)
        artist: Song artist
        
    Returns:
        A descriptive caption suitable for image generation
    """
    logger.info("Generating image caption from analysis results")
    
    # Use the provided title and artist
    logger.info(f"Using song: '{title}' by '{artist}' for caption generation")
    mood = analysis_json.get("mood") or "emotional"
    themes = ", ".join(analysis_json.get("main_themes") or ["music"])
    summary = analysis_json.get("summary") or ""
    conclusion = analysis_json.get("conclusion") or ""
    
    # Create an API prompt to generate a high-quality image caption
    prompt = f"""Generate a detailed, vivid, and artistic image generation prompt based on the following song analysis. 
    This prompt will be used by an AI image generator to create a visual representation of the song's essence.
    
    Song: {title} by {artist}
    Mood: {mood}
    Themes: {themes}
    Summary: {summary[:200] if summary else ""}
    Conclusion: {conclusion[:200] if conclusion else ""}
    
    Your task is to create a single paragraph (approximately 100-150 words) that vividly describes a scene or abstract image 
    that captures the emotional essence and themes of this song. The description should be detailed, visual, and evocative. 
    DO NOT include any text, words, or lyrics in the image description. Focus on colors, composition, mood, symbols, and visuals only.
    
    ONLY output the final image generation prompt with no additional text, explanations, or formatting.
    """
    
    # Use the same model as in lyrics analysis
    model_to_use = "openrouter/google/gemini-2.0-flash-lite-preview-02-05:free"
    logger.info("Using {} for caption generation", model_to_use)
    
    # Call the API to generate a caption
    logger.info("Generating image caption for song: '{}' by '{}'", title, artist)
    response_text = make_api_call_with_retry(model_to_use, prompt)
    
    # Clean up the response if needed
    caption = response_text.strip()
    logger.debug(f"Generated image caption: {caption[:100]}...")
    
    return caption


class GenerateImageTool(Tool):
    """Tool for generating images based on song analysis"""
    
    name = "generate_image"
    description = "Generates an image based on the song analysis results"
    inputs = {
        "analysis_json": {"type": "any", "description": "JSON dictionary containing the analysis results"},
        "title": {"type": "string", "description": "Title of the song"},
        "artist": {"type": "string", "description": "Artist of the song"}
    }
    output_type = "string"
    
    def generate_with_gemini(self, caption: str) -> str:
        """
        Generate image using Gemini API directly
        
        Args:
            caption: The prompt text for image generation
            
        Returns:
            HTML img tag with the image or error message
        """
        try:
            # Правильный импорт библиотеки
            from google import genai
            from google.genai import types
            from io import BytesIO
            import base64
            
            # Get API key from environment variable
            api_key = os.environ.get("GEMINI_API_KEY")
            if not api_key:
                logger.error("GEMINI_API_KEY not found in environment variables")
                return "<p>Error: Gemini API key not found. Please set the GEMINI_API_KEY environment variable.</p>"
            
            logger.info("Initializing Gemini client")
            # Новый способ настройки клиента
            client = genai.Client(api_key=api_key)
            
            logger.info("Generating image with Gemini")
            response = client.models.generate_content(
                model="gemini-2.0-flash-exp-image-generation",
                contents=caption,
                config=types.GenerateContentConfig(
                    response_modalities=['Text', 'Image']
                )
            )
            
            # Process the response
            for part in response.candidates[0].content.parts:
                if part.text is not None:
                    logger.info(f"Gemini response text: {part.text[:100]}...")
                elif part.inline_data is not None:
                    # Извлекаем данные изображения
                    image_data = part.inline_data.data
                    
                    # Преобразуем в base64 для HTML, если нужно
                    if isinstance(image_data, bytes):
                        image_b64 = base64.b64encode(image_data).decode('utf-8')
                    else:
                        # Если данные уже в base64
                        image_b64 = image_data
                        
                    img_html = f'<img src="data:image/png;base64,{image_b64}" alt="Generated image based on song analysis" style="max-width:100%; border-radius:10px; box-shadow:0 4px 8px rgba(0,0,0,0.1);">'
                    return img_html
            
            return "<p>Error: No image generated by Gemini API.</p>"
            
        except ImportError:
            logger.error("Google GenAI package not installed")
            return "<p>Error: Google GenAI package not installed. Install with 'pip install google-generativeai'</p>"
        except Exception as e:
            logger.error(f"Error generating image with Gemini: {str(e)}")
            return f"<p>Error generating image with Gemini: {str(e)}</p>"
    
    def forward(self, analysis_json: Dict, title: str, artist: str) -> str:
        """
        Generates an image based on the analysis results using Gemini API.
        
        Args:
            analysis_json: Dictionary containing the analysis results
            title: Song title
            artist: Song artist (required)
            
        Returns:
            HTML img tag with the image or error message
        """
        try:
            # Generate caption for the image
            caption = caption_gen_tool(analysis_json, title=title, artist=artist)
            logger.info("Caption generated successfully")
        
            logger.warning("OpenRouter failed, falling back to Gemini API")
            # Fall back to Gemini API
            result = self.generate_with_gemini(caption)
            return result
            
        except Exception as e:
            logger.error(f"Error in image generation: {str(e)}")
            return f"<p>Error in image generation: {str(e)}</p>"