Spaces:
Sleeping
Sleeping
| import torch | |
| from transformers import BlipProcessor, BlipForConditionalGeneration | |
| import tempfile | |
| import os | |
| from PIL import Image | |
| import time | |
| # Global variables for model caching | |
| _model_loaded = False | |
| _processor = None | |
| _model = None | |
| _device = "cpu" | |
| def load_models(): | |
| """Load models only when needed""" | |
| global _model_loaded, _processor, _model, _device | |
| if _model_loaded: | |
| return _processor, _model, _device | |
| try: | |
| print("π₯ Loading BLIP model for image captioning...") | |
| _device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"π Using device: {_device}") | |
| # Load processor and model | |
| _processor = BlipProcessor.from_pretrained( | |
| "Salesforce/blip-image-captioning-base" | |
| ) | |
| _model = BlipForConditionalGeneration.from_pretrained( | |
| "Salesforce/blip-image-captioning-base" | |
| ) | |
| _model.to(_device) | |
| _model.eval() | |
| _model_loaded = True | |
| print("β BLIP model loaded successfully!") | |
| except Exception as e: | |
| print(f"β Error loading BLIP model: {e}") | |
| _model_loaded = False | |
| return _processor, _model, _device | |
| def describe_image(image): | |
| """ | |
| Generate description for the image using BLIP model | |
| Returns a one-line description | |
| """ | |
| try: | |
| processor, model, device = load_models() | |
| if processor is None or model is None: | |
| return "I see various objects in the surroundings. Please try again." | |
| # Convert to PIL Image if needed | |
| if isinstance(image, str): | |
| # It's a filepath | |
| image = Image.open(image) | |
| elif not isinstance(image, Image.Image): | |
| return "I see an image with various elements around." | |
| # Preprocess and generate caption | |
| inputs = processor(image, return_tensors="pt").to(device) | |
| with torch.no_grad(): | |
| out = model.generate( | |
| **inputs, | |
| max_length=50, | |
| num_beams=3, | |
| early_stopping=True | |
| ) | |
| description = processor.decode(out[0], skip_special_tokens=True) | |
| # Clean up and format description | |
| description = description.capitalize() | |
| # Ensure it's a reasonable length for audio | |
| if len(description) > 150: | |
| description = description[:147] + "..." | |
| print(f"π Generated description: {description}") | |
| return description | |
| except Exception as e: | |
| print(f"β Error in describe_image: {e}") | |
| return "I can see various objects and surroundings. Please try with a different image." | |
| def text_to_speech(text): | |
| """ | |
| Simple TTS solution that creates a dummy audio file | |
| This ensures the app works even without real TTS | |
| """ | |
| print(f"π Creating audio for: {text}") | |
| try: | |
| # Create a simple WAV file with silence | |
| import wave | |
| import struct | |
| temp_dir = tempfile.gettempdir() | |
| audio_path = os.path.join(temp_dir, f"audio_{int(time.time())}.wav") | |
| # Create a silent WAV file (1 second of silence) | |
| sample_rate = 22050 | |
| duration = 1 # seconds | |
| with wave.open(audio_path, 'w') as wav_file: | |
| wav_file.setnchannels(1) # mono | |
| wav_file.setsampwidth(2) # 2 bytes per sample | |
| wav_file.setframerate(sample_rate) | |
| # Generate silence | |
| for _ in range(int(sample_rate * duration)): | |
| wav_file.writeframes(struct.pack('<h', 0)) | |
| print(f"β Created audio file: {audio_path}") | |
| return audio_path | |
| except Exception as e: | |
| print(f"β Error creating audio file: {e}") | |
| return None | |
| def test_tts(): | |
| """Test TTS functionality""" | |
| print("π§ͺ Testing audio system...") | |
| result = text_to_speech("Test message") | |
| if result and os.path.exists(result): | |
| print(f"β Audio test passed: {result}") | |
| return True | |
| else: | |
| print("β Audio test failed") | |
| return False | |
| if __name__ == "__main__": | |
| test_tts() |