Prathamesh1420's picture
Update utils.py
d31e184 verified
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
import tempfile
import os
from PIL import Image
import time
# Global variables for model caching
_model_loaded = False
_processor = None
_model = None
_device = "cpu"
def load_models():
"""Load models only when needed"""
global _model_loaded, _processor, _model, _device
if _model_loaded:
return _processor, _model, _device
try:
print("πŸ“₯ Loading BLIP model for image captioning...")
_device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"πŸš€ Using device: {_device}")
# Load processor and model
_processor = BlipProcessor.from_pretrained(
"Salesforce/blip-image-captioning-base"
)
_model = BlipForConditionalGeneration.from_pretrained(
"Salesforce/blip-image-captioning-base"
)
_model.to(_device)
_model.eval()
_model_loaded = True
print("βœ… BLIP model loaded successfully!")
except Exception as e:
print(f"❌ Error loading BLIP model: {e}")
_model_loaded = False
return _processor, _model, _device
def describe_image(image):
"""
Generate description for the image using BLIP model
Returns a one-line description
"""
try:
processor, model, device = load_models()
if processor is None or model is None:
return "I see various objects in the surroundings. Please try again."
# Convert to PIL Image if needed
if isinstance(image, str):
# It's a filepath
image = Image.open(image)
elif not isinstance(image, Image.Image):
return "I see an image with various elements around."
# Preprocess and generate caption
inputs = processor(image, return_tensors="pt").to(device)
with torch.no_grad():
out = model.generate(
**inputs,
max_length=50,
num_beams=3,
early_stopping=True
)
description = processor.decode(out[0], skip_special_tokens=True)
# Clean up and format description
description = description.capitalize()
# Ensure it's a reasonable length for audio
if len(description) > 150:
description = description[:147] + "..."
print(f"πŸ“ Generated description: {description}")
return description
except Exception as e:
print(f"❌ Error in describe_image: {e}")
return "I can see various objects and surroundings. Please try with a different image."
def text_to_speech(text):
"""
Simple TTS solution that creates a dummy audio file
This ensures the app works even without real TTS
"""
print(f"πŸ”Š Creating audio for: {text}")
try:
# Create a simple WAV file with silence
import wave
import struct
temp_dir = tempfile.gettempdir()
audio_path = os.path.join(temp_dir, f"audio_{int(time.time())}.wav")
# Create a silent WAV file (1 second of silence)
sample_rate = 22050
duration = 1 # seconds
with wave.open(audio_path, 'w') as wav_file:
wav_file.setnchannels(1) # mono
wav_file.setsampwidth(2) # 2 bytes per sample
wav_file.setframerate(sample_rate)
# Generate silence
for _ in range(int(sample_rate * duration)):
wav_file.writeframes(struct.pack('<h', 0))
print(f"βœ… Created audio file: {audio_path}")
return audio_path
except Exception as e:
print(f"❌ Error creating audio file: {e}")
return None
def test_tts():
"""Test TTS functionality"""
print("πŸ§ͺ Testing audio system...")
result = text_to_speech("Test message")
if result and os.path.exists(result):
print(f"βœ… Audio test passed: {result}")
return True
else:
print("❌ Audio test failed")
return False
if __name__ == "__main__":
test_tts()