Spaces:

Prathamesh1420
/

Blind_people_helper

Sleeping

App Files Files Community

Blind_people_helper / utils.py

Prathamesh1420

Update utils.py

d31e184 verified 19 days ago

raw

history blame contribute delete

4.25 kB

	import torch
	from transformers import BlipProcessor, BlipForConditionalGeneration
	import tempfile
	import os
	from PIL import Image
	import time

	# Global variables for model caching
	_model_loaded = False
	_processor = None
	_model = None
	_device = "cpu"

	def load_models():
	"""Load models only when needed"""
	global _model_loaded, _processor, _model, _device

	if _model_loaded:
	return _processor, _model, _device

	try:
	print("📥 Loading BLIP model for image captioning...")

	_device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"🚀 Using device: {_device}")

	# Load processor and model
	_processor = BlipProcessor.from_pretrained(
	"Salesforce/blip-image-captioning-base"
	)
	_model = BlipForConditionalGeneration.from_pretrained(
	"Salesforce/blip-image-captioning-base"
	)
	_model.to(_device)
	_model.eval()

	_model_loaded = True
	print("✅ BLIP model loaded successfully!")

	except Exception as e:
	print(f"❌ Error loading BLIP model: {e}")
	_model_loaded = False

	return _processor, _model, _device

	def describe_image(image):
	"""
	Generate description for the image using BLIP model
	Returns a one-line description
	"""
	try:
	processor, model, device = load_models()

	if processor is None or model is None:
	return "I see various objects in the surroundings. Please try again."

	# Convert to PIL Image if needed
	if isinstance(image, str):
	# It's a filepath
	image = Image.open(image)
	elif not isinstance(image, Image.Image):
	return "I see an image with various elements around."

	# Preprocess and generate caption
	inputs = processor(image, return_tensors="pt").to(device)

	with torch.no_grad():
	out = model.generate(
	**inputs,
	max_length=50,
	num_beams=3,
	early_stopping=True
	)

	description = processor.decode(out[0], skip_special_tokens=True)

	# Clean up and format description
	description = description.capitalize()

	# Ensure it's a reasonable length for audio
	if len(description) > 150:
	description = description[:147] + "..."

	print(f"📝 Generated description: {description}")
	return description

	except Exception as e:
	print(f"❌ Error in describe_image: {e}")
	return "I can see various objects and surroundings. Please try with a different image."

	def text_to_speech(text):
	"""
	Simple TTS solution that creates a dummy audio file
	This ensures the app works even without real TTS
	"""
	print(f"🔊 Creating audio for: {text}")

	try:
	# Create a simple WAV file with silence
	import wave
	import struct

	temp_dir = tempfile.gettempdir()
	audio_path = os.path.join(temp_dir, f"audio_{int(time.time())}.wav")

	# Create a silent WAV file (1 second of silence)
	sample_rate = 22050
	duration = 1 # seconds

	with wave.open(audio_path, 'w') as wav_file:
	wav_file.setnchannels(1) # mono
	wav_file.setsampwidth(2) # 2 bytes per sample
	wav_file.setframerate(sample_rate)

	# Generate silence
	for _ in range(int(sample_rate * duration)):
	wav_file.writeframes(struct.pack('<h', 0))

	print(f"✅ Created audio file: {audio_path}")
	return audio_path

	except Exception as e:
	print(f"❌ Error creating audio file: {e}")
	return None

	def test_tts():
	"""Test TTS functionality"""
	print("🧪 Testing audio system...")
	result = text_to_speech("Test message")
	if result and os.path.exists(result):
	print(f"✅ Audio test passed: {result}")
	return True
	else:
	print("❌ Audio test failed")
	return False

	if __name__ == "__main__":
	test_tts()