hackaton1 / app.py
akazmi's picture
Update app.py
d2f3f83 verified
import gradio as gr
from gtts import gTTS
import os
import speech_recognition as sr
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
from PIL import Image
import cv2
from pydub import AudioSegment
# Text-to-Speech function
def text_to_speech(text):
tts = gTTS(text=text, lang='en', slow=False)
filename = "output.mp3"
tts.save(filename)
return filename
# Speech-to-Text function
def speech_to_text(audio):
recognizer = sr.Recognizer()
# Check if the uploaded file is an MP3
if audio.endswith('.mp3'):
# Convert MP3 to WAV
audio_segment = AudioSegment.from_mp3(audio)
wav_file = "temp.wav"
audio_segment.export(wav_file, format="wav")
audio = wav_file # Update audio to the converted file
try:
with sr.AudioFile(audio) as source:
audio_data = recognizer.record(source)
text = recognizer.recognize_google(audio_data)
return text
except sr.UnknownValueError:
return "Sorry, I could not understand the audio."
except sr.RequestError as e:
return f"Could not request results; {e}"
except Exception as e:
return f"An error occurred: {e}"
# Image Description function
def generate_image_description(image):
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
inputs = processor(images=image, return_tensors="pt")
out = model.generate(**inputs)
description = processor.decode(out[0], skip_special_tokens=True)
return description
# Video Description function
def generate_video_description(video):
cap = cv2.VideoCapture(video.name)
descriptions = []
if not cap.isOpened():
return "Error opening video file."
frame_count = 0
while frame_count < 5: # Limit to first 5 frames for this example
ret, frame = cap.read()
if not ret:
break
image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
description = generate_image_description(image)
descriptions.append(description)
frame_count += 1
cap.release()
return descriptions if descriptions else ["No frames to describe."]
# Gradio Interface
def main():
with gr.Blocks() as app:
gr.Markdown("<h1 style='text-align: center; color: #1e90ff;'>AI-Powered Accessibility Tools</h1>")
# Text-to-Speech Section
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("<div style='background-color: #f0f8ff; padding: 20px; border-radius: 8px;'>"
"<h2>Text-to-Speech</h2>"
"<ul>"
"<li><strong>Core Idea:</strong> Create natural-sounding speech from text input.</li>"
"<li><strong>Functionality:</strong> Converts written text into spoken words, helping individuals with reading difficulties or visual impairments.</li>"
"<li><strong>Target Audience:</strong> People with visual impairments, reading disabilities, and those who prefer audio content.</li>"
"</ul>"
"<strong>Supported Input:</strong> Plain text. <br>"
"<strong>Output:</strong> MP3 audio file."
"</div>")
text_input = gr.Textbox(label="Enter text for Text-to-Speech", placeholder="Type your text here...")
tts_button = gr.Button("Convert to Speech")
tts_output = gr.Audio(label="TTS Output")
tts_button.click(fn=text_to_speech, inputs=text_input, outputs=tts_output)
# Speech-to-Text Section
with gr.Column(scale=1):
gr.Markdown("<div style='background-color: #e6ffe6; padding: 20px; border-radius: 8px;'>"
"<h2>Speech-to-Text</h2>"
"<ul>"
"<li><strong>Core Idea:</strong> Convert spoken language into written text.</li>"
"<li><strong>Functionality:</strong> Allows users to dictate speech and have it transcribed into text, facilitating communication and documentation.</li>"
"<li><strong>Target Audience:</strong> Individuals with hearing impairments, those who prefer speaking over typing, and people with mobility challenges.</li>"
"</ul>"
"<strong>Supported Input:</strong> WAV, FLAC, AIFF, MP3 (converted to WAV). <br>"
"<strong>Output:</strong> Transcribed text."
"</div>")
stt_input = gr.Audio(label="Record or Upload Audio", type="filepath")
stt_button = gr.Button("Convert Speech to Text")
stt_output = gr.Textbox(label="Speech-to-Text Output")
stt_button.click(fn=speech_to_text, inputs=stt_input, outputs=stt_output)
# Image Description Section
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("<div style='background-color: #ffe6e6; padding: 20px; border-radius: 8px;'>"
"<h2>Image Description</h2>"
"<ul>"
"<li><strong>Core Idea:</strong> Generate descriptive text for images.</li>"
"<li><strong>Functionality:</strong> Analyzes and describes the content of images, making visual information accessible to those who are visually impaired.</li>"
"<li><strong>Target Audience:</strong> Individuals with visual impairments and those needing assistance in understanding visual content.</li>"
"</ul>"
"<strong>Supported Input:</strong> JPEG, PNG, BMP, GIF. <br>"
"<strong>Output:</strong> Text description."
"</div>")
image_input = gr.Image(label="Upload an Image")
image_desc_output = gr.Textbox(label="Image Description")
image_desc_button = gr.Button("Describe Image")
image_desc_button.click(fn=generate_image_description, inputs=image_input, outputs=image_desc_output)
# Video Description Section
with gr.Column(scale=1):
gr.Markdown("<div style='background-color: #fff3e6; padding: 20px; border-radius: 8px;'>"
"<h2>Video Description</h2>"
"<ul>"
"<li><strong>Core Idea:</strong> Describe video content through generated text.</li>"
"<li><strong>Functionality:</strong> Provides textual descriptions of video frames, aiding understanding for those who cannot see the video.</li>"
"<li><strong>Target Audience:</strong> Individuals with visual impairments and those needing assistance in interpreting video content.</li>"
"</ul>"
"<strong>Supported Input:</strong> MP4, AVI, MOV. <br>"
"<strong>Output:</strong> List of text descriptions."
"</div>")
video_input = gr.File(label="Upload a Video", file_types=[".mp4", ".avi", ".mov"])
video_desc_output = gr.Textbox(label="Video Descriptions")
video_desc_button = gr.Button("Describe Video")
video_desc_button.click(fn=generate_video_description, inputs=video_input, outputs=video_desc_output)
app.launch()
if __name__ == "__main__":
main()