|
|
import gradio as gr |
|
|
from gtts import gTTS |
|
|
import os |
|
|
import speech_recognition as sr |
|
|
from transformers import BlipProcessor, BlipForConditionalGeneration |
|
|
import torch |
|
|
from PIL import Image |
|
|
import cv2 |
|
|
from pydub import AudioSegment |
|
|
|
|
|
|
|
|
def text_to_speech(text): |
|
|
tts = gTTS(text=text, lang='en', slow=False) |
|
|
filename = "output.mp3" |
|
|
tts.save(filename) |
|
|
return filename |
|
|
|
|
|
|
|
|
def speech_to_text(audio): |
|
|
recognizer = sr.Recognizer() |
|
|
|
|
|
|
|
|
if audio.endswith('.mp3'): |
|
|
|
|
|
audio_segment = AudioSegment.from_mp3(audio) |
|
|
wav_file = "temp.wav" |
|
|
audio_segment.export(wav_file, format="wav") |
|
|
audio = wav_file |
|
|
|
|
|
try: |
|
|
with sr.AudioFile(audio) as source: |
|
|
audio_data = recognizer.record(source) |
|
|
text = recognizer.recognize_google(audio_data) |
|
|
return text |
|
|
except sr.UnknownValueError: |
|
|
return "Sorry, I could not understand the audio." |
|
|
except sr.RequestError as e: |
|
|
return f"Could not request results; {e}" |
|
|
except Exception as e: |
|
|
return f"An error occurred: {e}" |
|
|
|
|
|
|
|
|
def generate_image_description(image): |
|
|
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") |
|
|
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") |
|
|
|
|
|
inputs = processor(images=image, return_tensors="pt") |
|
|
out = model.generate(**inputs) |
|
|
description = processor.decode(out[0], skip_special_tokens=True) |
|
|
return description |
|
|
|
|
|
|
|
|
def generate_video_description(video): |
|
|
cap = cv2.VideoCapture(video.name) |
|
|
descriptions = [] |
|
|
|
|
|
if not cap.isOpened(): |
|
|
return "Error opening video file." |
|
|
|
|
|
frame_count = 0 |
|
|
while frame_count < 5: |
|
|
ret, frame = cap.read() |
|
|
if not ret: |
|
|
break |
|
|
image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) |
|
|
description = generate_image_description(image) |
|
|
descriptions.append(description) |
|
|
frame_count += 1 |
|
|
|
|
|
cap.release() |
|
|
return descriptions if descriptions else ["No frames to describe."] |
|
|
|
|
|
|
|
|
def main(): |
|
|
with gr.Blocks() as app: |
|
|
gr.Markdown("<h1 style='text-align: center; color: #1e90ff;'>AI-Powered Accessibility Tools</h1>") |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown("<div style='background-color: #f0f8ff; padding: 20px; border-radius: 8px;'>" |
|
|
"<h2>Text-to-Speech</h2>" |
|
|
"<ul>" |
|
|
"<li><strong>Core Idea:</strong> Create natural-sounding speech from text input.</li>" |
|
|
"<li><strong>Functionality:</strong> Converts written text into spoken words, helping individuals with reading difficulties or visual impairments.</li>" |
|
|
"<li><strong>Target Audience:</strong> People with visual impairments, reading disabilities, and those who prefer audio content.</li>" |
|
|
"</ul>" |
|
|
"<strong>Supported Input:</strong> Plain text. <br>" |
|
|
"<strong>Output:</strong> MP3 audio file." |
|
|
"</div>") |
|
|
text_input = gr.Textbox(label="Enter text for Text-to-Speech", placeholder="Type your text here...") |
|
|
tts_button = gr.Button("Convert to Speech") |
|
|
tts_output = gr.Audio(label="TTS Output") |
|
|
tts_button.click(fn=text_to_speech, inputs=text_input, outputs=tts_output) |
|
|
|
|
|
|
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown("<div style='background-color: #e6ffe6; padding: 20px; border-radius: 8px;'>" |
|
|
"<h2>Speech-to-Text</h2>" |
|
|
"<ul>" |
|
|
"<li><strong>Core Idea:</strong> Convert spoken language into written text.</li>" |
|
|
"<li><strong>Functionality:</strong> Allows users to dictate speech and have it transcribed into text, facilitating communication and documentation.</li>" |
|
|
"<li><strong>Target Audience:</strong> Individuals with hearing impairments, those who prefer speaking over typing, and people with mobility challenges.</li>" |
|
|
"</ul>" |
|
|
"<strong>Supported Input:</strong> WAV, FLAC, AIFF, MP3 (converted to WAV). <br>" |
|
|
"<strong>Output:</strong> Transcribed text." |
|
|
"</div>") |
|
|
stt_input = gr.Audio(label="Record or Upload Audio", type="filepath") |
|
|
stt_button = gr.Button("Convert Speech to Text") |
|
|
stt_output = gr.Textbox(label="Speech-to-Text Output") |
|
|
stt_button.click(fn=speech_to_text, inputs=stt_input, outputs=stt_output) |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown("<div style='background-color: #ffe6e6; padding: 20px; border-radius: 8px;'>" |
|
|
"<h2>Image Description</h2>" |
|
|
"<ul>" |
|
|
"<li><strong>Core Idea:</strong> Generate descriptive text for images.</li>" |
|
|
"<li><strong>Functionality:</strong> Analyzes and describes the content of images, making visual information accessible to those who are visually impaired.</li>" |
|
|
"<li><strong>Target Audience:</strong> Individuals with visual impairments and those needing assistance in understanding visual content.</li>" |
|
|
"</ul>" |
|
|
"<strong>Supported Input:</strong> JPEG, PNG, BMP, GIF. <br>" |
|
|
"<strong>Output:</strong> Text description." |
|
|
"</div>") |
|
|
image_input = gr.Image(label="Upload an Image") |
|
|
image_desc_output = gr.Textbox(label="Image Description") |
|
|
image_desc_button = gr.Button("Describe Image") |
|
|
image_desc_button.click(fn=generate_image_description, inputs=image_input, outputs=image_desc_output) |
|
|
|
|
|
|
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown("<div style='background-color: #fff3e6; padding: 20px; border-radius: 8px;'>" |
|
|
"<h2>Video Description</h2>" |
|
|
"<ul>" |
|
|
"<li><strong>Core Idea:</strong> Describe video content through generated text.</li>" |
|
|
"<li><strong>Functionality:</strong> Provides textual descriptions of video frames, aiding understanding for those who cannot see the video.</li>" |
|
|
"<li><strong>Target Audience:</strong> Individuals with visual impairments and those needing assistance in interpreting video content.</li>" |
|
|
"</ul>" |
|
|
"<strong>Supported Input:</strong> MP4, AVI, MOV. <br>" |
|
|
"<strong>Output:</strong> List of text descriptions." |
|
|
"</div>") |
|
|
video_input = gr.File(label="Upload a Video", file_types=[".mp4", ".avi", ".mov"]) |
|
|
video_desc_output = gr.Textbox(label="Video Descriptions") |
|
|
video_desc_button = gr.Button("Describe Video") |
|
|
video_desc_button.click(fn=generate_video_description, inputs=video_input, outputs=video_desc_output) |
|
|
|
|
|
app.launch() |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|