import gradio as gr import requests import gtts as gt from PIL import Image from gradio_client import Client from googletrans import Translator import cv2 import numpy as np import tempfile import base64 from io import BytesIO def trans(text, lang='ta'): translator = Translator() out = translator.translate(text, dest=lang) tts = gt.gTTS(text=out.text, lang=lang) # Save the audio as a temporary file temp_audio_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) tts.save(temp_audio_file.name) return temp_audio_file.name def object_recognition(image_array, lang): # Convert the NumPy array to PIL Image image = Image.fromarray(image_array) API_URL = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-large" headers = {"Authorization": "Bearer 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"} with open("temp_image.jpg", "wb") as f: image.save(f, format="JPEG") with open("temp_image.jpg", "rb") as f: response = requests.post(API_URL, headers=headers, data=f) output = response.json() result = output[0]['generated_text'] text = "Object recognition result for the captured image." audio_file = trans(result, lang) return audio_file def ocr_detection(image_array, lang): image = Image.fromarray(image_array) buffered = BytesIO() image.save(buffered, format="PNG") image_base64 = base64.b64encode(buffered.getvalue()).decode() response = requests.post("https://pragnakalp-ocr-image-to-text.hf.space/run/predict", json={ "data": [ "PaddleOCR", f"data:image/png;base64,{image_base64}", ] }).json() data = response.get("data", []) text = " ".join(data) audio_file = trans(text, lang) return audio_file def operator(image_array, value, lang): if value == "1": audio_file = object_recognition(image_array, lang) elif value == "2": audio_file = ocr_detection(image_array, lang) else: text = "Sorry, I can't perform this operation." audio_file = trans(text, lang) return audio_file # Create Gradio interface iface = gr.Interface(fn=operator, inputs=["image", "text", "text"], outputs="audio") iface.launch(share=True)