import gradio as gr import base64 import numpy as np import soundfile as sf import os import requests import json API_URL = os.getenv("API_URL") API_KEY = os.getenv("API_KEY") API_URL2 = os.getenv("API_URL2") def audio_to_base64(audio): sr, data = audio # Save audio data to a temporary file temp_file = "temp.wav" sf.write(temp_file, data, sr, format='wav') # Read the temporary file as binary and encode it to base64 with open(temp_file, "rb") as audio_file: base64_audio = base64.b64encode(audio_file.read()).decode("utf-8") # Remove the temporary file os.remove(temp_file) response_text = send_to_api(base64_audio) response_json = json.loads(response_text) output_text = response_json["output"]["segments"][0]["text"] # Make the second API call second_api_response = second_api_call(output_text) return second_api_response def send_to_api(base64_audio): payload = { "input": { "audio_base64": base64_audio, "model": "tiny", "transcription": "plain text", "translate": True, "language": "en", "temperature": 0, "best_of": 5, "beam_size": 5, "patience": 1, "suppress_tokens": "-1", "condition_on_previous_text": False, "temperature_increment_on_fallback": 0.2, "compression_ratio_threshold": 2.4, "logprob_threshold": -1, "no_speech_threshold": 0.6, "word_timestamps": False, "initial_prompt": "You are a voice assistant for Bhuvan Portal by ISRO" }, "enable_vad": True } headers = { "accept": "application/json", "content-type": "application/json", "authorization": API_KEY } response = requests.post(API_URL, json=payload, headers=headers) return response.text def second_api_call(prompt_text): payload = { "input": { "prompt": prompt_text, "sampling_params": { "max_tokens": 2048, "n": 1, "best_of": None, "presence_penalty": 0, "frequency_penalty": 0, "temperature": 0.5, "top_p": 1, "top_k": -1, "use_beam_search": False, "stop": ["USER"], "ignore_eos": False, "logprobs": None } } } headers = { "accept": "application/json", "content-type": "application/json", "authorization": API_KEY } response = requests.post(API_URL2, json=payload, headers=headers) response_json = json.loads(response.text) output_text = response_json["output"]["text"][0] # Extract the "text" field output_text = output_text.replace("\\n", "\n") # Replace "\n" with an actual new line return output_text demo = gr.Interface( fn=audio_to_base64, inputs=["microphone"], outputs="text", title="Voice Assistant for SIF Hackathon *Our Vision* (Submit Again if error pops up)", description="Speak into the microphone and see the LLM response. (Faster Whisper tiny + llama13B)", theme='WeixuanYuan/Soft_dark' ) if __name__ == "__main__": demo.launch()