Gandalf / app.py
AkashMnd's picture
Update app.py
d7e9e68
raw
history blame contribute delete
No virus
3.36 kB
import gradio as gr
import base64
import numpy as np
import soundfile as sf
import os
import requests
import json
API_URL = os.getenv("API_URL")
API_KEY = os.getenv("API_KEY")
API_URL2 = os.getenv("API_URL2")
def audio_to_base64(audio):
sr, data = audio
# Save audio data to a temporary file
temp_file = "temp.wav"
sf.write(temp_file, data, sr, format='wav')
# Read the temporary file as binary and encode it to base64
with open(temp_file, "rb") as audio_file:
base64_audio = base64.b64encode(audio_file.read()).decode("utf-8")
# Remove the temporary file
os.remove(temp_file)
response_text = send_to_api(base64_audio)
response_json = json.loads(response_text)
output_text = response_json["output"]["segments"][0]["text"]
# Make the second API call
second_api_response = second_api_call(output_text)
return second_api_response
def send_to_api(base64_audio):
payload = {
"input": {
"audio_base64": base64_audio,
"model": "tiny",
"transcription": "plain text",
"translate": True,
"language": "en",
"temperature": 0,
"best_of": 5,
"beam_size": 5,
"patience": 1,
"suppress_tokens": "-1",
"condition_on_previous_text": False,
"temperature_increment_on_fallback": 0.2,
"compression_ratio_threshold": 2.4,
"logprob_threshold": -1,
"no_speech_threshold": 0.6,
"word_timestamps": False,
"initial_prompt": "You are a voice assistant for Bhuvan Portal by ISRO"
},
"enable_vad": True
}
headers = {
"accept": "application/json",
"content-type": "application/json",
"authorization": API_KEY
}
response = requests.post(API_URL, json=payload, headers=headers)
return response.text
def second_api_call(prompt_text):
payload = {
"input": {
"prompt": prompt_text,
"sampling_params": {
"max_tokens": 2048,
"n": 1,
"best_of": None,
"presence_penalty": 0,
"frequency_penalty": 0,
"temperature": 0.5,
"top_p": 1,
"top_k": -1,
"use_beam_search": False,
"stop": ["USER"],
"ignore_eos": False,
"logprobs": None
}
}
}
headers = {
"accept": "application/json",
"content-type": "application/json",
"authorization": API_KEY
}
response = requests.post(API_URL2, json=payload, headers=headers)
response_json = json.loads(response.text)
output_text = response_json["output"]["text"][0] # Extract the "text" field
output_text = output_text.replace("\\n", "\n") # Replace "\n" with an actual new line
return output_text
demo = gr.Interface(
fn=audio_to_base64,
inputs=["microphone"],
outputs="text",
title="Voice Assistant for SIF Hackathon *Our Vision* (Submit Again if error pops up)",
description="Speak into the microphone and see the LLM response. (Faster Whisper tiny + llama13B)",
theme='WeixuanYuan/Soft_dark'
)
if __name__ == "__main__":
demo.launch()