Spaces:

Vihang28
/

Audio_Recognition_QnA

Sleeping

App Files Files Community

Audio_Recognition_QnA / app.py

Vihang28

Update app.py

2a7a0fd verified about 1 year ago

raw

history blame contribute delete

3.99 kB

	import speech_recognition as sr
	from pydub import AudioSegment
	import gradio as gr
	from os import path
	import requests
	import openai
	from openai import OpenAI

	prompt = "Type and press Enter"


	def record_text(audio_file,api_key):
	client = OpenAI(api_key = api_key)
	input_file = audio_file
	output_file = "converted_sound.mp3"
	sound = AudioSegment.from_wav(input_file)
	sound.export(output_file, format="mp3")
	audio_file = "converted_sound.mp3"
	audio_file = open(audio_file, "rb")
	transcript = client.audio.transcriptions.create(
	model="whisper-1",
	file=audio_file,
	response_format="text"
	)
	return transcript
	# return(str(path.getsize(audio_file)/1000000)+'mb')
	# sound = audio_file
	# sound_type = sound.split(".")
	# if sound_type[-1] == 'mp3':
	# input_file = sound
	# output_file = "con_sound.wav"

	# # convert mp3 file to wav file
	# sound = AudioSegment.from_mp3(input_file)
	# sound.export(output_file, format="wav")
	# sound = "con_sound.wav"

	# MyText = ""
	# with sr.AudioFile(sound) as source:
	# r.adjust_for_ambient_noise(source)
	# print("Converting audio file to text..")
	# audio2 = r.record(source, duration=None) # Use record instead of listen

	# MyText = r.recognize_google(audio2, language="en-US", key=None, show_all=False)
	# MyText = MyText.lower()
	# return (MyText)


	def api_calling(audio_file, prompt, api_key):
	audio_text = record_text(audio_file,api_key)
	if len(prompt) == 0:
	prompt = "Apply proper punctuations, upper case and lower case to the provided text."
	return audio_text
	else:
	headers = {
	"Content-Type": "application/json",
	"Authorization": f"Bearer {api_key}"
	}
	payload = {
	"model": "gpt-3.5-turbo",
	"messages": [
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": prompt
	},
	{
	"type": "text",
	"text": audio_text
	}
	]
	}
	],
	"max_tokens": 1000
	}
	response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
	audio_text_res = response.json()
	return audio_text_res["choices"][0]["message"]["content"]



	def message_and_history(audio_text,input, history, api_key):
	history = history or []
	output_text = api_calling(audio_text,input,api_key)

	if len(input) == 0:
	input = "Speech from the video."
	history.append((input, output_text))
	else:
	history.append((input, output_text))

	return history, history


	block = gr.Blocks(theme=gr.themes.Monochrome(primary_hue="slate"))
	with block:
	gr.Markdown("""<h1><center>Audio Recognition - Ask & Learn about an Audio</center></h1> """)
	with gr.Row():
	with gr.Column(scale=0.5):
	aud_input = gr.Audio(type="filepath", label="Upload .mp3 or .wav file", sources="upload")
	api_input = gr.Textbox(label="Enter Api-key")
	upload_button = gr.Button(value="Upload & Start Chat", interactive=True, variant="primary")
	with gr.Column():
	chatbot = gr.Chatbot(label="Ask questions about the audio")
	message = gr.Textbox(label="User", placeholder=prompt)
	state = gr.State()

	upload_button.click(message_and_history, inputs=[aud_input,message, state, api_input], outputs=[chatbot, state])
	message.submit(message_and_history, inputs=[aud_input,message, state, api_input], outputs=[chatbot, state])
	message.submit(lambda: None, None, message, queue=False)
	block.launch()