Baghdad99 commited on
Commit
25fb027
1 Parent(s): cf0c6ae

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -48
app.py CHANGED
@@ -1,68 +1,64 @@
1
  import gradio as gr
2
- import requests
3
  import numpy as np
4
- from pydub import AudioSegment
5
- import io
6
- from IPython.display import Audio
7
 
8
- # Define the Hugging Face Inference API URLs and headers
9
- ASR_API_URL = "https://api-inference.huggingface.co/models/Baghdad99/saad-speech-recognition-hausa-audio-to-text"
10
- TTS_API_URL = "https://api-inference.huggingface.co/models/Baghdad99/english_voice_tts"
11
- TRANSLATION_API_URL = "https://api-inference.huggingface.co/models/Baghdad99/saad-hausa-text-to-english-text"
12
- headers = {"Authorization": "Bearer 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"}
13
-
14
- # Define the function to query the Hugging Face Inference API
15
- def query(api_url, payload=None, data=None):
16
- if data is not None:
17
- response = requests.post(api_url, headers=headers, data=data)
18
- else:
19
- response = requests.post(api_url, headers=headers, json=payload)
20
- response_json = response.json()
21
- if 'error' in response_json:
22
- print(f"Error in query function: {response_json['error']}")
23
- return None
24
- return response_json
25
 
26
  # Define the function to translate speech
27
- def translate_speech(audio_file):
28
- print(f"Type of audio: {type(audio_file)}, Value of audio: {audio_file}") # Debug line
 
29
 
30
- # Use the ASR pipeline to transcribe the audio
31
- data = audio_file.read()
32
- output = query(ASR_API_URL, data=data)
33
- print(f"Output: {output}") # Debug line
34
 
35
- # Check if output is not None
36
- if output is not None:
37
- # Check if 'error' key exists in the output
38
- if 'error' in output:
39
- print(f"Error: {output['error']}")
40
- return
41
-
42
- # Check if 'text' key exists in the output
43
- if 'text' in output:
44
- transcription = output["text"]
45
- else:
46
- print("Key 'text' does not exist in the output.")
47
- return
48
  else:
49
- print("Output is None.")
50
  return
51
 
52
  # Use the translation pipeline to translate the transcription
53
- translated_text = query(TRANSLATION_API_URL, {"inputs": transcription})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
- # Use the TTS pipeline to synthesize the translated text
56
- response = requests.post(TTS_API_URL, headers=headers, json={"inputs": translated_text})
57
- audio_bytes = response.content
58
 
59
- # Display the audio output
60
- return Audio(audio_bytes)
61
 
62
  # Define the Gradio interface
63
  iface = gr.Interface(
64
  fn=translate_speech,
65
- inputs=gr.inputs.File(type="file"), # Change this line
66
  outputs=gr.outputs.Audio(type="numpy"),
67
  title="Hausa to English Translation",
68
  description="Realtime demo for Hausa to English translation using speech recognition and text-to-speech synthesis."
 
1
  import gradio as gr
2
+ from transformers import pipeline, AutoTokenizer
3
  import numpy as np
 
 
 
4
 
5
+ # Load the pipeline for speech recognition and translation
6
+ pipe = pipeline(
7
+ "automatic-speech-recognition",
8
+ model="Baghdad99/saad-speech-recognition-hausa-audio-to-text",
9
+ tokenizer="Baghdad99/saad-speech-recognition-hausa-audio-to-text"
10
+ )
11
+ translator = pipeline("text2text-generation", model="Baghdad99/saad-hausa-text-to-english-text")
12
+ tts = pipeline("text-to-speech", model="Baghdad99/english_voice_tts")
 
 
 
 
 
 
 
 
 
13
 
14
  # Define the function to translate speech
15
+ def translate_speech(audio):
16
+ # Separate the sample rate and the audio data
17
+ sample_rate, audio_data = audio
18
 
19
+ # Use the speech recognition pipeline to transcribe the audio
20
+ output = pipe(audio_data)
21
+ print(f"Output: {output}") # Print the output to see what it contains
 
22
 
23
+ # Check if the output contains 'text'
24
+ if 'text' in output:
25
+ transcription = output["text"]
 
 
 
 
 
 
 
 
 
 
26
  else:
27
+ print("The output does not contain 'text'")
28
  return
29
 
30
  # Use the translation pipeline to translate the transcription
31
+ translated_text = translator(transcription, return_tensors="pt")
32
+ print(f"Translated text: {translated_text}") # Print the translated text to see what it contains
33
+
34
+ # Check if the translated text contains 'generated_token_ids'
35
+ if 'generated_token_ids' in translated_text[0]:
36
+ # Decode the tokens into text
37
+ translated_text_str = translator.tokenizer.decode(translated_text[0]['generated_token_ids'])
38
+ else:
39
+ print("The translated text does not contain 'generated_token_ids'")
40
+ return
41
+
42
+ # Use the text-to-speech pipeline to synthesize the translated text
43
+ synthesised_speech = tts(translated_text_str)
44
+ print(f"Synthesised speech: {synthesised_speech}") # Print the synthesised speech to see what it contains
45
+
46
+ # Check if the synthesised speech contains 'audio'
47
+ if 'audio' in synthesised_speech:
48
+ synthesised_speech_data = synthesised_speech['audio']
49
+ else:
50
+ print("The synthesised speech does not contain 'audio'")
51
+ return
52
 
53
+ # Scale the audio data to the range of int16 format
54
+ synthesised_speech = (synthesised_speech_data * 32767).astype(np.int16)
 
55
 
56
+ return 16000, synthesised_speech
 
57
 
58
  # Define the Gradio interface
59
  iface = gr.Interface(
60
  fn=translate_speech,
61
+ inputs=gr.inputs.Audio(source="microphone", type="numpy"),
62
  outputs=gr.outputs.Audio(type="numpy"),
63
  title="Hausa to English Translation",
64
  description="Realtime demo for Hausa to English translation using speech recognition and text-to-speech synthesis."