File size: 2,942 Bytes
8cbc035
5b74a4b
d75e5af
 
83e3ccb
abd2b24
5b74a4b
d75e5af
25fb027
 
e2984ff
 
25fb027
cd0ec84
 
d75e5af
 
 
cd0ec84
25fb027
a927d1d
393002d
abd2b24
f47bfae
 
abd2b24
f47bfae
abd2b24
 
 
 
c088ffc
b122d68
 
c088ffc
25fb027
 
c088ffc
25fb027
393002d
25fb027
 
 
393002d
25fb027
393002d
5b74a4b
d75e5af
cd0ec84
d75e5af
 
 
cd0ec84
 
 
 
25fb027
 
 
 
 
 
 
 
 
c58bd88
8c23bfa
 
 
25fb027
 
17cfe18
25fb027
a5ec736
b2c7d3a
5b74a4b
 
73cf408
8fe6fd5
5b74a4b
 
 
 
b2c7d3a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import torch
import gradio as gr
from transformers import pipeline, AutoTokenizer, M2M100ForConditionalGeneration
from tokenization_small100 import SMALL100Tokenizer
import numpy as np
from pydub import AudioSegment

# Load the pipeline for speech recognition
pipe = pipeline(
    "automatic-speech-recognition",
    model="DrishtiSharma/whisper-large-v2-hausa",
    tokenizer="DrishtiSharma/whisper-large-v2-hausa"
)

# Load the new translation model and tokenizer
model_name = 'alirezamsh/small100'
model = M2M100ForConditionalGeneration.from_pretrained(model_name)
tokenizer = SMALL100Tokenizer.from_pretrained(model_name)

tts = pipeline("text-to-speech", model="Baghdad99/english_voice_tts")

# Define the function to translate speech
def translate_speech(audio_file):
    print(f"Type of audio: {type(audio_file)}, Value of audio: {audio_file}")  # Debug line

    # Load the audio file with pydub
    audio = AudioSegment.from_mp3(audio_file)  # Change this line

    # Convert the audio to mono and get the raw data
    audio = audio.set_channels(1)
    audio_data = np.array(audio.get_array_of_samples())
    
    # Convert the numpy array to double
    audio_data = audio_data.astype(np.float64)
    
    # Use the speech recognition pipeline to transcribe the audio
    output = pipe(audio_data)

    print(f"Output: {output}")  # Print the output to see what it contains

    # Check if the output contains 'text'
    if 'text' in output:
        transcription = output["text"]
    else:
        print("The output does not contain 'text'")
        return

    # Use the new translation model to translate the transcription
    text = "translate Hausa to English: " + transcription
    tokenizer.tgt_lang = "en"
    encoded_text = tokenizer(text, return_tensors="pt")
    outputs = model.generate(**encoded_text)

    # Decode the tokens into text
    translated_text_str = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Use the text-to-speech pipeline to synthesize the translated text
    synthesised_speech = tts(translated_text_str)

    # Check if the synthesised speech contains 'audio'
    if 'audio' in synthesised_speech:
        synthesised_speech_data = synthesised_speech['audio']
    else:
        print("The synthesised speech does not contain 'audio'")
        return

    # Flatten the audio data
    synthesised_speech_data = synthesised_speech_data.flatten()

    # Scale the audio data to the range of int16 format
    synthesised_speech = (synthesised_speech_data * 32767).astype(np.int16)

    return 16000, synthesised_speech

# Define the Gradio interface
iface = gr.Interface(
    fn=translate_speech, 
    inputs=gr.inputs.Audio(type="filepath"),  # Change this line
    outputs=gr.outputs.Audio(type="numpy"),
    title="Hausa to English Translation",
    description="Realtime demo for Hausa to English translation using speech recognition and text-to-speech synthesis."
)

iface.launch()