File size: 4,705 Bytes
4177df5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import requests
import base64
import io
import json
import gradio as gr
from gradio import Text
import base64
import numpy as np
from pydub import AudioSegment
# Define the API endpoint URL
url = "https://ruslanmv-hf-llm-api-collection.hf.space/tts"

# Set headers for content type and desired response format
headers = {
    "Content-Type": "application/json",
    "accept": "application/json"  # May need adjustment if endpoint doesn't support JSON
}


def convert_text_to_base64(text, language="en"):
    """Converts text to base64 encoded audio string using the provided API.

    Args:
        text (str): The text to convert to speech.
        language (str, optional): The language code for the speech (default: "en").

    Returns:
        str: The base64 encoded audio string on success, None on error.
    """

    try:
        # Prepare the data
        data = {
            "input_text": text,
            "from_language": language
        }

        # Send the POST request
        response = requests.post(url, headers=headers, json=data)

        # Check for successful response
        if response.status_code == 200:
            try:
                # Check for JSON response format first
                response_data = response.json()

                # Check for errors in the response (if JSON)
                if "detail" in response_data:
                    print(f"Error: {response_data['detail']}")
                    return None

                # Extract audio data from the response (assuming it's in a field)
                audio_data = response_data.get("audio", None)
                if not audio_data:
                    print("Error: Missing audio data in response.")
                    return None

            except json.JSONDecodeError:
                # If not JSON, assume raw binary data
                audio_data = response.content

            # Use an in-memory buffer
            with io.BytesIO() as buffer:
                # Write audio data to the buffer
                buffer.write(audio_data)

                # Encode audio data to base64 string
                base64_encoded_str = base64.b64encode(buffer.getvalue()).decode("utf-8")

            return base64_encoded_str

        else:
            print(f"Error: {response.status_code}")
            return None

    except Exception as e:
        print(f"Error: {e}")
        return None




def get_audio_properties(audio_data):
    try:
        # Try to read as WAV
        audio_segment = AudioSegment.from_file(io.BytesIO(audio_data), format="wav")
        format = "wav"
    except:
        try:
            # Try to read as MP3
            audio_segment = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3")
            format = "mp3"
        except Exception as e:
            raise ValueError(f"Unknown audio format: {e}")

    duration = len(audio_segment) / 1000.0  # duration in seconds
    bitrate = audio_segment.frame_rate
    channels = audio_segment.channels
    sample_width = audio_segment.sample_width

    return {
        "format": format,
        "duration": duration,
        "bitrate": bitrate,
        "channels": channels,
        "sample_width": sample_width,
        "audio_segment": audio_segment
    }

def play_audio(text):
    """Converts text to speech using the provided API and plays the audio."""

    base64_encoded_audio = convert_text_to_base64(text)

    if base64_encoded_audio:
        # Decode base64 string to bytes (assuming known format)
        # Decode the base64 string
        audio_data = base64.b64decode(base64_encoded_audio)
        
        # Get audio properties
        properties = get_audio_properties(audio_data)
        print("Audio Properties:", properties)

        
        # Convert audio segment to numpy array
        audio_segment = properties["audio_segment"]
        samples = np.array(audio_segment.get_array_of_samples())
        if audio_segment.channels == 2:
            samples = samples.reshape((-1, 2))
    
        # Create the audio component with controls and optional download button
        return 24000, samples
    else:
        return "Error occurred during conversion."
# Define the Gradio interface with clear labels for user interaction
interface = gr.Interface(
    fn=play_audio,
    title="Text to Speech API",  # Add a title to the interface
    description="Developed by Ruslan Magana, visit <a href='https://ruslanmv.com/' target='_blank'>ruslanmv.com</a> for more information.",
    inputs=Text(label="Enter text to convert to speech"),
    outputs=gr.Audio(label="Generated audio", type="numpy"),
    #live=True  # Enable live updates
)

# Launch the Gradio interface
interface.launch()