File size: 6,857 Bytes
1bac931
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
#!pip install torch
#!pip install noisereduce
#!pip install scipy

import requests
import base64
import numpy as np
from scipy.io.wavfile import read, write
#import noisereduce as nr
import nltk
import struct
test=False
# Define sentence split length
SENTENCE_SPLIT_LENGTH = 400

##["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn","ja"]
def detect_language(sentence):
    url = "https://ruslanmv-hf-llm-api-collection.hf.space/detect"
    data = {"input_text": sentence}
    headers = {"Accept": "application/json", "Content-Type": "application/json"}
    response = requests.post(url, headers=headers, json=data)
    if response.status_code == 200:
        try:
            response_json = response.json()
            language = response_json.get("lang")  # Assuming "lang" is the key
            return language
        except JSONDecodeError:
            print("Error: Invalid JSON response from the language detection API.")
    else:
        print(f"Error: Language detection API call failed with status code {response.status_code}")

    return None  # Fallback if API calls fail

def split_sentences(text, max_len):
    # Apply custom rules to enforce sentence breaks with double punctuation
    text = re.sub(r"(\s*\.{2})\s*", r".\1 ", text)  # for '..'
    text = re.sub(r"(\s*\!{2})\s*", r"!\1 ", text)  # for '!!'

    # Use NLTK to split into sentences
    sentences = nltk.sent_tokenize(text)

    # Then check if each sentence is greater than max_len, if so, use textwrap to split it
    sentence_list = []
    for sent in sentences:
        if len(sent) > max_len:
            wrapped = textwrap.wrap(sent, max_len, break_long_words=True)
            sentence_list.extend(wrapped)
        else:
            sentence_list.append(sent)
    
    return sentence_list


def get_voice_streaming2(sentence, language):
    """Makes a POST request to the text-to-speech API and yields audio chunks."""
    url = "https://ruslanmv-hf-llm-api-collection.hf.space/tts"
    data = {"input_text": sentence, "from_language": language}
    headers = {"Accept": "application/json", "Content-Type": "application/json"}
    response = requests.post(url, headers=headers, json=data)
    return response


def pcm_to_wav2(pcm_data, sample_rate=24000, channels=1, bit_depth=16):
    if pcm_data.startswith(b"RIFF"):
        return pcm_data

    fmt_subchunk_size = 16
    data_subchunk_size = len(pcm_data)
    chunk_size = 4 + (8 + fmt_subchunk_size) + (8 + data_subchunk_size)

    wav_header = struct.pack('<4sI4s', b'RIFF', chunk_size, b'WAVE')
    fmt_subchunk = struct.pack('<4sIHHIIHH',
                               b'fmt ', fmt_subchunk_size, 1, channels,
                               sample_rate, sample_rate * channels * bit_depth // 8,
                               channels * bit_depth // 8, bit_depth)

    data_subchunk = struct.pack('<4sI', b'data', data_subchunk_size)
    return wav_header + fmt_subchunk + data_subchunk + pcm_data

import base64
import re
def generate_speech_from_history2(history, chatbot_role, sentence):
    """
    Generates speech audio from a given sentence, performing necessary preprocessing.

    Args:
        history (list): Conversation history.
        chatbot_role (str): Role of the chatbot.
        sentence (str): The sentence to be converted to speech.

    Returns:
        list: A list of dictionaries containing text and audio (base64 encoded) for each sentence fragment.
    """
    language = "autodetect"
    if len(sentence) == 0:
        print("EMPTY SENTENCE")
        return
    # Preprocessing steps:
    # - Remove special prompt token (</s>)
    sentence = sentence.replace("</s>", "")
    # - Remove code sections (enclosed in triple backticks)
    sentence = re.sub("`.*`", "", sentence, flags=re.DOTALL)
    # - Remove inline code fragments (backticks)
    sentence = re.sub("`.*`", "", sentence, flags=re.DOTALL)
    # - Remove content within parentheses
    sentence = re.sub("\(.*\)", "", sentence, flags=re.DOTALL)
    # - Remove remaining triple backticks
    sentence = sentence.replace("```", "")
    # - Replace ellipses with spaces
    sentence = sentence.replace("...", " ")
    # - Replace parentheses with spaces
    sentence = sentence.replace("(", " ")
    sentence = sentence.replace(")", " ")
    # - Remove assistant tag
    sentence = sentence.replace("<|assistant|>","")
    if len(sentence) == 0:
        print("EMPTY SENTENCE after processing")
        return
    # - Handle punctuation at the end of sentences
    sentence = re.sub("([^\x00-\x7F]|\w)([\.。?!]+)", r"\1 \2", sentence)
    print("Sentence for speech:", sentence)
    results = []

    try:
        if len(sentence) < SENTENCE_SPLIT_LENGTH:
            sentence_list = [sentence]
        else:
            # Split longer sentences (implement your preferred split method)
            sentence_list = split_sentences(sentence, SENTENCE_SPLIT_LENGTH)
            print("detected sentences:", sentence_list)

        for sentence in sentence_list:
            print("- sentence =", sentence)
            if any(c.isalnum() for c in sentence):
                if language == "autodetect":
                    language = detect_language(sentence)  # Detect language on first call
                    print("language",language)
                audio_stream = get_voice_streaming2(sentence, language)
                if audio_stream is not None:
                    sentence_wav_bytestream = b""
                    # Process audio chunks
                    for chunk in audio_stream:
                        if chunk is not None:
                            sentence_wav_bytestream += chunk
                    # Encode WAV to base64
                    base64_audio = base64.b64encode(pcm_to_wav2(sentence_wav_bytestream)).decode('utf8')
                    print("base64_audio",base64_audio[:10])
                    results.append({ "text": sentence, "audio": base64_audio })
                else:
                    # Handle the case where the audio stream is None (e.g., silent response)
                    results.append({ "text": sentence, "audio": "" })

    except RuntimeError as e:
        if "device-side assert" in str(e):
            # cannot do anything , need to restart
            print(
                f"Exit due to: Unrecoverable exception caused by prompt:{sentence}",
                flush=True,
            )
            #This error is unrecoverable need to restart space
            #api.restart_space(repo_id=repo_id)
        else:
            print("RuntimeError: non device-side assert error:", str(e))
            raise e

    return results

if test:
    # Example usage
    history = []
    chatbot_role = "assistant"
    sentence = "Hello, how can I help you?"
    result = generate_speech_from_history2(history, chatbot_role, sentence)
    print(result)