Spaces:
Sleeping
Sleeping
// import axios from "axios"; | |
import { Buffer } from "buffer"; | |
import { randomBytes } from "crypto"; | |
import { Readable } from "stream"; | |
// Modified according to https://github.com/Migushthe2nd/MsEdgeTTS | |
/** | |
* https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,volume,-Indicates%20the%20volume | |
*/ | |
export enum VOLUME { | |
SILENT = "silent", | |
X_SOFT = "x-soft", | |
SOFT = "soft", | |
MEDIUM = "medium", | |
LOUD = "loud", | |
X_LOUD = "x-LOUD", | |
DEFAULT = "default", | |
} | |
/** | |
* https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,rate,-Indicates%20the%20speaking | |
*/ | |
export enum RATE { | |
X_SLOW = "x-slow", | |
SLOW = "slow", | |
MEDIUM = "medium", | |
FAST = "fast", | |
X_FAST = "x-fast", | |
DEFAULT = "default", | |
} | |
/** | |
* https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,pitch,-Indicates%20the%20baseline | |
*/ | |
export enum PITCH { | |
X_LOW = "x-low", | |
LOW = "low", | |
MEDIUM = "medium", | |
HIGH = "high", | |
X_HIGH = "x-high", | |
DEFAULT = "default", | |
} | |
/** | |
* Only a few of the [possible formats](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/rest-text-to-speech#audio-outputs) are accepted. | |
*/ | |
export enum OUTPUT_FORMAT { | |
// Streaming ============================= | |
// AMR_WB_16000HZ = "amr-wb-16000hz", | |
// AUDIO_16KHZ_16BIT_32KBPS_MONO_OPUS = "audio-16khz-16bit-32kbps-mono-opus", | |
// AUDIO_16KHZ_32KBITRATE_MONO_MP3 = "audio-16khz-32kbitrate-mono-mp3", | |
// AUDIO_16KHZ_64KBITRATE_MONO_MP3 = "audio-16khz-64kbitrate-mono-mp3", | |
// AUDIO_16KHZ_128KBITRATE_MONO_MP3 = "audio-16khz-128kbitrate-mono-mp3", | |
// AUDIO_24KHZ_16BIT_24KBPS_MONO_OPUS = "audio-24khz-16bit-24kbps-mono-opus", | |
// AUDIO_24KHZ_16BIT_48KBPS_MONO_OPUS = "audio-24khz-16bit-48kbps-mono-opus", | |
AUDIO_24KHZ_48KBITRATE_MONO_MP3 = "audio-24khz-48kbitrate-mono-mp3", | |
AUDIO_24KHZ_96KBITRATE_MONO_MP3 = "audio-24khz-96kbitrate-mono-mp3", | |
// AUDIO_24KHZ_160KBITRATE_MONO_MP3 = "audio-24khz-160kbitrate-mono-mp3", | |
// AUDIO_48KHZ_96KBITRATE_MONO_MP3 = "audio-48khz-96kbitrate-mono-mp3", | |
// AUDIO_48KHZ_192KBITRATE_MONO_MP3 = "audio-48khz-192kbitrate-mono-mp3", | |
// OGG_16KHZ_16BIT_MONO_OPUS = "ogg-16khz-16bit-mono-opus", | |
// OGG_24KHZ_16BIT_MONO_OPUS = "ogg-24khz-16bit-mono-opus", | |
// OGG_48KHZ_16BIT_MONO_OPUS = "ogg-48khz-16bit-mono-opus", | |
// RAW_8KHZ_8BIT_MONO_ALAW = "raw-8khz-8bit-mono-alaw", | |
// RAW_8KHZ_8BIT_MONO_MULAW = "raw-8khz-8bit-mono-mulaw", | |
// RAW_8KHZ_16BIT_MONO_PCM = "raw-8khz-16bit-mono-pcm", | |
// RAW_16KHZ_16BIT_MONO_PCM = "raw-16khz-16bit-mono-pcm", | |
// RAW_16KHZ_16BIT_MONO_TRUESILK = "raw-16khz-16bit-mono-truesilk", | |
// RAW_22050HZ_16BIT_MONO_PCM = "raw-22050hz-16bit-mono-pcm", | |
// RAW_24KHZ_16BIT_MONO_PCM = "raw-24khz-16bit-mono-pcm", | |
// RAW_24KHZ_16BIT_MONO_TRUESILK = "raw-24khz-16bit-mono-truesilk", | |
// RAW_44100HZ_16BIT_MONO_PCM = "raw-44100hz-16bit-mono-pcm", | |
// RAW_48KHZ_16BIT_MONO_PCM = "raw-48khz-16bit-mono-pcm", | |
// WEBM_16KHZ_16BIT_MONO_OPUS = "webm-16khz-16bit-mono-opus", | |
// WEBM_24KHZ_16BIT_24KBPS_MONO_OPUS = "webm-24khz-16bit-24kbps-mono-opus", | |
WEBM_24KHZ_16BIT_MONO_OPUS = "webm-24khz-16bit-mono-opus", | |
// Non-streaming ============================= | |
// RIFF_8KHZ_8BIT_MONO_ALAW = "riff-8khz-8bit-mono-alaw", | |
// RIFF_8KHZ_8BIT_MONO_MULAW = "riff-8khz-8bit-mono-mulaw", | |
// RIFF_8KHZ_16BIT_MONO_PCM = "riff-8khz-16bit-mono-pcm", | |
// RIFF_22050HZ_16BIT_MONO_PCM = "riff-22050hz-16bit-mono-pcm", | |
// RIFF_24KHZ_16BIT_MONO_PCM = "riff-24khz-16bit-mono-pcm", | |
// RIFF_44100HZ_16BIT_MONO_PCM = "riff-44100hz-16bit-mono-pcm", | |
// RIFF_48KHZ_16BIT_MONO_PCM = "riff-48khz-16bit-mono-pcm", | |
} | |
export type Voice = { | |
Name: string; | |
ShortName: string; | |
Gender: string; | |
Locale: string; | |
SuggestedCodec: string; | |
FriendlyName: string; | |
Status: string; | |
}; | |
export class ProsodyOptions { | |
/** | |
* The pitch to use. | |
* Can be any {@link PITCH}, or a relative frequency in Hz (+50Hz), a relative semitone (+2st), or a relative percentage (+50%). | |
* [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,pitch,-Indicates%20the%20baseline) | |
*/ | |
pitch?: PITCH | string = "+0Hz"; | |
/** | |
* The rate to use. | |
* Can be any {@link RATE}, or a relative number (0.5), or string with a relative percentage (+50%). | |
* [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,rate,-Indicates%20the%20speaking) | |
*/ | |
rate?: RATE | string | number = 1.0; | |
/** | |
* The volume to use. | |
* Can be any {@link VOLUME}, or an absolute number (0, 100), a string with a relative number (+50), or a relative percentage (+50%). | |
* [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,volume,-Indicates%20the%20volume) | |
*/ | |
volume?: VOLUME | string | number = 100.0; | |
} | |
export class MsEdgeTTS { | |
static OUTPUT_FORMAT = OUTPUT_FORMAT; | |
private static TRUSTED_CLIENT_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4"; | |
private static VOICES_URL = `https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=${MsEdgeTTS.TRUSTED_CLIENT_TOKEN}`; | |
private static SYNTH_URL = `wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1?TrustedClientToken=${MsEdgeTTS.TRUSTED_CLIENT_TOKEN}`; | |
private static BINARY_DELIM = "Path:audio\r\n"; | |
private static VOICE_LANG_REGEX = /\w{2}-\w{2}/; | |
private readonly _enableLogger; | |
private _ws: WebSocket | undefined; | |
private _voice: any; | |
private _voiceLocale: any; | |
private _outputFormat: any; | |
private _streams: { [key: string]: Readable } = {}; | |
private _startTime = 0; | |
private _log(...o: any[]) { | |
if (this._enableLogger) { | |
console.log(...o); | |
} | |
} | |
/** | |
* Create a new `MsEdgeTTS` instance. | |
* | |
* @param agent (optional, **NOT SUPPORTED IN BROWSER**) Use a custom http.Agent implementation like [https-proxy-agent](https://github.com/TooTallNate/proxy-agents) or [socks-proxy-agent](https://github.com/TooTallNate/proxy-agents/tree/main/packages/socks-proxy-agent). | |
* @param enableLogger=false whether to enable the built-in logger. This logs connections inits, disconnects, and incoming data to the console | |
*/ | |
public constructor(enableLogger: boolean = false) { | |
this._enableLogger = enableLogger; | |
} | |
private async _send(message: any) { | |
for (let i = 1; i <= 3 && this._ws!.readyState !== this._ws!.OPEN; i++) { | |
if (i == 1) { | |
this._startTime = Date.now(); | |
} | |
this._log("connecting: ", i); | |
await this._initClient(); | |
} | |
this._ws!.send(message); | |
} | |
private _initClient() { | |
this._ws = new WebSocket(MsEdgeTTS.SYNTH_URL); | |
this._ws.binaryType = "arraybuffer"; | |
return new Promise((resolve, reject) => { | |
this._ws!.onopen = () => { | |
this._log( | |
"Connected in", | |
(Date.now() - this._startTime) / 1000, | |
"seconds", | |
); | |
this._send( | |
`Content-Type:application/json; charset=utf-8\r\nPath:speech.config\r\n\r\n | |
{ | |
"context": { | |
"synthesis": { | |
"audio": { | |
"metadataoptions": { | |
"sentenceBoundaryEnabled": "false", | |
"wordBoundaryEnabled": "false" | |
}, | |
"outputFormat": "${this._outputFormat}" | |
} | |
} | |
} | |
} | |
`, | |
).then(resolve); | |
}; | |
this._ws!.onmessage = (m: any) => { | |
const buffer = Buffer.from(m.data as ArrayBuffer); | |
const message = buffer.toString(); | |
const requestId = /X-RequestId:(.*?)\r\n/gm.exec(message)![1]; | |
if (message.includes("Path:turn.start")) { | |
// start of turn, ignore | |
} else if (message.includes("Path:turn.end")) { | |
// end of turn, close stream | |
this._streams[requestId].push(null); | |
} else if (message.includes("Path:response")) { | |
// context response, ignore | |
} else if ( | |
message.includes("Path:audio") && | |
m.data instanceof ArrayBuffer | |
) { | |
this._pushAudioData(buffer, requestId); | |
} else { | |
this._log("UNKNOWN MESSAGE", message); | |
} | |
}; | |
this._ws!.onclose = () => { | |
this._log( | |
"disconnected after:", | |
(Date.now() - this._startTime) / 1000, | |
"seconds", | |
); | |
for (const requestId in this._streams) { | |
this._streams[requestId].push(null); | |
} | |
}; | |
this._ws!.onerror = function (error: any) { | |
reject("Connect Error: " + error); | |
}; | |
}); | |
} | |
private _pushAudioData(audioBuffer: Buffer, requestId: string) { | |
const audioStartIndex = | |
audioBuffer.indexOf(MsEdgeTTS.BINARY_DELIM) + | |
MsEdgeTTS.BINARY_DELIM.length; | |
const audioData = audioBuffer.subarray(audioStartIndex); | |
this._streams[requestId].push(audioData); | |
this._log("received audio chunk, size: ", audioData?.length); | |
} | |
private _SSMLTemplate(input: string, options: ProsodyOptions = {}): string { | |
// in case future updates to the edge API block these elements, we'll be concatenating strings. | |
options = { ...new ProsodyOptions(), ...options }; | |
return `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${this._voiceLocale}"> | |
<voice name="${this._voice}"> | |
<prosody pitch="${options.pitch}" rate="${options.rate}" volume="${options.volume}"> | |
${input} | |
</prosody> | |
</voice> | |
</speak>`; | |
} | |
/** | |
* Fetch the list of voices available in Microsoft Edge. | |
* These, however, are not all. The complete list of voices supported by this module [can be found here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support) (neural, standard, and preview). | |
*/ | |
// getVoices(): Promise<Voice[]> { | |
// return new Promise((resolve, reject) => { | |
// axios | |
// .get(MsEdgeTTS.VOICES_URL) | |
// .then((res) => resolve(res.data)) | |
// .catch(reject); | |
// }); | |
// } | |
getVoices(): Promise<Voice[]> { | |
return fetch(MsEdgeTTS.VOICES_URL) | |
.then((response) => { | |
if (!response.ok) { | |
throw new Error("Network response was not ok"); | |
} | |
return response.json(); | |
}) | |
.then((data) => data as Voice[]) | |
.catch((error) => { | |
throw error; | |
}); | |
} | |
/** | |
* Sets the required information for the speech to be synthesised and inits a new WebSocket connection. | |
* Must be called at least once before text can be synthesised. | |
* Saved in this instance. Can be called at any time times to update the metadata. | |
* | |
* @param voiceName a string with any `ShortName`. A list of all available neural voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#neural-voices). However, it is not limited to neural voices: standard voices can also be used. A list of standard voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#standard-voices) | |
* @param outputFormat any {@link OUTPUT_FORMAT} | |
* @param voiceLocale (optional) any voice locale that is supported by the voice. See the list of all voices for compatibility. If not provided, the locale will be inferred from the `voiceName` | |
*/ | |
async setMetadata( | |
voiceName: string, | |
outputFormat: OUTPUT_FORMAT, | |
voiceLocale?: string, | |
) { | |
const oldVoice = this._voice; | |
const oldVoiceLocale = this._voiceLocale; | |
const oldOutputFormat = this._outputFormat; | |
this._voice = voiceName; | |
this._voiceLocale = voiceLocale; | |
if (!this._voiceLocale) { | |
const voiceLangMatch = MsEdgeTTS.VOICE_LANG_REGEX.exec(this._voice); | |
if (!voiceLangMatch) | |
throw new Error("Could not infer voiceLocale from voiceName!"); | |
this._voiceLocale = voiceLangMatch[0]; | |
} | |
this._outputFormat = outputFormat; | |
const changed = | |
oldVoice !== this._voice || | |
oldVoiceLocale !== this._voiceLocale || | |
oldOutputFormat !== this._outputFormat; | |
// create new client | |
if (changed || this._ws!.readyState !== this._ws!.OPEN) { | |
this._startTime = Date.now(); | |
await this._initClient(); | |
} | |
} | |
private _metadataCheck() { | |
if (!this._ws) | |
throw new Error( | |
"Speech synthesis not configured yet. Run setMetadata before calling toStream or toFile.", | |
); | |
} | |
/** | |
* Close the WebSocket connection. | |
*/ | |
close() { | |
this._ws!.close(); | |
} | |
/** | |
* Writes raw audio synthesised from text in real-time to a {@link Readable}. Uses a basic {@link _SSMLTemplate SML template}. | |
* | |
* @param input the text to synthesise. Can include SSML elements. | |
* @param options (optional) {@link ProsodyOptions} | |
* @returns {Readable} - a `stream.Readable` with the audio data | |
*/ | |
toStream(input: string, options?: ProsodyOptions): Readable { | |
const { stream } = this._rawSSMLRequest(this._SSMLTemplate(input, options)); | |
return stream; | |
} | |
toArrayBuffer(input: string, options?: ProsodyOptions): Promise<ArrayBuffer> { | |
return new Promise((resolve, reject) => { | |
let data: Uint8Array[] = []; | |
const readable = this.toStream(input, options); | |
readable.on("data", (chunk) => { | |
data.push(chunk); | |
}); | |
readable.on("end", () => { | |
resolve(Buffer.concat(data).buffer); | |
}); | |
readable.on("error", (err) => { | |
reject(err); | |
}); | |
}); | |
} | |
/** | |
* Writes raw audio synthesised from a request in real-time to a {@link Readable}. Has no SSML template. Basic SSML should be provided in the request. | |
* | |
* @param requestSSML the SSML to send. SSML elements required in order to work. | |
* @returns {Readable} - a `stream.Readable` with the audio data | |
*/ | |
rawToStream(requestSSML: string): Readable { | |
const { stream } = this._rawSSMLRequest(requestSSML); | |
return stream; | |
} | |
private _rawSSMLRequest(requestSSML: string): { | |
stream: Readable; | |
requestId: string; | |
} { | |
this._metadataCheck(); | |
const requestId = randomBytes(16).toString("hex"); | |
const request = | |
`X-RequestId:${requestId}\r\nContent-Type:application/ssml+xml\r\nPath:ssml\r\n\r\n | |
` + requestSSML.trim(); | |
// https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/speech-synthesis-markup | |
const self = this; | |
const stream = new Readable({ | |
read() {}, | |
destroy(error: Error | null, callback: (error: Error | null) => void) { | |
delete self._streams[requestId]; | |
callback(error); | |
}, | |
}); | |
this._streams[requestId] = stream; | |
this._send(request).then(); | |
return { stream, requestId }; | |
} | |
} | |