Spaces:

maltose1
/

work-house

Sleeping

App Files Files Community

work-house / app /utils /ms_edge_tts.ts

maltose1

Upload 365 files

853f6aa verified about 1 month ago

raw

history blame contribute delete

15.7 kB

	// import axios from "axios";
	import { Buffer } from "buffer";
	import { randomBytes } from "crypto";
	import { Readable } from "stream";

	// Modified according to https://github.com/Migushthe2nd/MsEdgeTTS

	/**
	* https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,volume,-Indicates%20the%20volume
	*/
	export enum VOLUME {
	SILENT = "silent",
	X_SOFT = "x-soft",
	SOFT = "soft",
	MEDIUM = "medium",
	LOUD = "loud",
	X_LOUD = "x-LOUD",
	DEFAULT = "default",
	}

	/**
	* https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,rate,-Indicates%20the%20speaking
	*/
	export enum RATE {
	X_SLOW = "x-slow",
	SLOW = "slow",
	MEDIUM = "medium",
	FAST = "fast",
	X_FAST = "x-fast",
	DEFAULT = "default",
	}

	/**
	* https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,pitch,-Indicates%20the%20baseline
	*/
	export enum PITCH {
	X_LOW = "x-low",
	LOW = "low",
	MEDIUM = "medium",
	HIGH = "high",
	X_HIGH = "x-high",
	DEFAULT = "default",
	}

	/**
	* Only a few of the [possible formats](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/rest-text-to-speech#audio-outputs) are accepted.
	*/
	export enum OUTPUT_FORMAT {
	// Streaming =============================
	// AMR_WB_16000HZ = "amr-wb-16000hz",
	// AUDIO_16KHZ_16BIT_32KBPS_MONO_OPUS = "audio-16khz-16bit-32kbps-mono-opus",
	// AUDIO_16KHZ_32KBITRATE_MONO_MP3 = "audio-16khz-32kbitrate-mono-mp3",
	// AUDIO_16KHZ_64KBITRATE_MONO_MP3 = "audio-16khz-64kbitrate-mono-mp3",
	// AUDIO_16KHZ_128KBITRATE_MONO_MP3 = "audio-16khz-128kbitrate-mono-mp3",
	// AUDIO_24KHZ_16BIT_24KBPS_MONO_OPUS = "audio-24khz-16bit-24kbps-mono-opus",
	// AUDIO_24KHZ_16BIT_48KBPS_MONO_OPUS = "audio-24khz-16bit-48kbps-mono-opus",
	AUDIO_24KHZ_48KBITRATE_MONO_MP3 = "audio-24khz-48kbitrate-mono-mp3",
	AUDIO_24KHZ_96KBITRATE_MONO_MP3 = "audio-24khz-96kbitrate-mono-mp3",
	// AUDIO_24KHZ_160KBITRATE_MONO_MP3 = "audio-24khz-160kbitrate-mono-mp3",
	// AUDIO_48KHZ_96KBITRATE_MONO_MP3 = "audio-48khz-96kbitrate-mono-mp3",
	// AUDIO_48KHZ_192KBITRATE_MONO_MP3 = "audio-48khz-192kbitrate-mono-mp3",
	// OGG_16KHZ_16BIT_MONO_OPUS = "ogg-16khz-16bit-mono-opus",
	// OGG_24KHZ_16BIT_MONO_OPUS = "ogg-24khz-16bit-mono-opus",
	// OGG_48KHZ_16BIT_MONO_OPUS = "ogg-48khz-16bit-mono-opus",
	// RAW_8KHZ_8BIT_MONO_ALAW = "raw-8khz-8bit-mono-alaw",
	// RAW_8KHZ_8BIT_MONO_MULAW = "raw-8khz-8bit-mono-mulaw",
	// RAW_8KHZ_16BIT_MONO_PCM = "raw-8khz-16bit-mono-pcm",
	// RAW_16KHZ_16BIT_MONO_PCM = "raw-16khz-16bit-mono-pcm",
	// RAW_16KHZ_16BIT_MONO_TRUESILK = "raw-16khz-16bit-mono-truesilk",
	// RAW_22050HZ_16BIT_MONO_PCM = "raw-22050hz-16bit-mono-pcm",
	// RAW_24KHZ_16BIT_MONO_PCM = "raw-24khz-16bit-mono-pcm",
	// RAW_24KHZ_16BIT_MONO_TRUESILK = "raw-24khz-16bit-mono-truesilk",
	// RAW_44100HZ_16BIT_MONO_PCM = "raw-44100hz-16bit-mono-pcm",
	// RAW_48KHZ_16BIT_MONO_PCM = "raw-48khz-16bit-mono-pcm",
	// WEBM_16KHZ_16BIT_MONO_OPUS = "webm-16khz-16bit-mono-opus",
	// WEBM_24KHZ_16BIT_24KBPS_MONO_OPUS = "webm-24khz-16bit-24kbps-mono-opus",
	WEBM_24KHZ_16BIT_MONO_OPUS = "webm-24khz-16bit-mono-opus",
	// Non-streaming =============================
	// RIFF_8KHZ_8BIT_MONO_ALAW = "riff-8khz-8bit-mono-alaw",
	// RIFF_8KHZ_8BIT_MONO_MULAW = "riff-8khz-8bit-mono-mulaw",
	// RIFF_8KHZ_16BIT_MONO_PCM = "riff-8khz-16bit-mono-pcm",
	// RIFF_22050HZ_16BIT_MONO_PCM = "riff-22050hz-16bit-mono-pcm",
	// RIFF_24KHZ_16BIT_MONO_PCM = "riff-24khz-16bit-mono-pcm",
	// RIFF_44100HZ_16BIT_MONO_PCM = "riff-44100hz-16bit-mono-pcm",
	// RIFF_48KHZ_16BIT_MONO_PCM = "riff-48khz-16bit-mono-pcm",
	}

	export type Voice = {
	Name: string;
	ShortName: string;
	Gender: string;
	Locale: string;
	SuggestedCodec: string;
	FriendlyName: string;
	Status: string;
	};

	export class ProsodyOptions {
	/**
	* The pitch to use.
	* Can be any {@link PITCH}, or a relative frequency in Hz (+50Hz), a relative semitone (+2st), or a relative percentage (+50%).
	* [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,pitch,-Indicates%20the%20baseline)
	*/
	pitch?: PITCH \| string = "+0Hz";
	/**
	* The rate to use.
	* Can be any {@link RATE}, or a relative number (0.5), or string with a relative percentage (+50%).
	* [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,rate,-Indicates%20the%20speaking)
	*/
	rate?: RATE \| string \| number = 1.0;
	/**
	* The volume to use.
	* Can be any {@link VOLUME}, or an absolute number (0, 100), a string with a relative number (+50), or a relative percentage (+50%).
	* [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,volume,-Indicates%20the%20volume)
	*/
	volume?: VOLUME \| string \| number = 100.0;
	}

	export class MsEdgeTTS {
	static OUTPUT_FORMAT = OUTPUT_FORMAT;
	private static TRUSTED_CLIENT_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4";
	private static VOICES_URL = `https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=${MsEdgeTTS.TRUSTED_CLIENT_TOKEN}`;
	private static SYNTH_URL = `wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1?TrustedClientToken=${MsEdgeTTS.TRUSTED_CLIENT_TOKEN}`;
	private static BINARY_DELIM = "Path:audio\r\n";
	private static VOICE_LANG_REGEX = /\w{2}-\w{2}/;
	private readonly _enableLogger;
	private _ws: WebSocket \| undefined;
	private _voice: any;
	private _voiceLocale: any;
	private _outputFormat: any;
	private _streams: { [key: string]: Readable } = {};
	private _startTime = 0;

	private _log(...o: any[]) {
	if (this._enableLogger) {
	console.log(...o);
	}
	}

	/**
	* Create a new `MsEdgeTTS` instance.
	*
	* @param agent (optional, NOT SUPPORTED IN BROWSER) Use a custom http.Agent implementation like [https-proxy-agent](https://github.com/TooTallNate/proxy-agents) or [socks-proxy-agent](https://github.com/TooTallNate/proxy-agents/tree/main/packages/socks-proxy-agent).
	* @param enableLogger=false whether to enable the built-in logger. This logs connections inits, disconnects, and incoming data to the console
	*/
	public constructor(enableLogger: boolean = false) {
	this._enableLogger = enableLogger;
	}

	private async _send(message: any) {
	for (let i = 1; i <= 3 && this._ws!.readyState !== this._ws!.OPEN; i++) {
	if (i == 1) {
	this._startTime = Date.now();
	}
	this._log("connecting: ", i);
	await this._initClient();
	}
	this._ws!.send(message);
	}

	private _initClient() {
	this._ws = new WebSocket(MsEdgeTTS.SYNTH_URL);

	this._ws.binaryType = "arraybuffer";
	return new Promise((resolve, reject) => {
	this._ws!.onopen = () => {
	this._log(
	"Connected in",
	(Date.now() - this._startTime) / 1000,
	"seconds",
	);
	this._send(
	`Content-Type:application/json; charset=utf-8\r\nPath:speech.config\r\n\r\n
	{
	"context": {
	"synthesis": {
	"audio": {
	"metadataoptions": {
	"sentenceBoundaryEnabled": "false",
	"wordBoundaryEnabled": "false"
	},
	"outputFormat": "${this._outputFormat}"
	}
	}
	}
	}
	`,
	).then(resolve);
	};
	this._ws!.onmessage = (m: any) => {
	const buffer = Buffer.from(m.data as ArrayBuffer);
	const message = buffer.toString();
	const requestId = /X-RequestId:(.*?)\r\n/gm.exec(message)![1];
	if (message.includes("Path:turn.start")) {
	// start of turn, ignore
	} else if (message.includes("Path:turn.end")) {
	// end of turn, close stream
	this._streams[requestId].push(null);
	} else if (message.includes("Path:response")) {
	// context response, ignore
	} else if (
	message.includes("Path:audio") &&
	m.data instanceof ArrayBuffer
	) {
	this._pushAudioData(buffer, requestId);
	} else {
	this._log("UNKNOWN MESSAGE", message);
	}
	};
	this._ws!.onclose = () => {
	this._log(
	"disconnected after:",
	(Date.now() - this._startTime) / 1000,
	"seconds",
	);
	for (const requestId in this._streams) {
	this._streams[requestId].push(null);
	}
	};
	this._ws!.onerror = function (error: any) {
	reject("Connect Error: " + error);
	};
	});
	}

	private _pushAudioData(audioBuffer: Buffer, requestId: string) {
	const audioStartIndex =
	audioBuffer.indexOf(MsEdgeTTS.BINARY_DELIM) +
	MsEdgeTTS.BINARY_DELIM.length;
	const audioData = audioBuffer.subarray(audioStartIndex);
	this._streams[requestId].push(audioData);
	this._log("received audio chunk, size: ", audioData?.length);
	}

	private _SSMLTemplate(input: string, options: ProsodyOptions = {}): string {
	// in case future updates to the edge API block these elements, we'll be concatenating strings.
	options = { ...new ProsodyOptions(), ...options };
	return `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${this._voiceLocale}">
	<voice name="${this._voice}">
	<prosody pitch="${options.pitch}" rate="${options.rate}" volume="${options.volume}">
	${input}
	</prosody>
	</voice>
	</speak>`;
	}

	/**
	* Fetch the list of voices available in Microsoft Edge.
	* These, however, are not all. The complete list of voices supported by this module [can be found here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support) (neural, standard, and preview).
	*/
	// getVoices(): Promise<Voice[]> {
	// return new Promise((resolve, reject) => {
	// axios
	// .get(MsEdgeTTS.VOICES_URL)
	// .then((res) => resolve(res.data))
	// .catch(reject);
	// });
	// }
	getVoices(): Promise<Voice[]> {
	return fetch(MsEdgeTTS.VOICES_URL)
	.then((response) => {
	if (!response.ok) {
	throw new Error("Network response was not ok");
	}
	return response.json();
	})
	.then((data) => data as Voice[])
	.catch((error) => {
	throw error;
	});
	}

	/**
	* Sets the required information for the speech to be synthesised and inits a new WebSocket connection.
	* Must be called at least once before text can be synthesised.
	* Saved in this instance. Can be called at any time times to update the metadata.
	*
	* @param voiceName a string with any `ShortName`. A list of all available neural voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#neural-voices). However, it is not limited to neural voices: standard voices can also be used. A list of standard voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#standard-voices)
	* @param outputFormat any {@link OUTPUT_FORMAT}
	* @param voiceLocale (optional) any voice locale that is supported by the voice. See the list of all voices for compatibility. If not provided, the locale will be inferred from the `voiceName`
	*/
	async setMetadata(
	voiceName: string,
	outputFormat: OUTPUT_FORMAT,
	voiceLocale?: string,
	) {
	const oldVoice = this._voice;
	const oldVoiceLocale = this._voiceLocale;
	const oldOutputFormat = this._outputFormat;

	this._voice = voiceName;
	this._voiceLocale = voiceLocale;
	if (!this._voiceLocale) {
	const voiceLangMatch = MsEdgeTTS.VOICE_LANG_REGEX.exec(this._voice);
	if (!voiceLangMatch)
	throw new Error("Could not infer voiceLocale from voiceName!");
	this._voiceLocale = voiceLangMatch[0];
	}
	this._outputFormat = outputFormat;

	const changed =
	oldVoice !== this._voice \|\|
	oldVoiceLocale !== this._voiceLocale \|\|
	oldOutputFormat !== this._outputFormat;

	// create new client
	if (changed \|\| this._ws!.readyState !== this._ws!.OPEN) {
	this._startTime = Date.now();
	await this._initClient();
	}
	}

	private _metadataCheck() {
	if (!this._ws)
	throw new Error(
	"Speech synthesis not configured yet. Run setMetadata before calling toStream or toFile.",
	);
	}

	/**
	* Close the WebSocket connection.
	*/
	close() {
	this._ws!.close();
	}

	/**
	* Writes raw audio synthesised from text in real-time to a {@link Readable}. Uses a basic {@link _SSMLTemplate SML template}.
	*
	* @param input the text to synthesise. Can include SSML elements.
	* @param options (optional) {@link ProsodyOptions}
	* @returns {Readable} - a `stream.Readable` with the audio data
	*/
	toStream(input: string, options?: ProsodyOptions): Readable {
	const { stream } = this._rawSSMLRequest(this._SSMLTemplate(input, options));
	return stream;
	}

	toArrayBuffer(input: string, options?: ProsodyOptions): Promise<ArrayBuffer> {
	return new Promise((resolve, reject) => {
	let data: Uint8Array[] = [];
	const readable = this.toStream(input, options);
	readable.on("data", (chunk) => {
	data.push(chunk);
	});

	readable.on("end", () => {
	resolve(Buffer.concat(data).buffer);
	});

	readable.on("error", (err) => {
	reject(err);
	});
	});
	}

	/**
	* Writes raw audio synthesised from a request in real-time to a {@link Readable}. Has no SSML template. Basic SSML should be provided in the request.
	*
	* @param requestSSML the SSML to send. SSML elements required in order to work.
	* @returns {Readable} - a `stream.Readable` with the audio data
	*/
	rawToStream(requestSSML: string): Readable {
	const { stream } = this._rawSSMLRequest(requestSSML);
	return stream;
	}

	private _rawSSMLRequest(requestSSML: string): {
	stream: Readable;
	requestId: string;
	} {
	this._metadataCheck();

	const requestId = randomBytes(16).toString("hex");
	const request =
	`X-RequestId:${requestId}\r\nContent-Type:application/ssml+xml\r\nPath:ssml\r\n\r\n
	` + requestSSML.trim();
	// https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/speech-synthesis-markup
	const self = this;
	const stream = new Readable({
	read() {},
	destroy(error: Error \| null, callback: (error: Error \| null) => void) {
	delete self._streams[requestId];
	callback(error);
	},
	});
	this._streams[requestId] = stream;
	this._send(request).then();
	return { stream, requestId };
	}
	}