Spaces:
Paused
Paused
| import * as ort from 'onnxruntime-web'; | |
| const presetTexts = window.presetTexts || {}; | |
| const PLAY_ICON_SVG = `<svg width="24" height="24" viewBox="0 0 24 24" fill="currentColor" aria-hidden="true" focusable="false"><path d="M8 5v14l11-7-11-7z"></path></svg>`; | |
| const PAUSE_ICON_SVG = `<svg width="24" height="24" viewBox="0 0 24 24" fill="currentColor" aria-hidden="true" focusable="false"><path d="M8 6h3v12H8V6zm5 0h3v12h-3V6z"></path></svg>`; | |
| const STOP_ICON_SVG = `<svg width="24" height="24" viewBox="0 0 24 24" fill="currentColor" aria-hidden="true" focusable="false"><path d="M7 7h10v10H7V7z"></path></svg>`; | |
| // Lightning background parallax | |
| (function initLightningParallax() { | |
| if (typeof document === 'undefined') { | |
| return; | |
| } | |
| const runBlink = (className, onComplete) => { | |
| let remaining = 1 + Math.round(Math.random()); | |
| const blink = () => { | |
| if (remaining-- <= 0) { | |
| if (typeof onComplete === 'function') { | |
| onComplete(); | |
| } | |
| return; | |
| } | |
| const wait = 20 + Math.random() * 80; | |
| document.body.classList.add(className); | |
| setTimeout(() => { | |
| document.body.classList.remove(className); | |
| setTimeout(blink, wait); | |
| }, wait); | |
| }; | |
| blink(); | |
| }; | |
| const schedule = () => { | |
| setTimeout(() => runBlink('lightning-flicker', schedule), Math.random() * 10000); | |
| }; | |
| schedule(); | |
| })(); | |
| function escapeHtml(value) { | |
| return value.replace(/[&<>"']/g, (match) => { | |
| switch (match) { | |
| case '&': return '&'; | |
| case '<': return '<'; | |
| case '>': return '>'; | |
| case '"': return '"'; | |
| case "'": return '''; | |
| default: return match; | |
| } | |
| }); | |
| } | |
| function formatStatValueWithSuffix(value, suffix, options = {}) { | |
| const { firstLabel = false } = options; | |
| if (value === undefined || value === null) { | |
| return ''; | |
| } | |
| if (!suffix) { | |
| const raw = `${value}`; | |
| return escapeHtml(raw); | |
| } | |
| const raw = `${value}`.trim(); | |
| if (!raw || raw === '--' || raw === '-' || raw.toLowerCase() === 'error') { | |
| return escapeHtml(raw); | |
| } | |
| const appendSuffix = (segment, includePrefix = false) => { | |
| const trimmed = segment.trim(); | |
| if (!trimmed) { | |
| return ''; | |
| } | |
| const escapedValue = `<span class="stat-value-number">${escapeHtml(trimmed)}</span>`; | |
| const suffixSpan = `<span class="stat-label stat-suffix">${escapeHtml(suffix)}</span>`; | |
| const prefixSpan = includePrefix && firstLabel | |
| ? `<span class="stat-label stat-suffix stat-prefix">First</span>` | |
| : ''; | |
| const segmentClass = includePrefix && firstLabel | |
| ? 'stat-value-segment has-prefix' | |
| : 'stat-value-segment'; | |
| return `<span class="${segmentClass}">${prefixSpan}${escapedValue}${suffixSpan}</span>`; | |
| }; | |
| if (raw.includes('/')) { | |
| const parts = raw.split('/'); | |
| const segments = parts.map((part, index) => appendSuffix(part, index === 0)); | |
| return segments.join(' / '); | |
| } | |
| return appendSuffix(raw); | |
| } | |
| /** | |
| * Unicode text processor | |
| */ | |
| export class UnicodeProcessor { | |
| constructor(indexer) { | |
| this.indexer = indexer; | |
| } | |
| call(textList, lang = null) { | |
| const processedTexts = textList.map(t => preprocessText(t, lang)); | |
| const textIdsLengths = processedTexts.map(t => t.length); | |
| const maxLen = Math.max(...textIdsLengths); | |
| const textIds = []; | |
| const unsupportedChars = new Set(); | |
| for (let i = 0; i < processedTexts.length; i++) { | |
| const row = new Array(maxLen).fill(0); | |
| const unicodeVals = textToUnicodeValues(processedTexts[i]); | |
| for (let j = 0; j < unicodeVals.length; j++) { | |
| const indexValue = this.indexer[unicodeVals[j]]; | |
| // Check if character is supported (not -1, undefined, or null) | |
| if (indexValue === undefined || indexValue === null || indexValue === -1) { | |
| unsupportedChars.add(processedTexts[i][j]); | |
| row[j] = 0; // Use 0 as fallback | |
| } else { | |
| row[j] = indexValue; | |
| } | |
| } | |
| textIds.push(row); | |
| } | |
| const textMask = getTextMask(textIdsLengths); | |
| return { textIds, textMask, unsupportedChars: Array.from(unsupportedChars) }; | |
| } | |
| } | |
| const AVAILABLE_LANGS = ["en", "ko", "es", "pt", "fr"]; | |
| /** | |
| * Language detection based on character patterns and language-specific markers | |
| * Returns the detected language code or null if uncertain | |
| */ | |
| export function detectLanguage(text) { | |
| if (!text || text.trim().length < 3) { | |
| return null; | |
| } | |
| // Only consider last 100 characters for efficiency | |
| const sampleText = text.length > 100 ? text.substring(text.length - 100) : text; | |
| // Normalize text for analysis | |
| const normalizedText = sampleText.normalize('NFC').toLowerCase(); | |
| // Korean detection: Hangul characters (most reliable) | |
| const koreanRegex = /[\uAC00-\uD7AF\u1100-\u11FF\u3130-\u318F\uA960-\uA97F\uD7B0-\uD7FF]/g; | |
| const koreanMatches = normalizedText.match(koreanRegex) || []; | |
| if (koreanMatches.length >= 2) { | |
| return 'ko'; | |
| } | |
| // Scoring system for Latin-based languages | |
| const scores = { en: 0, es: 0, fr: 0, pt: 0 }; | |
| // 1. Highly distinctive characters (definitive markers) | |
| if (/ñ/.test(normalizedText)) scores.es += 15; | |
| if (/[¿¡]/.test(normalizedText)) scores.es += 12; | |
| if (/ã/.test(normalizedText)) scores.pt += 15; | |
| if (/õ/.test(normalizedText)) scores.pt += 15; | |
| if (/œ/.test(normalizedText)) scores.fr += 15; | |
| if (/[ùû]/.test(normalizedText)) scores.fr += 10; | |
| // ç is shared between French and Portuguese | |
| if (/ç/.test(normalizedText)) { | |
| scores.fr += 4; | |
| scores.pt += 4; | |
| } | |
| // French-specific accent patterns | |
| if (/[èêë]/.test(normalizedText)) scores.fr += 5; | |
| if (/[àâ]/.test(normalizedText)) scores.fr += 3; | |
| if (/[îï]/.test(normalizedText)) scores.fr += 4; | |
| if (/ô/.test(normalizedText)) scores.fr += 3; | |
| // 2. Exclusive stopwords (words unique to one language) | |
| const exclusiveWords = { | |
| en: ['the', 'is', 'are', 'was', 'were', 'have', 'has', 'been', 'will', 'would', 'could', 'should', 'this', 'that', 'with', 'from', 'they', 'what', 'which', 'there', 'their', 'about', 'these', 'other', 'into', 'just', 'your', 'some', 'than', 'them', 'then', 'only', 'being', 'through', 'after', 'before'], | |
| es: ['el', 'los', 'las', 'es', 'está', 'están', 'porque', 'pero', 'muy', 'también', 'más', 'este', 'esta', 'estos', 'estas', 'ese', 'esa', 'yo', 'tú', 'nosotros', 'ellos', 'ellas', 'hola', 'gracias', 'buenos', 'buenas', 'ahora', 'siempre', 'nunca', 'todo', 'nada', 'algo', 'alguien'], | |
| fr: ['le', 'les', 'est', 'sont', 'dans', 'ce', 'cette', 'ces', 'il', 'elle', 'ils', 'elles', 'je', 'tu', 'nous', 'vous', 'avec', 'sur', 'ne', 'pas', 'plus', 'tout', 'bien', 'fait', 'être', 'avoir', 'donc', 'car', 'ni', 'jamais', 'toujours', 'rien', 'quelque', 'encore', 'aussi', 'très', 'peu', 'ici'], | |
| pt: ['os', 'as', 'é', 'são', 'está', 'estão', 'não', 'na', 'no', 'da', 'do', 'das', 'dos', 'ao', 'aos', 'ele', 'ela', 'eles', 'elas', 'eu', 'nós', 'você', 'vocês', 'seu', 'sua', 'seus', 'suas', 'muito', 'também', 'já', 'foi', 'só', 'mesmo', 'ter', 'até', 'isso', 'olá', 'obrigado', 'obrigada', 'bom', 'boa', 'agora', 'sempre', 'nunca', 'tudo', 'nada', 'algo', 'alguém'] | |
| }; | |
| // Extract words from text | |
| const words = normalizedText.match(/[a-záàâãäåçéèêëíìîïñóòôõöúùûüýÿœæ]+/g) || []; | |
| for (const word of words) { | |
| for (const [lang, wordList] of Object.entries(exclusiveWords)) { | |
| if (wordList.includes(word)) { | |
| scores[lang] += 3; | |
| } | |
| } | |
| } | |
| // 3. Common n-grams (character patterns) | |
| const ngramPatterns = { | |
| en: [/th/g, /ing/g, /tion/g, /ight/g, /ould/g], | |
| es: [/ción/g, /mente/g, /ado/g, /ido/g], | |
| fr: [/tion/g, /ment/g, /eau/g, /aux/g, /eux/g, /oir/g, /ais/g, /ait/g, /ont/g], | |
| pt: [/ção/g, /ões/g, /mente/g, /ado/g, /ido/g, /nh/g, /lh/g] | |
| }; | |
| for (const [lang, patterns] of Object.entries(ngramPatterns)) { | |
| for (const pattern of patterns) { | |
| const matches = normalizedText.match(pattern) || []; | |
| scores[lang] += matches.length * 2; | |
| } | |
| } | |
| // 4. French contractions and apostrophes | |
| const frenchContractions = /[cdjlmnst]'[aeiouéèêàâîïôûù]/g; | |
| const frenchContractionMatches = normalizedText.match(frenchContractions) || []; | |
| scores.fr += frenchContractionMatches.length * 5; | |
| // 5. Article patterns that help distinguish | |
| // "the" is very English, "el/la" Spanish, "le/la" French, "o/a" Portuguese | |
| if (/\bthe\b/.test(normalizedText)) scores.en += 5; | |
| if (/\b(el|los)\b/.test(normalizedText)) scores.es += 4; | |
| if (/\b(le|les)\b/.test(normalizedText)) scores.fr += 4; | |
| if (/\b(o|os)\b/.test(normalizedText)) scores.pt += 3; | |
| // Find the language with the highest score | |
| let maxScore = 0; | |
| let detectedLang = null; | |
| for (const [lang, score] of Object.entries(scores)) { | |
| if (score > maxScore) { | |
| maxScore = score; | |
| detectedLang = lang; | |
| } | |
| } | |
| // Only return if we have enough confidence (minimum threshold) | |
| if (maxScore >= 4) { | |
| return detectedLang; | |
| } | |
| return null; | |
| } | |
| // Language display names for toast notification | |
| const LANGUAGE_NAMES = { | |
| 'en': 'English', | |
| 'ko': 'Korean', | |
| 'es': 'Spanish', | |
| 'pt': 'Portuguese', | |
| 'fr': 'French' | |
| }; | |
| export function preprocessText(text, lang = null) { | |
| // Normalize unicode characters | |
| text = text.normalize('NFKD'); | |
| // Remove emojis | |
| text = text.replace(/[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F700}-\u{1F77F}\u{1F780}-\u{1F7FF}\u{1F800}-\u{1F8FF}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1F1E6}-\u{1F1FF}]+/gu, ''); | |
| // Replace various dashes and symbols | |
| const replacements = { | |
| "–": "-", | |
| "‑": "-", | |
| "—": "-", | |
| "_": " ", | |
| "\u201C": '"', // " | |
| "\u201D": '"', // " | |
| "\u2018": "'", // ' | |
| "\u2019": "'", // ' | |
| "´": "'", | |
| "`": "'", | |
| "[": " ", | |
| "]": " ", | |
| "|": " ", | |
| "/": " ", // FIXME: `/` should be pronounced. | |
| "#": " ", // FIXME: `#` should be pronounced. | |
| "→": " ", | |
| "←": " ", | |
| }; | |
| for (const [k, v] of Object.entries(replacements)) { | |
| text = text.replaceAll(k, v); | |
| } | |
| // Remove special symbols | |
| text = text.replace(/[♥☆♡©\\]/g, ""); | |
| // Replace known expressions | |
| const exprReplacements = { | |
| "@": " at ", | |
| "e.g.,": "for example,", | |
| "i.e.,": "that is,", | |
| }; | |
| for (const [k, v] of Object.entries(exprReplacements)) { | |
| text = text.replaceAll(k, v); | |
| } | |
| // Fix spacing around punctuation | |
| text = text.replace(/ ,/g, ","); | |
| text = text.replace(/ \./g, "."); | |
| text = text.replace(/ !/g, "!"); | |
| text = text.replace(/ \?/g, "?"); | |
| text = text.replace(/ ;/g, ";"); | |
| text = text.replace(/ :/g, ":"); | |
| text = text.replace(/ '/g, "'"); | |
| // Remove duplicate quotes | |
| while (text.includes('""')) { | |
| text = text.replace(/""/g, '"'); | |
| } | |
| while (text.includes("''")) { | |
| text = text.replace(/''/g, "'"); | |
| } | |
| while (text.includes("``")) { | |
| text = text.replace(/``/g, "`"); | |
| } | |
| // Remove extra spaces | |
| text = text.replace(/\s+/g, " ").trim(); | |
| // If text doesn't end with punctuation, quotes, or closing brackets, add a period | |
| if (!/[.!?;:,'"')\]}…。」』】〉》›»]$/.test(text)) { | |
| text += "."; | |
| } | |
| // Add language tags | |
| if (lang !== null) { | |
| if (!AVAILABLE_LANGS.includes(lang)) { | |
| throw new Error(`Invalid language: ${lang}`); | |
| } | |
| text = `<${lang}>` + text + `</${lang}>`; | |
| } else { | |
| text = `<na>` + text + `</na>`; | |
| } | |
| return text; | |
| } | |
| export function textToUnicodeValues(text) { | |
| return Array.from(text).map(char => char.charCodeAt(0)); | |
| } | |
| export function lengthToMask(lengths, maxLen = null) { | |
| maxLen = maxLen || Math.max(...lengths); | |
| const mask = []; | |
| for (let i = 0; i < lengths.length; i++) { | |
| const row = []; | |
| for (let j = 0; j < maxLen; j++) { | |
| row.push(j < lengths[i] ? 1.0 : 0.0); | |
| } | |
| mask.push([row]); | |
| } | |
| return mask; | |
| } | |
| export function getTextMask(textIdsLengths) { | |
| return lengthToMask(textIdsLengths); | |
| } | |
| export function getLatentMask(wavLengths, cfgs) { | |
| const baseChunkSize = cfgs.ae.base_chunk_size; | |
| const chunkCompressFactor = cfgs.ttl.chunk_compress_factor; | |
| const latentSize = baseChunkSize * chunkCompressFactor; | |
| const latentLengths = wavLengths.map(len => | |
| Math.floor((len + latentSize - 1) / latentSize) | |
| ); | |
| return lengthToMask(latentLengths); | |
| } | |
| export function sampleNoisyLatent(duration, cfgs) { | |
| const sampleRate = cfgs.ae.sample_rate; | |
| const baseChunkSize = cfgs.ae.base_chunk_size; | |
| const chunkCompressFactor = cfgs.ttl.chunk_compress_factor; | |
| const ldim = cfgs.ttl.latent_dim; | |
| const wavLenMax = Math.max(...duration.map(d => d[0][0])) * sampleRate; | |
| const wavLengths = duration.map(d => Math.floor(d[0][0] * sampleRate)); | |
| const chunkSize = baseChunkSize * chunkCompressFactor; | |
| const latentLen = Math.floor((wavLenMax + chunkSize - 1) / chunkSize); | |
| const latentDim = ldim * chunkCompressFactor; | |
| const noisyLatent = []; | |
| for (let b = 0; b < duration.length; b++) { | |
| const batch = []; | |
| for (let d = 0; d < latentDim; d++) { | |
| const row = []; | |
| for (let t = 0; t < latentLen; t++) { | |
| const u1 = Math.random(); | |
| const u2 = Math.random(); | |
| const randNormal = Math.sqrt(-2.0 * Math.log(u1)) * Math.cos(2.0 * Math.PI * u2); | |
| row.push(randNormal); | |
| } | |
| batch.push(row); | |
| } | |
| noisyLatent.push(batch); | |
| } | |
| const latentMask = getLatentMask(wavLengths, cfgs); | |
| for (let b = 0; b < noisyLatent.length; b++) { | |
| for (let d = 0; d < noisyLatent[b].length; d++) { | |
| for (let t = 0; t < noisyLatent[b][d].length; t++) { | |
| noisyLatent[b][d][t] *= latentMask[b][0][t]; | |
| } | |
| } | |
| } | |
| return { noisyLatent, latentMask }; | |
| } | |
| export async function loadOnnx(onnxPath, opts) { | |
| return await ort.InferenceSession.create(onnxPath, opts); | |
| } | |
| export async function loadOnnxAll(basePath, opts, onProgress) { | |
| const models = [ | |
| { name: 'Duration Predictor', path: `${basePath}/duration_predictor.onnx`, key: 'dpOrt' }, | |
| { name: 'Text Encoder', path: `${basePath}/text_encoder.onnx`, key: 'textEncOrt' }, | |
| { name: 'Vector Estimator', path: `${basePath}/vector_estimator.onnx`, key: 'vectorEstOrt' }, | |
| { name: 'Vocoder', path: `${basePath}/vocoder.onnx`, key: 'vocoderOrt' } | |
| ]; | |
| const result = {}; | |
| let loadedCount = 0; | |
| // Load all models in parallel | |
| const loadPromises = models.map(async (model) => { | |
| const session = await loadOnnx(model.path, opts); | |
| loadedCount++; | |
| if (onProgress) { | |
| onProgress(model.name, loadedCount, models.length); | |
| } | |
| return { key: model.key, session }; | |
| }); | |
| // Wait for all models to load | |
| const loadedModels = await Promise.all(loadPromises); | |
| // Organize results | |
| loadedModels.forEach(({ key, session }) => { | |
| result[key] = session; | |
| }); | |
| try { | |
| // Download counting | |
| await fetch('https://huggingface.co/Supertone/supertonic-2/resolve/main/config.json'); | |
| } catch (error) { | |
| console.warn('Failed to update download count:', error); | |
| } | |
| return result; | |
| } | |
| export async function loadCfgs(basePath) { | |
| const response = await fetch(`${basePath}/tts.json`); | |
| return await response.json(); | |
| } | |
| export async function loadProcessors(basePath) { | |
| const response = await fetch(`${basePath}/unicode_indexer.json`); | |
| const unicodeIndexerData = await response.json(); | |
| const textProcessor = new UnicodeProcessor(unicodeIndexerData); | |
| return { textProcessor }; | |
| } | |
| function parseWavFile(buffer) { | |
| const view = new DataView(buffer); | |
| // Check RIFF header | |
| const riff = String.fromCharCode(view.getUint8(0), view.getUint8(1), view.getUint8(2), view.getUint8(3)); | |
| if (riff !== 'RIFF') { | |
| throw new Error('Not a valid WAV file'); | |
| } | |
| const wave = String.fromCharCode(view.getUint8(8), view.getUint8(9), view.getUint8(10), view.getUint8(11)); | |
| if (wave !== 'WAVE') { | |
| throw new Error('Not a valid WAV file'); | |
| } | |
| let offset = 12; | |
| let fmtChunk = null; | |
| let dataChunk = null; | |
| while (offset < buffer.byteLength) { | |
| const chunkId = String.fromCharCode( | |
| view.getUint8(offset), | |
| view.getUint8(offset + 1), | |
| view.getUint8(offset + 2), | |
| view.getUint8(offset + 3) | |
| ); | |
| const chunkSize = view.getUint32(offset + 4, true); | |
| if (chunkId === 'fmt ') { | |
| fmtChunk = { | |
| audioFormat: view.getUint16(offset + 8, true), | |
| numChannels: view.getUint16(offset + 10, true), | |
| sampleRate: view.getUint32(offset + 12, true), | |
| bitsPerSample: view.getUint16(offset + 22, true) | |
| }; | |
| } else if (chunkId === 'data') { | |
| dataChunk = { | |
| offset: offset + 8, | |
| size: chunkSize | |
| }; | |
| break; | |
| } | |
| offset += 8 + chunkSize; | |
| } | |
| if (!fmtChunk || !dataChunk) { | |
| throw new Error('Invalid WAV file format'); | |
| } | |
| const bytesPerSample = fmtChunk.bitsPerSample / 8; | |
| const numSamples = Math.floor(dataChunk.size / (bytesPerSample * fmtChunk.numChannels)); | |
| const audioData = new Float32Array(numSamples); | |
| if (fmtChunk.bitsPerSample === 16) { | |
| for (let i = 0; i < numSamples; i++) { | |
| let sample = 0; | |
| for (let ch = 0; ch < fmtChunk.numChannels; ch++) { | |
| const sampleOffset = dataChunk.offset + (i * fmtChunk.numChannels + ch) * 2; | |
| sample += view.getInt16(sampleOffset, true); | |
| } | |
| audioData[i] = (sample / fmtChunk.numChannels) / 32768.0; | |
| } | |
| } else if (fmtChunk.bitsPerSample === 24) { | |
| // Support 24-bit PCM | |
| for (let i = 0; i < numSamples; i++) { | |
| let sample = 0; | |
| for (let ch = 0; ch < fmtChunk.numChannels; ch++) { | |
| const sampleOffset = dataChunk.offset + (i * fmtChunk.numChannels + ch) * 3; | |
| // Read 3 bytes and convert to signed 24-bit integer | |
| const byte1 = view.getUint8(sampleOffset); | |
| const byte2 = view.getUint8(sampleOffset + 1); | |
| const byte3 = view.getUint8(sampleOffset + 2); | |
| let value = (byte3 << 16) | (byte2 << 8) | byte1; | |
| // Convert to signed (two's complement) | |
| if (value & 0x800000) { | |
| value = value - 0x1000000; | |
| } | |
| sample += value; | |
| } | |
| audioData[i] = (sample / fmtChunk.numChannels) / 8388608.0; // 2^23 | |
| } | |
| } else if (fmtChunk.bitsPerSample === 32) { | |
| for (let i = 0; i < numSamples; i++) { | |
| let sample = 0; | |
| for (let ch = 0; ch < fmtChunk.numChannels; ch++) { | |
| const sampleOffset = dataChunk.offset + (i * fmtChunk.numChannels + ch) * 4; | |
| sample += view.getFloat32(sampleOffset, true); | |
| } | |
| audioData[i] = sample / fmtChunk.numChannels; | |
| } | |
| } else { | |
| throw new Error(`Unsupported bit depth: ${fmtChunk.bitsPerSample}. Supported formats: 16-bit, 24-bit, 32-bit`); | |
| } | |
| return { | |
| sampleRate: fmtChunk.sampleRate, | |
| audioData: audioData | |
| }; | |
| } | |
| export function arrayToTensor(array, dims) { | |
| const flat = array.flat(Infinity); | |
| return new ort.Tensor('float32', Float32Array.from(flat), dims); | |
| } | |
| export function intArrayToTensor(array, dims) { | |
| const flat = array.flat(Infinity); | |
| return new ort.Tensor('int64', BigInt64Array.from(flat.map(x => BigInt(x))), dims); | |
| } | |
| export function writeWavFile(audioData, sampleRate) { | |
| const numChannels = 1; | |
| const bitsPerSample = 16; | |
| const byteRate = sampleRate * numChannels * bitsPerSample / 8; | |
| const blockAlign = numChannels * bitsPerSample / 8; | |
| const dataSize = audioData.length * bitsPerSample / 8; | |
| const buffer = new ArrayBuffer(44 + dataSize); | |
| const view = new DataView(buffer); | |
| // RIFF header | |
| view.setUint8(0, 'R'.charCodeAt(0)); | |
| view.setUint8(1, 'I'.charCodeAt(0)); | |
| view.setUint8(2, 'F'.charCodeAt(0)); | |
| view.setUint8(3, 'F'.charCodeAt(0)); | |
| view.setUint32(4, 36 + dataSize, true); | |
| view.setUint8(8, 'W'.charCodeAt(0)); | |
| view.setUint8(9, 'A'.charCodeAt(0)); | |
| view.setUint8(10, 'V'.charCodeAt(0)); | |
| view.setUint8(11, 'E'.charCodeAt(0)); | |
| // fmt chunk | |
| view.setUint8(12, 'f'.charCodeAt(0)); | |
| view.setUint8(13, 'm'.charCodeAt(0)); | |
| view.setUint8(14, 't'.charCodeAt(0)); | |
| view.setUint8(15, ' '.charCodeAt(0)); | |
| view.setUint32(16, 16, true); | |
| view.setUint16(20, 1, true); // PCM | |
| view.setUint16(22, numChannels, true); | |
| view.setUint32(24, sampleRate, true); | |
| view.setUint32(28, byteRate, true); | |
| view.setUint16(32, blockAlign, true); | |
| view.setUint16(34, bitsPerSample, true); | |
| // data chunk | |
| view.setUint8(36, 'd'.charCodeAt(0)); | |
| view.setUint8(37, 'a'.charCodeAt(0)); | |
| view.setUint8(38, 't'.charCodeAt(0)); | |
| view.setUint8(39, 'a'.charCodeAt(0)); | |
| view.setUint32(40, dataSize, true); | |
| // Write audio data | |
| for (let i = 0; i < audioData.length; i++) { | |
| const sample = Math.max(-1, Math.min(1, audioData[i])); | |
| const intSample = Math.floor(sample * 32767); | |
| view.setInt16(44 + i * 2, intSample, true); | |
| } | |
| return buffer; | |
| } | |
| // Smooth scroll functionality | |
| document.addEventListener('DOMContentLoaded', () => { | |
| // Smooth scroll for anchor links | |
| document.querySelectorAll('a[href^="#"]').forEach(anchor => { | |
| anchor.addEventListener('click', function (e) { | |
| e.preventDefault(); | |
| const href = this.getAttribute('href'); | |
| const target = document.querySelector(href); | |
| if (target) { | |
| // Update URL with anchor | |
| if (history.pushState) { | |
| history.pushState(null, null, href); | |
| } | |
| target.scrollIntoView({ | |
| behavior: 'smooth', | |
| block: 'start' | |
| }); | |
| } | |
| }); | |
| }); | |
| // Add scroll animation for sections | |
| const observerOptions = { | |
| threshold: 0.1, | |
| rootMargin: '0px 0px -100px 0px' | |
| }; | |
| const observer = new IntersectionObserver((entries) => { | |
| entries.forEach(entry => { | |
| if (entry.isIntersecting) { | |
| entry.target.style.opacity = '1'; | |
| entry.target.style.transform = 'translateY(0)'; | |
| } | |
| }); | |
| }, observerOptions); | |
| }); | |
| // TTS Demo functionality | |
| (async function() { | |
| // Check if we're on a page with the TTS demo | |
| const demoTextInput = document.getElementById('demoTextInput'); | |
| if (!demoTextInput) return; | |
| // Configure ONNX Runtime for WebGPU support | |
| ort.env.wasm.wasmPaths = 'https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.0/dist/'; | |
| ort.env.wasm.numThreads = 1; | |
| // Configuration | |
| const REF_EMBEDDING_PATHS = { | |
| 'F1': 'assets/voice_styles/F1.json', | |
| 'F2': 'assets/voice_styles/F2.json', | |
| 'F3': 'assets/voice_styles/F3.json', | |
| 'F4': 'assets/voice_styles/F4.json', | |
| 'F5': 'assets/voice_styles/F5.json', | |
| 'M1': 'assets/voice_styles/M1.json', | |
| 'M2': 'assets/voice_styles/M2.json', | |
| 'M3': 'assets/voice_styles/M3.json', | |
| 'M4': 'assets/voice_styles/M4.json', | |
| 'M5': 'assets/voice_styles/M5.json' | |
| }; | |
| // Voice descriptions | |
| const VOICE_DESCRIPTIONS = { | |
| 'F1': 'Sarah - A calm female voice with a slightly low tone; steady and composed.', | |
| 'F2': 'Lily - A bright, cheerful female voice; lively, playful, and youthful with spirited energy.', | |
| 'F3': 'Jessica - A clear, professional announcer-style female voice; articulate and broadcast-ready.', | |
| 'F4': 'Olivia - A crisp, confident female voice; distinct and expressive with strong delivery.', | |
| 'F5': 'Emily - A kind, gentle female voice; soft-spoken, calm, and naturally soothing.', | |
| 'M1': 'Alex - A lively, upbeat male voice with confident energy and a standard, clear tone.', | |
| 'M2': 'James - A deep, robust male voice; calm, composed, and serious with a grounded presence.', | |
| 'M3': 'Robert - A polished, authoritative male voice; confident and trustworthy with strong presentation quality.', | |
| 'M4': 'Sam - A soft, neutral-toned male voice; gentle and approachable with a youthful, friendly quality.', | |
| 'M5': 'Daniel - A warm, soft-spoken male voice; calm and soothing with a natural storytelling quality.' | |
| }; | |
| // Global state | |
| let models = null; | |
| let cfgs = null; | |
| let processors = null; | |
| let currentVoice = 'M3'; // Default to Robert voice | |
| // Detect browser language and set initial language | |
| function detectBrowserLanguage() { | |
| // Get browser language (works in Chrome, Safari, Edge, Firefox, Opera, Samsung Internet) | |
| const browserLang = navigator.language || navigator.userLanguage || 'en'; | |
| // Extract language code (e.g., 'en-US' -> 'en', 'ko-KR' -> 'ko') | |
| const langCode = browserLang.split('-')[0].toLowerCase(); | |
| // Supported languages | |
| const supportedLangs = ['en', 'es', 'pt', 'fr', 'ko']; | |
| // Return detected language if supported, otherwise default to English | |
| return supportedLangs.includes(langCode) ? langCode : 'en'; | |
| } | |
| let currentLanguage = detectBrowserLanguage(); // Auto-detect from browser | |
| let refEmbeddingCache = {}; // Cache for embeddings | |
| let currentStyleTtlTensor = null; | |
| let currentStyleDpTensor = null; | |
| let modelsLoading = false; // Track if models are currently loading | |
| let modelsLoaded = false; // Track if models are fully loaded | |
| let modelsLoadPromise = null; // Promise for model loading | |
| // UI Elements | |
| const demoStatusBox = document.getElementById('demoStatusBox'); | |
| const demoStatusText = document.getElementById('demoStatusText'); | |
| const wasmWarningBanner = document.getElementById('wasmWarningBanner'); | |
| const demoGenerateBtn = document.getElementById('demoGenerateBtn'); | |
| const demoTotalSteps = document.getElementById('demoTotalSteps'); | |
| const demoSpeed = document.getElementById('demoSpeed'); | |
| const demoTotalStepsValue = document.getElementById('demoTotalStepsValue'); | |
| const demoSpeedValue = document.getElementById('demoSpeedValue'); | |
| const demoResults = document.getElementById('demoResults'); | |
| const demoError = document.getElementById('demoError'); | |
| const demoCharCount = document.getElementById('demoCharCount'); | |
| const demoCharCounter = document.getElementById('demoCharCounter'); | |
| const demoCharWarning = document.getElementById('demoCharWarning'); | |
| // Text validation constants | |
| const MIN_CHARS = 10; | |
| const MAX_CHUNK_LENGTH_DEFAULT = 300; // Maximum length for each chunk (default) | |
| const MAX_CHUNK_LENGTH_KO = 120; // Maximum length for Korean | |
| function getMaxChunkLength() { | |
| return currentLanguage === 'ko' ? MAX_CHUNK_LENGTH_KO : MAX_CHUNK_LENGTH_DEFAULT; | |
| } | |
| // Custom audio player state (shared across generations) | |
| let audioContext = null; | |
| let scheduledSources = []; | |
| let audioChunks = []; | |
| let totalDuration = 0; | |
| let startTime = 0; | |
| let pauseTime = 0; | |
| let isPaused = false; | |
| let isPlaying = false; | |
| let animationFrameId = null; | |
| let playPauseBtn = null; | |
| let progressBar = null; | |
| let currentTimeDisplay = null; | |
| let durationDisplay = null; | |
| let progressFill = null; | |
| let firstChunkGenerationTime = 0; // Processing time for first chunk | |
| let totalChunks = 0; | |
| let nextScheduledTime = 0; // Next time to schedule audio chunk | |
| let currentGenerationTextLength = 0; | |
| let supertonicPlayerRecord = null; // Supertonic player record for cross-player pause management | |
| let isGenerating = false; // Track if speech generation is in progress | |
| // Track all custom audio players | |
| let customAudioPlayers = []; | |
| const isMobileViewport = () => window.matchMedia('(max-width: 768px)').matches; | |
| // Check if device actually supports touch (not just viewport size) | |
| const isTouchDevice = () => 'ontouchstart' in window || navigator.maxTouchPoints > 0; | |
| const trimDecimalsForMobile = (formatted) => { | |
| if (!formatted) return formatted; | |
| return isMobileViewport() ? formatted.replace(/\.\d{2}$/, '') : formatted; | |
| }; | |
| function pauseAllPlayersExcept(currentPlayer) { | |
| customAudioPlayers.forEach(player => { | |
| if (player !== currentPlayer && player && typeof player.pausePlayback === 'function') { | |
| player.pausePlayback(); | |
| } | |
| }); | |
| } | |
| /** | |
| * Chunk text into smaller pieces based on sentence boundaries | |
| * @param {string} text - The text to chunk | |
| * @param {number} maxLen - Maximum length for each chunk | |
| * @returns {Array<string>} - Array of text chunks | |
| */ | |
| function chunkText(text, maxLen = getMaxChunkLength()) { | |
| // Split by paragraph (two or more newlines) | |
| const paragraphs = text.trim().split(/\n\s*\n+/).filter(p => p.trim()); | |
| const chunks = []; | |
| for (let paragraph of paragraphs) { | |
| paragraph = paragraph.trim(); | |
| if (!paragraph) continue; | |
| // Split by sentence boundaries (period, question mark, exclamation mark followed by space) | |
| // But exclude common abbreviations like Mr., Mrs., Dr., etc. and single capital letters like F. | |
| const sentences = paragraph.split(/(?<!Mr\.|Mrs\.|Ms\.|Dr\.|Prof\.|Sr\.|Jr\.|Ph\.D\.|etc\.|e\.g\.|i\.e\.|vs\.|Inc\.|Ltd\.|Co\.|Corp\.|St\.|Ave\.|Blvd\.)(?<!\b[A-Z]\.)(?<=[.!?])\s+/); | |
| let currentChunk = ""; | |
| for (let sentence of sentences) { | |
| if (currentChunk.length + sentence.length + 1 <= maxLen) { | |
| currentChunk += (currentChunk ? " " : "") + sentence; | |
| } else { | |
| if (currentChunk) { | |
| chunks.push(currentChunk.trim()); | |
| } | |
| currentChunk = sentence; | |
| } | |
| } | |
| if (currentChunk) { | |
| chunks.push(currentChunk.trim()); | |
| } | |
| } | |
| return chunks; | |
| } | |
| function showDemoStatus(message, type = 'info', progress = null) { | |
| demoStatusText.innerHTML = message; | |
| demoStatusBox.className = 'demo-status-box'; | |
| demoStatusBox.style.removeProperty('--status-progress'); | |
| demoStatusBox.style.display = ''; // Show the status box | |
| if (type === 'success') { | |
| demoStatusBox.classList.add('success'); | |
| } else if (type === 'error') { | |
| demoStatusBox.classList.add('error'); | |
| } | |
| // Update progress bar | |
| if (progress !== null && progress >= 0 && progress <= 100) { | |
| const clampedProgress = Math.max(0, Math.min(progress, 100)); | |
| demoStatusBox.style.setProperty('--status-progress', `${clampedProgress}%`); | |
| demoStatusBox.classList.toggle('complete', clampedProgress >= 100); | |
| } else if (type === 'success' || type === 'error') { | |
| demoStatusBox.style.removeProperty('--status-progress'); | |
| demoStatusBox.classList.remove('complete'); | |
| } else { | |
| demoStatusBox.style.removeProperty('--status-progress'); | |
| demoStatusBox.classList.remove('complete'); | |
| } | |
| } | |
| function hideDemoStatus() { | |
| demoStatusBox.style.display = 'none'; | |
| } | |
| function showDemoError(message) { | |
| demoError.textContent = message; | |
| demoError.classList.add('active'); | |
| } | |
| function hideDemoError() { | |
| demoError.classList.remove('active'); | |
| } | |
| // Language toast notification | |
| const languageToast = document.getElementById('languageToast'); | |
| const languageToastMessage = document.getElementById('languageToastMessage'); | |
| let languageToastTimeout = null; | |
| function showLanguageToast(fromLang, toLang) { | |
| if (!languageToast || !languageToastMessage) return; | |
| const fromName = LANGUAGE_NAMES[fromLang] || fromLang; | |
| const toName = LANGUAGE_NAMES[toLang] || toLang; | |
| languageToastMessage.innerHTML = `Language auto-detected: <strong>${toName}</strong>`; | |
| // Clear any existing timeout | |
| if (languageToastTimeout) { | |
| clearTimeout(languageToastTimeout); | |
| } | |
| // Show toast | |
| languageToast.classList.add('show'); | |
| // Hide after 3 seconds | |
| languageToastTimeout = setTimeout(() => { | |
| languageToast.classList.remove('show'); | |
| }, 3000); | |
| } | |
| function showWasmWarning() { | |
| if (wasmWarningBanner) { | |
| wasmWarningBanner.style.display = 'flex'; | |
| } | |
| } | |
| // Validate characters in text | |
| function validateCharacters(text) { | |
| if (!processors || !processors.textProcessor) { | |
| return { valid: true, unsupportedChars: [] }; | |
| } | |
| try { | |
| // Extract unique characters to minimize preprocessText calls | |
| const uniqueChars = [...new Set(text)]; | |
| // Build mapping for unique chars only (much faster for long texts) | |
| // For example, Korean '간' -> 'ㄱㅏㄴ', so we map 'ㄱ','ㅏ','ㄴ' -> '간' | |
| const processedToOriginal = new Map(); | |
| const charToProcessed = new Map(); | |
| for (const char of uniqueChars) { | |
| const processedChar = preprocessText(char); | |
| charToProcessed.set(char, processedChar); | |
| // Map each processed character back to its original | |
| for (const pc of processedChar) { | |
| if (!processedToOriginal.has(pc)) { | |
| processedToOriginal.set(pc, new Set()); | |
| } | |
| processedToOriginal.get(pc).add(char); | |
| } | |
| } | |
| // Build full processed text using cached mappings | |
| const fullProcessedText = Array.from(text).map(c => charToProcessed.get(c)).join(''); | |
| // Check the entire processed text once (efficient) | |
| const { unsupportedChars } = processors.textProcessor.call([fullProcessedText]); | |
| // Map unsupported processed chars back to original chars | |
| const unsupportedOriginalChars = new Set(); | |
| if (unsupportedChars && unsupportedChars.length > 0) { | |
| for (const unsupportedChar of unsupportedChars) { | |
| const originalChars = processedToOriginal.get(unsupportedChar); | |
| if (originalChars) { | |
| originalChars.forEach(c => unsupportedOriginalChars.add(c)); | |
| } | |
| } | |
| } | |
| const unsupportedCharsArray = Array.from(unsupportedOriginalChars); | |
| return { | |
| valid: unsupportedCharsArray.length === 0, | |
| unsupportedChars: unsupportedCharsArray | |
| }; | |
| } catch (error) { | |
| return { valid: true, unsupportedChars: [] }; | |
| } | |
| } | |
| // Update character counter and validate text length | |
| function updateCharCounter() { | |
| const rawText = demoTextInput.textContent || demoTextInput.innerText || ''; | |
| const text = rawText.replace(/\n$/g, ''); // Remove trailing newline that browsers may add | |
| const length = text.length; | |
| demoCharCount.textContent = length; | |
| // Get the actual width of the textarea | |
| const textareaWidth = demoTextInput.offsetWidth; | |
| // Max width reference: 1280px (container max-width) / 2 (grid column) - padding/gap ≈ 638px | |
| // Using 640px as reference for easier calculation | |
| const maxWidthRef = 640; | |
| // Calculate font size based on width ratio | |
| // Original rem values at max-width (640px): | |
| // 5rem = 80px @ 16px base → 80/640 = 12.5% | |
| // 4rem = 64px → 64/640 = 10% | |
| // 3rem = 48px → 48/640 = 7.5% | |
| // 2.5rem = 40px → 40/640 = 6.25% | |
| // 2rem = 32px → 32/640 = 5% | |
| // 1.5rem = 24px → 24/640 = 3.75% | |
| // 1rem = 16px → 16/640 = 2.5% | |
| // Check if mobile (572px or less) for 2x font size scaling | |
| const isMobile = window.innerWidth <= 572; | |
| const mobileMultiplier = isMobile ? 2 : 1; | |
| let fontSizeRatio; | |
| if (length <= 100) { | |
| fontSizeRatio = 0.055 * mobileMultiplier; // 5.5% of width | |
| } else if (length <= 200) { | |
| fontSizeRatio = 0.04 * mobileMultiplier; // 4% of width | |
| } else if (length < 240) { | |
| fontSizeRatio = 0.053125 * mobileMultiplier; // ~5.3125% of width (scaled from 2.5rem) | |
| } else if (length < 400) { | |
| fontSizeRatio = 0.0425 * mobileMultiplier; // ~4.25% of width (scaled from 2rem) | |
| } else if (length < 700) { | |
| fontSizeRatio = 0.031875 * mobileMultiplier; // ~3.1875% of width (scaled from 1.5rem) | |
| } else { | |
| fontSizeRatio = 0.025 * mobileMultiplier; // 2.5% of width (minimum stays the same) | |
| } | |
| // Calculate font size based on actual width | |
| const fontSize = textareaWidth * fontSizeRatio; | |
| demoTextInput.style.fontSize = `${fontSize}px`; | |
| // Remove all status classes | |
| demoCharCounter.classList.remove('error', 'warning', 'valid'); | |
| // Check for unsupported characters first (only if models are loaded) | |
| let hasUnsupportedChars = false; | |
| if (models && processors && length > 0) { | |
| const validation = validateCharacters(text); | |
| if (!validation.valid && validation.unsupportedChars.length > 0) { | |
| hasUnsupportedChars = true; | |
| const charList = validation.unsupportedChars.slice(0, 5).map(c => `"${c}"`).join(', '); | |
| const moreChars = validation.unsupportedChars.length > 5 ? ` and ${validation.unsupportedChars.length - 5} more` : ''; | |
| showDemoError(`Unsupported characters detected: ${charList}${moreChars}. Please remove them before generating speech.`); | |
| } else { | |
| hideDemoError(); | |
| } | |
| } | |
| // Update status based on length and character validation | |
| if (length < MIN_CHARS) { | |
| demoCharCounter.classList.add('error'); | |
| demoCharWarning.textContent = '(At least 10 characters)'; | |
| demoGenerateBtn.disabled = true; | |
| } else if (hasUnsupportedChars) { | |
| demoCharCounter.classList.add('error'); | |
| demoCharWarning.textContent = '(Unsupported characters)'; | |
| demoGenerateBtn.disabled = true; | |
| } else { | |
| demoCharCounter.classList.add('valid'); | |
| demoCharWarning.textContent = ''; | |
| // Enable only if models are loaded AND not currently generating | |
| demoGenerateBtn.disabled = !models || isGenerating; | |
| } | |
| } | |
| // Validate text input | |
| function validateTextInput(text) { | |
| if (!text || text.trim().length === 0) { | |
| return { valid: false, message: 'Please enter some text.' }; | |
| } | |
| if (text.length < MIN_CHARS) { | |
| return { valid: false, message: `Text must be at least ${MIN_CHARS} characters long. (Currently ${text.length})` }; | |
| } | |
| return { valid: true }; | |
| } | |
| // Load pre-extracted style embeddings from JSON | |
| async function loadStyleEmbeddings(voice) { | |
| try { | |
| // Check if already cached | |
| if (refEmbeddingCache[voice]) { | |
| return refEmbeddingCache[voice]; | |
| } | |
| const embeddingPath = REF_EMBEDDING_PATHS[voice]; | |
| if (!embeddingPath) { | |
| throw new Error(`No embedding path configured for voice: ${voice}`); | |
| } | |
| const response = await fetch(embeddingPath); | |
| if (!response.ok) { | |
| throw new Error(`Failed to fetch embedding: ${response.statusText}`); | |
| } | |
| const embeddingData = await response.json(); | |
| // Convert JSON data to ONNX tensors | |
| // Flatten nested arrays before creating Float32Array | |
| const styleTtlData = embeddingData.style_ttl.data.flat(Infinity); | |
| const styleTtlTensor = new ort.Tensor( | |
| embeddingData.style_ttl.type || 'float32', | |
| Float32Array.from(styleTtlData), | |
| embeddingData.style_ttl.dims | |
| ); | |
| const styleDpData = embeddingData.style_dp.data.flat(Infinity); | |
| const styleDpTensor = new ort.Tensor( | |
| embeddingData.style_dp.type || 'float32', | |
| Float32Array.from(styleDpData), | |
| embeddingData.style_dp.dims | |
| ); | |
| const embeddings = { | |
| styleTtl: styleTtlTensor, | |
| styleDp: styleDpTensor | |
| }; | |
| // Cache the embeddings | |
| refEmbeddingCache[voice] = embeddings; | |
| return embeddings; | |
| } catch (error) { | |
| throw error; | |
| } | |
| } | |
| // Switch to a different voice | |
| async function switchVoice(voice) { | |
| try { | |
| const embeddings = await loadStyleEmbeddings(voice); | |
| currentStyleTtlTensor = embeddings.styleTtl; | |
| currentStyleDpTensor = embeddings.styleDp; | |
| currentVoice = voice; | |
| // Update active speaker in UI | |
| if (typeof window.updateActiveSpeaker === 'function') { | |
| window.updateActiveSpeaker(voice); | |
| } | |
| // Re-validate text after switching voice | |
| updateCharCounter(); | |
| } catch (error) { | |
| showDemoError(`Failed to load voice ${voice}: ${error.message}`); | |
| throw error; | |
| } | |
| } | |
| // Check WebGPU support more thoroughly | |
| async function checkWebGPUSupport() { | |
| try { | |
| // Detect iOS/Safari | |
| const isIOS = /iPad|iPhone|iPod/.test(navigator.userAgent) || | |
| (navigator.platform === 'MacIntel' && navigator.maxTouchPoints > 1); | |
| const isSafari = /^((?!chrome|crios|android|edg|firefox).)*safari/i.test(navigator.userAgent); | |
| // iOS and Safari have incomplete WebGPU support | |
| if (isIOS) { | |
| return { supported: false, reason: 'iOS does not support the required WebGPU features' }; | |
| } | |
| if (isSafari) { | |
| // Desktop Safari might work, but check carefully | |
| return { supported: false, reason: 'Safari does not support the required WebGPU features' }; | |
| } | |
| // Check if WebGPU is available in the browser | |
| if (!navigator.gpu) { | |
| return { supported: false, reason: 'WebGPU not available in this browser' }; | |
| } | |
| // Request adapter | |
| const adapter = await navigator.gpu.requestAdapter(); | |
| if (!adapter) { | |
| return { supported: false, reason: 'No WebGPU adapter found' }; | |
| } | |
| // Check adapter info | |
| try { | |
| const adapterInfo = await adapter.requestAdapterInfo(); | |
| } catch (infoError) { | |
| // Ignore adapter info errors | |
| } | |
| // Request device to test if it actually works | |
| const device = await adapter.requestDevice(); | |
| if (!device) { | |
| return { supported: false, reason: 'Failed to create WebGPU device' }; | |
| } | |
| return { supported: true, adapter, device }; | |
| } catch (error) { | |
| // Handle specific iOS/Safari errors | |
| const errorMsg = error.message || ''; | |
| if (errorMsg.includes('subgroupMinSize') || errorMsg.includes('subgroup')) { | |
| return { supported: false, reason: 'iOS/Safari does not support required WebGPU features (subgroup operations)' }; | |
| } | |
| return { supported: false, reason: error.message }; | |
| } | |
| } | |
| // Warmup models with dummy inference (no audio playback, no UI updates) | |
| async function warmupModels() { | |
| try { | |
| const dummyText = 'Looking to integrate Supertonic into your product? We offer customized on-device SDK solutions tailored to your business needs. Our lightweight, high-performance TTS technology can be seamlessly integrated into mobile apps, IoT devices, automotive systems, and more. Try it now, and enjoy its speed.'; | |
| const totalStep = 5; // Use minimal steps for faster warmup | |
| const durationFactor = 1.0; | |
| const textList = [dummyText]; | |
| const bsz = 1; | |
| // Use pre-computed style embeddings | |
| const styleTtlTensor = currentStyleTtlTensor; | |
| const styleDpTensor = currentStyleDpTensor; | |
| // Step 1: Estimate duration | |
| const { textIds, textMask } = processors.textProcessor.call(textList, currentLanguage); | |
| const textIdsShape = [bsz, textIds[0].length]; | |
| const textMaskShape = [bsz, 1, textMask[0][0].length]; | |
| const textMaskTensor = arrayToTensor(textMask, textMaskShape); | |
| const dpResult = await models.dpOrt.run({ | |
| text_ids: intArrayToTensor(textIds, textIdsShape), | |
| style_dp: styleDpTensor, | |
| text_mask: textMaskTensor | |
| }); | |
| const durOnnx = Array.from(dpResult.duration.data); | |
| for (let i = 0; i < durOnnx.length; i++) { | |
| durOnnx[i] *= durationFactor; | |
| } | |
| const durReshaped = []; | |
| for (let b = 0; b < bsz; b++) { | |
| durReshaped.push([[durOnnx[b]]]); | |
| } | |
| // Step 2: Encode text | |
| const textEncResult = await models.textEncOrt.run({ | |
| text_ids: intArrayToTensor(textIds, textIdsShape), | |
| style_ttl: styleTtlTensor, | |
| text_mask: textMaskTensor | |
| }); | |
| const textEmbTensor = textEncResult.text_emb; | |
| // Step 3: Denoising | |
| let { noisyLatent, latentMask } = sampleNoisyLatent(durReshaped, cfgs); | |
| const latentShape = [bsz, noisyLatent[0].length, noisyLatent[0][0].length]; | |
| const latentMaskShape = [bsz, 1, latentMask[0][0].length]; | |
| const latentMaskTensor = arrayToTensor(latentMask, latentMaskShape); | |
| const totalStepArray = new Array(bsz).fill(totalStep); | |
| const scalarShape = [bsz]; | |
| const totalStepTensor = arrayToTensor(totalStepArray, scalarShape); | |
| for (let step = 0; step < totalStep; step++) { | |
| const currentStepArray = new Array(bsz).fill(step); | |
| const vectorEstResult = await models.vectorEstOrt.run({ | |
| noisy_latent: arrayToTensor(noisyLatent, latentShape), | |
| text_emb: textEmbTensor, | |
| style_ttl: styleTtlTensor, | |
| text_mask: textMaskTensor, | |
| latent_mask: latentMaskTensor, | |
| total_step: totalStepTensor, | |
| current_step: arrayToTensor(currentStepArray, scalarShape) | |
| }); | |
| const denoisedLatent = Array.from(vectorEstResult.denoised_latent.data); | |
| // Update latent | |
| let idx = 0; | |
| for (let b = 0; b < noisyLatent.length; b++) { | |
| for (let d = 0; d < noisyLatent[b].length; d++) { | |
| for (let t = 0; t < noisyLatent[b][d].length; t++) { | |
| noisyLatent[b][d][t] = denoisedLatent[idx++]; | |
| } | |
| } | |
| } | |
| } | |
| // Step 4: Generate waveform | |
| const vocoderResult = await models.vocoderOrt.run({ | |
| latent: arrayToTensor(noisyLatent, latentShape) | |
| }); | |
| // Warmup complete - no need to process the audio further | |
| } catch (error) { | |
| console.warn('Warmup failed (non-critical):', error.message); | |
| // Don't throw - warmup failure shouldn't prevent normal usage | |
| } | |
| } | |
| // Load models on page load | |
| async function initializeModels() { | |
| // If models are already loading, return the existing promise | |
| if (modelsLoading && modelsLoadPromise) { | |
| return modelsLoadPromise; | |
| } | |
| // If models are already loaded, return immediately | |
| if (modelsLoaded && models) { | |
| return; | |
| } | |
| modelsLoading = true; | |
| // Disable speaker selection during model loading | |
| const speakerItemsForLoading = document.querySelectorAll('.speaker-item[data-voice]'); | |
| speakerItemsForLoading.forEach(item => item.classList.add('disabled')); | |
| // Disable language selection during model loading | |
| const languageItemsForLoading = document.querySelectorAll('.speaker-item[data-language]'); | |
| languageItemsForLoading.forEach(item => item.classList.add('disabled')); | |
| modelsLoadPromise = (async () => { | |
| try { | |
| showDemoStatus('<strong>Loading configuration...</strong>', 'info', 5); | |
| const basePath = 'assets/onnx'; | |
| // Load config | |
| cfgs = await loadCfgs(basePath); | |
| // Check WebGPU support first | |
| showDemoStatus('<strong>Checking WebGPU support...</strong>', 'info', 8); | |
| const webgpuCheck = await checkWebGPUSupport(); | |
| // Determine execution provider based on WebGPU support | |
| const useWebGPU = webgpuCheck.supported; | |
| const executionProvider = useWebGPU ? 'webgpu' : 'wasm'; | |
| // If WebGPU is not supported, show subtle warning banner | |
| if (!useWebGPU) { | |
| showWasmWarning(); | |
| } | |
| // Load models with appropriate backend | |
| const backendName = useWebGPU ? 'WebGPU' : 'WASM'; | |
| showDemoStatus(`<strong>${backendName} detected! Loading models...</strong>`, 'info', 10); | |
| const modelsLoadPromise = loadOnnxAll(basePath, { | |
| executionProviders: [executionProvider], | |
| graphOptimizationLevel: 'all' | |
| }, (modelName, current, total) => { | |
| const progress = 10 + (current / total) * 70; // 10-80% for model loading | |
| showDemoStatus(`<strong>Loading models with ${backendName} (${current}/${total}):</strong> ${modelName}...`, 'info', progress); | |
| }); | |
| // Load processors in parallel with models | |
| const [loadedModels, loadedProcessors] = await Promise.all([ | |
| modelsLoadPromise, | |
| loadProcessors(basePath) | |
| ]); | |
| models = loadedModels; | |
| processors = loadedProcessors; | |
| showDemoStatus('<strong>Loading reference embeddings...</strong>', 'info', 85); | |
| // Load pre-extracted embeddings for default voice | |
| const embeddings = await loadStyleEmbeddings(currentVoice); | |
| currentStyleTtlTensor = embeddings.styleTtl; | |
| currentStyleDpTensor = embeddings.styleDp; | |
| showDemoStatus('<strong>Warming up models...</strong>', 'info', 90); | |
| // Warmup step: run inference once in background with dummy text | |
| await warmupModels(); | |
| hideDemoStatus(); | |
| demoGenerateBtn.disabled = false; | |
| demoTotalSteps.disabled = false; | |
| demoSpeed.disabled = false; | |
| // Enable voice toggle buttons after models are loaded | |
| const voiceToggleTexts = document.querySelectorAll('.voice-toggle-text'); | |
| voiceToggleTexts.forEach(text => text.classList.remove('disabled')); | |
| // Validate initial text now that models are loaded | |
| updateCharCounter(); | |
| // Mark models as loaded | |
| modelsLoaded = true; | |
| modelsLoading = false; | |
| // Re-enable speaker selection after model loading | |
| speakerItemsForLoading.forEach(item => item.classList.remove('disabled')); | |
| // Re-enable language selection after model loading | |
| languageItemsForLoading.forEach(item => item.classList.remove('disabled')); | |
| } catch (error) { | |
| modelsLoading = false; | |
| // Re-enable speaker selection on error too | |
| speakerItemsForLoading.forEach(item => item.classList.remove('disabled')); | |
| // Re-enable language selection on error too | |
| languageItemsForLoading.forEach(item => item.classList.remove('disabled')); | |
| showDemoStatus(`<strong>Error:</strong> ${error.message}`, 'error'); | |
| showDemoError(`Failed to initialize: ${error.message}. Check console for details.`); | |
| throw error; | |
| } | |
| })(); | |
| return modelsLoadPromise; | |
| } | |
| // Supertonic synthesis function (extracted for parallel execution) | |
| async function generateSupertonicSpeech(text, totalStep, durationFactor) { | |
| const supertonicStartTime = Date.now(); | |
| try { | |
| const textList = [text]; | |
| const bsz = 1; | |
| const sampleRate = cfgs.ae.sample_rate; | |
| // Use pre-computed style embeddings | |
| const styleTtlTensor = currentStyleTtlTensor; | |
| const styleDpTensor = currentStyleDpTensor; | |
| // Step 1: Estimate duration | |
| const { textIds, textMask, unsupportedChars } = processors.textProcessor.call(textList, currentLanguage); | |
| // Check for unsupported characters | |
| if (unsupportedChars && unsupportedChars.length > 0) { | |
| const charList = unsupportedChars.map(c => `"${c}"`).join(', '); | |
| throw new Error(`Unsupported characters: ${charList}`); | |
| } | |
| const textIdsShape = [bsz, textIds[0].length]; | |
| const textMaskShape = [bsz, 1, textMask[0][0].length]; | |
| const textMaskTensor = arrayToTensor(textMask, textMaskShape); | |
| const dpResult = await models.dpOrt.run({ | |
| text_ids: intArrayToTensor(textIds, textIdsShape), | |
| style_dp: styleDpTensor, | |
| text_mask: textMaskTensor | |
| }); | |
| const durOnnx = Array.from(dpResult.duration.data); | |
| // Apply duration factor to adjust speech length (once) | |
| for (let i = 0; i < durOnnx.length; i++) { | |
| durOnnx[i] *= durationFactor; | |
| } | |
| const durReshaped = []; | |
| for (let b = 0; b < bsz; b++) { | |
| durReshaped.push([[durOnnx[b]]]); | |
| } | |
| // Step 2: Encode text | |
| const textEncResult = await models.textEncOrt.run({ | |
| text_ids: intArrayToTensor(textIds, textIdsShape), | |
| style_ttl: styleTtlTensor, | |
| text_mask: textMaskTensor | |
| }); | |
| const textEmbTensor = textEncResult.text_emb; | |
| // Step 3: Denoising | |
| let { noisyLatent, latentMask } = sampleNoisyLatent(durReshaped, cfgs); | |
| const latentDim = noisyLatent[0].length; | |
| const latentLen = noisyLatent[0][0].length; | |
| const latentShape = [bsz, latentDim, latentLen]; | |
| const latentMaskShape = [bsz, 1, latentMask[0][0].length]; | |
| const latentMaskTensor = arrayToTensor(latentMask, latentMaskShape); | |
| // Pre-allocate flat buffer for latent data to avoid repeated allocations | |
| const latentBufferSize = bsz * latentDim * latentLen; | |
| const latentBuffer = new Float32Array(latentBufferSize); | |
| // Initialize latent buffer from noisyLatent | |
| let initIdx = 0; | |
| for (let b = 0; b < bsz; b++) { | |
| for (let d = 0; d < latentDim; d++) { | |
| for (let t = 0; t < latentLen; t++) { | |
| latentBuffer[initIdx++] = noisyLatent[b][d][t]; | |
| } | |
| } | |
| } | |
| // Prepare constant tensors | |
| const scalarShape = [bsz]; | |
| const totalStepTensor = arrayToTensor(new Array(bsz).fill(totalStep), scalarShape); | |
| // Pre-create all step tensors to avoid repeated allocations | |
| const stepTensors = []; | |
| for (let step = 0; step < totalStep; step++) { | |
| stepTensors.push(arrayToTensor(new Array(bsz).fill(step), scalarShape)); | |
| } | |
| for (let step = 0; step < totalStep; step++) { | |
| // Create tensor from pre-allocated buffer | |
| const noisyLatentTensor = new ort.Tensor('float32', latentBuffer, latentShape); | |
| const vectorEstResult = await models.vectorEstOrt.run({ | |
| noisy_latent: noisyLatentTensor, | |
| text_emb: textEmbTensor, | |
| style_ttl: styleTtlTensor, | |
| text_mask: textMaskTensor, | |
| latent_mask: latentMaskTensor, | |
| total_step: totalStepTensor, | |
| current_step: stepTensors[step] | |
| }); | |
| // Copy denoised result directly into pre-allocated buffer | |
| const denoisedData = vectorEstResult.denoised_latent.data; | |
| latentBuffer.set(denoisedData); | |
| } | |
| // Step 4: Generate waveform - use latentBuffer directly | |
| const vocoderResult = await models.vocoderOrt.run({ | |
| latent: new ort.Tensor('float32', latentBuffer, latentShape) | |
| }); | |
| const wavBatch = vocoderResult.wav_tts.data; | |
| const wavLen = Math.floor(sampleRate * durOnnx[0]); | |
| // Create a copy of the audio data (not a view) to prevent buffer reuse issues | |
| const audioData = wavBatch.slice(0, wavLen); | |
| // Calculate times for Supertonic | |
| const supertonicEndTime = Date.now(); | |
| const supertonicProcessingTime = (supertonicEndTime - supertonicStartTime) / 1000; | |
| const audioDurationSec = durOnnx[0]; | |
| return { | |
| success: true, | |
| processingTime: supertonicProcessingTime, | |
| audioDuration: audioDurationSec, | |
| audioData: audioData, | |
| sampleRate: sampleRate, | |
| text: text | |
| }; | |
| } catch (error) { | |
| return { | |
| success: false, | |
| error: error.message, | |
| text: text | |
| }; | |
| } | |
| } | |
| // Format time: 60초 미만 -> 00.00, 60분 미만 -> 00:00.00, 60분 이상 -> 00:00:00.00 | |
| function formatTimeDetailed(seconds) { | |
| const hours = Math.floor(seconds / 3600); | |
| const mins = Math.floor((seconds % 3600) / 60); | |
| const secs = seconds % 60; | |
| const ms = Math.floor((secs % 1) * 100); | |
| const wholeSecs = Math.floor(secs); | |
| if (seconds < 60) { | |
| return `${wholeSecs.toString().padStart(2, '0')}.${ms.toString().padStart(2, '0')}`; | |
| } else if (seconds < 3600) { | |
| return `${mins.toString().padStart(2, '0')}:${wholeSecs.toString().padStart(2, '0')}.${ms.toString().padStart(2, '0')}`; | |
| } else { | |
| return `${hours.toString().padStart(2, '0')}:${mins.toString().padStart(2, '0')}:${wholeSecs.toString().padStart(2, '0')}.${ms.toString().padStart(2, '0')}`; | |
| } | |
| } | |
| // Generate Supertonic speech with chunking support and progressive playback | |
| async function generateSupertonicSpeechChunked(text, totalStep, durationFactor, onFirstChunkReady, onChunkAdded) { | |
| const supertonicStartTime = Date.now(); | |
| const sampleRate = cfgs.ae.sample_rate; | |
| const silenceDuration = 0.3; // 0.3 seconds of silence between chunks | |
| try { | |
| // Split text into chunks | |
| const chunks = chunkText(text); | |
| const audioDataArrays = []; | |
| const durations = []; | |
| const silenceSamples = Math.floor(silenceDuration * sampleRate); | |
| let firstChunkEndTime = 0; | |
| let firstChunkTime = 0; | |
| // Generate speech for each chunk | |
| for (let i = 0; i < chunks.length; i++) { | |
| const chunkText = chunks[i]; | |
| const result = await generateSupertonicSpeech(chunkText, totalStep, durationFactor); | |
| if (!result.success) { | |
| throw new Error(`Failed to generate chunk ${i + 1}: ${result.error}`); | |
| } | |
| // Use raw Float32Array directly - no WAV encode/decode round-trip | |
| const audioData = result.audioData; | |
| audioDataArrays.push(audioData); | |
| durations.push(result.audioDuration); | |
| // Progressive playback: pass raw Float32Array directly to callbacks | |
| if (i === 0 && onFirstChunkReady) { | |
| // First chunk ready - send it immediately | |
| firstChunkEndTime = Date.now(); | |
| firstChunkTime = (firstChunkEndTime - supertonicStartTime) / 1000; | |
| const totalDurationSoFar = result.audioDuration; | |
| const processedChars = chunks[0].length; | |
| // Pass raw audio data and sample rate directly | |
| onFirstChunkReady(audioData, sampleRate, totalDurationSoFar, text, chunks.length, firstChunkTime, processedChars); | |
| } else if (i > 0 && onChunkAdded) { | |
| // Subsequent chunks - send just the new chunk | |
| const totalDurationSoFar = durations.slice(0, i + 1).reduce((sum, dur) => sum + dur, 0) + silenceDuration * i; | |
| const currentProcessingTime = (Date.now() - supertonicStartTime) / 1000; | |
| const processedChars = chunks.slice(0, i + 1).reduce((sum, chunk) => sum + chunk.length, 0); | |
| // Pass raw audio data and sample rate directly | |
| onChunkAdded(audioData, sampleRate, totalDurationSoFar, i + 1, chunks.length, currentProcessingTime, processedChars); | |
| } | |
| } | |
| // Concatenate all audio chunks with silence for final result | |
| const totalDuration = durations.reduce((sum, dur) => sum + dur, 0) + silenceDuration * (chunks.length - 1); | |
| // Calculate total samples needed | |
| let totalSamples = 0; | |
| for (let i = 0; i < audioDataArrays.length; i++) { | |
| totalSamples += audioDataArrays[i].length; | |
| if (i < audioDataArrays.length - 1) { | |
| totalSamples += silenceSamples; | |
| } | |
| } | |
| const wavCat = new Float32Array(totalSamples); | |
| let currentIdx = 0; | |
| for (let i = 0; i < audioDataArrays.length; i++) { | |
| // Copy audio data | |
| const audioData = audioDataArrays[i]; | |
| wavCat.set(audioData, currentIdx); | |
| currentIdx += audioData.length; | |
| // Add silence if not the last chunk | |
| if (i < audioDataArrays.length - 1) { | |
| // Silence is already zeros in Float32Array, just skip the indices | |
| currentIdx += silenceSamples; | |
| } | |
| } | |
| // Create final WAV file | |
| const wavBuffer = writeWavFile(wavCat, sampleRate); | |
| const blob = new Blob([wavBuffer], { type: 'audio/wav' }); | |
| const url = URL.createObjectURL(blob); | |
| const supertonicEndTime = Date.now(); | |
| const supertonicProcessingTime = (supertonicEndTime - supertonicStartTime) / 1000; | |
| return { | |
| success: true, | |
| processingTime: supertonicProcessingTime, | |
| audioDuration: totalDuration, | |
| url: url, | |
| text: text, | |
| firstChunkTime: firstChunkTime | |
| }; | |
| } catch (error) { | |
| return { | |
| success: false, | |
| error: error.message, | |
| text: text | |
| }; | |
| } | |
| } | |
| // Main synthesis function | |
| async function generateSpeech() { | |
| let text = (demoTextInput.textContent || demoTextInput.innerText || '').trim(); | |
| // Validate text input | |
| const validation = validateTextInput(text); | |
| if (!validation.valid) { | |
| showDemoError(validation.message); | |
| return; | |
| } | |
| if (!models || !cfgs || !processors) { | |
| showDemoError('Models are still loading. Please wait.'); | |
| return; | |
| } | |
| if (!currentStyleTtlTensor || !currentStyleDpTensor) { | |
| showDemoError('Reference embeddings are not ready. Please wait.'); | |
| return; | |
| } | |
| // Validate characters before generation | |
| const charValidation = validateCharacters(text); | |
| if (!charValidation.valid && charValidation.unsupportedChars.length > 0) { | |
| const charList = charValidation.unsupportedChars.map(c => `"${c}"`).join(', '); | |
| showDemoError(`Cannot generate speech: Unsupported characters found: ${charList}`); | |
| return; | |
| } | |
| currentGenerationTextLength = text.length; | |
| try { | |
| isGenerating = true; | |
| demoGenerateBtn.disabled = true; | |
| // Disable speaker selection during generation | |
| const speakerItemsForGeneration = document.querySelectorAll('.speaker-item[data-voice]'); | |
| speakerItemsForGeneration.forEach(item => item.classList.add('disabled')); | |
| // Disable language selection during generation | |
| const languageItemsForGeneration = document.querySelectorAll('.speaker-item[data-language]'); | |
| languageItemsForGeneration.forEach(item => item.classList.add('disabled')); | |
| hideDemoError(); | |
| hideDemoStatus(); // Hide the status box when starting generation | |
| // Clean up previous audio playback | |
| if (audioContext) { | |
| // Stop all scheduled sources | |
| scheduledSources.forEach(source => { | |
| try { | |
| source.stop(); | |
| } catch (e) { | |
| // Already stopped | |
| } | |
| }); | |
| scheduledSources = []; | |
| // Close audio context | |
| if (audioContext.state !== 'closed') { | |
| audioContext.close(); | |
| } | |
| audioContext = null; | |
| } | |
| // Cancel animation frame | |
| if (animationFrameId) { | |
| cancelAnimationFrame(animationFrameId); | |
| animationFrameId = null; | |
| } | |
| // Clean up all custom audio players | |
| customAudioPlayers.forEach(player => { | |
| if (player.cleanup) { | |
| player.cleanup(); | |
| } | |
| }); | |
| customAudioPlayers = []; | |
| // Reset state | |
| audioChunks = []; | |
| totalDuration = 0; | |
| startTime = 0; | |
| pauseTime = 0; | |
| isPaused = false; | |
| isPlaying = false; | |
| firstChunkGenerationTime = 0; // Processing time for first chunk | |
| totalChunks = 0; | |
| nextScheduledTime = 0; // Next time to schedule audio chunk | |
| // Show result shell(s) immediately | |
| const createInitialResultItem = (system, titleMain, titleSub, titleColor, includeStatus) => { | |
| const titleStatus = includeStatus | |
| ? `<span class="title-status status-running" id="${system}-status">⏳ Running...</span>` | |
| : ''; | |
| return ` | |
| <div class="demo-result-item ${system}-result-item generating" id="${system}-result" style="--result-progress: 0%;"> | |
| <div class="demo-result-title"> | |
| <span class="title-main" style="color: ${titleColor};">${titleMain}</span> | |
| <span class="title-sub">${titleSub}</span> | |
| ${titleStatus} | |
| </div> | |
| <div class="demo-result-info"> | |
| <!-- | |
| <div class="stat"> | |
| <div class="stat-value" id="${system}-chars">--</div> | |
| <div class="stat-label">Processed Chars</div> | |
| </div> | |
| --> | |
| <div class="stat"> | |
| <div class="stat-value" id="${system}-time">--</div> | |
| <div class="stat-label">Processing Time<span class="stat-arrow stat-arrow--down">↓</span></div> | |
| </div> | |
| <div class="stat"> | |
| <div class="stat-value" id="${system}-cps">--</div> | |
| <div class="stat-label">Chars/sec<span class="stat-arrow stat-arrow--up">↑</span></div> | |
| </div> | |
| <div class="stat"> | |
| <div class="stat-value" id="${system}-rtf">--</div> | |
| <div class="stat-label">RTF<span class="stat-arrow stat-arrow--down">↓</span></div> | |
| </div> | |
| </div> | |
| <div class="custom-audio-player"> | |
| <div class="demo-placeholder-audio">Generating speech...</div> | |
| </div> | |
| </div> | |
| `; | |
| }; | |
| const supertonicInitial = createInitialResultItem( | |
| 'supertonic', | |
| 'Supertonic', | |
| 'On-Device', | |
| 'var(--accent-yellow)', | |
| false | |
| ); | |
| demoResults.style.display = 'flex'; | |
| demoResults.innerHTML = supertonicInitial; | |
| const totalStep = parseInt(demoTotalSteps.value); | |
| const speed = parseFloat(demoSpeed.value); | |
| const durationFactor = speedToDurationFactor(speed); | |
| // Track which one finishes first | |
| let latestSupertonicProcessedChars = 0; | |
| // Helper functions for custom player | |
| const formatTime = (seconds, { trimMobile = false } = {}) => { | |
| const mins = Math.floor(seconds / 60); | |
| const secs = seconds % 60; | |
| const secString = secs.toFixed(2).padStart(5, '0'); | |
| let formatted = `${mins}:${secString}`; | |
| if (trimMobile) { | |
| formatted = trimDecimalsForMobile(formatted); | |
| } | |
| return formatted; | |
| }; | |
| const updateProgress = () => { | |
| if (!isPlaying || !audioContext) return; | |
| const currentTime = isPaused ? pauseTime : (audioContext.currentTime - startTime); | |
| const progress = totalDuration > 0 ? (currentTime / totalDuration) * 100 : 0; | |
| if (progressFill) { | |
| progressFill.style.width = `${Math.min(progress, 100)}%`; | |
| } | |
| if (currentTimeDisplay) { | |
| currentTimeDisplay.textContent = formatTime(Math.min(currentTime, totalDuration), { trimMobile: true }); | |
| } | |
| if (currentTime < totalDuration) { | |
| animationFrameId = requestAnimationFrame(updateProgress); | |
| } else { | |
| // Playback finished | |
| isPlaying = false; | |
| isPaused = false; | |
| if (playPauseBtn) { | |
| playPauseBtn.innerHTML = PLAY_ICON_SVG; | |
| } | |
| } | |
| }; | |
| const togglePlayPause = () => { | |
| if (!audioContext || audioChunks.length === 0) return; | |
| if (isPaused) { | |
| // Resume from paused position | |
| pauseAllPlayersExcept(supertonicPlayerRecord); | |
| const seekTime = pauseTime; | |
| // Find which chunk we should start from | |
| let accumulatedTime = 0; | |
| let startChunkIndex = 0; | |
| let offsetInChunk = seekTime; | |
| for (let i = 0; i < audioChunks.length; i++) { | |
| const chunkDuration = audioChunks[i].buffer.duration; | |
| if (accumulatedTime + chunkDuration > seekTime) { | |
| startChunkIndex = i; | |
| offsetInChunk = seekTime - accumulatedTime; | |
| break; | |
| } | |
| accumulatedTime += chunkDuration + 0.3; | |
| } | |
| // Stop any existing sources | |
| scheduledSources.forEach(source => { | |
| try { | |
| source.stop(); | |
| } catch (e) { | |
| // Already stopped | |
| } | |
| }); | |
| scheduledSources = []; | |
| // Resume AudioContext if suspended | |
| if (audioContext.state === 'suspended') { | |
| audioContext.resume(); | |
| } | |
| // Reschedule from the pause point | |
| startTime = audioContext.currentTime - seekTime; | |
| let nextStartTime = audioContext.currentTime; | |
| for (let i = startChunkIndex; i < audioChunks.length; i++) { | |
| const source = audioContext.createBufferSource(); | |
| source.buffer = audioChunks[i].buffer; | |
| source.connect(audioContext.destination); | |
| if (i === startChunkIndex) { | |
| source.start(nextStartTime, offsetInChunk); | |
| nextStartTime += (audioChunks[i].buffer.duration - offsetInChunk); | |
| } else { | |
| source.start(nextStartTime); | |
| nextStartTime += audioChunks[i].buffer.duration; | |
| } | |
| if (i < audioChunks.length - 1) { | |
| nextStartTime += 0.3; | |
| } | |
| scheduledSources.push(source); | |
| } | |
| nextScheduledTime = nextStartTime; | |
| isPaused = false; | |
| isPlaying = true; | |
| playPauseBtn.innerHTML = PAUSE_ICON_SVG; | |
| updateProgress(); | |
| } else if (isPlaying) { | |
| // Pause playback | |
| pauseTime = audioContext.currentTime - startTime; | |
| audioContext.suspend(); | |
| isPaused = true; | |
| playPauseBtn.innerHTML = PLAY_ICON_SVG; | |
| if (animationFrameId) { | |
| cancelAnimationFrame(animationFrameId); | |
| } | |
| } else { | |
| // Was finished, restart from beginning | |
| pauseAllPlayersExcept(supertonicPlayerRecord); | |
| pauseTime = 0; | |
| // Resume AudioContext if suspended | |
| if (audioContext.state === 'suspended') { | |
| audioContext.resume(); | |
| } | |
| // Stop any existing sources | |
| scheduledSources.forEach(source => { | |
| try { | |
| source.stop(); | |
| } catch (e) { | |
| // Already stopped | |
| } | |
| }); | |
| scheduledSources = []; | |
| // Restart from beginning | |
| startTime = audioContext.currentTime; | |
| let nextStartTime = audioContext.currentTime; | |
| for (let i = 0; i < audioChunks.length; i++) { | |
| const source = audioContext.createBufferSource(); | |
| source.buffer = audioChunks[i].buffer; | |
| source.connect(audioContext.destination); | |
| source.start(nextStartTime); | |
| nextStartTime += audioChunks[i].buffer.duration; | |
| if (i < audioChunks.length - 1) { | |
| nextStartTime += 0.3; | |
| } | |
| scheduledSources.push(source); | |
| } | |
| nextScheduledTime = nextStartTime; | |
| isPlaying = true; | |
| isPaused = false; | |
| playPauseBtn.innerHTML = PAUSE_ICON_SVG; | |
| updateProgress(); | |
| } | |
| }; | |
| const seekTo = (percentage) => { | |
| if (!audioContext || audioChunks.length === 0) return; | |
| const seekTime = (percentage / 100) * totalDuration; | |
| // Remember current playing state | |
| const wasPlaying = isPlaying; | |
| const wasPaused = isPaused; | |
| // Stop all current sources | |
| scheduledSources.forEach(source => { | |
| try { | |
| source.stop(); | |
| } catch (e) { | |
| // Already stopped | |
| } | |
| }); | |
| scheduledSources = []; | |
| // Cancel animation | |
| if (animationFrameId) { | |
| cancelAnimationFrame(animationFrameId); | |
| } | |
| // Find which chunk we should start from | |
| let accumulatedTime = 0; | |
| let startChunkIndex = 0; | |
| let offsetInChunk = seekTime; | |
| for (let i = 0; i < audioChunks.length; i++) { | |
| const chunkDuration = audioChunks[i].buffer.duration; | |
| if (accumulatedTime + chunkDuration > seekTime) { | |
| startChunkIndex = i; | |
| offsetInChunk = seekTime - accumulatedTime; | |
| break; | |
| } | |
| accumulatedTime += chunkDuration + 0.3; // Include silence | |
| } | |
| // If paused or finished, just update the pause position | |
| if (wasPaused || !wasPlaying) { | |
| pauseTime = seekTime; | |
| // Update UI | |
| if (progressFill) { | |
| const progress = (seekTime / totalDuration) * 100; | |
| progressFill.style.width = `${Math.min(progress, 100)}%`; | |
| } | |
| if (currentTimeDisplay) { | |
| currentTimeDisplay.textContent = formatTime(seekTime, { trimMobile: true }); | |
| } | |
| // Set to paused state so play button will resume from seek position | |
| isPaused = true; | |
| isPlaying = true; // Valid state for playback | |
| if (playPauseBtn) { | |
| playPauseBtn.innerHTML = PLAY_ICON_SVG; | |
| } | |
| return; | |
| } | |
| // Resume AudioContext if it was suspended | |
| if (audioContext.state === 'suspended') { | |
| audioContext.resume(); | |
| } | |
| // Reschedule from the seek point | |
| startTime = audioContext.currentTime - seekTime; | |
| let nextStartTime = audioContext.currentTime; | |
| for (let i = startChunkIndex; i < audioChunks.length; i++) { | |
| const source = audioContext.createBufferSource(); | |
| source.buffer = audioChunks[i].buffer; | |
| source.connect(audioContext.destination); | |
| if (i === startChunkIndex) { | |
| // Start from offset | |
| source.start(nextStartTime, offsetInChunk); | |
| nextStartTime += (audioChunks[i].buffer.duration - offsetInChunk); | |
| } else { | |
| source.start(nextStartTime); | |
| nextStartTime += audioChunks[i].buffer.duration; | |
| } | |
| // Add silence between chunks | |
| if (i < audioChunks.length - 1) { | |
| nextStartTime += 0.3; | |
| } | |
| scheduledSources.push(source); | |
| } | |
| // Update nextScheduledTime for any future chunks | |
| nextScheduledTime = nextStartTime; | |
| // Resume playing state | |
| isPlaying = true; | |
| isPaused = false; | |
| if (playPauseBtn) { | |
| playPauseBtn.innerHTML = PAUSE_ICON_SVG; | |
| } | |
| // Restart progress animation | |
| updateProgress(); | |
| }; | |
| // Callback for first chunk ready - create custom player and start playback | |
| // Helper function to create AudioBuffer directly from Float32Array | |
| const createAudioBufferFromFloat32 = (audioData, sampleRate) => { | |
| const audioBuffer = audioContext.createBuffer(1, audioData.length, sampleRate); | |
| audioBuffer.getChannelData(0).set(audioData); | |
| return audioBuffer; | |
| }; | |
| const onFirstChunkReady = async (audioData, sampleRate, duration, text, numChunks, firstChunkTime, processedChars) => { | |
| totalChunks = numChunks; | |
| firstChunkGenerationTime = firstChunkTime; | |
| const container = document.getElementById('demoResults'); | |
| const textLength = currentGenerationTextLength > 0 | |
| ? currentGenerationTextLength | |
| : (text ? text.length : 0); | |
| const isBatch = textLength >= getMaxChunkLength(); | |
| const processingTimeStr = isBatch && firstChunkTime | |
| ? `${formatTimeDetailed(firstChunkTime)} / ${formatTimeDetailed(firstChunkTime)}` | |
| : formatTimeDetailed(firstChunkTime); | |
| const safeInitialChars = typeof processedChars === 'number' ? processedChars : 0; | |
| const displayedInitialChars = textLength > 0 ? Math.min(safeInitialChars, textLength) : safeInitialChars; | |
| const charsPerSec = firstChunkTime > 0 && displayedInitialChars > 0 | |
| ? (displayedInitialChars / firstChunkTime).toFixed(1) | |
| : '0.0'; | |
| const rtf = duration > 0 && firstChunkTime > 0 ? (firstChunkTime / duration).toFixed(3) : '-'; | |
| const progressValue = textLength > 0 ? Math.min(100, (displayedInitialChars / textLength) * 100) : 0; | |
| const resultItemEl = document.getElementById('supertonic-result'); | |
| if (!resultItemEl) { | |
| console.warn('Supertonic result container not found.'); | |
| return; | |
| } | |
| resultItemEl.classList.remove('generating'); | |
| resultItemEl.style.setProperty('--result-progress', `${progressValue}%`); | |
| const titleMainEl = resultItemEl.querySelector('.title-main'); | |
| if (titleMainEl) { | |
| titleMainEl.textContent = 'Supertonic'; | |
| titleMainEl.style.color = 'var(--accent-yellow)'; | |
| } | |
| const titleSubEl = resultItemEl.querySelector('.title-sub'); | |
| if (titleSubEl) { | |
| titleSubEl.textContent = 'On-Device'; | |
| } | |
| const infoContainer = resultItemEl.querySelector('.demo-result-info'); | |
| if (infoContainer) { | |
| infoContainer.classList.remove('error'); | |
| } | |
| const timeElInitial = document.getElementById('supertonic-time'); | |
| if (timeElInitial) { | |
| timeElInitial.innerHTML = formatStatValueWithSuffix(processingTimeStr, 's', { firstLabel: true }); | |
| } | |
| const cpsElInitial = document.getElementById('supertonic-cps'); | |
| if (cpsElInitial) { | |
| cpsElInitial.textContent = charsPerSec; | |
| } | |
| const rtfElInitial = document.getElementById('supertonic-rtf'); | |
| if (rtfElInitial) { | |
| rtfElInitial.innerHTML = formatStatValueWithSuffix(rtf, 'x'); | |
| } | |
| const playerContainer = resultItemEl.querySelector('.custom-audio-player'); | |
| if (playerContainer) { | |
| playerContainer.style.display = ''; | |
| playerContainer.innerHTML = ` | |
| <button id="play-pause-btn" class="player-btn">${PAUSE_ICON_SVG}</button> | |
| <div class="time-display" id="current-time">0:00.00</div> | |
| <div class="progress-container" id="progress-container"> | |
| <div class="progress-bar"> | |
| <div class="progress-fill" id="progress-fill"></div> | |
| </div> | |
| </div> | |
| <div class="time-display" id="total-duration">${formatTime(duration, { trimMobile: true })}</div> | |
| <div class="demo-result-actions" style="display: none;"> | |
| <button class="demo-download-btn" id="supertonic-download" aria-label="Download WAV" title="Download WAV"> | |
| <svg width="16" height="16" fill="none" stroke="currentColor" stroke-width="2" viewBox="0 0 24 24"> | |
| <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/> | |
| <polyline points="7 10 12 15 17 10"/> | |
| <line x1="12" y1="15" x2="12" y2="3"/> | |
| </svg> | |
| </button> | |
| </div> | |
| `; | |
| } | |
| container.style.display = 'flex'; | |
| latestSupertonicProcessedChars = displayedInitialChars; | |
| // Get UI elements | |
| playPauseBtn = document.getElementById('play-pause-btn'); | |
| progressBar = document.getElementById('progress-container'); | |
| currentTimeDisplay = document.getElementById('current-time'); | |
| durationDisplay = document.getElementById('total-duration'); | |
| progressFill = document.getElementById('progress-fill'); | |
| // Initialize Web Audio API | |
| audioContext = new (window.AudioContext || window.webkitAudioContext)(); | |
| startTime = audioContext.currentTime; | |
| totalDuration = duration; | |
| isPlaying = true; | |
| isPaused = false; | |
| // Create Supertonic player record and register it | |
| const pausePlayback = () => { | |
| if (!audioContext || audioContext.state === 'closed') return; | |
| if (isPlaying) { | |
| pauseTime = audioContext.currentTime - startTime; | |
| scheduledSources.forEach(source => { | |
| try { | |
| source.stop(); | |
| } catch (e) { | |
| // Already stopped | |
| } | |
| }); | |
| scheduledSources = []; | |
| audioContext.suspend(); | |
| isPaused = true; | |
| isPlaying = false; | |
| if (playPauseBtn) { | |
| playPauseBtn.innerHTML = PLAY_ICON_SVG; | |
| } | |
| if (animationFrameId) { | |
| cancelAnimationFrame(animationFrameId); | |
| } | |
| } | |
| }; | |
| supertonicPlayerRecord = { | |
| audioContext: audioContext, | |
| pausePlayback: pausePlayback | |
| }; | |
| // Remove old Supertonic player if exists and add new one | |
| customAudioPlayers = customAudioPlayers.filter(p => p !== supertonicPlayerRecord && p.audioContext !== audioContext); | |
| customAudioPlayers.push(supertonicPlayerRecord); | |
| // Pause all other players before starting Supertonic | |
| pauseAllPlayersExcept(supertonicPlayerRecord); | |
| // Create AudioBuffer directly from Float32Array - no WAV encode/decode | |
| const audioBuffer = createAudioBufferFromFloat32(audioData, sampleRate); | |
| audioChunks.push({ buffer: audioBuffer, duration: audioBuffer.duration }); | |
| // Play first chunk immediately | |
| const source = audioContext.createBufferSource(); | |
| source.buffer = audioBuffer; | |
| source.connect(audioContext.destination); | |
| source.start(audioContext.currentTime); | |
| scheduledSources.push(source); | |
| // Set next scheduled time for additional chunks | |
| nextScheduledTime = audioContext.currentTime + audioBuffer.duration + 0.3; // Add silence gap | |
| // Setup player controls | |
| playPauseBtn.addEventListener('click', togglePlayPause); | |
| progressBar.addEventListener('click', (e) => { | |
| const rect = progressBar.getBoundingClientRect(); | |
| const percentage = ((e.clientX - rect.left) / rect.width) * 100; | |
| seekTo(percentage); | |
| }); | |
| // Start progress animation | |
| updateProgress(); | |
| }; | |
| // Callback for each additional chunk - schedule seamlessly | |
| const onChunkAdded = async (audioData, sampleRate, duration, chunkIndex, totalChunks, currentProcessingTime, processedChars) => { | |
| if (!audioContext) return; | |
| // Create AudioBuffer directly from Float32Array - no WAV encode/decode | |
| const audioBuffer = createAudioBufferFromFloat32(audioData, sampleRate); | |
| const chunkDuration = audioBuffer.duration; | |
| audioChunks.push({ buffer: audioBuffer, duration: chunkDuration }); | |
| // Schedule the new chunk at the pre-calculated time | |
| const source = audioContext.createBufferSource(); | |
| source.buffer = audioBuffer; | |
| source.connect(audioContext.destination); | |
| source.start(nextScheduledTime); | |
| scheduledSources.push(source); | |
| // Update next scheduled time for the next chunk | |
| nextScheduledTime = nextScheduledTime + audioBuffer.duration + 0.3; // Add silence gap | |
| // Update total duration | |
| totalDuration = duration; | |
| // Update duration display with smooth animation | |
| if (durationDisplay) { | |
| durationDisplay.textContent = formatTime(duration, { trimMobile: true }); | |
| durationDisplay.style.transition = 'color 0.3s'; | |
| durationDisplay.style.color = '#ffffff'; | |
| setTimeout(() => { | |
| durationDisplay.style.color = ''; | |
| }, 300); | |
| } | |
| // Update info display | |
| const textLengthCandidate = currentGenerationTextLength > 0 | |
| ? currentGenerationTextLength | |
| : (demoTextInput.textContent || demoTextInput.innerText || '').trim().length; | |
| const textLength = textLengthCandidate; | |
| const isBatch = textLength >= getMaxChunkLength(); | |
| const timeEl = document.getElementById('supertonic-time'); | |
| const durationEl = document.getElementById('supertonic-duration'); | |
| const cpsEl = document.getElementById('supertonic-cps'); | |
| const rtfEl = document.getElementById('supertonic-rtf'); | |
| const effectiveProcessedChars = typeof processedChars === 'number' ? processedChars : latestSupertonicProcessedChars; | |
| if (effectiveProcessedChars < latestSupertonicProcessedChars) { | |
| return; | |
| } | |
| const clampedProcessedChars = textLength > 0 ? Math.min(effectiveProcessedChars, textLength) : effectiveProcessedChars; | |
| const progressValue = textLength > 0 ? Math.min(100, (clampedProcessedChars / textLength) * 100) : 0; | |
| if (durationEl) { | |
| durationEl.textContent = formatTimeDetailed(duration); | |
| } | |
| if (timeEl && isBatch && firstChunkGenerationTime > 0 && currentProcessingTime) { | |
| const timeDisplay = `${formatTimeDetailed(firstChunkGenerationTime)} / ${formatTimeDetailed(currentProcessingTime)}`; | |
| timeEl.innerHTML = formatStatValueWithSuffix(timeDisplay, 's', { firstLabel: true }); | |
| } | |
| if (cpsEl && currentProcessingTime > 0 && clampedProcessedChars >= 0) { | |
| const charsPerSec = (clampedProcessedChars / currentProcessingTime).toFixed(1); | |
| cpsEl.textContent = charsPerSec; | |
| } | |
| if (rtfEl && duration > 0 && currentProcessingTime > 0) { | |
| const rtf = (currentProcessingTime / duration).toFixed(3); | |
| rtfEl.innerHTML = formatStatValueWithSuffix(rtf, 'x'); | |
| } | |
| const resultItemEl = document.getElementById('supertonic-result'); | |
| if (resultItemEl) { | |
| resultItemEl.style.setProperty('--result-progress', `${progressValue}%`); | |
| } | |
| latestSupertonicProcessedChars = clampedProcessedChars; | |
| }; | |
| // Start all syntheses simultaneously | |
| const result = await generateSupertonicSpeechChunked( | |
| text, | |
| totalStep, | |
| durationFactor, | |
| onFirstChunkReady, | |
| onChunkAdded | |
| ); | |
| if (result.success) { | |
| const textLength = result.text ? result.text.length : 0; | |
| const isBatch = textLength >= getMaxChunkLength(); | |
| const processingTimeStr = isBatch && firstChunkGenerationTime > 0 | |
| ? `${formatTimeDetailed(firstChunkGenerationTime)} / ${formatTimeDetailed(result.processingTime)}` | |
| : formatTimeDetailed(result.processingTime); | |
| const charsPerSec = result.processingTime > 0 ? (textLength / result.processingTime).toFixed(1) : '0.0'; | |
| const progressValue = textLength > 0 ? 100 : 0; | |
| const timeEl = document.getElementById('supertonic-time'); | |
| const durationEl = document.getElementById('supertonic-duration'); | |
| const cpsEl = document.getElementById('supertonic-cps'); | |
| const rtfEl = document.getElementById('supertonic-rtf'); | |
| if (timeEl) timeEl.innerHTML = formatStatValueWithSuffix(processingTimeStr, 's', { firstLabel: true }); | |
| if (durationEl) durationEl.textContent = formatTimeDetailed(result.audioDuration); | |
| latestSupertonicProcessedChars = textLength; | |
| if (cpsEl) cpsEl.textContent = charsPerSec; | |
| if (rtfEl) { | |
| const rtf = result.audioDuration > 0 ? (result.processingTime / result.audioDuration).toFixed(3) : '-'; | |
| rtfEl.innerHTML = formatStatValueWithSuffix(rtf, 'x'); | |
| } | |
| const resultItemEl = document.getElementById('supertonic-result'); | |
| if (resultItemEl) { | |
| resultItemEl.style.setProperty('--result-progress', `${progressValue}%`); | |
| } | |
| // Final duration update (if custom player was used) | |
| if (audioContext && audioChunks.length > 0) { | |
| totalDuration = result.audioDuration; | |
| if (durationDisplay) { | |
| durationDisplay.textContent = formatTime(result.audioDuration, { trimMobile: true }); | |
| } | |
| } | |
| // Always show download button | |
| const downloadBtn = document.getElementById('supertonic-download'); | |
| if (downloadBtn) { | |
| downloadBtn.parentElement.style.display = 'block'; | |
| downloadBtn.onclick = () => downloadDemoAudio(result.url, 'supertonic_speech.wav'); | |
| } | |
| } | |
| } catch (error) { | |
| showDemoStatus(`<strong>Error:</strong> ${error.message}`, 'error'); | |
| showDemoError(`Error during synthesis: ${error.message}`); | |
| console.error('Synthesis error:', error); | |
| // Restore placeholder | |
| demoResults.style.display = 'none'; | |
| demoResults.innerHTML = ` | |
| <div class="demo-placeholder"> | |
| <div class="demo-placeholder-icon">🎙️</div> | |
| <p>Your generated speech will appear here</p> | |
| </div> | |
| `; | |
| } finally { | |
| isGenerating = false; | |
| demoGenerateBtn.disabled = false; | |
| // Re-enable speaker selection after generation | |
| const speakerItemsForGeneration = document.querySelectorAll('.speaker-item[data-voice]'); | |
| speakerItemsForGeneration.forEach(item => item.classList.remove('disabled')); | |
| // Re-enable language selection after generation | |
| const languageItemsForGeneration = document.querySelectorAll('.speaker-item[data-language]'); | |
| languageItemsForGeneration.forEach(item => item.classList.remove('disabled')); | |
| } | |
| } | |
| // Download handler (make it global) | |
| window.downloadDemoAudio = function(url, filename) { | |
| const a = document.createElement('a'); | |
| a.href = url; | |
| a.download = filename; | |
| a.click(); | |
| }; | |
| // Helper function to convert speed to durationFactor | |
| function speedToDurationFactor(speed, offset=0.05) { | |
| return 1 / (speed + offset); | |
| } | |
| // Update slider value displays | |
| function updateSliderValues() { | |
| demoTotalStepsValue.textContent = demoTotalSteps.value + ' Steps'; | |
| // Display speed with 'x' suffix (e.g., 1.0x, 0.7x, 1.5x) | |
| const speed = parseFloat(demoSpeed.value); | |
| demoSpeedValue.textContent = speed.toFixed(2) + 'x'; | |
| } | |
| // Attach slider event listeners | |
| demoTotalSteps.addEventListener('input', updateSliderValues); | |
| demoSpeed.addEventListener('input', updateSliderValues); | |
| // Initialize slider values | |
| updateSliderValues(); | |
| // Attach generate function to button | |
| demoGenerateBtn.addEventListener('click', generateSpeech); | |
| // Preset text items (defined before input listener to share scope) | |
| const presetItems = document.querySelectorAll('.preset-item[data-preset]'); | |
| const freeformBtn = document.getElementById('freeformBtn'); | |
| let currentPreset = 'quote'; // Initialize with quote | |
| // currentLanguage is already declared above (line 902) | |
| let isPresetChanging = false; // Flag to track if text change is from preset button | |
| // Helper function to update active button state | |
| function updateActiveButton(presetType) { | |
| // Remove active from all preset items | |
| presetItems.forEach(item => item.classList.remove('active')); | |
| // Add active to the specified item | |
| if (presetType) { | |
| const targetItem = document.querySelector(`.preset-item[data-preset="${presetType}"]`); | |
| if (targetItem) { | |
| targetItem.classList.add('active'); | |
| } | |
| } | |
| currentPreset = presetType; | |
| updateQuoteModeState(presetType === 'quote'); | |
| } | |
| function updateQuoteModeState(isQuote) { | |
| if (!demoResults) return; | |
| demoResults.classList.toggle('quote-mode', Boolean(isQuote)); | |
| } | |
| // Initialize quote button active state | |
| updateActiveButton('quote'); | |
| if (presetTexts.quote && typeof presetTexts.quote === 'object' && presetTexts.quote[currentLanguage]) { | |
| demoTextInput.textContent = presetTexts.quote[currentLanguage]; | |
| updateCharCounter(); | |
| } | |
| presetItems.forEach(item => { | |
| item.addEventListener('click', () => { | |
| const presetType = item.getAttribute('data-preset'); | |
| if (presetType === 'freeform') { | |
| // Freeform item: clear text | |
| isPresetChanging = true; | |
| demoTextInput.textContent = ''; | |
| updateCharCounter(); | |
| updateActiveButton('freeform'); | |
| isPresetChanging = false; | |
| } else { | |
| // Other preset items: set text | |
| const preset = presetTexts[presetType]; | |
| if (preset && typeof preset === 'object' && preset[currentLanguage]) { | |
| const text = preset[currentLanguage]; | |
| isPresetChanging = true; | |
| demoTextInput.textContent = text; | |
| updateCharCounter(); | |
| updateActiveButton(presetType); | |
| isPresetChanging = false; | |
| } else if (preset && typeof preset === 'string') { | |
| // Fallback for old format (shouldn't happen, but just in case) | |
| isPresetChanging = true; | |
| demoTextInput.textContent = preset; | |
| updateCharCounter(); | |
| updateActiveButton(presetType); | |
| isPresetChanging = false; | |
| } | |
| } | |
| }); | |
| }); | |
| // Handle paste event to remove styles and paste only text | |
| demoTextInput.addEventListener('paste', (e) => { | |
| e.preventDefault(); | |
| const text = (e.clipboardData || window.clipboardData).getData('text/plain'); | |
| const selection = window.getSelection(); | |
| if (!selection.rangeCount) return; | |
| const range = selection.getRangeAt(0); | |
| range.deleteContents(); | |
| const textNode = document.createTextNode(text); | |
| range.insertNode(textNode); | |
| range.setStartAfter(textNode); | |
| range.collapse(true); | |
| selection.removeAllRanges(); | |
| selection.addRange(range); | |
| // Trigger input event to update character counter | |
| demoTextInput.dispatchEvent(new Event('input', { bubbles: true })); | |
| }); | |
| // Update character counter on input | |
| let previousTextValue = demoTextInput.textContent || demoTextInput.innerText || ''; | |
| // Update left border line height to match demo-input-section height | |
| const demoInputSection = document.querySelector('.demo-input-section'); | |
| function updateLeftBorderHeight() { | |
| if (demoInputSection) { | |
| const height = demoInputSection.offsetHeight; | |
| demoInputSection.style.setProperty('--demo-text-input-height', `${height}px`); | |
| } | |
| } | |
| // Initialize and observe height changes | |
| updateLeftBorderHeight(); | |
| const resizeObserver = new ResizeObserver(() => { | |
| updateLeftBorderHeight(); | |
| }); | |
| if (demoInputSection) { | |
| resizeObserver.observe(demoInputSection); | |
| } | |
| // Auto-calculate text input height for screens wider than 768px | |
| function calculateTextInputHeight() { | |
| if (window.innerWidth <= 768) { | |
| // Reset to default height for screens 768px and below | |
| demoTextInput.style.height = ''; | |
| return; | |
| } | |
| const viewportHeight = window.innerHeight; | |
| const interactiveDemoEl = document.querySelector('.interactive-demo'); | |
| const containerEl = document.querySelector('.container'); | |
| const headerWrapperEl = document.querySelector('.demo-header-wrapper'); | |
| const controlsEl = document.querySelector('.demo-controls'); | |
| const inputLabelEl = document.querySelector('.demo-input-label'); | |
| const presetRowEl = document.querySelector('#presetControlsRow'); | |
| const outputSectionEl = document.querySelector('.demo-output-section'); | |
| const contentEl = document.querySelector('.demo-content'); | |
| // Get computed styles for gaps and paddings | |
| const interactiveDemoStyle = window.getComputedStyle(interactiveDemoEl || document.body); | |
| const containerStyle = window.getComputedStyle(containerEl || document.body); | |
| const contentStyle = window.getComputedStyle(contentEl || document.body); | |
| // Calculate total height of elements above and below text input | |
| let totalHeight = 0; | |
| // Interactive demo padding | |
| const interactiveDemoPaddingTop = parseFloat(interactiveDemoStyle.paddingTop) || 0; | |
| const interactiveDemoPaddingBottom = parseFloat(interactiveDemoStyle.paddingBottom) || 0; | |
| totalHeight += interactiveDemoPaddingTop + interactiveDemoPaddingBottom; | |
| // Container padding | |
| const containerPaddingTop = parseFloat(containerStyle.paddingTop) || 0; | |
| const containerPaddingBottom = parseFloat(containerStyle.paddingBottom) || 0; | |
| totalHeight += containerPaddingTop + containerPaddingBottom; | |
| // Header wrapper | |
| if (headerWrapperEl) { | |
| totalHeight += headerWrapperEl.offsetHeight; | |
| } | |
| // Demo controls | |
| if (controlsEl) { | |
| totalHeight += controlsEl.offsetHeight; | |
| } | |
| // Demo content gap (top) | |
| const contentGap = parseFloat(contentStyle.gap) || 0; | |
| totalHeight += contentGap; | |
| // Input label | |
| if (inputLabelEl) { | |
| totalHeight += inputLabelEl.offsetHeight; | |
| } | |
| // Preset controls row | |
| if (presetRowEl) { | |
| totalHeight += presetRowEl.offsetHeight; | |
| } | |
| // Demo content gap (bottom) | |
| totalHeight += contentGap; | |
| // Output section | |
| if (outputSectionEl) { | |
| totalHeight += outputSectionEl.offsetHeight; | |
| } | |
| // Calculate available height for text input | |
| const availableHeight = viewportHeight - totalHeight - 275; // Subtract 275px | |
| // Set minimum height (e.g., 200px) and maximum height | |
| const minHeight = 200; | |
| const maxHeight = availableHeight - 20; // 20px buffer | |
| if (availableHeight > minHeight) { | |
| demoTextInput.style.height = `${Math.max(minHeight, maxHeight)}px`; | |
| } else { | |
| demoTextInput.style.height = `${minHeight}px`; | |
| } | |
| } | |
| // Calculate on load and resize | |
| calculateTextInputHeight(); | |
| window.addEventListener('resize', calculateTextInputHeight); | |
| // Observe elements that might change height | |
| const heightObserver = new ResizeObserver(() => { | |
| calculateTextInputHeight(); | |
| }); | |
| const headerWrapperEl = document.querySelector('.demo-header-wrapper'); | |
| const controlsEl = document.querySelector('.demo-controls'); | |
| const presetRowEl = document.querySelector('#presetControlsRow'); | |
| const outputSectionEl = document.querySelector('.demo-output-section'); | |
| if (headerWrapperEl) heightObserver.observe(headerWrapperEl); | |
| if (controlsEl) heightObserver.observe(controlsEl); | |
| if (presetRowEl) heightObserver.observe(presetRowEl); | |
| if (outputSectionEl) heightObserver.observe(outputSectionEl); | |
| // Auto-hide scrollbar functionality | |
| let scrollbarTimeout; | |
| demoTextInput.addEventListener('scroll', () => { | |
| // Add scrolling class to show scrollbar | |
| demoTextInput.classList.add('scrolling'); | |
| // Clear existing timeout | |
| if (scrollbarTimeout) { | |
| clearTimeout(scrollbarTimeout); | |
| } | |
| // Hide scrollbar after 1.5 seconds of no scrolling | |
| scrollbarTimeout = setTimeout(() => { | |
| demoTextInput.classList.remove('scrolling'); | |
| }, 1500); | |
| }); | |
| demoTextInput.addEventListener('input', () => { | |
| updateCharCounter(); | |
| // If text was modified by user (not from preset button), switch to freeform | |
| const currentText = demoTextInput.textContent || demoTextInput.innerText || ''; | |
| if (!isPresetChanging && currentText !== previousTextValue) { | |
| updateActiveButton('freeform'); | |
| } | |
| if (currentPreset === 'freeform') { | |
| // Auto-detect language when user is typing (not from preset) | |
| const detectedLang = detectLanguage(currentText); | |
| if (detectedLang && detectedLang !== currentLanguage) { | |
| const previousLang = currentLanguage; | |
| currentLanguage = detectedLang; | |
| window.updateActiveLanguage(currentLanguage); | |
| showLanguageToast(previousLang, detectedLang); | |
| } | |
| } | |
| previousTextValue = currentText; | |
| }); | |
| // Update font size when window is resized (for responsive width-based font sizing) | |
| let resizeTimeout; | |
| window.addEventListener('resize', () => { | |
| clearTimeout(resizeTimeout); | |
| resizeTimeout = setTimeout(() => { | |
| updateCharCounter(); | |
| }, 100); | |
| }); | |
| // Initialize character counter | |
| updateCharCounter(); | |
| // Speaker list handler (replaces voice select dropdown) | |
| const speakerList = document.getElementById('speakerList'); | |
| const speakerItems = speakerList ? speakerList.querySelectorAll('.speaker-item[data-voice]') : []; | |
| const createVoiceBtn = document.getElementById('createVoiceBtn'); | |
| const comingSoonModal = document.getElementById('comingSoonModal'); | |
| const comingSoonCloseBtn = document.getElementById('comingSoonCloseBtn'); | |
| let voiceSelectDisabled = false; | |
| // Update active speaker item (global function for use in switchVoice) | |
| window.updateActiveSpeaker = function(voice) { | |
| if (!speakerList || !speakerItems) return; | |
| speakerItems.forEach(item => { | |
| if (item.dataset.voice === voice) { | |
| item.classList.add('active'); | |
| } else { | |
| item.classList.remove('active'); | |
| } | |
| }); | |
| }; | |
| // Initialize active speaker | |
| if (speakerList && speakerItems.length > 0) { | |
| window.updateActiveSpeaker(currentVoice); | |
| } | |
| // Handle speaker item clicks and hover tooltips | |
| const speakerTooltip = document.getElementById('speakerTooltip'); | |
| if (speakerList) { | |
| speakerItems.forEach(item => { | |
| // Track if click was triggered by touch event (to prevent double execution) | |
| let clickFromTouch = false; | |
| // Click handler | |
| item.addEventListener('click', async (e) => { | |
| // On touch devices with mobile viewport, ignore native click events (we'll trigger manually from touchend) | |
| // PC (even with narrow viewport) should always handle clicks | |
| if (isTouchDevice() && isMobileViewport() && !clickFromTouch) { | |
| return; | |
| } | |
| // Reset flag | |
| clickFromTouch = false; | |
| if (voiceSelectDisabled || modelsLoading || isGenerating) return; | |
| const selectedVoice = item.dataset.voice; | |
| // If already selected, just auto-generate and play | |
| if (selectedVoice === currentVoice) { | |
| const text = (demoTextInput.textContent || demoTextInput.innerText || '').trim(); | |
| if (text.length >= 10 && !isGenerating && models && cfgs && processors) { | |
| generateSpeech(); | |
| } | |
| return; | |
| } | |
| // Disable all controls while loading | |
| const wasDisabled = demoGenerateBtn.disabled; | |
| demoGenerateBtn.disabled = true; | |
| voiceSelectDisabled = true; | |
| // Update UI immediately | |
| window.updateActiveSpeaker(selectedVoice); | |
| try { | |
| await switchVoice(selectedVoice); | |
| // Re-enable if models are loaded | |
| if (models && cfgs && processors) { | |
| demoGenerateBtn.disabled = false; | |
| voiceSelectDisabled = false; | |
| // Auto-generate and play after voice change | |
| const text = (demoTextInput.textContent || demoTextInput.innerText || '').trim(); | |
| if (text.length >= 10 && !isGenerating) { | |
| generateSpeech(); | |
| } | |
| } | |
| } catch (error) { | |
| console.error('Failed to switch voice:', error); | |
| // Revert selection on error | |
| window.updateActiveSpeaker(currentVoice); | |
| voiceSelectDisabled = false; | |
| if (!wasDisabled) demoGenerateBtn.disabled = false; | |
| } | |
| }); | |
| // Hover handler for tooltip | |
| if (speakerTooltip) { | |
| // Desktop hover events | |
| item.addEventListener('mouseenter', (e) => { | |
| if (isTouchDevice() && isMobileViewport()) return; // Skip on touch devices with mobile viewport | |
| const voice = item.dataset.voice; | |
| if (voice && VOICE_DESCRIPTIONS[voice]) { | |
| speakerTooltip.textContent = VOICE_DESCRIPTIONS[voice]; | |
| speakerTooltip.style.display = 'block'; | |
| updateTooltipPosition(e, speakerTooltip); | |
| } | |
| }); | |
| item.addEventListener('mousemove', (e) => { | |
| if (isTouchDevice() && isMobileViewport()) return; // Skip on touch devices with mobile viewport | |
| if (speakerTooltip.style.display === 'block') { | |
| updateTooltipPosition(e, speakerTooltip); | |
| } | |
| }); | |
| item.addEventListener('mouseleave', () => { | |
| if (isTouchDevice() && isMobileViewport()) return; // Skip on touch devices with mobile viewport | |
| speakerTooltip.style.display = 'none'; | |
| }); | |
| // Mobile touch events | |
| let touchStartTime = 0; | |
| let touchHandled = false; | |
| let touchStartY = 0; | |
| const TOUCH_MOVE_THRESHOLD = 10; // pixels | |
| item.addEventListener('touchstart', (e) => { | |
| if (!isTouchDevice() || !isMobileViewport()) return; | |
| touchHandled = false; | |
| const touch = e.touches[0]; | |
| touchStartTime = Date.now(); | |
| touchStartY = touch.clientY; | |
| const voice = item.dataset.voice; | |
| if (voice && VOICE_DESCRIPTIONS[voice]) { | |
| // Prevent default to block text selection | |
| e.preventDefault(); | |
| // Show tooltip with mobile styling | |
| speakerTooltip.textContent = VOICE_DESCRIPTIONS[voice]; | |
| speakerTooltip.style.display = 'block'; | |
| updateTooltipPositionMobile(speakerTooltip, touch.clientY); | |
| } | |
| }, { passive: false }); | |
| item.addEventListener('touchmove', (e) => { | |
| if (!isTouchDevice() || !isMobileViewport()) return; | |
| const touch = e.touches[0]; | |
| const deltaY = Math.abs(touch.clientY - touchStartY); | |
| // Check if touch moved significantly | |
| if (deltaY > TOUCH_MOVE_THRESHOLD) { | |
| touchHandled = true; | |
| // Hide tooltip if user moves finger | |
| speakerTooltip.style.display = 'none'; | |
| } | |
| // Prevent default to avoid scrolling while showing tooltip | |
| e.preventDefault(); | |
| }, { passive: false }); | |
| item.addEventListener('touchend', (e) => { | |
| if (!isTouchDevice() || !isMobileViewport()) return; | |
| const touchEndTime = Date.now(); | |
| const touchDuration = touchEndTime - touchStartTime; | |
| // Hide tooltip | |
| speakerTooltip.style.display = 'none'; | |
| // Always prevent default to avoid text selection | |
| e.preventDefault(); | |
| // Only allow click if it was a short tap without movement | |
| if (!touchHandled && touchDuration < 500) { | |
| // Short tap - trigger click event manually after a small delay | |
| clickFromTouch = true; | |
| setTimeout(() => { | |
| const clickEvent = new MouseEvent('click', { | |
| bubbles: true, | |
| cancelable: true, | |
| view: window | |
| }); | |
| item.dispatchEvent(clickEvent); | |
| }, 50); | |
| } else { | |
| // Long press or moved - prevent click | |
| touchHandled = true; | |
| e.stopPropagation(); | |
| } | |
| }, { passive: false }); | |
| item.addEventListener('touchcancel', (e) => { | |
| if (!isTouchDevice() || !isMobileViewport()) return; | |
| // Hide tooltip | |
| speakerTooltip.style.display = 'none'; | |
| touchHandled = true; | |
| // Prevent default | |
| e.preventDefault(); | |
| }, { passive: false }); | |
| // Prevent context menu (long press menu) | |
| item.addEventListener('contextmenu', (e) => { | |
| if (isTouchDevice() && isMobileViewport()) { | |
| e.preventDefault(); | |
| return false; | |
| } | |
| }); | |
| } | |
| }); | |
| } | |
| // Function to update tooltip position (40px above mouse pointer) | |
| function updateTooltipPosition(event, tooltip) { | |
| const x = event.clientX; | |
| const y = event.clientY - 40; // 40px above mouse pointer | |
| tooltip.style.left = x + 'px'; | |
| tooltip.style.top = y + 'px'; | |
| // Adjust if tooltip goes off screen | |
| const tooltipRect = tooltip.getBoundingClientRect(); | |
| const windowWidth = window.innerWidth; | |
| const windowHeight = window.innerHeight; | |
| if (tooltipRect.right > windowWidth) { | |
| tooltip.style.left = (windowWidth - tooltipRect.width - 10) + 'px'; | |
| } | |
| if (tooltipRect.left < 0) { | |
| tooltip.style.left = '10px'; | |
| } | |
| if (tooltipRect.top < 0) { | |
| tooltip.style.top = (event.clientY + 40) + 'px'; | |
| } | |
| if (tooltipRect.bottom > windowHeight) { | |
| tooltip.style.top = (windowHeight - tooltipRect.height - 10) + 'px'; | |
| } | |
| } | |
| // Function to update tooltip position for mobile (centered, 75px above touch point) | |
| function updateTooltipPositionMobile(tooltip, touchY) { | |
| const windowWidth = window.innerWidth; | |
| const windowHeight = window.innerHeight; | |
| // Set mobile-specific styles | |
| tooltip.style.width = '90%'; | |
| tooltip.style.left = '5%'; // Center: (100% - 90%) / 2 = 5% | |
| tooltip.style.right = 'auto'; | |
| tooltip.style.marginLeft = '0'; | |
| tooltip.style.marginRight = '0'; | |
| tooltip.style.whiteSpace = 'normal'; | |
| tooltip.style.textAlign = 'center'; | |
| // Position tooltip 75px above touch point (60px + 15px) | |
| const y = touchY - 75; | |
| tooltip.style.top = y + 'px'; | |
| // Adjust if tooltip goes off screen | |
| const tooltipRect = tooltip.getBoundingClientRect(); | |
| if (tooltipRect.top < 10) { | |
| // If tooltip goes above viewport, position it below touch point instead | |
| tooltip.style.top = (touchY + 20) + 'px'; | |
| } | |
| if (tooltipRect.bottom > windowHeight - 10) { | |
| tooltip.style.top = (windowHeight - tooltipRect.height - 10) + 'px'; | |
| } | |
| } | |
| // Handle "Create your own voice" button | |
| if (createVoiceBtn && comingSoonModal) { | |
| createVoiceBtn.addEventListener('click', () => { | |
| comingSoonModal.classList.add('show'); | |
| }); | |
| } | |
| // Close modal handlers | |
| if (comingSoonCloseBtn && comingSoonModal) { | |
| comingSoonCloseBtn.addEventListener('click', () => { | |
| comingSoonModal.classList.remove('show'); | |
| }); | |
| } | |
| if (comingSoonModal) { | |
| const overlay = comingSoonModal.querySelector('.coming-soon-modal-overlay'); | |
| if (overlay) { | |
| overlay.addEventListener('click', () => { | |
| comingSoonModal.classList.remove('show'); | |
| }); | |
| } | |
| } | |
| // Language selection handler | |
| const languageList = document.getElementById('languageList'); | |
| const languageItems = languageList ? languageList.querySelectorAll('.speaker-item[data-language]') : []; | |
| // Update active language item (global function for use in language change) | |
| window.updateActiveLanguage = function(language) { | |
| if (!languageList || !languageItems) return; | |
| languageItems.forEach(item => { | |
| if (item.dataset.language === language) { | |
| item.classList.add('active'); | |
| } else { | |
| item.classList.remove('active'); | |
| } | |
| }); | |
| }; | |
| // Initialize active language | |
| if (languageList && languageItems.length > 0) { | |
| window.updateActiveLanguage(currentLanguage); | |
| } | |
| // Handle language item clicks | |
| if (languageList) { | |
| languageItems.forEach(item => { | |
| item.addEventListener('click', async (e) => { | |
| // Don't allow language change during model loading or generation | |
| if (modelsLoading || isGenerating) return; | |
| const selectedLanguage = item.dataset.language; | |
| // If already selected, just auto-generate and play | |
| if (selectedLanguage === currentLanguage) { | |
| const text = (demoTextInput.textContent || demoTextInput.innerText || '').trim(); | |
| if (text.length >= 10 && !isGenerating && models && cfgs && processors) { | |
| generateSpeech(); | |
| } | |
| return; | |
| } | |
| // Update language | |
| currentLanguage = selectedLanguage; | |
| window.updateActiveLanguage(currentLanguage); | |
| // Update text if we're on a preset (not freeform) | |
| if (currentPreset && currentPreset !== 'freeform' && presetTexts[currentPreset]) { | |
| const preset = presetTexts[currentPreset]; | |
| if (preset && typeof preset === 'object' && preset[currentLanguage]) { | |
| isPresetChanging = true; | |
| demoTextInput.textContent = preset[currentLanguage]; | |
| updateCharCounter(); | |
| isPresetChanging = false; | |
| } | |
| } | |
| // Auto-generate and play after language change | |
| // Wait a bit for UI to update | |
| await new Promise(resolve => setTimeout(resolve, 100)); | |
| const text = (demoTextInput.textContent || demoTextInput.innerText || '').trim(); | |
| if (text.length >= 10 && !isGenerating && models && cfgs && processors) { | |
| generateSpeech(); | |
| } | |
| }); | |
| }); | |
| } | |
| // Title animation setup | |
| const demoTitleLeft = document.querySelector('.demo-title-left'); | |
| const demoTitleRight = document.querySelector('.demo-title-right'); | |
| const demoOutputSection = document.querySelector('.demo-output-section'); | |
| // Initialize Text with letters wrapped in spans | |
| if (demoTitleLeft) { | |
| const text = demoTitleLeft.textContent.trim(); | |
| demoTitleLeft.innerHTML = text.split('').map(char => | |
| char === ' ' ? ' ' : `<span class="letter visible">${char}</span>` | |
| ).join(''); | |
| } | |
| // Text animation on demo-input-section click | |
| if (demoInputSection && demoTitleLeft) { | |
| demoInputSection.addEventListener('click', () => { | |
| const letters = demoTitleLeft.querySelectorAll('.letter'); | |
| // Reset all letters | |
| letters.forEach(letter => { | |
| letter.classList.remove('visible'); | |
| }); | |
| // Show letters one by one (total 0.25s = 0.125s / 2) | |
| letters.forEach((letter, index) => { | |
| setTimeout(() => { | |
| letter.classList.add('visible'); | |
| }, index * 0.0625 * 1000); // 0.0625s delay between each letter | |
| }); | |
| }); | |
| } | |
| // Speech animation on demo-output-section click | |
| if (demoOutputSection && demoTitleRight) { | |
| demoOutputSection.addEventListener('click', (event) => { | |
| if (event.target.closest('#demoGenerateBtn')) { | |
| return; | |
| } | |
| demoTitleRight.classList.remove('animate-speech'); | |
| // Trigger reflow | |
| void demoTitleRight.offsetWidth; | |
| demoTitleRight.classList.add('animate-speech'); | |
| }); | |
| } | |
| // Initialize models | |
| initializeModels(); | |
| })(); | |