Omarrran's picture
TTS Dataset Collector for HF Spaces
88b6846
import { promises as fs } from 'fs';
import path from 'path';
/**
* Determines the base data directory path based on environment.
* On Hugging Face Spaces with persistent storage, uses /data
* Otherwise, uses the local dataset folder.
*/
export function getDataDir(): string {
// Check for explicit environment variable first
if (process.env.DATA_DIR) {
return process.env.DATA_DIR;
}
// On HF Spaces with persistent storage, /data is available
// We check this at runtime since /data only exists at runtime, not build time
if (process.env.SPACE_ID || isHuggingFaceSpaces()) {
return '/data';
}
// Default to local dataset directory
return path.join(process.cwd(), 'dataset');
}
/**
* Check if running on Hugging Face Spaces
*/
function isHuggingFaceSpaces(): boolean {
// HF Spaces sets SPACE_ID environment variable
return !!process.env.SPACE_ID;
}
/**
* Get the full path to a subdirectory within the data directory
*/
export function getDataPath(...subPaths: string[]): string {
return path.join(getDataDir(), ...subPaths);
}
/**
* Get audio directory path for a speaker
*/
export function getAudioPath(speakerId?: string): string {
if (speakerId) {
return getDataPath('audio', speakerId);
}
return getDataPath('audio');
}
/**
* Get transcriptions directory path for a speaker
*/
export function getTranscriptionsPath(speakerId?: string): string {
if (speakerId) {
return getDataPath('transcriptions', speakerId);
}
return getDataPath('transcriptions');
}
/**
* Get metadata directory path
*/
export function getMetadataPath(): string {
return getDataPath('metadata');
}
/**
* Get fonts directory path
*/
export function getFontsPath(): string {
return getDataPath('fonts');
}
/**
* Safely create a directory, handling errors gracefully
*/
export async function ensureDir(dirPath: string): Promise<void> {
try {
await fs.mkdir(dirPath, { recursive: true });
} catch (error: unknown) {
// Ignore EEXIST errors (directory already exists)
if (error instanceof Error && 'code' in error && (error as NodeJS.ErrnoException).code !== 'EEXIST') {
console.error(`Failed to create directory ${dirPath}:`, error);
throw error;
}
}
}
/**
* Sanitize a string for use in file paths
* Prevents path traversal attacks and invalid characters
*/
export function sanitizePath(input: string, maxLength: number = 50): string {
if (!input || typeof input !== 'string') {
return 'unknown';
}
// Remove any path traversal attempts and invalid characters
return input
.replace(/\.\./g, '') // Prevent path traversal
.replace(/[\/\\:*?"<>|]/g, '_') // Remove invalid path characters
.replace(/[^a-zA-Z0-9_-]/g, '_') // Keep only safe characters
.substring(0, maxLength)
.replace(/^_+|_+$/g, '') // Trim leading/trailing underscores
|| 'unknown';
}
/**
* Initialize the data directory structure
* Creates all necessary subdirectories
*/
export async function initializeDataDirs(): Promise<void> {
const dirs = [
getDataPath(),
getAudioPath(),
getTranscriptionsPath(),
getMetadataPath(),
getFontsPath(),
];
for (const dir of dirs) {
await ensureDir(dir);
}
console.log(`[DataPath] Initialized data directories at: ${getDataDir()}`);
}