ModalTranscriberMCP / src /models /transcription.py
richard-su's picture
Upload folder using huggingface_hub
b5df735 verified
"""
Transcription models
"""
from dataclasses import dataclass, field
from typing import List, Optional, Dict, Any
from enum import Enum
from .base import BaseRequest, BaseResponse, OperationStatus
class ModelSize(str, Enum):
"""Whisper model sizes"""
TINY = "tiny"
BASE = "base"
SMALL = "small"
MEDIUM = "medium"
LARGE = "large"
TURBO = "turbo"
class OutputFormat(str, Enum):
"""Output formats"""
TXT = "txt"
SRT = "srt"
JSON = "json"
@dataclass
class TranscriptionRequest(BaseRequest):
"""Request model for transcription"""
audio_file_path: str
model_size: ModelSize = ModelSize.TURBO
language: Optional[str] = None
output_format: OutputFormat = OutputFormat.SRT
enable_speaker_diarization: bool = False
@dataclass
class TranscriptionSegment:
"""Individual transcription segment"""
start: float
end: float
text: str
speaker: Optional[str] = None
confidence: Optional[float] = None
@dataclass
class SpeakerInfo:
"""Speaker diarization information"""
enabled: bool = False
global_speaker_count: int = 0
speaker_mapping: Dict[str, str] = field(default_factory=dict)
speaker_summary: Dict[str, Any] = field(default_factory=dict)
@dataclass
class TranscriptionFiles:
"""Generated transcription files"""
txt_file_path: Optional[str] = None
srt_file_path: Optional[str] = None
json_file_path: Optional[str] = None
@property
def all_files(self) -> List[str]:
"""Get all non-None file paths"""
return [f for f in [self.txt_file_path, self.srt_file_path, self.json_file_path] if f]
@dataclass
class TranscriptionMetrics:
"""Transcription processing metrics"""
audio_duration: float = 0.0
processing_time: float = 0.0
segment_count: int = 0
model_used: str = ""
language_detected: str = "unknown"
@dataclass
class TranscriptionResponse(BaseResponse):
"""Response model for transcription"""
audio_file: str = ""
files: TranscriptionFiles = field(default_factory=TranscriptionFiles)
segments: List[TranscriptionSegment] = field(default_factory=list)
speaker_info: SpeakerInfo = field(default_factory=SpeakerInfo)
metrics: TranscriptionMetrics = field(default_factory=TranscriptionMetrics)
@classmethod
def success(
cls,
audio_file: str,
files: TranscriptionFiles,
segments: List[TranscriptionSegment],
metrics: TranscriptionMetrics,
speaker_info: Optional[SpeakerInfo] = None,
message: str = "转录完成"
) -> "TranscriptionResponse":
"""Create successful response"""
return cls(
status=OperationStatus.SUCCESS,
message=message,
audio_file=audio_file,
files=files,
segments=segments,
speaker_info=speaker_info or SpeakerInfo(),
metrics=metrics
)
@classmethod
def failed(
cls,
audio_file: str,
error_message: str,
error_code: str = "TRANSCRIPTION_ERROR",
error_details: Optional[Dict[str, Any]] = None
) -> "TranscriptionResponse":
"""Create failed response"""
return cls(
status=OperationStatus.FAILED,
message=error_message,
error_code=error_code,
error_details=error_details,
audio_file=audio_file
)