Spaces:
Sleeping
Sleeping
| """文件验证模块 | |
| 提供音频文件格式验证、大小检查等功能。 | |
| """ | |
| import magic | |
| from pathlib import Path | |
| from typing import List, Optional, Tuple | |
| import mimetypes | |
| from ..core.config import get_config | |
| from ..utils.logger import get_task_logger | |
| class FileValidator: | |
| """文件验证器""" | |
| # 支持的音频文件格式 | |
| SUPPORTED_EXTENSIONS = { | |
| '.aac', '.amr', '.avi', '.flac', '.flv', '.m4a', '.mkv', | |
| '.mov', '.mp3', '.mp4', '.mpeg', '.ogg', '.opus', '.wav', | |
| '.webm', '.wma', '.wmv' | |
| } | |
| # 支持的MIME类型 | |
| SUPPORTED_MIME_TYPES = { | |
| 'audio/aac', 'audio/amr', 'audio/flac', 'audio/mp3', 'audio/mpeg', | |
| 'audio/mp4', 'audio/ogg', 'audio/opus', 'audio/wav', 'audio/webm', | |
| 'audio/x-wav', 'audio/x-flac', 'audio/x-m4a', | |
| 'video/mp4', 'video/avi', 'video/x-flv', 'video/quicktime', | |
| 'video/x-msvideo', 'video/webm', 'video/x-ms-wmv' | |
| } | |
| def __init__(self): | |
| """初始化文件验证器""" | |
| self.config = get_config() | |
| self.logger = get_task_logger(logger_name="transcript_service.validator") | |
| # 初始化libmagic | |
| try: | |
| self.magic = magic.Magic(mime=True) | |
| except Exception as e: | |
| self.logger.warning(f"无法初始化libmagic: {str(e)}, 将使用基础验证") | |
| self.magic = None | |
| def validate_file(self, file_path: Path) -> Tuple[bool, Optional[str]]: | |
| """验证单个文件 | |
| Args: | |
| file_path: 文件路径 | |
| Returns: | |
| (是否有效, 错误信息) | |
| """ | |
| try: | |
| # 检查文件是否存在 | |
| if not file_path.exists(): | |
| return False, f"文件不存在: {file_path}" | |
| # 检查是否是文件 | |
| if not file_path.is_file(): | |
| return False, f"不是有效的文件: {file_path}" | |
| # 检查文件大小 | |
| file_size = file_path.stat().st_size | |
| if file_size == 0: | |
| return False, f"文件为空: {file_path.name}" | |
| if file_size > self.config.app.max_file_size: | |
| size_mb = file_size / (1024 * 1024) | |
| max_size_mb = self.config.app.max_file_size / (1024 * 1024) | |
| return False, f"文件大小 {size_mb:.1f}MB 超过限制 {max_size_mb:.1f}MB: {file_path.name}" | |
| # 检查文件扩展名 | |
| file_ext = file_path.suffix.lower() | |
| if file_ext not in self.SUPPORTED_EXTENSIONS: | |
| return False, f"不支持的文件格式 {file_ext}: {file_path.name}" | |
| # 检查MIME类型 | |
| if not self._check_mime_type(file_path): | |
| return False, f"文件内容与扩展名不匹配: {file_path.name}" | |
| # 检查文件完整性 | |
| if not self._check_file_integrity(file_path): | |
| return False, f"文件可能损坏或不完整: {file_path.name}" | |
| self.logger.info(f"文件验证通过: {file_path.name}") | |
| return True, None | |
| except Exception as e: | |
| error_msg = f"验证文件时发生错误: {file_path.name}, 错误: {str(e)}" | |
| self.logger.exception(error_msg) | |
| return False, error_msg | |
| def validate_multiple_files(self, file_paths: List[Path]) -> Tuple[List[Path], List[Tuple[Path, str]]]: | |
| """验证多个文件 | |
| Args: | |
| file_paths: 文件路径列表 | |
| Returns: | |
| (有效文件列表, 无效文件列表[(文件路径, 错误信息)]) | |
| """ | |
| # 检查文件数量 | |
| if len(file_paths) > self.config.app.max_files_count: | |
| self.logger.warning(f"文件数量 {len(file_paths)} 超过限制 {self.config.app.max_files_count}") | |
| valid_files = [] | |
| invalid_files = [] | |
| for file_path in file_paths[:self.config.app.max_files_count]: | |
| is_valid, error_msg = self.validate_file(file_path) | |
| if is_valid: | |
| valid_files.append(file_path) | |
| else: | |
| invalid_files.append((file_path, error_msg)) | |
| # 如果超过限制,记录被跳过的文件 | |
| if len(file_paths) > self.config.app.max_files_count: | |
| skipped_count = len(file_paths) - self.config.app.max_files_count | |
| self.logger.warning(f"跳过了 {skipped_count} 个文件(超过批处理限制)") | |
| self.logger.info(f"文件验证完成: {len(valid_files)} 个有效文件, {len(invalid_files)} 个无效文件") | |
| return valid_files, invalid_files | |
| def _check_mime_type(self, file_path: Path) -> bool: | |
| """检查文件MIME类型 | |
| Args: | |
| file_path: 文件路径 | |
| Returns: | |
| MIME类型是否匹配 | |
| """ | |
| try: | |
| # 使用libmagic检查 | |
| if self.magic: | |
| mime_type = self.magic.from_file(str(file_path)) | |
| if mime_type in self.SUPPORTED_MIME_TYPES: | |
| return True | |
| # 使用mimetypes作为备选方案 | |
| mime_type, _ = mimetypes.guess_type(str(file_path)) | |
| if mime_type and mime_type in self.SUPPORTED_MIME_TYPES: | |
| return True | |
| # 对于某些格式,检查文件头 | |
| return self._check_file_header(file_path) | |
| except Exception as e: | |
| self.logger.warning(f"检查MIME类型时发生错误: {file_path.name}, 错误: {str(e)}") | |
| # 如果MIME检查失败,只要扩展名正确就通过 | |
| return True | |
| def _check_file_header(self, file_path: Path) -> bool: | |
| """检查文件头部特征 | |
| Args: | |
| file_path: 文件路径 | |
| Returns: | |
| 文件头是否匹配 | |
| """ | |
| try: | |
| with open(file_path, 'rb') as f: | |
| header = f.read(16) | |
| if not header: | |
| return False | |
| # 检查常见音频格式的文件头 | |
| if header.startswith(b'ID3') or header[4:8] == b'ftyp': # MP3, MP4 | |
| return True | |
| elif header.startswith(b'RIFF') and b'WAVE' in header: # WAV | |
| return True | |
| elif header.startswith(b'fLaC'): # FLAC | |
| return True | |
| elif header.startswith(b'OggS'): # OGG | |
| return True | |
| elif header.startswith(b'\xff\xfb') or header.startswith(b'\xff\xfa'): # MP3 | |
| return True | |
| # 如果无法识别文件头,但扩展名正确,就通过验证 | |
| return True | |
| except Exception as e: | |
| self.logger.warning(f"检查文件头时发生错误: {file_path.name}, 错误: {str(e)}") | |
| return True | |
| def _check_file_integrity(self, file_path: Path) -> bool: | |
| """检查文件完整性 | |
| Args: | |
| file_path: 文件路径 | |
| Returns: | |
| 文件是否完整 | |
| """ | |
| try: | |
| # 基础完整性检查:确保文件可以完全读取 | |
| with open(file_path, 'rb') as f: | |
| # 读取文件开头和结尾 | |
| f.read(1024) # 读取前1KB | |
| f.seek(-min(1024, file_path.stat().st_size), 2) # 读取后1KB | |
| f.read() | |
| return True | |
| except Exception as e: | |
| self.logger.warning(f"检查文件完整性时发生错误: {file_path.name}, 错误: {str(e)}") | |
| return False | |
| def get_file_info(self, file_path: Path) -> dict: | |
| """获取文件信息 | |
| Args: | |
| file_path: 文件路径 | |
| Returns: | |
| 文件信息字典 | |
| """ | |
| try: | |
| stat = file_path.stat() | |
| # 获取MIME类型 | |
| mime_type = None | |
| if self.magic: | |
| try: | |
| mime_type = self.magic.from_file(str(file_path)) | |
| except: | |
| pass | |
| if not mime_type: | |
| mime_type, _ = mimetypes.guess_type(str(file_path)) | |
| return { | |
| 'name': file_path.name, | |
| 'size': stat.st_size, | |
| 'size_mb': round(stat.st_size / (1024 * 1024), 2), | |
| 'extension': file_path.suffix.lower(), | |
| 'mime_type': mime_type, | |
| 'modified_time': stat.st_mtime, | |
| 'is_supported': file_path.suffix.lower() in self.SUPPORTED_EXTENSIONS | |
| } | |
| except Exception as e: | |
| self.logger.error(f"获取文件信息失败: {file_path.name}, 错误: {str(e)}") | |
| return { | |
| 'name': file_path.name, | |
| 'error': str(e) | |
| } | |
| def get_supported_formats(self) -> dict: | |
| """获取支持的文件格式信息 | |
| Returns: | |
| 支持的格式信息 | |
| """ | |
| return { | |
| 'extensions': sorted(list(self.SUPPORTED_EXTENSIONS)), | |
| 'mime_types': sorted(list(self.SUPPORTED_MIME_TYPES)), | |
| 'max_file_size_mb': self.config.app.max_file_size / (1024 * 1024), | |
| 'max_files_count': self.config.app.max_files_count | |
| } | |
| # 全局文件验证器实例 | |
| file_validator = FileValidator() | |
| def get_file_validator() -> FileValidator: | |
| """获取文件验证器实例 | |
| Returns: | |
| 文件验证器实例 | |
| """ | |
| return file_validator |