Spaces:
Building
Building
File size: 7,212 Bytes
5d50ed0 165e2d0 5d50ed0 165e2d0 5d50ed0 165e2d0 5d50ed0 165e2d0 5d50ed0 165e2d0 5d50ed0 911913b 5d50ed0 911913b 5d50ed0 911913b 5d50ed0 911913b 5d50ed0 911913b 5d50ed0 911913b 5d50ed0 911913b 5d50ed0 911913b 5d50ed0 911913b 5d50ed0 911913b 5d50ed0 911913b 5d50ed0 911913b 5d50ed0 911913b c3db99d 165e2d0 5d50ed0 165e2d0 6472323 165e2d0 6472323 165e2d0 5d50ed0 165e2d0 5d50ed0 165e2d0 5d50ed0 165e2d0 5d50ed0 fa1c68b 5d50ed0 fa1c68b 5d50ed0 e90d3a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
"""
Deepgram Speech-to-Text Implementation - Simple Batch Mode
"""
from typing import Optional, List
from datetime import datetime
import io
import wave
import aiohttp
import json
from utils.logger import log_info, log_error, log_debug, log_warning
from .stt_interface import STTInterface, STTConfig, TranscriptionResult
class DeepgramSTT(STTInterface):
def __init__(self, api_key: str):
"""
Initialize Deepgram STT
Args:
api_key: Deepgram API key
"""
try:
self.api_key = api_key
self.base_url = "https://api.deepgram.com/v1/listen"
log_info("✅ Deepgram STT initialized in batch mode")
except Exception as e:
log_error(f"❌ Failed to initialize Deepgram STT: {str(e)}")
raise
def _map_language_code(self, language: str) -> str:
"""Map language codes to Deepgram format"""
# Deepgram uses different language codes
language_map = {
"tr": "tr",
"tr-TR": "tr",
"en": "en-US",
"en-US": "en-US",
"en-GB": "en-GB",
"de": "de",
"de-DE": "de",
"fr": "fr",
"fr-FR": "fr",
"es": "es",
"es-ES": "es",
"it": "it",
"it-IT": "it",
"pt": "pt-BR",
"pt-BR": "pt-BR",
"ru": "ru",
"ru-RU": "ru",
"ja": "ja",
"ja-JP": "ja",
"ko": "ko",
"ko-KR": "ko",
"zh": "zh-CN",
"zh-CN": "zh-CN",
"ar": "ar",
"ar-SA": "ar",
}
# Default to the language itself if not in map
return language_map.get(language, language)
async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
"""Transcribe audio data using Deepgram API"""
try:
# Check if we have audio to transcribe
if not audio_data:
log_warning("⚠️ No audio data provided")
return None
log_info(f"📊 Transcribing {len(audio_data)} bytes of audio")
# Convert to WAV format for better compatibility
wav_audio = self._convert_to_wav(audio_data, config.sample_rate)
# Build Deepgram API parameters
language = self._map_language_code(config.language)
params = {
"language": language,
"punctuate": str(config.enable_punctuation).lower(),
"model": config.model if config.model != "latest_long" else "general",
"tier": "enhanced" if config.use_enhanced else "base",
}
# Add word timestamps if requested
if config.enable_word_timestamps:
params["utterances"] = "true"
params["words"] = "true"
# Build URL with parameters
url = f"{self.base_url}?" + "&".join([f"{k}={v}" for k, v in params.items()])
# Prepare headers
headers = {
"Authorization": f"Token {self.api_key}",
"Content-Type": "audio/wav"
}
# Make API request
log_info(f"🔄 Sending audio to Deepgram API...")
async with aiohttp.ClientSession() as session:
async with session.post(url, headers=headers, data=wav_audio) as response:
if response.status == 200:
result = await response.json()
# Extract transcription from response
if result.get("results") and result["results"].get("channels"):
channel = result["results"]["channels"][0]
if channel.get("alternatives"):
alternative = channel["alternatives"][0]
# Extract word timestamps if available
word_timestamps = None
if config.enable_word_timestamps and alternative.get("words"):
word_timestamps = [
{
"word": word["word"],
"start_time": word["start"],
"end_time": word["end"]
}
for word in alternative["words"]
]
transcription = TranscriptionResult(
text=alternative.get("transcript", ""),
confidence=alternative.get("confidence", 0.0),
timestamp=datetime.now().timestamp(),
language=language,
word_timestamps=word_timestamps
)
log_info(f"✅ Transcription: '{transcription.text}' (confidence: {transcription.confidence:.2f})")
return transcription
log_warning("⚠️ No transcription in response")
return None
else:
error_text = await response.text()
log_error(f"❌ Deepgram API error: {response.status} - {error_text}")
return None
except Exception as e:
log_error(f"❌ Error during transcription: {str(e)}")
import traceback
log_error(f"Traceback: {traceback.format_exc()}")
return None
def _convert_to_wav(self, audio_data: bytes, sample_rate: int) -> bytes:
"""Convert raw PCM audio to WAV format"""
# Create WAV file in memory
wav_buffer = io.BytesIO()
with wave.open(wav_buffer, 'wb') as wav_file:
# Set WAV parameters
wav_file.setnchannels(1) # Mono
wav_file.setsampwidth(2) # 16-bit
wav_file.setframerate(sample_rate)
wav_file.writeframes(audio_data)
# Get WAV data
wav_buffer.seek(0)
return wav_buffer.read()
def get_supported_languages(self) -> List[str]:
"""Get list of supported language codes"""
# Deepgram'ın desteklediği dil kodları
# Kaynak: https://developers.deepgram.com/docs/models-languages
return [
"en", "es", "fr", "de", "it", "pt", "ru", "tr", "pl", "nl",
"sv", "no", "fi", "da", "ja", "ko", "zh", "hi", "id", "th",
"uk", "cs", "el", "he", "ar", "fa", "ta", "tl"
]
def get_provider_name(self) -> str:
"""Get provider name"""
return "deepgram" |