pythonprincess commited on
Commit
4d0d018
·
verified ·
1 Parent(s): 1c7b654

Delete models/translation/translation_utils.py

Browse files
models/translation/translation_utils.py DELETED
@@ -1,621 +0,0 @@
1
- # models/translation/translation_utils.py
2
-
3
- """
4
- Translation Model Utilities for PENNY Project
5
- Handles multilingual translation using NLLB-200 for civic engagement accessibility.
6
- Provides async translation with structured error handling and language code normalization.
7
- """
8
-
9
- import asyncio
10
- import time
11
- from typing import Dict, Any, Optional, List
12
-
13
- # --- Logging Imports ---
14
- from app.logging_utils import log_interaction, sanitize_for_logging
15
-
16
- # --- Model Loader Import ---
17
- try:
18
- from app.model_loader import load_model_pipeline
19
- MODEL_LOADER_AVAILABLE = True
20
- except ImportError:
21
- MODEL_LOADER_AVAILABLE = False
22
- import logging
23
- logging.getLogger(__name__).warning("Could not import load_model_pipeline. Translation service unavailable.")
24
-
25
- # Global variable to store the loaded pipeline for re-use
26
- TRANSLATION_PIPELINE: Optional[Any] = None
27
- AGENT_NAME = "penny-translate-agent"
28
- INITIALIZATION_ATTEMPTED = False
29
-
30
- # NLLB-200 Language Code Mapping (Common languages for civic engagement)
31
- LANGUAGE_CODES = {
32
- # English variants
33
- "english": "eng_Latn",
34
- "en": "eng_Latn",
35
-
36
- # Spanish variants
37
- "spanish": "spa_Latn",
38
- "es": "spa_Latn",
39
- "español": "spa_Latn",
40
-
41
- # French
42
- "french": "fra_Latn",
43
- "fr": "fra_Latn",
44
- "français": "fra_Latn",
45
-
46
- # Mandarin Chinese
47
- "chinese": "zho_Hans",
48
- "mandarin": "zho_Hans",
49
- "zh": "zho_Hans",
50
-
51
- # Arabic
52
- "arabic": "arb_Arab",
53
- "ar": "arb_Arab",
54
-
55
- # Hindi
56
- "hindi": "hin_Deva",
57
- "hi": "hin_Deva",
58
-
59
- # Portuguese
60
- "portuguese": "por_Latn",
61
- "pt": "por_Latn",
62
-
63
- # Russian
64
- "russian": "rus_Cyrl",
65
- "ru": "rus_Cyrl",
66
-
67
- # German
68
- "german": "deu_Latn",
69
- "de": "deu_Latn",
70
-
71
- # Vietnamese
72
- "vietnamese": "vie_Latn",
73
- "vi": "vie_Latn",
74
-
75
- # Tagalog
76
- "tagalog": "tgl_Latn",
77
- "tl": "tgl_Latn",
78
-
79
- # Urdu
80
- "urdu": "urd_Arab",
81
- "ur": "urd_Arab",
82
-
83
- # Swahili
84
- "swahili": "swh_Latn",
85
- "sw": "swh_Latn",
86
- }
87
-
88
- # Pre-translated civic phrases for common queries
89
- CIVIC_PHRASES = {
90
- "eng_Latn": {
91
- "voting_location": "Where is my polling place?",
92
- "voter_registration": "How do I register to vote?",
93
- "city_services": "What city services are available?",
94
- "report_issue": "I want to report a problem.",
95
- "contact_city": "How do I contact city hall?",
96
- },
97
- "spa_Latn": {
98
- "voting_location": "¿Dónde está mi lugar de votación?",
99
- "voter_registration": "¿Cómo me registro para votar?",
100
- "city_services": "¿Qué servicios de la ciudad están disponibles?",
101
- "report_issue": "Quiero reportar un problema.",
102
- "contact_city": "¿Cómo contacto al ayuntamiento?",
103
- }
104
- }
105
-
106
-
107
- def _initialize_translation_pipeline() -> bool:
108
- """
109
- Initializes the translation pipeline only once.
110
-
111
- Returns:
112
- bool: True if initialization succeeded, False otherwise.
113
- """
114
- global TRANSLATION_PIPELINE, INITIALIZATION_ATTEMPTED
115
-
116
- if INITIALIZATION_ATTEMPTED:
117
- return TRANSLATION_PIPELINE is not None
118
-
119
- INITIALIZATION_ATTEMPTED = True
120
-
121
- if not MODEL_LOADER_AVAILABLE:
122
- log_interaction(
123
- intent="translation_initialization",
124
- success=False,
125
- error="model_loader unavailable"
126
- )
127
- return False
128
-
129
- try:
130
- log_interaction(
131
- intent="translation_initialization",
132
- success=None,
133
- details=f"Loading {AGENT_NAME}"
134
- )
135
-
136
- TRANSLATION_PIPELINE = load_model_pipeline(AGENT_NAME)
137
-
138
- if TRANSLATION_PIPELINE is None:
139
- log_interaction(
140
- intent="translation_initialization",
141
- success=False,
142
- error="Pipeline returned None"
143
- )
144
- return False
145
-
146
- log_interaction(
147
- intent="translation_initialization",
148
- success=True,
149
- details=f"Model {AGENT_NAME} loaded successfully"
150
- )
151
- return True
152
-
153
- except Exception as e:
154
- log_interaction(
155
- intent="translation_initialization",
156
- success=False,
157
- error=str(e)
158
- )
159
- return False
160
-
161
-
162
- # Attempt initialization at module load
163
- _initialize_translation_pipeline()
164
-
165
-
166
- def is_translation_available() -> bool:
167
- """
168
- Check if translation service is available.
169
-
170
- Returns:
171
- bool: True if translation pipeline is loaded and ready.
172
- """
173
- return TRANSLATION_PIPELINE is not None
174
-
175
-
176
- def normalize_language_code(lang: str) -> str:
177
- """
178
- Converts common language names/codes to NLLB-200 format.
179
-
180
- Args:
181
- lang: Language name or code (e.g., "spanish", "es", "español")
182
-
183
- Returns:
184
- NLLB-200 language code (e.g., "spa_Latn")
185
- """
186
- if not lang or not isinstance(lang, str):
187
- return "eng_Latn" # Default to English
188
-
189
- lang_lower = lang.lower().strip()
190
-
191
- # Check if it's already in NLLB format (contains underscore)
192
- if "_" in lang_lower:
193
- return lang_lower
194
-
195
- # Look up in mapping
196
- return LANGUAGE_CODES.get(lang_lower, lang_lower)
197
-
198
-
199
- def get_supported_languages() -> List[str]:
200
- """
201
- Get list of supported language codes.
202
-
203
- Returns:
204
- List of NLLB-200 language codes supported by PENNY.
205
- """
206
- return list(set(LANGUAGE_CODES.values()))
207
-
208
-
209
- async def translate_text(
210
- text: str,
211
- source_language: str = "eng_Latn",
212
- target_language: str = "spa_Latn",
213
- tenant_id: Optional[str] = None
214
- ) -> Dict[str, Any]:
215
- """
216
- Translates text from source language to target language using NLLB-200.
217
-
218
- Args:
219
- text: The text to translate.
220
- source_language: Source language code (e.g., "eng_Latn", "spanish", "es")
221
- target_language: Target language code (e.g., "spa_Latn", "french", "fr")
222
- tenant_id: Optional tenant identifier for logging.
223
-
224
- Returns:
225
- A dictionary containing:
226
- - translated_text (str): The translated text
227
- - source_lang (str): Normalized source language code
228
- - target_lang (str): Normalized target language code
229
- - original_text (str): The input text
230
- - available (bool): Whether the service was available
231
- - error (str, optional): Error message if translation failed
232
- - response_time_ms (int, optional): Translation time in milliseconds
233
- """
234
- start_time = time.time()
235
-
236
- global TRANSLATION_PIPELINE
237
-
238
- # Check availability
239
- if not is_translation_available():
240
- log_interaction(
241
- intent="translation",
242
- tenant_id=tenant_id,
243
- success=False,
244
- error="Translation pipeline not available",
245
- fallback_used=True
246
- )
247
- return {
248
- "translated_text": text, # Return original text as fallback
249
- "source_lang": source_language,
250
- "target_lang": target_language,
251
- "original_text": text,
252
- "available": False,
253
- "error": "Translation service is temporarily unavailable."
254
- }
255
-
256
- # Validate input
257
- if not text or not isinstance(text, str):
258
- log_interaction(
259
- intent="translation",
260
- tenant_id=tenant_id,
261
- success=False,
262
- error="Invalid text input"
263
- )
264
- return {
265
- "translated_text": "",
266
- "source_lang": source_language,
267
- "target_lang": target_language,
268
- "original_text": text if isinstance(text, str) else "",
269
- "available": True,
270
- "error": "Invalid text input provided."
271
- }
272
-
273
- # Check text length (prevent processing extremely long texts)
274
- if len(text) > 5000: # 5k character limit for translation
275
- log_interaction(
276
- intent="translation",
277
- tenant_id=tenant_id,
278
- success=False,
279
- error=f"Text too long: {len(text)} characters",
280
- text_preview=sanitize_for_logging(text[:100])
281
- )
282
- return {
283
- "translated_text": text,
284
- "source_lang": source_language,
285
- "target_lang": target_language,
286
- "original_text": text,
287
- "available": True,
288
- "error": "Text is too long for translation (max 5,000 characters)."
289
- }
290
-
291
- # Normalize language codes
292
- src_lang = normalize_language_code(source_language)
293
- tgt_lang = normalize_language_code(target_language)
294
-
295
- # Skip translation if source and target are the same
296
- if src_lang == tgt_lang:
297
- log_interaction(
298
- intent="translation_skipped",
299
- tenant_id=tenant_id,
300
- success=True,
301
- details="Source and target languages are identical"
302
- )
303
- return {
304
- "translated_text": text,
305
- "source_lang": src_lang,
306
- "target_lang": tgt_lang,
307
- "original_text": text,
308
- "available": True,
309
- "skipped": True
310
- }
311
-
312
- try:
313
- loop = asyncio.get_event_loop()
314
-
315
- # Run model inference in thread executor
316
- # load_model_pipeline returns a wrapper that calls client.predict()
317
- # predict() returns: {"translation": "...", "source_lang": "...", "target_lang": "...", "success": True}
318
- result_dict = await loop.run_in_executor(
319
- None,
320
- lambda: TRANSLATION_PIPELINE(
321
- text,
322
- source_lang=src_lang,
323
- target_lang=tgt_lang
324
- )
325
- )
326
-
327
- response_time_ms = int((time.time() - start_time) * 1000)
328
-
329
- # Validate results - check if predict() returned an error
330
- if not result_dict or not isinstance(result_dict, dict):
331
- log_interaction(
332
- intent="translation",
333
- tenant_id=tenant_id,
334
- success=False,
335
- error="Empty or invalid model output",
336
- response_time_ms=response_time_ms,
337
- source_lang=src_lang,
338
- target_lang=tgt_lang
339
- )
340
- return {
341
- "translated_text": text, # Fallback to original
342
- "source_lang": src_lang,
343
- "target_lang": tgt_lang,
344
- "original_text": text,
345
- "available": True,
346
- "error": "Translation returned unexpected format."
347
- }
348
-
349
- # Check for error in result
350
- if not result_dict.get("success", False) or "error" in result_dict:
351
- error_msg = result_dict.get("error", "Translation failed")
352
- log_interaction(
353
- intent="translation",
354
- tenant_id=tenant_id,
355
- success=False,
356
- error=error_msg,
357
- response_time_ms=response_time_ms,
358
- source_lang=src_lang,
359
- target_lang=tgt_lang
360
- )
361
- return {
362
- "translated_text": text, # Fallback to original
363
- "source_lang": src_lang,
364
- "target_lang": tgt_lang,
365
- "original_text": text,
366
- "available": False,
367
- "error": error_msg
368
- }
369
-
370
- # Extract translation from predict() result format
371
- # predict() returns: {"translation": "...", "source_lang": "...", "target_lang": "...", "success": True}
372
- translated = result_dict.get('translation', '').strip()
373
-
374
- if not translated:
375
- log_interaction(
376
- intent="translation",
377
- tenant_id=tenant_id,
378
- success=False,
379
- error="Empty translation result",
380
- response_time_ms=response_time_ms,
381
- source_lang=src_lang,
382
- target_lang=tgt_lang
383
- )
384
- return {
385
- "translated_text": text, # Fallback to original
386
- "source_lang": src_lang,
387
- "target_lang": tgt_lang,
388
- "original_text": text,
389
- "available": True,
390
- "error": "Translation produced empty result."
391
- }
392
-
393
- # Log slow translations
394
- if response_time_ms > 5000: # 5 seconds
395
- log_interaction(
396
- intent="translation_slow",
397
- tenant_id=tenant_id,
398
- success=True,
399
- response_time_ms=response_time_ms,
400
- details="Slow translation detected",
401
- source_lang=src_lang,
402
- target_lang=tgt_lang,
403
- text_length=len(text)
404
- )
405
-
406
- log_interaction(
407
- intent="translation",
408
- tenant_id=tenant_id,
409
- success=True,
410
- response_time_ms=response_time_ms,
411
- source_lang=src_lang,
412
- target_lang=tgt_lang,
413
- text_length=len(text)
414
- )
415
-
416
- return {
417
- "translated_text": translated,
418
- "source_lang": src_lang,
419
- "target_lang": tgt_lang,
420
- "original_text": text,
421
- "available": True,
422
- "response_time_ms": response_time_ms
423
- }
424
-
425
- except asyncio.CancelledError:
426
- log_interaction(
427
- intent="translation",
428
- tenant_id=tenant_id,
429
- success=False,
430
- error="Translation cancelled",
431
- source_lang=src_lang,
432
- target_lang=tgt_lang
433
- )
434
- raise
435
-
436
- except Exception as e:
437
- response_time_ms = int((time.time() - start_time) * 1000)
438
-
439
- log_interaction(
440
- intent="translation",
441
- tenant_id=tenant_id,
442
- success=False,
443
- error=str(e),
444
- response_time_ms=response_time_ms,
445
- source_lang=src_lang,
446
- target_lang=tgt_lang,
447
- text_preview=sanitize_for_logging(text[:100]),
448
- fallback_used=True
449
- )
450
-
451
- return {
452
- "translated_text": text, # Fallback to original
453
- "source_lang": src_lang,
454
- "target_lang": tgt_lang,
455
- "original_text": text,
456
- "available": False,
457
- "error": str(e),
458
- "response_time_ms": response_time_ms
459
- }
460
-
461
-
462
- async def detect_and_translate(
463
- text: str,
464
- target_language: str = "eng_Latn",
465
- tenant_id: Optional[str] = None
466
- ) -> Dict[str, Any]:
467
- """
468
- Attempts to detect the source language and translate to target.
469
-
470
- Note: This is a simplified heuristic-based detection. For production,
471
- consider integrating a dedicated language detection model.
472
-
473
- Args:
474
- text: The text to translate
475
- target_language: Target language code
476
- tenant_id: Optional tenant identifier for logging
477
-
478
- Returns:
479
- Translation result dictionary
480
- """
481
- if not text or not isinstance(text, str):
482
- return {
483
- "translated_text": "",
484
- "detected_lang": "unknown",
485
- "target_lang": target_language,
486
- "original_text": text if isinstance(text, str) else "",
487
- "available": True,
488
- "error": "Invalid text input."
489
- }
490
-
491
- # Simple heuristic: check for common non-English characters
492
- detected_lang = "eng_Latn" # Default assumption
493
-
494
- # Check for Spanish characters
495
- if any(char in text for char in ['¿', '¡', 'ñ', 'á', 'é', 'í', 'ó', 'ú']):
496
- detected_lang = "spa_Latn"
497
- # Check for Chinese characters
498
- elif any('\u4e00' <= char <= '\u9fff' for char in text):
499
- detected_lang = "zho_Hans"
500
- # Check for Arabic script
501
- elif any('\u0600' <= char <= '\u06ff' for char in text):
502
- detected_lang = "arb_Arab"
503
- # Check for Cyrillic (Russian)
504
- elif any('\u0400' <= char <= '\u04ff' for char in text):
505
- detected_lang = "rus_Cyrl"
506
- # Check for Devanagari (Hindi)
507
- elif any('\u0900' <= char <= '\u097f' for char in text):
508
- detected_lang = "hin_Deva"
509
-
510
- log_interaction(
511
- intent="language_detection",
512
- tenant_id=tenant_id,
513
- success=True,
514
- detected_lang=detected_lang,
515
- text_preview=sanitize_for_logging(text[:50])
516
- )
517
-
518
- result = await translate_text(text, detected_lang, target_language, tenant_id)
519
- result["detected_lang"] = detected_lang
520
-
521
- return result
522
-
523
-
524
- async def batch_translate(
525
- texts: List[str],
526
- source_language: str = "eng_Latn",
527
- target_language: str = "spa_Latn",
528
- tenant_id: Optional[str] = None
529
- ) -> List[Dict[str, Any]]:
530
- """
531
- Translate multiple texts at once.
532
-
533
- Args:
534
- texts: List of strings to translate
535
- source_language: Source language code
536
- target_language: Target language code
537
- tenant_id: Optional tenant identifier for logging
538
-
539
- Returns:
540
- List of translation result dictionaries
541
- """
542
- if not texts or not isinstance(texts, list):
543
- log_interaction(
544
- intent="batch_translation",
545
- tenant_id=tenant_id,
546
- success=False,
547
- error="Invalid texts input"
548
- )
549
- return []
550
-
551
- # Filter valid texts and limit batch size
552
- valid_texts = [t for t in texts if isinstance(t, str) and t.strip()]
553
- if len(valid_texts) > 50: # Batch size limit
554
- valid_texts = valid_texts[:50]
555
- log_interaction(
556
- intent="batch_translation",
557
- tenant_id=tenant_id,
558
- success=None,
559
- details=f"Batch size limited to 50 texts"
560
- )
561
-
562
- if not valid_texts:
563
- log_interaction(
564
- intent="batch_translation",
565
- tenant_id=tenant_id,
566
- success=False,
567
- error="No valid texts in batch"
568
- )
569
- return []
570
-
571
- start_time = time.time()
572
- results = []
573
-
574
- for text in valid_texts:
575
- result = await translate_text(text, source_language, target_language, tenant_id)
576
- results.append(result)
577
-
578
- response_time_ms = int((time.time() - start_time) * 1000)
579
-
580
- log_interaction(
581
- intent="batch_translation",
582
- tenant_id=tenant_id,
583
- success=True,
584
- response_time_ms=response_time_ms,
585
- batch_size=len(valid_texts),
586
- source_lang=normalize_language_code(source_language),
587
- target_lang=normalize_language_code(target_language)
588
- )
589
-
590
- return results
591
-
592
-
593
- def get_civic_phrase(
594
- phrase_key: str,
595
- language: str = "eng_Latn"
596
- ) -> str:
597
- """
598
- Get a pre-translated civic phrase for common queries.
599
-
600
- Args:
601
- phrase_key: Key for the civic phrase (e.g., "voting_location")
602
- language: Target language code
603
-
604
- Returns:
605
- Translated phrase or empty string if not found
606
- """
607
- if not phrase_key or not isinstance(phrase_key, str):
608
- return ""
609
-
610
- lang_code = normalize_language_code(language)
611
- phrase = CIVIC_PHRASES.get(lang_code, {}).get(phrase_key, "")
612
-
613
- if phrase:
614
- log_interaction(
615
- intent="civic_phrase_lookup",
616
- success=True,
617
- phrase_key=phrase_key,
618
- language=lang_code
619
- )
620
-
621
- return phrase