pythonprincess commited on
Commit
a9113e0
·
verified ·
1 Parent(s): d86bd2d

Upload 5 files

Browse files
Files changed (5) hide show
  1. gemma_utils.py +216 -0
  2. layoutlm_utils.py +359 -0
  3. model_config.json +47 -0
  4. sentiment_utils.py +450 -0
  5. translation_utils.py +578 -0
gemma_utils.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # models/gemma/gemma_utils.py
2
+
3
+ """
4
+ Gemma Model Utilities for PENNY Project
5
+ Handles text generation using the Gemma-based core language model via Hugging Face Inference API.
6
+ Provides async generation with structured error handling and logging.
7
+ """
8
+
9
+ import os
10
+ import asyncio
11
+ import time
12
+ import httpx
13
+ from typing import Dict, Any, Optional
14
+
15
+ # --- Logging Imports ---
16
+ from app.logging_utils import log_interaction, sanitize_for_logging
17
+
18
+ # --- Configuration ---
19
+ HF_API_URL = "https://api-inference.huggingface.co/models/google/gemma-7b-it"
20
+ DEFAULT_TIMEOUT = 30.0 # Gemma can take longer to respond
21
+ MAX_RETRIES = 2
22
+ AGENT_NAME = "penny-core-agent"
23
+
24
+
25
+ def is_gemma_available() -> bool:
26
+ """
27
+ Check if Gemma service is available.
28
+
29
+ Returns:
30
+ bool: True if HF_TOKEN is configured.
31
+ """
32
+ return bool(os.getenv("HF_TOKEN"))
33
+
34
+
35
+ async def generate_response(
36
+ prompt: str,
37
+ max_new_tokens: int = 256,
38
+ temperature: float = 0.7,
39
+ tenant_id: Optional[str] = None,
40
+ ) -> Dict[str, Any]:
41
+ """
42
+ Runs text generation using Gemma via Hugging Face Inference API.
43
+
44
+ Args:
45
+ prompt: The conversational or instruction prompt.
46
+ max_new_tokens: The maximum number of tokens to generate (default: 256).
47
+ temperature: Controls randomness in generation (default: 0.7).
48
+ tenant_id: Optional tenant identifier for logging.
49
+
50
+ Returns:
51
+ A dictionary containing:
52
+ - response (str): The generated text
53
+ - available (bool): Whether the service was available
54
+ - error (str, optional): Error message if generation failed
55
+ - response_time_ms (int, optional): Generation time in milliseconds
56
+ """
57
+ start_time = time.time()
58
+
59
+ # Check API token availability
60
+ HF_TOKEN = os.getenv("HF_TOKEN")
61
+ if not HF_TOKEN:
62
+ log_interaction(
63
+ intent="gemma_generate",
64
+ tenant_id=tenant_id,
65
+ success=False,
66
+ error="HF_TOKEN not configured",
67
+ fallback_used=True
68
+ )
69
+ return {
70
+ "response": "I'm having trouble accessing my language model right now. Please try again in a moment!",
71
+ "available": False,
72
+ "error": "HF_TOKEN not configured"
73
+ }
74
+
75
+ # Validate inputs
76
+ if not prompt or not isinstance(prompt, str):
77
+ log_interaction(
78
+ intent="gemma_generate",
79
+ tenant_id=tenant_id,
80
+ success=False,
81
+ error="Invalid prompt provided"
82
+ )
83
+ return {
84
+ "response": "I didn't receive a valid prompt. Could you try again?",
85
+ "available": True,
86
+ "error": "Invalid input"
87
+ }
88
+
89
+ # Configure generation parameters
90
+ payload = {
91
+ "inputs": prompt,
92
+ "parameters": {
93
+ "max_new_tokens": max_new_tokens,
94
+ "temperature": temperature,
95
+ "do_sample": True if temperature > 0.0 else False,
96
+ "return_full_text": False
97
+ }
98
+ }
99
+
100
+ headers = {
101
+ "Authorization": f"Bearer {HF_TOKEN}",
102
+ "Content-Type": "application/json"
103
+ }
104
+
105
+ # Retry logic for API calls
106
+ for attempt in range(MAX_RETRIES):
107
+ try:
108
+ async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT) as client:
109
+ response = await client.post(HF_API_URL, json=payload, headers=headers)
110
+ response.raise_for_status()
111
+ result = response.json()
112
+
113
+ response_time_ms = int((time.time() - start_time) * 1000)
114
+
115
+ # Parse response
116
+ if isinstance(result, list) and len(result) > 0:
117
+ generated_text = result[0].get("generated_text", "").strip()
118
+
119
+ # Log slow responses
120
+ if response_time_ms > 5000:
121
+ log_interaction(
122
+ intent="gemma_generate_slow",
123
+ tenant_id=tenant_id,
124
+ success=True,
125
+ response_time_ms=response_time_ms,
126
+ details="Slow generation detected"
127
+ )
128
+
129
+ log_interaction(
130
+ intent="gemma_generate",
131
+ tenant_id=tenant_id,
132
+ success=True,
133
+ response_time_ms=response_time_ms,
134
+ prompt_preview=sanitize_for_logging(prompt[:100])
135
+ )
136
+
137
+ return {
138
+ "response": generated_text,
139
+ "available": True,
140
+ "response_time_ms": response_time_ms
141
+ }
142
+
143
+ # Unexpected output format
144
+ log_interaction(
145
+ intent="gemma_generate",
146
+ tenant_id=tenant_id,
147
+ success=False,
148
+ error="Unexpected API response format",
149
+ response_time_ms=response_time_ms
150
+ )
151
+
152
+ return {
153
+ "response": "I got an unexpected response from my language model. Let me try to help you another way!",
154
+ "available": True,
155
+ "error": "Unexpected output format"
156
+ }
157
+
158
+ except httpx.TimeoutException:
159
+ if attempt < MAX_RETRIES - 1:
160
+ await asyncio.sleep(1) # Wait before retry
161
+ continue
162
+
163
+ response_time_ms = int((time.time() - start_time) * 1000)
164
+ log_interaction(
165
+ intent="gemma_generate",
166
+ tenant_id=tenant_id,
167
+ success=False,
168
+ error="API timeout after retries",
169
+ response_time_ms=response_time_ms
170
+ )
171
+
172
+ return {
173
+ "response": "I'm taking too long to respond. Please try again!",
174
+ "available": False,
175
+ "error": "Timeout",
176
+ "response_time_ms": response_time_ms
177
+ }
178
+
179
+ except httpx.HTTPStatusError as e:
180
+ response_time_ms = int((time.time() - start_time) * 1000)
181
+ log_interaction(
182
+ intent="gemma_generate",
183
+ tenant_id=tenant_id,
184
+ success=False,
185
+ error=f"HTTP {e.response.status_code}",
186
+ response_time_ms=response_time_ms
187
+ )
188
+
189
+ return {
190
+ "response": "I'm having trouble generating a response right now. Please try again!",
191
+ "available": False,
192
+ "error": f"HTTP {e.response.status_code}",
193
+ "response_time_ms": response_time_ms
194
+ }
195
+
196
+ except Exception as e:
197
+ if attempt < MAX_RETRIES - 1:
198
+ await asyncio.sleep(1)
199
+ continue
200
+
201
+ response_time_ms = int((time.time() - start_time) * 1000)
202
+ log_interaction(
203
+ intent="gemma_generate",
204
+ tenant_id=tenant_id,
205
+ success=False,
206
+ error=str(e),
207
+ response_time_ms=response_time_ms,
208
+ fallback_used=True
209
+ )
210
+
211
+ return {
212
+ "response": "I'm having trouble generating a response right now. Please try again!",
213
+ "available": False,
214
+ "error": str(e),
215
+ "response_time_ms": response_time_ms
216
+ }
layoutlm_utils.py ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # models/layoutlm/layoutlm_utils.py
2
+
3
+ """
4
+ LayoutLM Model Utilities for PENNY Project
5
+ Handles document structure extraction and field recognition for civic forms and documents.
6
+ Provides async document processing with structured error handling and logging.
7
+ """
8
+
9
+ import asyncio
10
+ import time
11
+ from typing import Dict, Any, Optional, List
12
+ from io import BytesIO
13
+
14
+ # --- Logging Imports ---
15
+ from app.logging_utils import log_interaction, sanitize_for_logging
16
+
17
+ # --- Model Loader Import ---
18
+ try:
19
+ from app.model_loader import load_model_pipeline
20
+ MODEL_LOADER_AVAILABLE = True
21
+ except ImportError:
22
+ MODEL_LOADER_AVAILABLE = False
23
+ import logging
24
+ logging.getLogger(__name__).warning("Could not import load_model_pipeline. LayoutLM service unavailable.")
25
+
26
+ # Global variable to store the loaded pipeline for re-use
27
+ LAYOUTLM_PIPELINE: Optional[Any] = None
28
+ AGENT_NAME = "penny-doc-agent"
29
+ INITIALIZATION_ATTEMPTED = False
30
+
31
+
32
+ def _initialize_layoutlm_pipeline() -> bool:
33
+ """
34
+ Initializes the LayoutLM pipeline only once.
35
+
36
+ Returns:
37
+ bool: True if initialization succeeded, False otherwise.
38
+ """
39
+ global LAYOUTLM_PIPELINE, INITIALIZATION_ATTEMPTED
40
+
41
+ if INITIALIZATION_ATTEMPTED:
42
+ return LAYOUTLM_PIPELINE is not None
43
+
44
+ INITIALIZATION_ATTEMPTED = True
45
+
46
+ if not MODEL_LOADER_AVAILABLE:
47
+ log_interaction(
48
+ intent="layoutlm_initialization",
49
+ success=False,
50
+ error="model_loader unavailable"
51
+ )
52
+ return False
53
+
54
+ try:
55
+ log_interaction(
56
+ intent="layoutlm_initialization",
57
+ success=None,
58
+ details=f"Loading {AGENT_NAME}"
59
+ )
60
+
61
+ LAYOUTLM_PIPELINE = load_model_pipeline(AGENT_NAME)
62
+
63
+ if LAYOUTLM_PIPELINE is None:
64
+ log_interaction(
65
+ intent="layoutlm_initialization",
66
+ success=False,
67
+ error="Pipeline returned None"
68
+ )
69
+ return False
70
+
71
+ log_interaction(
72
+ intent="layoutlm_initialization",
73
+ success=True,
74
+ details=f"Model {AGENT_NAME} loaded successfully"
75
+ )
76
+ return True
77
+
78
+ except Exception as e:
79
+ log_interaction(
80
+ intent="layoutlm_initialization",
81
+ success=False,
82
+ error=str(e)
83
+ )
84
+ return False
85
+
86
+
87
+ # Attempt initialization at module load
88
+ _initialize_layoutlm_pipeline()
89
+
90
+
91
+ def is_layoutlm_available() -> bool:
92
+ """
93
+ Check if LayoutLM service is available.
94
+
95
+ Returns:
96
+ bool: True if LayoutLM pipeline is loaded and ready.
97
+ """
98
+ return LAYOUTLM_PIPELINE is not None
99
+
100
+
101
+ async def extract_document_data(
102
+ file_bytes: bytes,
103
+ file_name: str,
104
+ tenant_id: Optional[str] = None
105
+ ) -> Dict[str, Any]:
106
+ """
107
+ Processes a document (e.g., PDF, image) using LayoutLM to extract structured data.
108
+
109
+ Args:
110
+ file_bytes: The raw bytes of the uploaded file.
111
+ file_name: The original name of the file (e.g., form.pdf).
112
+ tenant_id: Optional tenant identifier for logging.
113
+
114
+ Returns:
115
+ A dictionary containing:
116
+ - status (str): "success" or "error"
117
+ - extracted_fields (dict, optional): Extracted key-value pairs
118
+ - available (bool): Whether the service was available
119
+ - message (str, optional): Error message if extraction failed
120
+ - response_time_ms (int, optional): Processing time in milliseconds
121
+ """
122
+ start_time = time.time()
123
+
124
+ global LAYOUTLM_PIPELINE
125
+
126
+ # Check availability
127
+ if not is_layoutlm_available():
128
+ log_interaction(
129
+ intent="layoutlm_extract",
130
+ tenant_id=tenant_id,
131
+ success=False,
132
+ error="LayoutLM pipeline not available",
133
+ fallback_used=True
134
+ )
135
+ return {
136
+ "status": "error",
137
+ "available": False,
138
+ "message": "Document processing is temporarily unavailable. Please try uploading your document again in a moment!"
139
+ }
140
+
141
+ # Validate inputs
142
+ if not file_bytes or not isinstance(file_bytes, bytes):
143
+ log_interaction(
144
+ intent="layoutlm_extract",
145
+ tenant_id=tenant_id,
146
+ success=False,
147
+ error="Invalid file_bytes provided"
148
+ )
149
+ return {
150
+ "status": "error",
151
+ "available": True,
152
+ "message": "I didn't receive valid document data. Could you try uploading your file again?"
153
+ }
154
+
155
+ if not file_name or not isinstance(file_name, str):
156
+ log_interaction(
157
+ intent="layoutlm_extract",
158
+ tenant_id=tenant_id,
159
+ success=False,
160
+ error="Invalid file_name provided"
161
+ )
162
+ return {
163
+ "status": "error",
164
+ "available": True,
165
+ "message": "I need a valid file name to process your document. Please try again!"
166
+ }
167
+
168
+ # Check file size (prevent processing extremely large files)
169
+ file_size_mb = len(file_bytes) / (1024 * 1024)
170
+ if file_size_mb > 50: # 50 MB limit
171
+ log_interaction(
172
+ intent="layoutlm_extract",
173
+ tenant_id=tenant_id,
174
+ success=False,
175
+ error=f"File too large: {file_size_mb:.2f}MB",
176
+ file_name=sanitize_for_logging(file_name)
177
+ )
178
+ return {
179
+ "status": "error",
180
+ "available": True,
181
+ "message": f"Your file is too large ({file_size_mb:.1f}MB). Please upload a document smaller than 50MB."
182
+ }
183
+
184
+ try:
185
+ # --- Real-world step (PLACEHOLDER) ---
186
+ # In a real implementation, you would:
187
+ # 1. Use a library (e.g., PyMuPDF, pdf2image) to convert PDF bytes to image(s).
188
+ # 2. Use PIL/Pillow to load the image(s) from bytes.
189
+ # 3. Pass the PIL Image object to the LayoutLM pipeline.
190
+
191
+ # For now, we use a simple mock placeholder for the image object:
192
+ image_mock = {
193
+ "file_name": file_name,
194
+ "byte_size": len(file_bytes)
195
+ }
196
+
197
+ loop = asyncio.get_event_loop()
198
+
199
+ # Run model inference in thread executor
200
+ results = await loop.run_in_executor(
201
+ None,
202
+ lambda: LAYOUTLM_PIPELINE(image_mock)
203
+ )
204
+
205
+ response_time_ms = int((time.time() - start_time) * 1000)
206
+
207
+ # Validate results
208
+ if not results or not isinstance(results, list):
209
+ log_interaction(
210
+ intent="layoutlm_extract",
211
+ tenant_id=tenant_id,
212
+ success=False,
213
+ error="Unexpected model output format",
214
+ response_time_ms=response_time_ms,
215
+ file_name=sanitize_for_logging(file_name)
216
+ )
217
+ return {
218
+ "status": "error",
219
+ "available": True,
220
+ "message": "I had trouble understanding the document structure. The file might be corrupted or in an unsupported format."
221
+ }
222
+
223
+ # Convert model output (list of dicts) into a clean key-value format
224
+ extracted_data = {}
225
+ for item in results:
226
+ if isinstance(item, dict) and 'label' in item and 'text' in item:
227
+ label_key = item['label'].lower().strip()
228
+ text_value = str(item['text']).strip()
229
+
230
+ # Avoid empty values
231
+ if text_value:
232
+ extracted_data[label_key] = text_value
233
+
234
+ # Log slow processing
235
+ if response_time_ms > 10000: # 10 seconds
236
+ log_interaction(
237
+ intent="layoutlm_extract_slow",
238
+ tenant_id=tenant_id,
239
+ success=True,
240
+ response_time_ms=response_time_ms,
241
+ details="Slow document processing detected",
242
+ file_name=sanitize_for_logging(file_name)
243
+ )
244
+
245
+ log_interaction(
246
+ intent="layoutlm_extract",
247
+ tenant_id=tenant_id,
248
+ success=True,
249
+ response_time_ms=response_time_ms,
250
+ file_name=sanitize_for_logging(file_name),
251
+ fields_extracted=len(extracted_data)
252
+ )
253
+
254
+ return {
255
+ "status": "success",
256
+ "extracted_fields": extracted_data,
257
+ "available": True,
258
+ "response_time_ms": response_time_ms,
259
+ "fields_count": len(extracted_data)
260
+ }
261
+
262
+ except asyncio.CancelledError:
263
+ log_interaction(
264
+ intent="layoutlm_extract",
265
+ tenant_id=tenant_id,
266
+ success=False,
267
+ error="Processing cancelled",
268
+ file_name=sanitize_for_logging(file_name)
269
+ )
270
+ raise
271
+
272
+ except Exception as e:
273
+ response_time_ms = int((time.time() - start_time) * 1000)
274
+
275
+ log_interaction(
276
+ intent="layoutlm_extract",
277
+ tenant_id=tenant_id,
278
+ success=False,
279
+ error=str(e),
280
+ response_time_ms=response_time_ms,
281
+ file_name=sanitize_for_logging(file_name),
282
+ fallback_used=True
283
+ )
284
+
285
+ return {
286
+ "status": "error",
287
+ "available": False,
288
+ "message": f"I encountered an issue while processing your document. Please try again, or contact support if this continues!",
289
+ "error": str(e),
290
+ "response_time_ms": response_time_ms
291
+ }
292
+
293
+
294
+ async def validate_document_fields(
295
+ extracted_fields: Dict[str, str],
296
+ required_fields: List[str],
297
+ tenant_id: Optional[str] = None
298
+ ) -> Dict[str, Any]:
299
+ """
300
+ Validates that required fields were successfully extracted from a document.
301
+
302
+ Args:
303
+ extracted_fields: Dictionary of extracted field names and values.
304
+ required_fields: List of field names that must be present.
305
+ tenant_id: Optional tenant identifier for logging.
306
+
307
+ Returns:
308
+ A dictionary containing:
309
+ - valid (bool): Whether all required fields are present
310
+ - missing_fields (list): List of missing required fields
311
+ - present_fields (list): List of found required fields
312
+ """
313
+ if not isinstance(extracted_fields, dict):
314
+ log_interaction(
315
+ intent="layoutlm_validate",
316
+ tenant_id=tenant_id,
317
+ success=False,
318
+ error="Invalid extracted_fields type"
319
+ )
320
+ return {
321
+ "valid": False,
322
+ "missing_fields": required_fields,
323
+ "present_fields": []
324
+ }
325
+
326
+ if not isinstance(required_fields, list):
327
+ log_interaction(
328
+ intent="layoutlm_validate",
329
+ tenant_id=tenant_id,
330
+ success=False,
331
+ error="Invalid required_fields type"
332
+ )
333
+ return {
334
+ "valid": False,
335
+ "missing_fields": [],
336
+ "present_fields": []
337
+ }
338
+
339
+ # Normalize field names for case-insensitive comparison
340
+ extracted_keys = {k.lower().strip() for k in extracted_fields.keys()}
341
+ required_keys = {f.lower().strip() for f in required_fields}
342
+
343
+ present_fields = [f for f in required_fields if f.lower().strip() in extracted_keys]
344
+ missing_fields = [f for f in required_fields if f.lower().strip() not in extracted_keys]
345
+
346
+ is_valid = len(missing_fields) == 0
347
+
348
+ log_interaction(
349
+ intent="layoutlm_validate",
350
+ tenant_id=tenant_id,
351
+ success=is_valid,
352
+ details=f"Validated {len(present_fields)}/{len(required_fields)} required fields"
353
+ )
354
+
355
+ return {
356
+ "valid": is_valid,
357
+ "missing_fields": missing_fields,
358
+ "present_fields": present_fields
359
+ }
model_config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "penny-core-agent": {
3
+ "model_name": "google/gemma-7b-it",
4
+ "task": "text-generation",
5
+ "endpoint": "huggingface-api",
6
+ "api_url": "https://api-inference.huggingface.co/models/google/gemma-7b-it",
7
+ "timeout_seconds": 30,
8
+ "max_retries": 2,
9
+ "description": "Penny's core conversational AI for civic engagement responses"
10
+ },
11
+ "penny-doc-agent": {
12
+ "model_name": "microsoft/layoutlmv3-base",
13
+ "task": "pdf-extraction",
14
+ "endpoint": "huggingface-api",
15
+ "api_url": "https://api-inference.huggingface.co/models/microsoft/layoutlmv3-base",
16
+ "timeout_seconds": 45,
17
+ "max_retries": 2,
18
+ "description": "Document understanding and PDF extraction for civic documents"
19
+ },
20
+ "penny-translate-agent": {
21
+ "model_name": "facebook/nllb-200-distilled-600M",
22
+ "task": "translation",
23
+ "endpoint": "huggingface-api",
24
+ "api_url": "https://api-inference.huggingface.co/models/facebook/nllb-200-distilled-600M",
25
+ "timeout_seconds": 20,
26
+ "max_retries": 2,
27
+ "description": "Multilingual translation service for accessible civic information"
28
+ },
29
+ "penny-sentiment-agent": {
30
+ "model_name": "cardiffnlp/twitter-roberta-base-sentiment",
31
+ "task": "sentiment-analysis",
32
+ "endpoint": "huggingface-api",
33
+ "api_url": "https://api-inference.huggingface.co/models/cardiffnlp/twitter-roberta-base-sentiment",
34
+ "timeout_seconds": 15,
35
+ "max_retries": 2,
36
+ "description": "Sentiment analysis for community feedback and engagement monitoring"
37
+ },
38
+ "penny-bias-checker": {
39
+ "model_name": "facebook/bart-large-mnli",
40
+ "task": "bias-detection",
41
+ "endpoint": "huggingface-api",
42
+ "api_url": "https://api-inference.huggingface.co/models/facebook/bart-large-mnli",
43
+ "timeout_seconds": 20,
44
+ "max_retries": 2,
45
+ "description": "Bias detection to ensure fair and equitable civic information"
46
+ }
47
+ }
sentiment_utils.py ADDED
@@ -0,0 +1,450 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # models/sentiment/sentiment_utils.py
2
+
3
+ """
4
+ Sentiment Analysis Model Utilities for PENNY Project
5
+ Handles text sentiment classification for user input analysis and content moderation.
6
+ Provides async sentiment analysis with structured error handling and logging.
7
+ """
8
+
9
+ import asyncio
10
+ import time
11
+ import os
12
+ import httpx
13
+ from typing import Dict, Any, Optional, List
14
+
15
+ # --- Logging Imports ---
16
+ from app.logging_utils import log_interaction, sanitize_for_logging
17
+
18
+ # --- Hugging Face API Configuration ---
19
+ HF_API_URL = "https://api-inference.huggingface.co/models/cardiffnlp/twitter-roberta-base-sentiment"
20
+ HF_TOKEN = os.getenv("HF_TOKEN")
21
+
22
+ AGENT_NAME = "penny-sentiment-agent"
23
+
24
+
25
+ def is_sentiment_available() -> bool:
26
+ """
27
+ Check if sentiment analysis service is available.
28
+
29
+ Returns:
30
+ bool: True if sentiment API is configured and ready.
31
+ """
32
+ return HF_TOKEN is not None and len(HF_TOKEN) > 0
33
+
34
+
35
+ async def get_sentiment_analysis(
36
+ text: str,
37
+ tenant_id: Optional[str] = None
38
+ ) -> Dict[str, Any]:
39
+ """
40
+ Runs sentiment analysis on the input text using the loaded pipeline.
41
+
42
+ Args:
43
+ text: The string of text to analyze.
44
+ tenant_id: Optional tenant identifier for logging.
45
+
46
+ Returns:
47
+ A dictionary containing:
48
+ - label (str): Sentiment label (e.g., "POSITIVE", "NEGATIVE", "NEUTRAL")
49
+ - score (float): Confidence score for the sentiment prediction
50
+ - available (bool): Whether the service was available
51
+ - message (str, optional): Error message if analysis failed
52
+ - response_time_ms (int, optional): Analysis time in milliseconds
53
+ """
54
+ start_time = time.time()
55
+
56
+ # Check availability
57
+ if not is_sentiment_available():
58
+ log_interaction(
59
+ intent="sentiment_analysis",
60
+ tenant_id=tenant_id,
61
+ success=False,
62
+ error="Sentiment API not configured (missing HF_TOKEN)",
63
+ fallback_used=True
64
+ )
65
+ return {
66
+ "label": "UNKNOWN",
67
+ "score": 0.0,
68
+ "available": False,
69
+ "message": "Sentiment analysis is temporarily unavailable."
70
+ }
71
+
72
+ # Validate input
73
+ if not text or not isinstance(text, str):
74
+ log_interaction(
75
+ intent="sentiment_analysis",
76
+ tenant_id=tenant_id,
77
+ success=False,
78
+ error="Invalid text input"
79
+ )
80
+ return {
81
+ "label": "ERROR",
82
+ "score": 0.0,
83
+ "available": True,
84
+ "message": "Invalid text input provided."
85
+ }
86
+
87
+ # Check text length (prevent processing extremely long texts)
88
+ if len(text) > 10000: # 10k character limit
89
+ log_interaction(
90
+ intent="sentiment_analysis",
91
+ tenant_id=tenant_id,
92
+ success=False,
93
+ error=f"Text too long: {len(text)} characters",
94
+ text_preview=sanitize_for_logging(text[:100])
95
+ )
96
+ return {
97
+ "label": "ERROR",
98
+ "score": 0.0,
99
+ "available": True,
100
+ "message": "Text is too long for sentiment analysis (max 10,000 characters)."
101
+ }
102
+
103
+ try:
104
+ # Prepare API request
105
+ headers = {"Authorization": f"Bearer {HF_TOKEN}"}
106
+ payload = {"inputs": text}
107
+
108
+ # Call Hugging Face Inference API
109
+ async with httpx.AsyncClient(timeout=30.0) as client:
110
+ response = await client.post(HF_API_URL, json=payload, headers=headers)
111
+
112
+ response_time_ms = int((time.time() - start_time) * 1000)
113
+
114
+ if response.status_code != 200:
115
+ log_interaction(
116
+ intent="sentiment_analysis",
117
+ tenant_id=tenant_id,
118
+ success=False,
119
+ error=f"API returned status {response.status_code}",
120
+ response_time_ms=response_time_ms,
121
+ text_preview=sanitize_for_logging(text[:100]),
122
+ fallback_used=True
123
+ )
124
+ return {
125
+ "label": "ERROR",
126
+ "score": 0.0,
127
+ "available": False,
128
+ "message": f"Sentiment API error: {response.status_code}",
129
+ "response_time_ms": response_time_ms
130
+ }
131
+
132
+ results = response.json()
133
+
134
+ # Validate results
135
+ # API returns: [[{"label": "LABEL_2", "score": 0.95}, ...]]
136
+ if not results or not isinstance(results, list) or len(results) == 0:
137
+ log_interaction(
138
+ intent="sentiment_analysis",
139
+ tenant_id=tenant_id,
140
+ success=False,
141
+ error="Empty or invalid model output",
142
+ response_time_ms=response_time_ms,
143
+ text_preview=sanitize_for_logging(text[:100])
144
+ )
145
+ return {
146
+ "label": "ERROR",
147
+ "score": 0.0,
148
+ "available": True,
149
+ "message": "Sentiment analysis returned unexpected format."
150
+ }
151
+
152
+ # Get the first (highest scoring) result
153
+ result_list = results[0] if isinstance(results[0], list) else results
154
+
155
+ if not result_list or len(result_list) == 0:
156
+ log_interaction(
157
+ intent="sentiment_analysis",
158
+ tenant_id=tenant_id,
159
+ success=False,
160
+ error="Empty result list",
161
+ response_time_ms=response_time_ms,
162
+ text_preview=sanitize_for_logging(text[:100])
163
+ )
164
+ return {
165
+ "label": "ERROR",
166
+ "score": 0.0,
167
+ "available": True,
168
+ "message": "Sentiment analysis returned unexpected format."
169
+ }
170
+
171
+ result = result_list[0]
172
+
173
+ # Validate result structure
174
+ if not isinstance(result, dict) or 'label' not in result or 'score' not in result:
175
+ log_interaction(
176
+ intent="sentiment_analysis",
177
+ tenant_id=tenant_id,
178
+ success=False,
179
+ error="Invalid result structure",
180
+ response_time_ms=response_time_ms,
181
+ text_preview=sanitize_for_logging(text[:100])
182
+ )
183
+ return {
184
+ "label": "ERROR",
185
+ "score": 0.0,
186
+ "available": True,
187
+ "message": "Sentiment analysis returned unexpected format."
188
+ }
189
+
190
+ # Map RoBERTa labels to readable format
191
+ # LABEL_0 = NEGATIVE, LABEL_1 = NEUTRAL, LABEL_2 = POSITIVE
192
+ label_mapping = {
193
+ "LABEL_0": "NEGATIVE",
194
+ "LABEL_1": "NEUTRAL",
195
+ "LABEL_2": "POSITIVE"
196
+ }
197
+ label = label_mapping.get(result['label'], result['label'])
198
+
199
+ # Log slow analysis
200
+ if response_time_ms > 3000: # 3 seconds
201
+ log_interaction(
202
+ intent="sentiment_analysis_slow",
203
+ tenant_id=tenant_id,
204
+ success=True,
205
+ response_time_ms=response_time_ms,
206
+ details="Slow sentiment analysis detected",
207
+ text_length=len(text)
208
+ )
209
+
210
+ log_interaction(
211
+ intent="sentiment_analysis",
212
+ tenant_id=tenant_id,
213
+ success=True,
214
+ response_time_ms=response_time_ms,
215
+ sentiment_label=label,
216
+ sentiment_score=result.get('score'),
217
+ text_length=len(text)
218
+ )
219
+
220
+ return {
221
+ "label": label,
222
+ "score": float(result['score']),
223
+ "available": True,
224
+ "response_time_ms": response_time_ms
225
+ }
226
+
227
+ except httpx.TimeoutException:
228
+ response_time_ms = int((time.time() - start_time) * 1000)
229
+ log_interaction(
230
+ intent="sentiment_analysis",
231
+ tenant_id=tenant_id,
232
+ success=False,
233
+ error="Sentiment analysis request timed out",
234
+ response_time_ms=response_time_ms,
235
+ text_preview=sanitize_for_logging(text[:100]),
236
+ fallback_used=True
237
+ )
238
+ return {
239
+ "label": "ERROR",
240
+ "score": 0.0,
241
+ "available": False,
242
+ "message": "Sentiment analysis request timed out.",
243
+ "response_time_ms": response_time_ms
244
+ }
245
+
246
+ except asyncio.CancelledError:
247
+ log_interaction(
248
+ intent="sentiment_analysis",
249
+ tenant_id=tenant_id,
250
+ success=False,
251
+ error="Analysis cancelled"
252
+ )
253
+ raise
254
+
255
+ except Exception as e:
256
+ response_time_ms = int((time.time() - start_time) * 1000)
257
+
258
+ log_interaction(
259
+ intent="sentiment_analysis",
260
+ tenant_id=tenant_id,
261
+ success=False,
262
+ error=str(e),
263
+ response_time_ms=response_time_ms,
264
+ text_preview=sanitize_for_logging(text[:100]),
265
+ fallback_used=True
266
+ )
267
+
268
+ return {
269
+ "label": "ERROR",
270
+ "score": 0.0,
271
+ "available": False,
272
+ "message": "An error occurred during sentiment analysis.",
273
+ "error": str(e),
274
+ "response_time_ms": response_time_ms
275
+ }
276
+
277
+
278
+ async def analyze_sentiment_batch(
279
+ texts: List[str],
280
+ tenant_id: Optional[str] = None
281
+ ) -> Dict[str, Any]:
282
+ """
283
+ Runs sentiment analysis on a batch of texts for efficiency.
284
+
285
+ Args:
286
+ texts: List of text strings to analyze.
287
+ tenant_id: Optional tenant identifier for logging.
288
+
289
+ Returns:
290
+ A dictionary containing:
291
+ - results (list): List of sentiment analysis results for each text
292
+ - available (bool): Whether the service was available
293
+ - total_analyzed (int): Number of texts successfully analyzed
294
+ - response_time_ms (int, optional): Total batch analysis time
295
+ """
296
+ start_time = time.time()
297
+
298
+ # Check availability
299
+ if not is_sentiment_available():
300
+ log_interaction(
301
+ intent="sentiment_batch_analysis",
302
+ tenant_id=tenant_id,
303
+ success=False,
304
+ error="Sentiment API not configured (missing HF_TOKEN)",
305
+ batch_size=len(texts) if texts else 0
306
+ )
307
+ return {
308
+ "results": [],
309
+ "available": False,
310
+ "total_analyzed": 0,
311
+ "message": "Sentiment analysis is temporarily unavailable."
312
+ }
313
+
314
+ # Validate input
315
+ if not texts or not isinstance(texts, list):
316
+ log_interaction(
317
+ intent="sentiment_batch_analysis",
318
+ tenant_id=tenant_id,
319
+ success=False,
320
+ error="Invalid texts input"
321
+ )
322
+ return {
323
+ "results": [],
324
+ "available": True,
325
+ "total_analyzed": 0,
326
+ "message": "Invalid batch input provided."
327
+ }
328
+
329
+ # Filter valid texts and limit batch size
330
+ valid_texts = [t for t in texts if isinstance(t, str) and t.strip()]
331
+ if len(valid_texts) > 100: # Batch size limit
332
+ valid_texts = valid_texts[:100]
333
+
334
+ if not valid_texts:
335
+ log_interaction(
336
+ intent="sentiment_batch_analysis",
337
+ tenant_id=tenant_id,
338
+ success=False,
339
+ error="No valid texts in batch"
340
+ )
341
+ return {
342
+ "results": [],
343
+ "available": True,
344
+ "total_analyzed": 0,
345
+ "message": "No valid texts provided for analysis."
346
+ }
347
+
348
+ try:
349
+ # Prepare API request with batch input
350
+ headers = {"Authorization": f"Bearer {HF_TOKEN}"}
351
+ payload = {"inputs": valid_texts}
352
+
353
+ # Call Hugging Face Inference API
354
+ async with httpx.AsyncClient(timeout=60.0) as client: # Longer timeout for batch
355
+ response = await client.post(HF_API_URL, json=payload, headers=headers)
356
+
357
+ response_time_ms = int((time.time() - start_time) * 1000)
358
+
359
+ if response.status_code != 200:
360
+ log_interaction(
361
+ intent="sentiment_batch_analysis",
362
+ tenant_id=tenant_id,
363
+ success=False,
364
+ error=f"API returned status {response.status_code}",
365
+ response_time_ms=response_time_ms,
366
+ batch_size=len(valid_texts)
367
+ )
368
+ return {
369
+ "results": [],
370
+ "available": False,
371
+ "total_analyzed": 0,
372
+ "message": f"Sentiment API error: {response.status_code}",
373
+ "response_time_ms": response_time_ms
374
+ }
375
+
376
+ results = response.json()
377
+
378
+ # Process results and map labels
379
+ label_mapping = {
380
+ "LABEL_0": "NEGATIVE",
381
+ "LABEL_1": "NEUTRAL",
382
+ "LABEL_2": "POSITIVE"
383
+ }
384
+
385
+ processed_results = []
386
+ if results and isinstance(results, list):
387
+ for item in results:
388
+ if isinstance(item, list) and len(item) > 0:
389
+ top_result = item[0]
390
+ if isinstance(top_result, dict) and 'label' in top_result:
391
+ processed_results.append({
392
+ "label": label_mapping.get(top_result['label'], top_result['label']),
393
+ "score": float(top_result.get('score', 0.0))
394
+ })
395
+
396
+ log_interaction(
397
+ intent="sentiment_batch_analysis",
398
+ tenant_id=tenant_id,
399
+ success=True,
400
+ response_time_ms=response_time_ms,
401
+ batch_size=len(valid_texts),
402
+ total_analyzed=len(processed_results)
403
+ )
404
+
405
+ return {
406
+ "results": processed_results,
407
+ "available": True,
408
+ "total_analyzed": len(processed_results),
409
+ "response_time_ms": response_time_ms
410
+ }
411
+
412
+ except httpx.TimeoutException:
413
+ response_time_ms = int((time.time() - start_time) * 1000)
414
+ log_interaction(
415
+ intent="sentiment_batch_analysis",
416
+ tenant_id=tenant_id,
417
+ success=False,
418
+ error="Batch sentiment analysis timed out",
419
+ response_time_ms=response_time_ms,
420
+ batch_size=len(valid_texts)
421
+ )
422
+ return {
423
+ "results": [],
424
+ "available": False,
425
+ "total_analyzed": 0,
426
+ "message": "Batch sentiment analysis timed out.",
427
+ "error": "Request timeout",
428
+ "response_time_ms": response_time_ms
429
+ }
430
+
431
+ except Exception as e:
432
+ response_time_ms = int((time.time() - start_time) * 1000)
433
+
434
+ log_interaction(
435
+ intent="sentiment_batch_analysis",
436
+ tenant_id=tenant_id,
437
+ success=False,
438
+ error=str(e),
439
+ response_time_ms=response_time_ms,
440
+ batch_size=len(valid_texts)
441
+ )
442
+
443
+ return {
444
+ "results": [],
445
+ "available": False,
446
+ "total_analyzed": 0,
447
+ "message": "An error occurred during batch sentiment analysis.",
448
+ "error": str(e),
449
+ "response_time_ms": response_time_ms
450
+ }
translation_utils.py ADDED
@@ -0,0 +1,578 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # models/translation/translation_utils.py
2
+
3
+ """
4
+ Translation Model Utilities for PENNY Project
5
+ Handles multilingual translation using NLLB-200 for civic engagement accessibility.
6
+ Provides async translation with structured error handling and language code normalization.
7
+ """
8
+
9
+ import asyncio
10
+ import time
11
+ import os
12
+ import httpx
13
+ from typing import Dict, Any, Optional, List
14
+
15
+ # --- Logging Imports ---
16
+ from app.logging_utils import log_interaction, sanitize_for_logging
17
+
18
+ # --- Hugging Face API Configuration ---
19
+ HF_API_URL = "https://api-inference.huggingface.co/models/facebook/nllb-200-distilled-600M"
20
+ HF_TOKEN = os.getenv("HF_TOKEN")
21
+
22
+ AGENT_NAME = "penny-translate-agent"
23
+ SERVICE_AVAILABLE = True # Assume available since we're using API
24
+
25
+ # NLLB-200 Language Code Mapping (Common languages for civic engagement)
26
+ LANGUAGE_CODES = {
27
+ # English variants
28
+ "english": "eng_Latn",
29
+ "en": "eng_Latn",
30
+
31
+ # Spanish variants
32
+ "spanish": "spa_Latn",
33
+ "es": "spa_Latn",
34
+ "español": "spa_Latn",
35
+
36
+ # French
37
+ "french": "fra_Latn",
38
+ "fr": "fra_Latn",
39
+ "français": "fra_Latn",
40
+
41
+ # Mandarin Chinese
42
+ "chinese": "zho_Hans",
43
+ "mandarin": "zho_Hans",
44
+ "zh": "zho_Hans",
45
+
46
+ # Arabic
47
+ "arabic": "arb_Arab",
48
+ "ar": "arb_Arab",
49
+
50
+ # Hindi
51
+ "hindi": "hin_Deva",
52
+ "hi": "hin_Deva",
53
+
54
+ # Portuguese
55
+ "portuguese": "por_Latn",
56
+ "pt": "por_Latn",
57
+
58
+ # Russian
59
+ "russian": "rus_Cyrl",
60
+ "ru": "rus_Cyrl",
61
+
62
+ # German
63
+ "german": "deu_Latn",
64
+ "de": "deu_Latn",
65
+
66
+ # Vietnamese
67
+ "vietnamese": "vie_Latn",
68
+ "vi": "vie_Latn",
69
+
70
+ # Tagalog
71
+ "tagalog": "tgl_Latn",
72
+ "tl": "tgl_Latn",
73
+
74
+ # Urdu
75
+ "urdu": "urd_Arab",
76
+ "ur": "urd_Arab",
77
+
78
+ # Swahili
79
+ "swahili": "swh_Latn",
80
+ "sw": "swh_Latn",
81
+ }
82
+
83
+ # Pre-translated civic phrases for common queries
84
+ CIVIC_PHRASES = {
85
+ "eng_Latn": {
86
+ "voting_location": "Where is my polling place?",
87
+ "voter_registration": "How do I register to vote?",
88
+ "city_services": "What city services are available?",
89
+ "report_issue": "I want to report a problem.",
90
+ "contact_city": "How do I contact city hall?",
91
+ },
92
+ "spa_Latn": {
93
+ "voting_location": "¿Dónde está mi lugar de votación?",
94
+ "voter_registration": "¿Cómo me registro para votar?",
95
+ "city_services": "¿Qué servicios de la ciudad están disponibles?",
96
+ "report_issue": "Quiero reportar un problema.",
97
+ "contact_city": "¿Cómo contacto al ayuntamiento?",
98
+ }
99
+ }
100
+
101
+
102
+ def is_translation_available() -> bool:
103
+ """
104
+ Check if translation service is available.
105
+
106
+ Returns:
107
+ bool: True if translation API is configured and ready.
108
+ """
109
+ return HF_TOKEN is not None and len(HF_TOKEN) > 0
110
+
111
+
112
+ def normalize_language_code(lang: str) -> str:
113
+ """
114
+ Converts common language names/codes to NLLB-200 format.
115
+
116
+ Args:
117
+ lang: Language name or code (e.g., "spanish", "es", "español")
118
+
119
+ Returns:
120
+ NLLB-200 language code (e.g., "spa_Latn")
121
+ """
122
+ if not lang or not isinstance(lang, str):
123
+ return "eng_Latn" # Default to English
124
+
125
+ lang_lower = lang.lower().strip()
126
+
127
+ # Check if it's already in NLLB format (contains underscore)
128
+ if "_" in lang_lower:
129
+ return lang_lower
130
+
131
+ # Look up in mapping
132
+ return LANGUAGE_CODES.get(lang_lower, lang_lower)
133
+
134
+
135
+ def get_supported_languages() -> List[str]:
136
+ """
137
+ Get list of supported language codes.
138
+
139
+ Returns:
140
+ List of NLLB-200 language codes supported by PENNY.
141
+ """
142
+ return list(set(LANGUAGE_CODES.values()))
143
+
144
+
145
+ async def translate_text(
146
+ text: str,
147
+ source_language: str = "eng_Latn",
148
+ target_language: str = "spa_Latn",
149
+ tenant_id: Optional[str] = None
150
+ ) -> Dict[str, Any]:
151
+ """
152
+ Translates text from source language to target language using NLLB-200.
153
+
154
+ Args:
155
+ text: The text to translate.
156
+ source_language: Source language code (e.g., "eng_Latn", "spanish", "es")
157
+ target_language: Target language code (e.g., "spa_Latn", "french", "fr")
158
+ tenant_id: Optional tenant identifier for logging.
159
+
160
+ Returns:
161
+ A dictionary containing:
162
+ - translated_text (str): The translated text
163
+ - source_lang (str): Normalized source language code
164
+ - target_lang (str): Normalized target language code
165
+ - original_text (str): The input text
166
+ - available (bool): Whether the service was available
167
+ - error (str, optional): Error message if translation failed
168
+ - response_time_ms (int, optional): Translation time in milliseconds
169
+ """
170
+ start_time = time.time()
171
+
172
+ # Check availability
173
+ if not is_translation_available():
174
+ log_interaction(
175
+ intent="translation",
176
+ tenant_id=tenant_id,
177
+ success=False,
178
+ error="Translation API not configured (missing HF_TOKEN)",
179
+ fallback_used=True
180
+ )
181
+ return {
182
+ "translated_text": text, # Return original text as fallback
183
+ "source_lang": source_language,
184
+ "target_lang": target_language,
185
+ "original_text": text,
186
+ "available": False,
187
+ "error": "Translation service is temporarily unavailable."
188
+ }
189
+
190
+ # Validate input
191
+ if not text or not isinstance(text, str):
192
+ log_interaction(
193
+ intent="translation",
194
+ tenant_id=tenant_id,
195
+ success=False,
196
+ error="Invalid text input"
197
+ )
198
+ return {
199
+ "translated_text": "",
200
+ "source_lang": source_language,
201
+ "target_lang": target_language,
202
+ "original_text": text if isinstance(text, str) else "",
203
+ "available": True,
204
+ "error": "Invalid text input provided."
205
+ }
206
+
207
+ # Check text length (prevent processing extremely long texts)
208
+ if len(text) > 5000: # 5k character limit for translation
209
+ log_interaction(
210
+ intent="translation",
211
+ tenant_id=tenant_id,
212
+ success=False,
213
+ error=f"Text too long: {len(text)} characters",
214
+ text_preview=sanitize_for_logging(text[:100])
215
+ )
216
+ return {
217
+ "translated_text": text,
218
+ "source_lang": source_language,
219
+ "target_lang": target_language,
220
+ "original_text": text,
221
+ "available": True,
222
+ "error": "Text is too long for translation (max 5,000 characters)."
223
+ }
224
+
225
+ # Normalize language codes
226
+ src_lang = normalize_language_code(source_language)
227
+ tgt_lang = normalize_language_code(target_language)
228
+
229
+ # Skip translation if source and target are the same
230
+ if src_lang == tgt_lang:
231
+ log_interaction(
232
+ intent="translation_skipped",
233
+ tenant_id=tenant_id,
234
+ success=True,
235
+ details="Source and target languages are identical"
236
+ )
237
+ return {
238
+ "translated_text": text,
239
+ "source_lang": src_lang,
240
+ "target_lang": tgt_lang,
241
+ "original_text": text,
242
+ "available": True,
243
+ "skipped": True
244
+ }
245
+
246
+ try:
247
+ # Prepare API request
248
+ headers = {"Authorization": f"Bearer {HF_TOKEN}"}
249
+ payload = {
250
+ "inputs": text,
251
+ "parameters": {
252
+ "src_lang": src_lang,
253
+ "tgt_lang": tgt_lang
254
+ }
255
+ }
256
+
257
+ # Call Hugging Face Inference API
258
+ async with httpx.AsyncClient(timeout=30.0) as client:
259
+ response = await client.post(HF_API_URL, json=payload, headers=headers)
260
+
261
+ response_time_ms = int((time.time() - start_time) * 1000)
262
+
263
+ if response.status_code != 200:
264
+ log_interaction(
265
+ intent="translation",
266
+ tenant_id=tenant_id,
267
+ success=False,
268
+ error=f"API returned status {response.status_code}",
269
+ response_time_ms=response_time_ms,
270
+ source_lang=src_lang,
271
+ target_lang=tgt_lang,
272
+ fallback_used=True
273
+ )
274
+ return {
275
+ "translated_text": text, # Fallback to original
276
+ "source_lang": src_lang,
277
+ "target_lang": tgt_lang,
278
+ "original_text": text,
279
+ "available": False,
280
+ "error": f"Translation API error: {response.status_code}",
281
+ "response_time_ms": response_time_ms
282
+ }
283
+
284
+ results = response.json()
285
+
286
+ # Validate results
287
+ if not results or not isinstance(results, list) or len(results) == 0:
288
+ log_interaction(
289
+ intent="translation",
290
+ tenant_id=tenant_id,
291
+ success=False,
292
+ error="Empty or invalid model output",
293
+ response_time_ms=response_time_ms,
294
+ source_lang=src_lang,
295
+ target_lang=tgt_lang
296
+ )
297
+ return {
298
+ "translated_text": text, # Fallback to original
299
+ "source_lang": src_lang,
300
+ "target_lang": tgt_lang,
301
+ "original_text": text,
302
+ "available": True,
303
+ "error": "Translation returned unexpected format."
304
+ }
305
+
306
+ # NLLB returns format: [{'translation_text': '...'}]
307
+ translated = results[0].get('translation_text', '').strip()
308
+
309
+ if not translated:
310
+ log_interaction(
311
+ intent="translation",
312
+ tenant_id=tenant_id,
313
+ success=False,
314
+ error="Empty translation result",
315
+ response_time_ms=response_time_ms,
316
+ source_lang=src_lang,
317
+ target_lang=tgt_lang
318
+ )
319
+ return {
320
+ "translated_text": text, # Fallback to original
321
+ "source_lang": src_lang,
322
+ "target_lang": tgt_lang,
323
+ "original_text": text,
324
+ "available": True,
325
+ "error": "Translation produced empty result."
326
+ }
327
+
328
+ # Log slow translations
329
+ if response_time_ms > 5000: # 5 seconds
330
+ log_interaction(
331
+ intent="translation_slow",
332
+ tenant_id=tenant_id,
333
+ success=True,
334
+ response_time_ms=response_time_ms,
335
+ details="Slow translation detected",
336
+ source_lang=src_lang,
337
+ target_lang=tgt_lang,
338
+ text_length=len(text)
339
+ )
340
+
341
+ log_interaction(
342
+ intent="translation",
343
+ tenant_id=tenant_id,
344
+ success=True,
345
+ response_time_ms=response_time_ms,
346
+ source_lang=src_lang,
347
+ target_lang=tgt_lang,
348
+ text_length=len(text)
349
+ )
350
+
351
+ return {
352
+ "translated_text": translated,
353
+ "source_lang": src_lang,
354
+ "target_lang": tgt_lang,
355
+ "original_text": text,
356
+ "available": True,
357
+ "response_time_ms": response_time_ms
358
+ }
359
+
360
+ except httpx.TimeoutException:
361
+ response_time_ms = int((time.time() - start_time) * 1000)
362
+ log_interaction(
363
+ intent="translation",
364
+ tenant_id=tenant_id,
365
+ success=False,
366
+ error="Translation request timed out",
367
+ response_time_ms=response_time_ms,
368
+ source_lang=src_lang,
369
+ target_lang=tgt_lang,
370
+ fallback_used=True
371
+ )
372
+ return {
373
+ "translated_text": text, # Fallback to original
374
+ "source_lang": src_lang,
375
+ "target_lang": tgt_lang,
376
+ "original_text": text,
377
+ "available": False,
378
+ "error": "Translation request timed out.",
379
+ "response_time_ms": response_time_ms
380
+ }
381
+
382
+ except asyncio.CancelledError:
383
+ log_interaction(
384
+ intent="translation",
385
+ tenant_id=tenant_id,
386
+ success=False,
387
+ error="Translation cancelled",
388
+ source_lang=src_lang,
389
+ target_lang=tgt_lang
390
+ )
391
+ raise
392
+
393
+ except Exception as e:
394
+ response_time_ms = int((time.time() - start_time) * 1000)
395
+
396
+ log_interaction(
397
+ intent="translation",
398
+ tenant_id=tenant_id,
399
+ success=False,
400
+ error=str(e),
401
+ response_time_ms=response_time_ms,
402
+ source_lang=src_lang,
403
+ target_lang=tgt_lang,
404
+ text_preview=sanitize_for_logging(text[:100]),
405
+ fallback_used=True
406
+ )
407
+
408
+ return {
409
+ "translated_text": text, # Fallback to original
410
+ "source_lang": src_lang,
411
+ "target_lang": tgt_lang,
412
+ "original_text": text,
413
+ "available": False,
414
+ "error": str(e),
415
+ "response_time_ms": response_time_ms
416
+ }
417
+
418
+
419
+ async def detect_and_translate(
420
+ text: str,
421
+ target_language: str = "eng_Latn",
422
+ tenant_id: Optional[str] = None
423
+ ) -> Dict[str, Any]:
424
+ """
425
+ Attempts to detect the source language and translate to target.
426
+
427
+ Note: This is a simplified heuristic-based detection. For production,
428
+ consider integrating a dedicated language detection model.
429
+
430
+ Args:
431
+ text: The text to translate
432
+ target_language: Target language code
433
+ tenant_id: Optional tenant identifier for logging
434
+
435
+ Returns:
436
+ Translation result dictionary
437
+ """
438
+ if not text or not isinstance(text, str):
439
+ return {
440
+ "translated_text": "",
441
+ "detected_lang": "unknown",
442
+ "target_lang": target_language,
443
+ "original_text": text if isinstance(text, str) else "",
444
+ "available": True,
445
+ "error": "Invalid text input."
446
+ }
447
+
448
+ # Simple heuristic: check for common non-English characters
449
+ detected_lang = "eng_Latn" # Default assumption
450
+
451
+ # Check for Spanish characters
452
+ if any(char in text for char in ['¿', '¡', 'ñ', 'á', 'é', 'í', 'ó', 'ú']):
453
+ detected_lang = "spa_Latn"
454
+ # Check for Chinese characters
455
+ elif any('\u4e00' <= char <= '\u9fff' for char in text):
456
+ detected_lang = "zho_Hans"
457
+ # Check for Arabic script
458
+ elif any('\u0600' <= char <= '\u06ff' for char in text):
459
+ detected_lang = "arb_Arab"
460
+ # Check for Cyrillic (Russian)
461
+ elif any('\u0400' <= char <= '\u04ff' for char in text):
462
+ detected_lang = "rus_Cyrl"
463
+ # Check for Devanagari (Hindi)
464
+ elif any('\u0900' <= char <= '\u097f' for char in text):
465
+ detected_lang = "hin_Deva"
466
+
467
+ log_interaction(
468
+ intent="language_detection",
469
+ tenant_id=tenant_id,
470
+ success=True,
471
+ detected_lang=detected_lang,
472
+ text_preview=sanitize_for_logging(text[:50])
473
+ )
474
+
475
+ result = await translate_text(text, detected_lang, target_language, tenant_id)
476
+ result["detected_lang"] = detected_lang
477
+
478
+ return result
479
+
480
+
481
+ async def batch_translate(
482
+ texts: List[str],
483
+ source_language: str = "eng_Latn",
484
+ target_language: str = "spa_Latn",
485
+ tenant_id: Optional[str] = None
486
+ ) -> List[Dict[str, Any]]:
487
+ """
488
+ Translate multiple texts at once.
489
+
490
+ Args:
491
+ texts: List of strings to translate
492
+ source_language: Source language code
493
+ target_language: Target language code
494
+ tenant_id: Optional tenant identifier for logging
495
+
496
+ Returns:
497
+ List of translation result dictionaries
498
+ """
499
+ if not texts or not isinstance(texts, list):
500
+ log_interaction(
501
+ intent="batch_translation",
502
+ tenant_id=tenant_id,
503
+ success=False,
504
+ error="Invalid texts input"
505
+ )
506
+ return []
507
+
508
+ # Filter valid texts and limit batch size
509
+ valid_texts = [t for t in texts if isinstance(t, str) and t.strip()]
510
+ if len(valid_texts) > 50: # Batch size limit
511
+ valid_texts = valid_texts[:50]
512
+ log_interaction(
513
+ intent="batch_translation",
514
+ tenant_id=tenant_id,
515
+ success=None,
516
+ details=f"Batch size limited to 50 texts"
517
+ )
518
+
519
+ if not valid_texts:
520
+ log_interaction(
521
+ intent="batch_translation",
522
+ tenant_id=tenant_id,
523
+ success=False,
524
+ error="No valid texts in batch"
525
+ )
526
+ return []
527
+
528
+ start_time = time.time()
529
+ results = []
530
+
531
+ for text in valid_texts:
532
+ result = await translate_text(text, source_language, target_language, tenant_id)
533
+ results.append(result)
534
+
535
+ response_time_ms = int((time.time() - start_time) * 1000)
536
+
537
+ log_interaction(
538
+ intent="batch_translation",
539
+ tenant_id=tenant_id,
540
+ success=True,
541
+ response_time_ms=response_time_ms,
542
+ batch_size=len(valid_texts),
543
+ source_lang=normalize_language_code(source_language),
544
+ target_lang=normalize_language_code(target_language)
545
+ )
546
+
547
+ return results
548
+
549
+
550
+ def get_civic_phrase(
551
+ phrase_key: str,
552
+ language: str = "eng_Latn"
553
+ ) -> str:
554
+ """
555
+ Get a pre-translated civic phrase for common queries.
556
+
557
+ Args:
558
+ phrase_key: Key for the civic phrase (e.g., "voting_location")
559
+ language: Target language code
560
+
561
+ Returns:
562
+ Translated phrase or empty string if not found
563
+ """
564
+ if not phrase_key or not isinstance(phrase_key, str):
565
+ return ""
566
+
567
+ lang_code = normalize_language_code(language)
568
+ phrase = CIVIC_PHRASES.get(lang_code, {}).get(phrase_key, "")
569
+
570
+ if phrase:
571
+ log_interaction(
572
+ intent="civic_phrase_lookup",
573
+ success=True,
574
+ phrase_key=phrase_key,
575
+ language=lang_code
576
+ )
577
+
578
+ return phrase