Spaces:
Running
Running
fix for catching Gradio DataFile objects when they are passed from API calls as strings
Browse files- __pycache__/asl_gloss.cpython-311.pyc +0 -0
- __pycache__/document_parsing.cpython-311.pyc +0 -0
- app.py +17 -9
- asl_gloss.py +0 -1
- document_parsing.py +55 -1
__pycache__/asl_gloss.cpython-311.pyc
CHANGED
|
Binary files a/__pycache__/asl_gloss.cpython-311.pyc and b/__pycache__/asl_gloss.cpython-311.pyc differ
|
|
|
__pycache__/document_parsing.cpython-311.pyc
CHANGED
|
Binary files a/__pycache__/document_parsing.cpython-311.pyc and b/__pycache__/document_parsing.cpython-311.pyc differ
|
|
|
app.py
CHANGED
|
@@ -199,7 +199,8 @@ def cleanup_temp_video(file_path):
|
|
| 199 |
def determine_input_type(input_data):
|
| 200 |
"""
|
| 201 |
Determine the type of input data and return a standardized format.
|
| 202 |
-
Returns: (input_type, processed_data) where input_type is 'text',
|
|
|
|
| 203 |
"""
|
| 204 |
if isinstance(input_data, str):
|
| 205 |
# Check if it's a file path (contains file extension)
|
|
@@ -209,11 +210,20 @@ def determine_input_type(input_data):
|
|
| 209 |
elif input_data.startswith('{') and 'gradio.FileData' in input_data:
|
| 210 |
try:
|
| 211 |
import ast
|
| 212 |
-
|
| 213 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
if isinstance(file_data, dict) and 'path' in file_data:
|
|
|
|
| 215 |
return 'file_path', file_data['path']
|
| 216 |
-
except (ValueError, SyntaxError):
|
|
|
|
|
|
|
| 217 |
pass
|
| 218 |
else:
|
| 219 |
return 'text', input_data.strip()
|
|
@@ -255,9 +265,7 @@ def process_input(input_data):
|
|
| 255 |
async def parse_vectorize_and_search_unified(input_data):
|
| 256 |
"""
|
| 257 |
Unified function that handles both text and file inputs
|
| 258 |
-
"""
|
| 259 |
-
print(f"Input type: {type(input_data)}")
|
| 260 |
-
|
| 261 |
# Process the input to get gloss
|
| 262 |
gloss = process_input(input_data)
|
| 263 |
if not gloss:
|
|
@@ -356,7 +364,6 @@ def predict_unified(input_data):
|
|
| 356 |
"message": "Please provide text or upload a document"
|
| 357 |
}, None
|
| 358 |
|
| 359 |
-
print("Input", input_data, type(input_data))
|
| 360 |
# Use the unified processing function
|
| 361 |
result = parse_vectorize_and_search_unified_sync(input_data)
|
| 362 |
|
|
@@ -444,7 +451,8 @@ def predict(text, file):
|
|
| 444 |
"message": "Please provide either text or upload a file"
|
| 445 |
}, None
|
| 446 |
|
| 447 |
-
print("Input", input_data)
|
|
|
|
| 448 |
# Process using the unified function
|
| 449 |
return predict_unified(input_data)
|
| 450 |
|
|
|
|
| 199 |
def determine_input_type(input_data):
|
| 200 |
"""
|
| 201 |
Determine the type of input data and return a standardized format.
|
| 202 |
+
Returns: (input_type, processed_data) where input_type is 'text',
|
| 203 |
+
'file_path', or 'file_object'
|
| 204 |
"""
|
| 205 |
if isinstance(input_data, str):
|
| 206 |
# Check if it's a file path (contains file extension)
|
|
|
|
| 210 |
elif input_data.startswith('{') and 'gradio.FileData' in input_data:
|
| 211 |
try:
|
| 212 |
import ast
|
| 213 |
+
import json
|
| 214 |
+
# Try to parse as JSON first
|
| 215 |
+
try:
|
| 216 |
+
file_data = json.loads(input_data)
|
| 217 |
+
except json.JSONDecodeError:
|
| 218 |
+
# Fall back to ast.literal_eval for safer parsing
|
| 219 |
+
file_data = ast.literal_eval(input_data)
|
| 220 |
+
|
| 221 |
if isinstance(file_data, dict) and 'path' in file_data:
|
| 222 |
+
print(f"Parsed FileData: {file_data}")
|
| 223 |
return 'file_path', file_data['path']
|
| 224 |
+
except (ValueError, SyntaxError, json.JSONDecodeError) as e:
|
| 225 |
+
print(f"Error parsing FileData string: {e}")
|
| 226 |
+
print(f"Input data: {input_data}")
|
| 227 |
pass
|
| 228 |
else:
|
| 229 |
return 'text', input_data.strip()
|
|
|
|
| 265 |
async def parse_vectorize_and_search_unified(input_data):
|
| 266 |
"""
|
| 267 |
Unified function that handles both text and file inputs
|
| 268 |
+
"""
|
|
|
|
|
|
|
| 269 |
# Process the input to get gloss
|
| 270 |
gloss = process_input(input_data)
|
| 271 |
if not gloss:
|
|
|
|
| 364 |
"message": "Please provide text or upload a document"
|
| 365 |
}, None
|
| 366 |
|
|
|
|
| 367 |
# Use the unified processing function
|
| 368 |
result = parse_vectorize_and_search_unified_sync(input_data)
|
| 369 |
|
|
|
|
| 451 |
"message": "Please provide either text or upload a file"
|
| 452 |
}, None
|
| 453 |
|
| 454 |
+
print("Input to the prediction function", input_data)
|
| 455 |
+
print("Input type:", type(input))
|
| 456 |
# Process using the unified function
|
| 457 |
return predict_unified(input_data)
|
| 458 |
|
asl_gloss.py
CHANGED
|
@@ -10,7 +10,6 @@ that preserves the spatial and grammatical structure of ASL.
|
|
| 10 |
import os
|
| 11 |
import sys
|
| 12 |
import argparse
|
| 13 |
-
import json
|
| 14 |
from typing import Optional, Dict, Any
|
| 15 |
from pathlib import Path
|
| 16 |
|
|
|
|
| 10 |
import os
|
| 11 |
import sys
|
| 12 |
import argparse
|
|
|
|
| 13 |
from typing import Optional, Dict, Any
|
| 14 |
from pathlib import Path
|
| 15 |
|
document_parsing.py
CHANGED
|
@@ -66,7 +66,61 @@ class DocumentParser:
|
|
| 66 |
'.doc': 'application/msword',
|
| 67 |
'.epub': 'application/epub+zip'
|
| 68 |
}
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
def extract_text(self, file_path: Union[str, Path]) -> Optional[str]:
|
| 72 |
"""
|
|
|
|
| 66 |
'.doc': 'application/msword',
|
| 67 |
'.epub': 'application/epub+zip'
|
| 68 |
}
|
| 69 |
+
|
| 70 |
+
mime_type = extension_map.get(extension, 'unknown')
|
| 71 |
+
|
| 72 |
+
# If no extension or unknown extension, try to detect by content
|
| 73 |
+
if mime_type == 'unknown':
|
| 74 |
+
mime_type = self._detect_mime_by_content(file_path)
|
| 75 |
+
|
| 76 |
+
return mime_type
|
| 77 |
+
|
| 78 |
+
def _detect_mime_by_content(self, file_path: Union[str, Path]) -> str:
|
| 79 |
+
"""
|
| 80 |
+
Detect MIME type by reading file content.
|
| 81 |
+
|
| 82 |
+
Args:
|
| 83 |
+
file_path: Path to the file
|
| 84 |
+
|
| 85 |
+
Returns:
|
| 86 |
+
MIME type string
|
| 87 |
+
"""
|
| 88 |
+
try:
|
| 89 |
+
with open(file_path, 'rb') as f:
|
| 90 |
+
# Read first 1024 bytes to detect file type
|
| 91 |
+
header = f.read(1024)
|
| 92 |
+
|
| 93 |
+
# PDF detection
|
| 94 |
+
if header.startswith(b'%PDF'):
|
| 95 |
+
return 'application/pdf'
|
| 96 |
+
|
| 97 |
+
# ZIP-based formats (DOCX, EPUB)
|
| 98 |
+
if header.startswith(b'PK\x03\x04'):
|
| 99 |
+
# Check if it's EPUB by looking for mimetype file
|
| 100 |
+
try:
|
| 101 |
+
import zipfile
|
| 102 |
+
with zipfile.ZipFile(file_path, 'r') as zf:
|
| 103 |
+
if 'mimetype' in zf.namelist():
|
| 104 |
+
with zf.open('mimetype') as mf:
|
| 105 |
+
mimetype = mf.read().decode('utf-8').strip()
|
| 106 |
+
if mimetype == 'application/epub+zip':
|
| 107 |
+
return 'application/epub+zip'
|
| 108 |
+
# If not EPUB, assume DOCX
|
| 109 |
+
return 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
| 110 |
+
except:
|
| 111 |
+
pass
|
| 112 |
+
|
| 113 |
+
# Plain text detection (try to decode as UTF-8)
|
| 114 |
+
try:
|
| 115 |
+
header.decode('utf-8')
|
| 116 |
+
return 'text/plain'
|
| 117 |
+
except UnicodeDecodeError:
|
| 118 |
+
pass
|
| 119 |
+
|
| 120 |
+
except Exception as e:
|
| 121 |
+
logger.warning(f"Error detecting MIME type by content: {e}")
|
| 122 |
+
|
| 123 |
+
return 'unknown'
|
| 124 |
|
| 125 |
def extract_text(self, file_path: Union[str, Path]) -> Optional[str]:
|
| 126 |
"""
|