|
|
|
""" |
|
Document to ASL Gloss Converter |
|
|
|
This script combines document parsing and ASL glossing to convert |
|
uploaded documents (PDF, TXT, DOC, DOCX, EPUB) directly to ASL gloss format. |
|
""" |
|
|
|
import os |
|
import sys |
|
import argparse |
|
from typing import Optional, Dict, Any |
|
from pathlib import Path |
|
|
|
|
|
from document_parsing import DocumentParser |
|
from asl_gloss import ASLGlossConverter |
|
|
|
|
|
class DocumentToASLConverter: |
|
""" |
|
Combines document parsing and ASL glossing functionality. |
|
Extracts text from various document formats and converts to ASL gloss. |
|
""" |
|
|
|
def __init__(self, api_key: Optional[str] = None): |
|
""" |
|
Initialize the document to ASL converter. |
|
|
|
Args: |
|
api_key: Anthropic API key. If not provided, will look for ANTHROPIC_API_KEY env var. |
|
""" |
|
self.document_parser = DocumentParser() |
|
self.asl_converter = ASLGlossConverter(api_key=api_key) |
|
|
|
def convert_document(self, document_path: str, output_file: Optional[str] = None) -> str: |
|
""" |
|
Convert a document file to ASL gloss. |
|
|
|
Args: |
|
document_path: Path to the document file |
|
output_file: Path to output file (optional) |
|
|
|
Returns: |
|
The ASL gloss text |
|
""" |
|
try: |
|
print(f"Processing document: {document_path}") |
|
|
|
|
|
print("Step 1: Extracting text from document...") |
|
extracted_text = self.document_parser.extract_text(document_path) |
|
|
|
if not extracted_text: |
|
raise Exception("Failed to extract text from document") |
|
|
|
print(f"β Extracted {len(extracted_text)} characters") |
|
|
|
|
|
print("Step 2: Converting to ASL gloss...") |
|
asl_gloss = self.asl_converter.convert_text(extracted_text) |
|
|
|
print("β ASL gloss conversion completed") |
|
|
|
|
|
if output_file: |
|
with open(output_file, 'w', encoding='utf-8') as f: |
|
f.write(asl_gloss) |
|
print(f"β ASL gloss saved to: {output_file}") |
|
|
|
return asl_gloss |
|
|
|
except Exception as e: |
|
raise Exception(f"Error processing document: {str(e)}") |
|
|
|
def batch_convert_documents(self, document_paths: list, output_dir: Optional[str] = None) -> Dict[str, str]: |
|
""" |
|
Convert multiple documents to ASL gloss. |
|
|
|
Args: |
|
document_paths: List of document file paths |
|
output_dir: Directory to save output files (optional) |
|
|
|
Returns: |
|
Dictionary mapping input files to their ASL gloss |
|
""" |
|
results = {} |
|
|
|
for document_path in document_paths: |
|
try: |
|
print(f"\n{'='*50}") |
|
print(f"Converting: {document_path}") |
|
print(f"{'='*50}") |
|
|
|
if output_dir: |
|
|
|
input_path = Path(document_path) |
|
output_filename = f"{input_path.stem}_asl_gloss.txt" |
|
output_file = Path(output_dir) / output_filename |
|
else: |
|
output_file = None |
|
|
|
asl_gloss = self.convert_document(document_path, str(output_file) if output_file else None) |
|
results[document_path] = asl_gloss |
|
|
|
print(f"β Completed: {document_path}") |
|
|
|
except Exception as e: |
|
print(f"β Error processing {document_path}: {str(e)}") |
|
results[document_path] = f"ERROR: {str(e)}" |
|
|
|
return results |
|
|
|
def get_supported_formats(self) -> list: |
|
""" |
|
Get list of supported document formats. |
|
|
|
Returns: |
|
List of supported file extensions |
|
""" |
|
return ['.pdf', '.txt', '.docx', '.doc', '.epub'] |
|
|
|
|
|
def main(): |
|
"""Main function for command-line usage.""" |
|
parser = argparse.ArgumentParser( |
|
description="Convert documents to ASL gloss using Claude's API", |
|
formatter_class=argparse.RawDescriptionHelpFormatter, |
|
epilog=""" |
|
Examples: |
|
# Convert a single document |
|
python document_to_asl.py document.pdf |
|
|
|
# Convert document with output file |
|
python document_to_asl.py document.pdf -o output.txt |
|
|
|
# Batch convert multiple documents |
|
python document_to_asl.py -b doc1.pdf doc2.docx doc3.txt -d output_dir/ |
|
|
|
# Interactive mode |
|
python document_to_asl.py -i |
|
|
|
# Show supported formats |
|
python document_to_asl.py --formats |
|
""" |
|
) |
|
|
|
parser.add_argument( |
|
'document', |
|
nargs='?', |
|
help='Document file to convert to ASL gloss' |
|
) |
|
|
|
parser.add_argument( |
|
'-o', '--output', |
|
help='Output file for ASL gloss' |
|
) |
|
|
|
parser.add_argument( |
|
'-b', '--batch', |
|
nargs='+', |
|
help='Batch convert multiple documents' |
|
) |
|
|
|
parser.add_argument( |
|
'-d', '--output-dir', |
|
help='Output directory for batch conversion' |
|
) |
|
|
|
parser.add_argument( |
|
'-i', '--interactive', |
|
action='store_true', |
|
help='Run in interactive mode' |
|
) |
|
|
|
parser.add_argument( |
|
'--formats', |
|
action='store_true', |
|
help='Show supported document formats' |
|
) |
|
|
|
parser.add_argument( |
|
'--api-key', |
|
help='Anthropic API key (or set ANTHROPIC_API_KEY env var)' |
|
) |
|
|
|
args = parser.parse_args() |
|
|
|
try: |
|
|
|
converter = DocumentToASLConverter(api_key=args.api_key) |
|
|
|
if args.formats: |
|
print("Supported Document Formats:") |
|
print("=" * 30) |
|
formats = converter.get_supported_formats() |
|
for fmt in formats: |
|
print(f" β’ {fmt}") |
|
print("\nExamples: .pdf, .txt, .docx, .doc, .epub") |
|
return 0 |
|
|
|
if args.interactive: |
|
print("Document to ASL Gloss Converter - Interactive Mode") |
|
print("Enter document file paths to convert (or 'quit' to exit):") |
|
print("-" * 60) |
|
|
|
while True: |
|
try: |
|
doc_path = input("\nDocument path: ").strip() |
|
if doc_path.lower() in ['quit', 'exit', 'q']: |
|
break |
|
|
|
if not doc_path: |
|
continue |
|
|
|
if not os.path.exists(doc_path): |
|
print(f"Error: File not found: {doc_path}") |
|
continue |
|
|
|
|
|
output_file = input("Output file (optional, press Enter to skip): ").strip() |
|
if not output_file: |
|
output_file = None |
|
|
|
print("Converting...") |
|
asl_gloss = converter.convert_document(doc_path, output_file) |
|
|
|
if not output_file: |
|
print("\nASL Gloss:") |
|
print("-" * 20) |
|
print(asl_gloss) |
|
|
|
except KeyboardInterrupt: |
|
print("\nExiting...") |
|
break |
|
except Exception as e: |
|
print(f"Error: {str(e)}") |
|
|
|
elif args.batch: |
|
if not args.batch: |
|
print("Error: No documents specified for batch conversion") |
|
return 1 |
|
|
|
print(f"Batch converting {len(args.batch)} documents...") |
|
results = converter.batch_convert_documents(args.batch, args.output_dir) |
|
|
|
print("\n" + "="*60) |
|
print("BATCH CONVERSION RESULTS") |
|
print("="*60) |
|
for doc_path, result in results.items(): |
|
print(f"\nDocument: {doc_path}") |
|
print("-" * 40) |
|
if result.startswith("ERROR:"): |
|
print(f"β {result}") |
|
else: |
|
print("β
Conversion successful") |
|
if not args.output_dir: |
|
print("ASL Gloss:") |
|
print(result[:500] + "..." if len(result) > 500 else result) |
|
|
|
elif args.document: |
|
asl_gloss = converter.convert_document(args.document, args.output) |
|
if not args.output: |
|
print("\nASL Gloss:") |
|
print("-" * 20) |
|
print(asl_gloss) |
|
|
|
else: |
|
parser.print_help() |
|
return 1 |
|
|
|
return 0 |
|
|
|
except Exception as e: |
|
print(f"Error: {str(e)}") |
|
return 1 |
|
|
|
|
|
if __name__ == "__main__": |
|
sys.exit(main()) |