File size: 4,708 Bytes
5ff6b14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE
from typing import List, Dict, Any
from PIL import Image
from io import BytesIO
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
import tempfile
import os
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
from config import config

# OCR Space API configuration
API_KEY = getattr(config, 'OCR_SPACE_API_KEY', None)
API_URL = "https://api.ocr.space/parse/image"

def ocr_space_file(filename, api_key=API_KEY, overlay=False, language="eng"):
    """Extract text from image file using OCR Space API"""
    if not api_key:
        return filename, "OCR API key not configured"
        
    payload = {
        "isOverlayRequired": overlay,
        "apikey": api_key,
        "language": language,
        "detectOrientation": True,
        "scale": True,
        "isTable": False,
        "OCREngine": 2
    }
    try:
        with open(filename, "rb") as f:
            response = requests.post(API_URL, files={filename: f}, data=payload, timeout=30)
        
        if response.status_code != 200:
            return filename, f"API Error: HTTP {response.status_code}"
            
        parsed = response.json()
        
        if parsed.get("OCRExitCode") == 1:
            parsed_text = parsed.get("ParsedResults", [{}])[0].get("ParsedText", "")
            return filename, parsed_text
        else:
            error_msg = parsed.get("ErrorMessage", ["Unknown error"])[0] if parsed.get("ErrorMessage") else "Unknown OCR error"
            return filename, f"OCR Error: {error_msg}"
            
    except requests.exceptions.Timeout:
        return filename, "Error: Request timeout"
    except requests.exceptions.RequestException as e:
        return filename, f"Error: Network error - {str(e)}"
    except Exception as e:
        return filename, f"Error: {e}"

def extract_pptx(pptx_path: str) -> str:
    """Extract text and images from PowerPoint presentations."""
    try:
        prs = Presentation(pptx_path)
    except Exception as e:
        return f"Error loading PowerPoint file: {str(e)}"

    all_content = []
    temp_files = []

    try:
        for slide_idx, slide in enumerate(prs.slides):
            slide_content = [f"\\n=== Slide {slide_idx + 1} ===\\n"]
            slide_images = []

            for shape in slide.shapes:
                # Extract text
                if hasattr(shape, "text") and shape.text.strip():
                    slide_content.append(shape.text.strip())
                
                # Extract images
                elif shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
                    try:
                        image = shape.image
                        image_bytes = image.blob
                        
                        # Save image to temp file
                        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
                        temp_file.write(image_bytes)
                        temp_file.close()
                        temp_files.append(temp_file.name)
                        slide_images.append(temp_file.name)
                    except Exception as e:
                        slide_content.append(f"[Image extraction error: {str(e)}]")

            # Process images with OCR if API key is available
            if slide_images and API_KEY:
                try:
                    with ThreadPoolExecutor(max_workers=3) as executor:
                        future_to_filename = {
                            executor.submit(ocr_space_file, img_file): img_file 
                            for img_file in slide_images
                        }
                        
                        for future in as_completed(future_to_filename):
                            filename, ocr_result = future.result()
                            if ocr_result and not ocr_result.startswith("Error") and not ocr_result.startswith("OCR Error"):
                                slide_content.append(f"[Image Text]: {ocr_result}")
                except Exception as e:
                    slide_content.append(f"[OCR processing error: {str(e)}]")
            elif slide_images:
                slide_content.append(f"[{len(slide_images)} images found - OCR not available]")

            all_content.append("\\n".join(slide_content))

    finally:
        # Clean up temp files
        for temp_file in temp_files:
            try:
                os.unlink(temp_file)
            except:
                pass

    return "\\n\\n".join(all_content)