Update app_main.py
Browse files- app_main.py +20 -1
app_main.py
CHANGED
|
@@ -10,10 +10,22 @@ import pytesseract
|
|
| 10 |
from transformers import AutoProcessor, AutoModelForImageTextToText
|
| 11 |
from langchain_community.document_loaders.image_captions import ImageCaptionLoader
|
| 12 |
from werkzeug.utils import secure_filename
|
| 13 |
-
import tempfile
|
| 14 |
|
| 15 |
app = Flask(__name__)
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
|
| 18 |
poppler_path=r"C:\poppler-23.11.0\Library\bin"
|
| 19 |
|
|
@@ -131,7 +143,9 @@ def index():
|
|
| 131 |
@app.route('/process_pdf', methods=['POST'])
|
| 132 |
def process_pdf():
|
| 133 |
try:
|
|
|
|
| 134 |
if 'pdf_file' not in request.files:
|
|
|
|
| 135 |
return jsonify({"error": "Missing PDF file in form-data with key 'pdf_file'"}), 400
|
| 136 |
|
| 137 |
pdf_file = request.files['pdf_file']
|
|
@@ -144,16 +158,21 @@ def process_pdf():
|
|
| 144 |
saved_pdf_path = os.path.join(temp_dir, filename)
|
| 145 |
pdf_file.save(saved_pdf_path)
|
| 146 |
|
|
|
|
|
|
|
| 147 |
# Extract & process
|
| 148 |
json_path = None
|
| 149 |
output_path, result = extract_images_from_pdf(saved_pdf_path, json_path)
|
| 150 |
|
|
|
|
|
|
|
| 151 |
return jsonify({
|
| 152 |
"message": "✅ PDF processed successfully",
|
| 153 |
"output_json": output_path,
|
| 154 |
"sprites": result
|
| 155 |
})
|
| 156 |
except Exception as e:
|
|
|
|
| 157 |
return jsonify({"error": f"❌ Failed to process PDF: {str(e)}"}), 500
|
| 158 |
|
| 159 |
if __name__ == '__main__':
|
|
|
|
| 10 |
from transformers import AutoProcessor, AutoModelForImageTextToText
|
| 11 |
from langchain_community.document_loaders.image_captions import ImageCaptionLoader
|
| 12 |
from werkzeug.utils import secure_filename
|
| 13 |
+
import tempfile, logging
|
| 14 |
|
| 15 |
app = Flask(__name__)
|
| 16 |
|
| 17 |
+
# Configure logging
|
| 18 |
+
logging.basicConfig(
|
| 19 |
+
level=logging.DEBUG, # Use INFO or ERROR in production
|
| 20 |
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
| 21 |
+
handlers=[
|
| 22 |
+
logging.FileHandler("app.log"),
|
| 23 |
+
logging.StreamHandler()
|
| 24 |
+
]
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
logger = logging.getLogger(__name__)
|
| 28 |
+
|
| 29 |
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
|
| 30 |
poppler_path=r"C:\poppler-23.11.0\Library\bin"
|
| 31 |
|
|
|
|
| 143 |
@app.route('/process_pdf', methods=['POST'])
|
| 144 |
def process_pdf():
|
| 145 |
try:
|
| 146 |
+
logger.info("Received request to process PDF.")
|
| 147 |
if 'pdf_file' not in request.files:
|
| 148 |
+
logger.warning("No PDF file found in request.")
|
| 149 |
return jsonify({"error": "Missing PDF file in form-data with key 'pdf_file'"}), 400
|
| 150 |
|
| 151 |
pdf_file = request.files['pdf_file']
|
|
|
|
| 158 |
saved_pdf_path = os.path.join(temp_dir, filename)
|
| 159 |
pdf_file.save(saved_pdf_path)
|
| 160 |
|
| 161 |
+
logger.info(f"Saved uploaded PDF to: {saved_pdf_path}")
|
| 162 |
+
|
| 163 |
# Extract & process
|
| 164 |
json_path = None
|
| 165 |
output_path, result = extract_images_from_pdf(saved_pdf_path, json_path)
|
| 166 |
|
| 167 |
+
logger.info("Received request to process PDF.")
|
| 168 |
+
|
| 169 |
return jsonify({
|
| 170 |
"message": "✅ PDF processed successfully",
|
| 171 |
"output_json": output_path,
|
| 172 |
"sprites": result
|
| 173 |
})
|
| 174 |
except Exception as e:
|
| 175 |
+
logger.exception("❌ Failed to process PDF")
|
| 176 |
return jsonify({"error": f"❌ Failed to process PDF: {str(e)}"}), 500
|
| 177 |
|
| 178 |
if __name__ == '__main__':
|