Commit
·
3dd6d75
1
Parent(s):
5ecf07b
Modified Dockerfile and entrypoint to switch user at runtime. Updated output folder file creation for custom_image_anlyser_engine and find_duplicate_pages.py
Browse files- Dockerfile +13 -10
- app.py +0 -1
- entrypoint.sh +3 -3
- tools/custom_image_analyser_engine.py +10 -14
- tools/find_duplicate_pages.py +6 -2
Dockerfile
CHANGED
|
@@ -51,8 +51,8 @@ RUN apt-get update \
|
|
| 51 |
&& rm -rf /var/lib/apt/lists/*
|
| 52 |
|
| 53 |
# Create non-root user
|
| 54 |
-
RUN useradd -m -u 1000 user
|
| 55 |
ENV APP_HOME=/home/user
|
|
|
|
| 56 |
|
| 57 |
# Set env variables for Gradio & other apps
|
| 58 |
ENV GRADIO_TEMP_DIR=/tmp/gradio_tmp/ \
|
|
@@ -117,18 +117,21 @@ COPY --from=builder /install /usr/local/lib/python3.12/site-packages/
|
|
| 117 |
# Copy installed CLI binaries (e.g. uvicorn)
|
| 118 |
COPY --from=builder /install/bin /usr/local/bin/
|
| 119 |
|
| 120 |
-
# Copy app code
|
| 121 |
-
COPY
|
|
|
|
|
|
|
|
|
|
| 122 |
|
| 123 |
-
# Copy
|
| 124 |
-
COPY
|
| 125 |
-
RUN chmod 755 /
|
| 126 |
-
&& sed -i 's/\r$//' /
|
|
|
|
| 127 |
|
| 128 |
-
#
|
| 129 |
-
|
| 130 |
|
| 131 |
-
# Declare working directory
|
| 132 |
WORKDIR $APP_HOME/app
|
| 133 |
|
| 134 |
# Declare volumes (NOTE: runtime mounts will override permissions — handle with care)
|
|
|
|
| 51 |
&& rm -rf /var/lib/apt/lists/*
|
| 52 |
|
| 53 |
# Create non-root user
|
|
|
|
| 54 |
ENV APP_HOME=/home/user
|
| 55 |
+
RUN useradd -m -u 1000 user
|
| 56 |
|
| 57 |
# Set env variables for Gradio & other apps
|
| 58 |
ENV GRADIO_TEMP_DIR=/tmp/gradio_tmp/ \
|
|
|
|
| 117 |
# Copy installed CLI binaries (e.g. uvicorn)
|
| 118 |
COPY --from=builder /install/bin /usr/local/bin/
|
| 119 |
|
| 120 |
+
# Copy app code
|
| 121 |
+
COPY . $APP_HOME/app
|
| 122 |
+
|
| 123 |
+
# Fix ownership if needed
|
| 124 |
+
RUN if [ "$APP_MODE" = "gradio" ]; then chown -R user:user /home/user/app; fi
|
| 125 |
|
| 126 |
+
# Copy entrypoint and fix line endings + permissions
|
| 127 |
+
COPY entrypoint.sh ${APP_HOME}/app/entrypoint.sh
|
| 128 |
+
RUN chmod 755 ${APP_HOME}/app/entrypoint.sh \
|
| 129 |
+
&& sed -i 's/\r$//' ${APP_HOME}/app/entrypoint.sh \
|
| 130 |
+
&& if [ "$APP_MODE" = "gradio" ]; then chown user:user ${APP_HOME}/app/entrypoint.sh; fi
|
| 131 |
|
| 132 |
+
# Set permissions for Python executable
|
| 133 |
+
RUN chmod 755 /usr/local/bin/python
|
| 134 |
|
|
|
|
| 135 |
WORKDIR $APP_HOME/app
|
| 136 |
|
| 137 |
# Declare volumes (NOTE: runtime mounts will override permissions — handle with care)
|
app.py
CHANGED
|
@@ -1224,7 +1224,6 @@ with blocks:
|
|
| 1224 |
else:
|
| 1225 |
handwrite_signature_checkbox.render()
|
| 1226 |
|
| 1227 |
-
print(f"SHOW_LOCAL_OCR_MODEL_OPTIONS : {SHOW_LOCAL_OCR_MODEL_OPTIONS}")
|
| 1228 |
if SHOW_LOCAL_OCR_MODEL_OPTIONS:
|
| 1229 |
with gr.Accordion(
|
| 1230 |
label="Change default local OCR model",
|
|
|
|
| 1224 |
else:
|
| 1225 |
handwrite_signature_checkbox.render()
|
| 1226 |
|
|
|
|
| 1227 |
if SHOW_LOCAL_OCR_MODEL_OPTIONS:
|
| 1228 |
with gr.Accordion(
|
| 1229 |
label="Change default local OCR model",
|
entrypoint.sh
CHANGED
|
@@ -22,12 +22,12 @@ else
|
|
| 22 |
|
| 23 |
# Start uvicorn server.
|
| 24 |
echo "Starting with Uvicorn on $GRADIO_SERVER_NAME:$GRADIO_SERVER_PORT"
|
| 25 |
-
exec uvicorn app:app \
|
| 26 |
--host $GRADIO_SERVER_NAME \
|
| 27 |
--port $GRADIO_SERVER_PORT \
|
| 28 |
-
--proxy-headers
|
| 29 |
else
|
| 30 |
echo "Starting in Gradio mode..."
|
| 31 |
-
exec python app.py
|
| 32 |
fi
|
| 33 |
fi
|
|
|
|
| 22 |
|
| 23 |
# Start uvicorn server.
|
| 24 |
echo "Starting with Uvicorn on $GRADIO_SERVER_NAME:$GRADIO_SERVER_PORT"
|
| 25 |
+
exec su -s /bin/sh user -c "uvicorn app:app \
|
| 26 |
--host $GRADIO_SERVER_NAME \
|
| 27 |
--port $GRADIO_SERVER_PORT \
|
| 28 |
+
--proxy-headers"
|
| 29 |
else
|
| 30 |
echo "Starting in Gradio mode..."
|
| 31 |
+
exec su -s /bin/sh user -c "python app.py"
|
| 32 |
fi
|
| 33 |
fi
|
tools/custom_image_analyser_engine.py
CHANGED
|
@@ -499,7 +499,14 @@ class CustomImageAnalyzerEngine:
|
|
| 499 |
self.language = language or DEFAULT_LANGUAGE or "en"
|
| 500 |
self.tesseract_lang = _tesseract_lang_code(self.language)
|
| 501 |
self.paddle_lang = _paddle_lang_code(self.language)
|
| 502 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 503 |
|
| 504 |
if self.ocr_engine == "paddle" or self.ocr_engine == "hybrid":
|
| 505 |
if PaddleOCR is None:
|
|
@@ -868,20 +875,9 @@ class CustomImageAnalyzerEngine:
|
|
| 868 |
if paddle_results and SAVE_PADDLE_VISUALISATIONS is True:
|
| 869 |
|
| 870 |
for res in paddle_results:
|
| 871 |
-
#
|
| 872 |
-
normalized_output_folder = os.path.normpath(
|
| 873 |
-
os.path.abspath(self.output_folder)
|
| 874 |
-
)
|
| 875 |
-
# Validate the output folder is safe
|
| 876 |
-
if not validate_folder_containment(
|
| 877 |
-
normalized_output_folder, OUTPUT_FOLDER
|
| 878 |
-
):
|
| 879 |
-
raise ValueError(
|
| 880 |
-
f"Unsafe output folder path: {normalized_output_folder}"
|
| 881 |
-
)
|
| 882 |
-
|
| 883 |
paddle_viz_folder = os.path.join(
|
| 884 |
-
|
| 885 |
)
|
| 886 |
# Double-check the constructed path is safe
|
| 887 |
if not validate_folder_containment(
|
|
|
|
| 499 |
self.language = language or DEFAULT_LANGUAGE or "en"
|
| 500 |
self.tesseract_lang = _tesseract_lang_code(self.language)
|
| 501 |
self.paddle_lang = _paddle_lang_code(self.language)
|
| 502 |
+
|
| 503 |
+
# Security: Validate and normalize output_folder at construction time
|
| 504 |
+
# This ensures the object is always in a secure state and prevents
|
| 505 |
+
# any future code from accidentally using an untrusted directory
|
| 506 |
+
normalized_output_folder = os.path.normpath(os.path.abspath(output_folder))
|
| 507 |
+
if not validate_folder_containment(normalized_output_folder, OUTPUT_FOLDER):
|
| 508 |
+
raise ValueError(f"Unsafe output folder path: {output_folder}. Must be contained within {OUTPUT_FOLDER}")
|
| 509 |
+
self.output_folder = normalized_output_folder
|
| 510 |
|
| 511 |
if self.ocr_engine == "paddle" or self.ocr_engine == "hybrid":
|
| 512 |
if PaddleOCR is None:
|
|
|
|
| 875 |
if paddle_results and SAVE_PADDLE_VISUALISATIONS is True:
|
| 876 |
|
| 877 |
for res in paddle_results:
|
| 878 |
+
# self.output_folder is already validated and normalized at construction time
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 879 |
paddle_viz_folder = os.path.join(
|
| 880 |
+
self.output_folder, "paddle_visualisations"
|
| 881 |
)
|
| 882 |
# Double-check the constructed path is safe
|
| 883 |
if not validate_folder_containment(
|
tools/find_duplicate_pages.py
CHANGED
|
@@ -474,9 +474,13 @@ def combine_ocr_dataframes(
|
|
| 474 |
raise ValueError(
|
| 475 |
f"Unsafe normalized output folder path: {normalized_output_folder}"
|
| 476 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 477 |
|
| 478 |
-
os.makedirs(
|
| 479 |
-
output_path = secure_path_join(
|
| 480 |
combined_df.to_csv(output_path, index=False)
|
| 481 |
output_files.append(output_path)
|
| 482 |
print(f"Successfully combined data and saved to: {output_path}")
|
|
|
|
| 474 |
raise ValueError(
|
| 475 |
f"Unsafe normalized output folder path: {normalized_output_folder}"
|
| 476 |
)
|
| 477 |
+
|
| 478 |
+
# Assign the validated path back to output_folder to ensure all subsequent
|
| 479 |
+
# operations use the secure, validated value
|
| 480 |
+
output_folder = normalized_output_folder
|
| 481 |
|
| 482 |
+
os.makedirs(output_folder, exist_ok=True)
|
| 483 |
+
output_path = secure_path_join(output_folder, output_filename)
|
| 484 |
combined_df.to_csv(output_path, index=False)
|
| 485 |
output_files.append(output_path)
|
| 486 |
print(f"Successfully combined data and saved to: {output_path}")
|