seanpedrickcase commited on
Commit
3dd6d75
·
1 Parent(s): 5ecf07b

Modified Dockerfile and entrypoint to switch user at runtime. Updated output folder file creation for custom_image_anlyser_engine and find_duplicate_pages.py

Browse files
Dockerfile CHANGED
@@ -51,8 +51,8 @@ RUN apt-get update \
51
  && rm -rf /var/lib/apt/lists/*
52
 
53
  # Create non-root user
54
- RUN useradd -m -u 1000 user
55
  ENV APP_HOME=/home/user
 
56
 
57
  # Set env variables for Gradio & other apps
58
  ENV GRADIO_TEMP_DIR=/tmp/gradio_tmp/ \
@@ -117,18 +117,21 @@ COPY --from=builder /install /usr/local/lib/python3.12/site-packages/
117
  # Copy installed CLI binaries (e.g. uvicorn)
118
  COPY --from=builder /install/bin /usr/local/bin/
119
 
120
- # Copy app code and entrypoint with correct ownership
121
- COPY --chown=user . $APP_HOME/app
 
 
 
122
 
123
- # Copy the entrypoint script separately, set permissions, set line endings correctly
124
- COPY --chown=user entrypoint.sh ${APP_HOME}/app/entrypoint.sh
125
- RUN chmod 755 /home/user/app/entrypoint.sh \
126
- && sed -i 's/\r$//' /home/user/app/entrypoint.sh
 
127
 
128
- # Switch to user
129
- USER user
130
 
131
- # Declare working directory
132
  WORKDIR $APP_HOME/app
133
 
134
  # Declare volumes (NOTE: runtime mounts will override permissions — handle with care)
 
51
  && rm -rf /var/lib/apt/lists/*
52
 
53
  # Create non-root user
 
54
  ENV APP_HOME=/home/user
55
+ RUN useradd -m -u 1000 user
56
 
57
  # Set env variables for Gradio & other apps
58
  ENV GRADIO_TEMP_DIR=/tmp/gradio_tmp/ \
 
117
  # Copy installed CLI binaries (e.g. uvicorn)
118
  COPY --from=builder /install/bin /usr/local/bin/
119
 
120
+ # Copy app code
121
+ COPY . $APP_HOME/app
122
+
123
+ # Fix ownership if needed
124
+ RUN if [ "$APP_MODE" = "gradio" ]; then chown -R user:user /home/user/app; fi
125
 
126
+ # Copy entrypoint and fix line endings + permissions
127
+ COPY entrypoint.sh ${APP_HOME}/app/entrypoint.sh
128
+ RUN chmod 755 ${APP_HOME}/app/entrypoint.sh \
129
+ && sed -i 's/\r$//' ${APP_HOME}/app/entrypoint.sh \
130
+ && if [ "$APP_MODE" = "gradio" ]; then chown user:user ${APP_HOME}/app/entrypoint.sh; fi
131
 
132
+ # Set permissions for Python executable
133
+ RUN chmod 755 /usr/local/bin/python
134
 
 
135
  WORKDIR $APP_HOME/app
136
 
137
  # Declare volumes (NOTE: runtime mounts will override permissions — handle with care)
app.py CHANGED
@@ -1224,7 +1224,6 @@ with blocks:
1224
  else:
1225
  handwrite_signature_checkbox.render()
1226
 
1227
- print(f"SHOW_LOCAL_OCR_MODEL_OPTIONS : {SHOW_LOCAL_OCR_MODEL_OPTIONS}")
1228
  if SHOW_LOCAL_OCR_MODEL_OPTIONS:
1229
  with gr.Accordion(
1230
  label="Change default local OCR model",
 
1224
  else:
1225
  handwrite_signature_checkbox.render()
1226
 
 
1227
  if SHOW_LOCAL_OCR_MODEL_OPTIONS:
1228
  with gr.Accordion(
1229
  label="Change default local OCR model",
entrypoint.sh CHANGED
@@ -22,12 +22,12 @@ else
22
 
23
  # Start uvicorn server.
24
  echo "Starting with Uvicorn on $GRADIO_SERVER_NAME:$GRADIO_SERVER_PORT"
25
- exec uvicorn app:app \
26
  --host $GRADIO_SERVER_NAME \
27
  --port $GRADIO_SERVER_PORT \
28
- --proxy-headers
29
  else
30
  echo "Starting in Gradio mode..."
31
- exec python app.py
32
  fi
33
  fi
 
22
 
23
  # Start uvicorn server.
24
  echo "Starting with Uvicorn on $GRADIO_SERVER_NAME:$GRADIO_SERVER_PORT"
25
+ exec su -s /bin/sh user -c "uvicorn app:app \
26
  --host $GRADIO_SERVER_NAME \
27
  --port $GRADIO_SERVER_PORT \
28
+ --proxy-headers"
29
  else
30
  echo "Starting in Gradio mode..."
31
+ exec su -s /bin/sh user -c "python app.py"
32
  fi
33
  fi
tools/custom_image_analyser_engine.py CHANGED
@@ -499,7 +499,14 @@ class CustomImageAnalyzerEngine:
499
  self.language = language or DEFAULT_LANGUAGE or "en"
500
  self.tesseract_lang = _tesseract_lang_code(self.language)
501
  self.paddle_lang = _paddle_lang_code(self.language)
502
- self.output_folder = output_folder
 
 
 
 
 
 
 
503
 
504
  if self.ocr_engine == "paddle" or self.ocr_engine == "hybrid":
505
  if PaddleOCR is None:
@@ -868,20 +875,9 @@ class CustomImageAnalyzerEngine:
868
  if paddle_results and SAVE_PADDLE_VISUALISATIONS is True:
869
 
870
  for res in paddle_results:
871
- # Normalize and validate output folder path before using in path construction
872
- normalized_output_folder = os.path.normpath(
873
- os.path.abspath(self.output_folder)
874
- )
875
- # Validate the output folder is safe
876
- if not validate_folder_containment(
877
- normalized_output_folder, OUTPUT_FOLDER
878
- ):
879
- raise ValueError(
880
- f"Unsafe output folder path: {normalized_output_folder}"
881
- )
882
-
883
  paddle_viz_folder = os.path.join(
884
- normalized_output_folder, "paddle_visualisations"
885
  )
886
  # Double-check the constructed path is safe
887
  if not validate_folder_containment(
 
499
  self.language = language or DEFAULT_LANGUAGE or "en"
500
  self.tesseract_lang = _tesseract_lang_code(self.language)
501
  self.paddle_lang = _paddle_lang_code(self.language)
502
+
503
+ # Security: Validate and normalize output_folder at construction time
504
+ # This ensures the object is always in a secure state and prevents
505
+ # any future code from accidentally using an untrusted directory
506
+ normalized_output_folder = os.path.normpath(os.path.abspath(output_folder))
507
+ if not validate_folder_containment(normalized_output_folder, OUTPUT_FOLDER):
508
+ raise ValueError(f"Unsafe output folder path: {output_folder}. Must be contained within {OUTPUT_FOLDER}")
509
+ self.output_folder = normalized_output_folder
510
 
511
  if self.ocr_engine == "paddle" or self.ocr_engine == "hybrid":
512
  if PaddleOCR is None:
 
875
  if paddle_results and SAVE_PADDLE_VISUALISATIONS is True:
876
 
877
  for res in paddle_results:
878
+ # self.output_folder is already validated and normalized at construction time
 
 
 
 
 
 
 
 
 
 
 
879
  paddle_viz_folder = os.path.join(
880
+ self.output_folder, "paddle_visualisations"
881
  )
882
  # Double-check the constructed path is safe
883
  if not validate_folder_containment(
tools/find_duplicate_pages.py CHANGED
@@ -474,9 +474,13 @@ def combine_ocr_dataframes(
474
  raise ValueError(
475
  f"Unsafe normalized output folder path: {normalized_output_folder}"
476
  )
 
 
 
 
477
 
478
- os.makedirs(normalized_output_folder, exist_ok=True)
479
- output_path = secure_path_join(normalized_output_folder, output_filename)
480
  combined_df.to_csv(output_path, index=False)
481
  output_files.append(output_path)
482
  print(f"Successfully combined data and saved to: {output_path}")
 
474
  raise ValueError(
475
  f"Unsafe normalized output folder path: {normalized_output_folder}"
476
  )
477
+
478
+ # Assign the validated path back to output_folder to ensure all subsequent
479
+ # operations use the secure, validated value
480
+ output_folder = normalized_output_folder
481
 
482
+ os.makedirs(output_folder, exist_ok=True)
483
+ output_path = secure_path_join(output_folder, output_filename)
484
  combined_df.to_csv(output_path, index=False)
485
  output_files.append(output_path)
486
  print(f"Successfully combined data and saved to: {output_path}")