#!/bin/bash # Exit on error set -e echo "Starting build process..." # Install system dependencies for tesseract echo "Installing Tesseract and dependencies..." apt-get update && apt-get install -y \ tesseract-ocr \ tesseract-ocr-eng \ libtesseract-dev \ libleptonica-dev \ pkg-config \ wget # Create tessdata directory TESSDATA_DIR="/usr/share/tesseract-ocr/4.00/tessdata" mkdir -p "$TESSDATA_DIR" # Download traineddata files directly from the official repository echo "Downloading Tesseract traineddata files..." wget -O "$TESSDATA_DIR/eng.traineddata" "https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata" wget -O "$TESSDATA_DIR/osd.traineddata" "https://github.com/tesseract-ocr/tessdata/raw/main/osd.traineddata" # Set and verify TESSDATA_PREFIX export TESSDATA_PREFIX="$TESSDATA_DIR" echo "TESSDATA_PREFIX=${TESSDATA_PREFIX}" >> /etc/environment # Verify tesseract installation and data files echo "Verifying Tesseract installation..." if ! command -v tesseract &> /dev/null; then echo "Tesseract installation failed!" exit 1 fi echo "Tesseract version: $(tesseract --version)" # Verify traineddata files echo "Verifying traineddata files..." if [ ! -f "$TESSDATA_DIR/eng.traineddata" ]; then echo "eng.traineddata is missing!" exit 1 fi if [ ! -f "$TESSDATA_DIR/osd.traineddata" ]; then echo "osd.traineddata is missing!" exit 1 fi echo "Traineddata files in $TESSDATA_DIR:" ls -l "$TESSDATA_DIR" # Test Tesseract functionality echo "Testing Tesseract functionality..." echo "Hello World" > test.png if ! tesseract test.png stdout; then echo "Tesseract test failed!" exit 1 fi rm test.png # Clean and install tesserocr from source echo "Installing tesserocr from source..." pip uninstall -y tesserocr || true CPPFLAGS="-I/usr/include/tesseract" LDFLAGS="-L/usr/lib/x86_64-linux-gnu/" pip install --no-binary :all: tesserocr # Verify tesserocr installation echo "Verifying tesserocr installation..." python3 -c " import tesserocr print(f'tesserocr version: {tesserocr.__version__}') print(f'Available languages: {tesserocr.get_languages()}') print(f'TESSDATA_PREFIX: {tesserocr.get_languages()[1]}') " # Install Google Gemini API client echo "Installing Google Gemini API client..." pip install -q -U google-genai echo "Google Gemini API client installed successfully" # Install Python dependencies echo "Installing Python dependencies..." pip install -e . # Create .env file if it doesn't exist if [ ! -f .env ]; then echo "Creating .env file..." cp .env.example .env || echo "Warning: .env.example not found" fi echo "Build process completed successfully!"