## Docker GROBID image using deep learning models and/or CRF models ## See https://grobid.readthedocs.io/en/latest/Grobid-docker/ ## usage example with version 0.8.0: ## docker build -t grobid/grobid:0.8.0 --build-arg GROBID_VERSION=0.8.0 --file Dockerfile.delft . ## no GPU: ## docker run -t --rm --init -p 8070:8070 -p 8071:8071 -v /home/lopez/grobid/grobid-home/config/grobid.properties:/opt/grobid/grobid-home/config/grobid.properties:ro grobid/grobid:0.8.0 ## allocate all available GPUs (only Linux with proper nvidia driver installed on host machine): ## docker run --rm --gpus all --init -p 8070:8070 -p 8071:8071 -v /home/lopez/grobid/grobid-home/config/grobid.properties:/opt/grobid/grobid-home/config/grobid.properties:ro grobid/grobid:0.8.0 # ------------------- # build builder image # ------------------- FROM openjdk:17-jdk-slim as builder USER root RUN apt-get update && \ apt-get -y --no-install-recommends install unzip WORKDIR /opt/grobid-source # gradle COPY gradle/ ./gradle/ COPY gradlew ./ COPY gradle.properties ./ COPY build.gradle ./ COPY settings.gradle ./ # source COPY grobid-home/ ./grobid-home/ COPY grobid-core/ ./grobid-core/ COPY grobid-service/ ./grobid-service/ COPY grobid-trainer/ ./grobid-trainer/ # cleaning unused native libraries before packaging RUN rm -rf grobid-home/pdf2xml RUN rm -rf grobid-home/pdfalto/lin-32 RUN rm -rf grobid-home/pdfalto/mac-64 RUN rm -rf grobid-home/pdfalto/mac_arm-64 RUN rm -rf grobid-home/pdfalto/win-* RUN rm -rf grobid-home/lib/lin-32 RUN rm -rf grobid-home/lib/win-* RUN rm -rf grobid-home/lib/mac-64 RUN ./gradlew clean assemble --no-daemon --info --stacktrace WORKDIR /opt/grobid RUN unzip -o /opt/grobid-source/grobid-service/build/distributions/grobid-service-*.zip && \ mv grobid-service* grobid-service RUN unzip -o /opt/grobid-source/grobid-home/build/distributions/grobid-home-*.zip && \ chmod -R 755 /opt/grobid/grobid-home/pdfalto RUN rm -rf grobid-source # ------------------- # build runtime image # ------------------- # use NVIDIA Container Toolkit to automatically recognize possible GPU drivers on the host machine FROM tensorflow/tensorflow:2.7.0-gpu # setting locale is likely useless but to be sure ENV LANG C.UTF-8 # update NVIDIA Cuda key (following a key rotation in April 2022) RUN apt-get install -y wget RUN apt-key del 7fa2af80 RUN rm /etc/apt/sources.list.d/cuda.list RUN rm /etc/apt/sources.list.d/nvidia-ml.list RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb RUN dpkg -i cuda-keyring_1.0-1_all.deb # install JRE, python and other dependencies RUN apt-get update && \ apt-get -y --no-install-recommends install apt-utils build-essential gcc libxml2 libfontconfig unzip curl \ openjdk-17-jre-headless ca-certificates-java \ musl gfortran \ python3 python3-pip python3-setuptools python3-dev \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* WORKDIR /opt/grobid COPY --from=builder /opt/grobid . RUN python3 -m pip install pip --upgrade # install DeLFT via pypi RUN pip3 install requests delft==0.3.3 # link the data directory to /data # the current working directory will most likely be /opt/grobid RUN mkdir -p /data \ && ln -s /data /opt/grobid/data \ && ln -s /data ./data # disable python warnings (and fix logging) ENV PYTHONWARNINGS="ignore" WORKDIR /opt/grobid ENV JAVA_OPTS=-Xmx4g # install jep (and temporarily the matching JDK) ENV JDK_URL=https://download.java.net/java/GA/jdk17.0.2/dfd4a8d0985749f896bed50d7138ee7f/8/GPL/openjdk-17.0.2_linux-x64_bin.tar.gz RUN curl --fail --show-error --location -q ${JDK_URL} -o /tmp/openjdk.tar.gz RUN mkdir /tmp/jdk-17 RUN tar xvfz /tmp/openjdk.tar.gz --directory /tmp/jdk-17 --strip-components 1 --no-same-owner RUN /tmp/jdk-17/bin/javac -version RUN JAVA_HOME=/tmp/jdk-17 pip3 install jep==4.0.2 RUN rm -f /tmp/openjdk.tar.gz RUN rm -rf /tmp/jdk-17 ENV LD_LIBRARY_PATH=/usr/local/lib/python3.8/dist-packages/jep:grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep:${LD_LIBRARY_PATH} # remove libjep.so because we are providing our own version in the virtual env above RUN rm /opt/grobid/grobid-home/lib/lin-64/jep/libjep.so # preload embeddings, for GROBID all the RNN models use glove-840B (default for the script), ELMo is currently not loaded # to be done: mechanism to download GROBID fine-tuned models based on SciBERT if selected (but not good enough for the moment) COPY --from=builder /opt/grobid-source/grobid-home/scripts/preload_embeddings.py . COPY --from=builder /opt/grobid-source/grobid-home/config/resources-registry.json . RUN python3 preload_embeddings.py --registry ./resources-registry.json RUN ln -s /opt/grobid /opt/delft RUN mkdir delft RUN cp ./resources-registry.json delft/ ENV GROBID_SERVICE_OPTS "--add-opens java.base/java.lang=ALL-UNNAMED" CMD ["./grobid-service/bin/grobid-service"] ARG GROBID_VERSION LABEL \ authors="The contributors" \ org.label-schema.name="GROBID" \ org.label-schema.description="Image with GROBID service" \ org.label-schema.url="https://github.com/kermitt2/grobid" \ org.label-schema.version=${GROBID_VERSION}