seanpedrickcase commited on
Commit
91bd588
1 Parent(s): b1c3d49

Updated Dockerfile and requirements files to create a smaller container

Browse files
Dockerfile CHANGED
@@ -1,43 +1,48 @@
1
  # First stage: build dependencies
2
- FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
3
 
4
  # Optional - install Lambda web adapter in case you want to run with with an AWS Lamba function URL
5
  # COPY --from=public.ecr.aws/awsguru/aws-lambda-adapter:0.8.3 /lambda-adapter /opt/extensions/lambda-adapter
6
 
7
- # Install wget
8
- RUN apt-get update && \
9
- apt-get install -y wget && \
10
- apt-get install -y curl && \
11
- apt-get clean && rm -rf /var/lib/apt/lists/*
12
 
13
  # Create a directory for the model
14
- RUN mkdir /model
15
 
16
  WORKDIR /src
17
 
18
- COPY requirements.txt .
19
 
20
- RUN pip install --no-cache-dir -r requirements.txt
 
 
 
21
 
22
- # Gradio needs to be installed after due to conflict with spacy in requirements
23
- RUN pip install --no-cache-dir gradio==4.37.2
24
 
25
- # Download the BGE embedding model during the build process. Create a directory for the model and download specific files using huggingface_hub
26
- RUN mkdir -p /model/minilm
27
  COPY download_model.py /src/download_model.py
28
  RUN python /src/download_model.py
29
 
 
 
 
30
  # Set up a new user named "user" with user ID 1000
31
  RUN useradd -m -u 1000 user
32
 
 
 
 
33
  # Change ownership of /home/user directory
34
  RUN chown -R user:user /home/user
35
 
36
- EXPOSE 7860
37
-
38
  # Make output folder
39
- RUN mkdir -p /home/user/app/output && chown -R user:user /home/user/app/output
40
- RUN mkdir -p /home/user/.cache/huggingface/hub && chown -R user:user /home/user/.cache/huggingface/hub
 
 
41
 
42
  # Switch to the "user" user
43
  USER user
@@ -54,7 +59,6 @@ ENV HOME=/home/user \
54
  GRADIO_SERVER_PORT=7860 \
55
  GRADIO_THEME=huggingface \
56
  AWS_STS_REGIONAL_ENDPOINT=regional \
57
- #GRADIO_ROOT_PATH=/data-text-search \
58
  SYSTEM=spaces
59
 
60
  # Set the working directory to the user's home directory
 
1
  # First stage: build dependencies
2
+ FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm AS builder
3
 
4
  # Optional - install Lambda web adapter in case you want to run with with an AWS Lamba function URL
5
  # COPY --from=public.ecr.aws/awsguru/aws-lambda-adapter:0.8.3 /lambda-adapter /opt/extensions/lambda-adapter
6
 
7
+ # Update apt
8
+ RUN apt-get update && rm -rf /var/lib/apt/lists/*
 
 
 
9
 
10
  # Create a directory for the model
11
+ RUN mkdir -p /model /model/minilm /install
12
 
13
  WORKDIR /src
14
 
15
+ COPY requirements_aws.txt .
16
 
17
+ RUN pip install torch==2.4.0+cpu --target=/install --index-url https://download.pytorch.org/whl/cpu \
18
+ && pip install --no-cache-dir --target=/install sentence-transformers==3.0.1 --no-deps \
19
+ && pip install --no-cache-dir --target=/install -r requirements_aws.txt \
20
+ && pip install --no-cache-dir --target=/install gradio==4.41.0
21
 
22
+ # Add /install to the PYTHONPATH
23
+ ENV PYTHONPATH="/install:${PYTHONPATH}"
24
 
25
+ # Download the embedding model during the build process. Create a directory for the model and download specific files using huggingface_hub
 
26
  COPY download_model.py /src/download_model.py
27
  RUN python /src/download_model.py
28
 
29
+ # Stage 2: Final runtime image
30
+ FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
31
+
32
  # Set up a new user named "user" with user ID 1000
33
  RUN useradd -m -u 1000 user
34
 
35
+ # Copy installed packages from builder stage
36
+ COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
37
+
38
  # Change ownership of /home/user directory
39
  RUN chown -R user:user /home/user
40
 
 
 
41
  # Make output folder
42
+ RUN mkdir -p /home/user/app/output && mkdir -p /home/user/.cache/huggingface/hub && chown -R user:user /home/user
43
+
44
+ # Copy models from the builder stage
45
+ COPY --from=builder /model/minilm /home/user/app/model/minilm
46
 
47
  # Switch to the "user" user
48
  USER user
 
59
  GRADIO_SERVER_PORT=7860 \
60
  GRADIO_THEME=huggingface \
61
  AWS_STS_REGIONAL_ENDPOINT=regional \
 
62
  SYSTEM=spaces
63
 
64
  # Set the working directory to the user's home directory
requirements_aws.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ pandas==2.2.2
2
+ polars==0.20.3
3
+ pyarrow==14.0.2
4
+ openpyxl==3.1.3
5
+ spacy==3.7.5
6
+ en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
7
+ lxml==5.2.2
8
+ boto3==1.34.158
search_funcs/convert_files_to_parquet.py DELETED
@@ -1,33 +0,0 @@
1
- # %%
2
- import pandas as pd
3
- import csv
4
-
5
- # %%
6
- # Define your file paths
7
- file_dir = "../"
8
- extracted_file_path = file_dir + "2022_08_case_notes.txt"
9
- parquet_file_path = file_dir + "2022_08_case_notes.parquet"
10
-
11
- # %%
12
- # Read the TXT file using the csv module and convert to DataFrame
13
- csv.field_size_limit(1000000) # set to a higher value
14
-
15
- data_list = []
16
- with open(extracted_file_path, mode='r', encoding='iso-8859-1') as file:
17
- csv_reader = csv.reader(file, delimiter=',') # Change the delimiter if needed
18
- for row in csv_reader:
19
- data_list.append(row)
20
-
21
- # Filter rows that have the same number of columns as the header
22
- header = data_list[0]
23
- filtered_data = [row for row in data_list if len(row) == len(header)]
24
-
25
- # Convert list of rows to DataFrame
26
- casenotes = pd.DataFrame(filtered_data[1:], columns=header) # Assuming first row is header
27
-
28
- print(casenotes.head()) # Display the first few rows of the DataFrame
29
-
30
- # %%
31
- casenotes.to_parquet(parquet_file_path)
32
-
33
-