luisrodriguesphd commited on
Commit
341fcdf
1 Parent(s): 2e113bf

Improve deployment issues - reqs, dockerfile, gitigore,...

Browse files
.gitignore CHANGED
@@ -10,18 +10,12 @@ conf/local/**
10
  conf/**/*credentials*
11
 
12
  # ignore everything in the following folders
13
- data/**
14
  logs/**
 
15
 
16
  # except their sub-folders
17
- !data/**/
18
  !logs/**/
19
 
20
- # also keep the example dataset
21
- !data/01_raw/*.csv
22
- !data/02_processed/**
23
- !data/02_processed/**/**
24
-
25
  # also keep all .gitkeep files
26
  !.gitkeep
27
 
 
10
  conf/**/*credentials*
11
 
12
  # ignore everything in the following folders
 
13
  logs/**
14
+ data/03_indexed/**
15
 
16
  # except their sub-folders
 
17
  !logs/**/
18
 
 
 
 
 
 
19
  # also keep all .gitkeep files
20
  !.gitkeep
21
 
Dockerfile CHANGED
@@ -15,6 +15,9 @@ ENV REQUIREMENTS_PATH=$REQUIREMENTS_PATH
15
  ARG HF_HOME=".cache/huggingface/hub"
16
  ENV HF_HOME=$HF_HOME
17
 
 
 
 
18
  ARG ENTRYPOINT_PATH="./entrypoint.sh"
19
  ENV ENTRYPOINT_PATH=$ENTRYPOINT_PATH
20
 
@@ -26,10 +29,10 @@ RUN mkdir -p /code/&& \
26
  WORKDIR /code
27
 
28
  # Create a virtual environment in the directory /venv
29
- RUN python -m venv venv
30
 
31
  # Activate the virtual environment by adding it to the PATH environment variable
32
- ENV PATH="/venv/bin:$PATH"
33
 
34
  RUN apt update && \
35
  python -m ensurepip --upgrade && \
@@ -41,13 +44,18 @@ RUN pip install --no-cache-dir -r ./requirements.txt
41
 
42
  RUN mkdir -p $HF_HOME && \
43
  chmod -R 777 $HF_HOME && \
 
44
  export TRANSFORMERS_CACHE=$HF_HOME && \
45
- export HF_HOME=$HF_HOME
 
 
46
 
47
  COPY . .
48
 
49
  RUN pip install -e . && \
50
- python src/resume_worth/pipelines/data_indexing/pipeline.py && \
 
 
51
  chmod +x $ENTRYPOINT_PATH
52
 
53
  ENTRYPOINT $ENTRYPOINT_PATH
 
15
  ARG HF_HOME=".cache/huggingface/hub"
16
  ENV HF_HOME=$HF_HOME
17
 
18
+ ARG MPLCONFIGDIR=".config/matplotlib"
19
+ ENV MPLCONFIGDIR=$MPLCONFIGDIR
20
+
21
  ARG ENTRYPOINT_PATH="./entrypoint.sh"
22
  ENV ENTRYPOINT_PATH=$ENTRYPOINT_PATH
23
 
 
29
  WORKDIR /code
30
 
31
  # Create a virtual environment in the directory /venv
32
+ RUN python -m venv .venv
33
 
34
  # Activate the virtual environment by adding it to the PATH environment variable
35
+ ENV PATH="/code/.venv/bin:$PATH"
36
 
37
  RUN apt update && \
38
  python -m ensurepip --upgrade && \
 
44
 
45
  RUN mkdir -p $HF_HOME && \
46
  chmod -R 777 $HF_HOME && \
47
+ export HF_HOME=$HF_HOME && \
48
  export TRANSFORMERS_CACHE=$HF_HOME && \
49
+ mkdir -p $MPLCONFIGDIR && \
50
+ chmod -R 777 $MPLCONFIGDIR && \
51
+ export MPLCONFIGDIR=$MPLCONFIGDIR
52
 
53
  COPY . .
54
 
55
  RUN pip install -e . && \
56
+ python src/resume_worth/pipelines/data_indexing/pipeline.py
57
+
58
+ RUN python src/resume_worth/pipelines/text_generation/pipeline.py && \
59
  chmod +x $ENTRYPOINT_PATH
60
 
61
  ENTRYPOINT $ENTRYPOINT_PATH
data/04_prompts/prompt_template_for_explaning_why_is_a_good_fit.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": null,
3
+ "input_variables": [
4
+ "job",
5
+ "resume"
6
+ ],
7
+ "input_types": {},
8
+ "output_parser": null,
9
+ "partial_variables": {},
10
+ "metadata": null,
11
+ "tags": null,
12
+ "template": "<|im_start|>user\nExplain why the following RESUME is a good match for the presented JOB VACANCY.\nKeep your answer grounded in the facts of the RESUME and JOB VACANCY.\nWrite a maximum of three points in clear and concise language.\n\nRESUME: \n{resume}\n\nJOB VACANCY: \n{job}<|im_end|>\n<|im_start|>assistant\n",
13
+ "template_format": "f-string",
14
+ "validate_template": false,
15
+ "_type": "prompt"
16
+ }
requirements.in CHANGED
@@ -10,6 +10,10 @@ langchain-community
10
  sentence-transformers
11
  chromadb
12
 
 
 
 
 
13
  # to build the user infertafe
14
  gradio
15
 
 
10
  sentence-transformers
11
  chromadb
12
 
13
+ # to generate text
14
+ transformers
15
+ torch
16
+
17
  # to build the user infertafe
18
  gradio
19
 
requirements.txt CHANGED
@@ -6,7 +6,7 @@
6
  #
7
  aiofiles==23.2.1
8
  # via gradio
9
- aiohttp==3.9.3
10
  # via
11
  # langchain
12
  # langchain-community
@@ -73,7 +73,7 @@ fastapi==0.110.1
73
  # gradio
74
  ffmpy==0.3.2
75
  # via gradio
76
- filelock==3.13.3
77
  # via
78
  # huggingface-hub
79
  # torch
@@ -95,9 +95,9 @@ google-auth==2.29.0
95
  # via kubernetes
96
  googleapis-common-protos==1.63.0
97
  # via opentelemetry-exporter-otlp-proto-grpc
98
- gradio==4.25.0
99
  # via -r requirements.in
100
- gradio-client==0.15.0
101
  # via gradio
102
  grpcio==1.62.1
103
  # via
@@ -124,7 +124,7 @@ huggingface-hub==0.22.2
124
  # transformers
125
  humanfriendly==10.0
126
  # via coloredlogs
127
- idna==3.6
128
  # via
129
  # anyio
130
  # httpx
@@ -141,7 +141,7 @@ jinja2==3.1.3
141
  # altair
142
  # gradio
143
  # torch
144
- joblib==1.3.2
145
  # via scikit-learn
146
  jsonpatch==1.33
147
  # via
@@ -157,20 +157,20 @@ kiwisolver==1.4.5
157
  # via matplotlib
158
  kubernetes==29.0.0
159
  # via chromadb
160
- langchain==0.1.14
161
  # via -r requirements.in
162
- langchain-community==0.0.31
163
  # via
164
  # -r requirements.in
165
  # langchain
166
- langchain-core==0.1.40
167
  # via
168
  # langchain
169
  # langchain-community
170
  # langchain-text-splitters
171
  langchain-text-splitters==0.0.1
172
  # via langchain
173
- langsmith==0.1.40
174
  # via
175
  # langchain
176
  # langchain-community
@@ -221,7 +221,7 @@ oauthlib==3.2.2
221
  # via
222
  # kubernetes
223
  # requests-oauthlib
224
- onnxruntime==1.17.1
225
  # via chromadb
226
  opentelemetry-api==1.24.0
227
  # via
@@ -279,7 +279,7 @@ packaging==23.2
279
  # matplotlib
280
  # onnxruntime
281
  # transformers
282
- pandas==2.2.1
283
  # via
284
  # -r requirements.in
285
  # altair
@@ -296,7 +296,7 @@ protobuf==4.25.3
296
  # googleapis-common-protos
297
  # onnxruntime
298
  # opentelemetry-proto
299
- pulsar-client==3.4.0
300
  # via chromadb
301
  pyasn1==0.6.0
302
  # via
@@ -304,7 +304,7 @@ pyasn1==0.6.0
304
  # rsa
305
  pyasn1-modules==0.4.0
306
  # via google-auth
307
- pydantic==2.6.4
308
  # via
309
  # chromadb
310
  # fastapi
@@ -312,7 +312,7 @@ pydantic==2.6.4
312
  # langchain
313
  # langchain-core
314
  # langsmith
315
- pydantic-core==2.16.3
316
  # via pydantic
317
  pydub==0.25.1
318
  # via gradio
@@ -376,11 +376,11 @@ rpds-py==0.18.0
376
  # referencing
377
  rsa==4.9
378
  # via google-auth
379
- ruff==0.3.5
380
  # via gradio
381
  safetensors==0.4.2
382
  # via transformers
383
- scikit-learn==1.4.1.post1
384
  # via sentence-transformers
385
  scipy==1.13.0
386
  # via
@@ -428,7 +428,9 @@ tomlkit==0.12.0
428
  toolz==0.12.1
429
  # via altair
430
  torch==2.2.2
431
- # via sentence-transformers
 
 
432
  tqdm==4.66.2
433
  # via
434
  # chromadb
@@ -436,8 +438,10 @@ tqdm==4.66.2
436
  # sentence-transformers
437
  # transformers
438
  transformers==4.39.3
439
- # via sentence-transformers
440
- typer[all]==0.12.1
 
 
441
  # via
442
  # chromadb
443
  # gradio
 
6
  #
7
  aiofiles==23.2.1
8
  # via gradio
9
+ aiohttp==3.9.4
10
  # via
11
  # langchain
12
  # langchain-community
 
73
  # gradio
74
  ffmpy==0.3.2
75
  # via gradio
76
+ filelock==3.13.4
77
  # via
78
  # huggingface-hub
79
  # torch
 
95
  # via kubernetes
96
  googleapis-common-protos==1.63.0
97
  # via opentelemetry-exporter-otlp-proto-grpc
98
+ gradio==4.26.0
99
  # via -r requirements.in
100
+ gradio-client==0.15.1
101
  # via gradio
102
  grpcio==1.62.1
103
  # via
 
124
  # transformers
125
  humanfriendly==10.0
126
  # via coloredlogs
127
+ idna==3.7
128
  # via
129
  # anyio
130
  # httpx
 
141
  # altair
142
  # gradio
143
  # torch
144
+ joblib==1.4.0
145
  # via scikit-learn
146
  jsonpatch==1.33
147
  # via
 
157
  # via matplotlib
158
  kubernetes==29.0.0
159
  # via chromadb
160
+ langchain==0.1.16
161
  # via -r requirements.in
162
+ langchain-community==0.0.32
163
  # via
164
  # -r requirements.in
165
  # langchain
166
+ langchain-core==0.1.42
167
  # via
168
  # langchain
169
  # langchain-community
170
  # langchain-text-splitters
171
  langchain-text-splitters==0.0.1
172
  # via langchain
173
+ langsmith==0.1.47
174
  # via
175
  # langchain
176
  # langchain-community
 
221
  # via
222
  # kubernetes
223
  # requests-oauthlib
224
+ onnxruntime==1.17.3
225
  # via chromadb
226
  opentelemetry-api==1.24.0
227
  # via
 
279
  # matplotlib
280
  # onnxruntime
281
  # transformers
282
+ pandas==2.2.2
283
  # via
284
  # -r requirements.in
285
  # altair
 
296
  # googleapis-common-protos
297
  # onnxruntime
298
  # opentelemetry-proto
299
+ pulsar-client==3.5.0
300
  # via chromadb
301
  pyasn1==0.6.0
302
  # via
 
304
  # rsa
305
  pyasn1-modules==0.4.0
306
  # via google-auth
307
+ pydantic==2.7.0
308
  # via
309
  # chromadb
310
  # fastapi
 
312
  # langchain
313
  # langchain-core
314
  # langsmith
315
+ pydantic-core==2.18.1
316
  # via pydantic
317
  pydub==0.25.1
318
  # via gradio
 
376
  # referencing
377
  rsa==4.9
378
  # via google-auth
379
+ ruff==0.3.7
380
  # via gradio
381
  safetensors==0.4.2
382
  # via transformers
383
+ scikit-learn==1.4.2
384
  # via sentence-transformers
385
  scipy==1.13.0
386
  # via
 
428
  toolz==0.12.1
429
  # via altair
430
  torch==2.2.2
431
+ # via
432
+ # -r requirements.in
433
+ # sentence-transformers
434
  tqdm==4.66.2
435
  # via
436
  # chromadb
 
438
  # sentence-transformers
439
  # transformers
440
  transformers==4.39.3
441
+ # via
442
+ # -r requirements.in
443
+ # sentence-transformers
444
+ typer[all]==0.12.3
445
  # via
446
  # chromadb
447
  # gradio
src/app/app.py CHANGED
@@ -54,7 +54,7 @@ def run():
54
  )
55
 
56
  # Use share=True to create a public link to share. This share link expires in 72 hours.
57
- app.launch(server_name=app_config['host'], server_port=app_config['port'], max_threads=8)
58
 
59
 
60
  if __name__ == "__main__":
 
54
  )
55
 
56
  # Use share=True to create a public link to share. This share link expires in 72 hours.
57
+ app.launch(server_name=app_config['host'], server_port=app_config['port'])
58
 
59
 
60
  if __name__ == "__main__":