Spaces:
Configuration error
Configuration error
Merge pull request #11 from gperdrizet/dev
Browse files- .github/workflows/python_ci.yml +1 -1
- .gitignore +2 -1
- README.md +3 -5
- configuration.py +11 -11
- functions/job_call.py +7 -8
- functions/writer_agent.py +1 -1
- inference_endpoints/deepseekR1-qwen-32B.py +0 -74
- inference_endpoints/llama3-1-8B-instruct.py +0 -76
- inference_endpoints/qwen2-5-coder-14B-instruct.py +0 -74
- requirements.txt +0 -1
.github/workflows/python_ci.yml
CHANGED
|
@@ -25,7 +25,7 @@ jobs:
|
|
| 25 |
- name: Test with unittest
|
| 26 |
env:
|
| 27 |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
| 28 |
-
|
| 29 |
run: |
|
| 30 |
python -m unittest tests/test_gradio.py
|
| 31 |
python -m unittest tests/test_linkedin_resume.py
|
|
|
|
| 25 |
- name: Test with unittest
|
| 26 |
env:
|
| 27 |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
| 28 |
+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
| 29 |
run: |
|
| 30 |
python -m unittest tests/test_gradio.py
|
| 31 |
python -m unittest tests/test_linkedin_resume.py
|
.gitignore
CHANGED
|
@@ -2,4 +2,5 @@ __pycache__
|
|
| 2 |
.vscode
|
| 3 |
.venv
|
| 4 |
.env
|
| 5 |
-
data
|
|
|
|
|
|
| 2 |
.vscode
|
| 3 |
.venv
|
| 4 |
.env
|
| 5 |
+
data
|
| 6 |
+
inference_endopints
|
README.md
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
|
| 3 |
[](https://github.com/gperdrizet/resumate/actions/workflows/python_ci.yml)[](https://github.com/gperdrizet/resumate/actions/workflows/publish_hf_space.yml)[](https://github.com/gperdrizet/resumate/actions/workflows/codespaces/create_codespaces_prebuilds)
|
| 4 |
|
| 5 |
-
Resumate is a simple web app that helps you generate a tailored resume for a specific job post. It collects your LinkedIn profile (PDF export), GitHub profile URL, and the job post text, then processes this information to help you create a resume that matches the job requirements.
|
| 6 |
|
| 7 |
|
| 8 |
## Features
|
|
@@ -20,8 +20,7 @@ Resumate is a simple web app that helps you generate a tailored resume for a spe
|
|
| 20 |
|
| 21 |
2. **Add API keys as secrets**
|
| 22 |
- In your fork, go to **Settings > Secrets and variables > Codespaces**.
|
| 23 |
-
- Add
|
| 24 |
-
- These will be available as environment variables in your Codespace.
|
| 25 |
|
| 26 |
3. **Start a Codespace**
|
| 27 |
- Click the "Code" button on your fork and select "Open with Codespaces".
|
|
@@ -48,8 +47,7 @@ Resumate is a simple web app that helps you generate a tailored resume for a spe
|
|
| 48 |
4. **Set your API keys as environment variables:**
|
| 49 |
Add your API keys to `.venv/bin/activate`:
|
| 50 |
```bash
|
| 51 |
-
export
|
| 52 |
-
export MODAL_API_KEY=your_modal_api_key
|
| 53 |
```
|
| 54 |
5. **Activate the virtual environment:**
|
| 55 |
```bash
|
|
|
|
| 2 |
|
| 3 |
[](https://github.com/gperdrizet/resumate/actions/workflows/python_ci.yml)[](https://github.com/gperdrizet/resumate/actions/workflows/publish_hf_space.yml)[](https://github.com/gperdrizet/resumate/actions/workflows/codespaces/create_codespaces_prebuilds)
|
| 4 |
|
| 5 |
+
Resumate is a simple web app that helps you generate a tailored resume for a specific job post using the Antropic API. It collects your LinkedIn profile (PDF export), GitHub profile URL, and the job post text, then processes this information to help you create a resume that matches the job requirements.
|
| 6 |
|
| 7 |
|
| 8 |
## Features
|
|
|
|
| 20 |
|
| 21 |
2. **Add API keys as secrets**
|
| 22 |
- In your fork, go to **Settings > Secrets and variables > Codespaces**.
|
| 23 |
+
- Add `ANTHROPIC_API_KEY` with your API key as value.
|
|
|
|
| 24 |
|
| 25 |
3. **Start a Codespace**
|
| 26 |
- Click the "Code" button on your fork and select "Open with Codespaces".
|
|
|
|
| 47 |
4. **Set your API keys as environment variables:**
|
| 48 |
Add your API keys to `.venv/bin/activate`:
|
| 49 |
```bash
|
| 50 |
+
export ANTHROPIC_API_KEY=your_anthropic_api_key
|
|
|
|
| 51 |
```
|
| 52 |
5. **Activate the virtual environment:**
|
| 53 |
```bash
|
configuration.py
CHANGED
|
@@ -6,30 +6,30 @@ from smolagents import OpenAIServerModel
|
|
| 6 |
|
| 7 |
DEFAULT_GITHUB_PROFILE = "https://github.com/gperdrizet"
|
| 8 |
|
| 9 |
-
# AGENT_MODEL = OpenAIServerModel(
|
| 10 |
-
# model_id="gpt-4.1",
|
| 11 |
-
# max_tokens=8000
|
| 12 |
-
# )
|
| 13 |
-
|
| 14 |
# Will be used for single shot summarization with no-frills prompting
|
| 15 |
# (e.g. job call extraction). It needs to output JSON formatted text,
|
| 16 |
# but this task does not require any complex reasoning or planning.
|
| 17 |
-
|
| 18 |
-
base_url="https://
|
| 19 |
-
api_key=os.environ[
|
| 20 |
)
|
| 21 |
|
|
|
|
|
|
|
| 22 |
# Will be used for resume resume writing agent via HuggingFace smolagents
|
| 23 |
# Including selection of relevant projects from GitHub profile
|
| 24 |
#
|
| 25 |
# Notes:
|
| 26 |
# - DeepSeek-R1-Distill-Qwen-32B does not seem to work well with smolagents,
|
| 27 |
# has trouble correctly formatting responses as code.
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
AGENT_MODEL = OpenAIServerModel(
|
| 30 |
-
model_id="
|
| 31 |
-
api_base="https://
|
| 32 |
-
api_key=os.environ["
|
| 33 |
)
|
| 34 |
|
| 35 |
INSTRUCTIONS = """
|
|
|
|
| 6 |
|
| 7 |
DEFAULT_GITHUB_PROFILE = "https://github.com/gperdrizet"
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
# Will be used for single shot summarization with no-frills prompting
|
| 10 |
# (e.g. job call extraction). It needs to output JSON formatted text,
|
| 11 |
# but this task does not require any complex reasoning or planning.
|
| 12 |
+
SUMMARIZER_CLIENT = OpenAI(
|
| 13 |
+
base_url="https://api.anthropic.com/v1/",
|
| 14 |
+
api_key=os.environ["ANTHROPIC_API_KEY"]
|
| 15 |
)
|
| 16 |
|
| 17 |
+
SUMMARIZER_MODEL = "claude-3-5-haiku-20241022"
|
| 18 |
+
|
| 19 |
# Will be used for resume resume writing agent via HuggingFace smolagents
|
| 20 |
# Including selection of relevant projects from GitHub profile
|
| 21 |
#
|
| 22 |
# Notes:
|
| 23 |
# - DeepSeek-R1-Distill-Qwen-32B does not seem to work well with smolagents,
|
| 24 |
# has trouble correctly formatting responses as code.
|
| 25 |
+
# - Qwen2.5-Coder-14B-Instruct works OK, but is not great at markdown formatting
|
| 26 |
+
# and tends to get some details wrong.
|
| 27 |
+
# - Claude-3-5-Haiku is the best model for this task so far.
|
| 28 |
|
| 29 |
AGENT_MODEL = OpenAIServerModel(
|
| 30 |
+
model_id="claude-3-5-haiku-20241022", # Same as HF model string
|
| 31 |
+
api_base="https://api.anthropic.com/v1/",
|
| 32 |
+
api_key=os.environ["ANTHROPIC_API_KEY"],
|
| 33 |
)
|
| 34 |
|
| 35 |
INSTRUCTIONS = """
|
functions/job_call.py
CHANGED
|
@@ -4,7 +4,11 @@ import json
|
|
| 4 |
import logging
|
| 5 |
from pathlib import Path
|
| 6 |
from datetime import datetime
|
| 7 |
-
from configuration import
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
# pylint: disable=broad-exception-caught
|
| 10 |
|
|
@@ -57,11 +61,6 @@ def summarize_job_call(job_call: str) -> str:
|
|
| 57 |
|
| 58 |
logger.info("Summarizing job call (%d characters)", len(job_call))
|
| 59 |
|
| 60 |
-
# Default to first available model
|
| 61 |
-
model = SUMMARIZER_MODEL.models.list().data[0]
|
| 62 |
-
model_id = model.id
|
| 63 |
-
print(f"Using model: {model_id}")
|
| 64 |
-
|
| 65 |
messages = [
|
| 66 |
{
|
| 67 |
'role': 'system',
|
|
@@ -70,12 +69,12 @@ def summarize_job_call(job_call: str) -> str:
|
|
| 70 |
]
|
| 71 |
|
| 72 |
completion_args = {
|
| 73 |
-
'model':
|
| 74 |
'messages': messages,
|
| 75 |
}
|
| 76 |
|
| 77 |
try:
|
| 78 |
-
response =
|
| 79 |
|
| 80 |
except Exception as e:
|
| 81 |
response = None
|
|
|
|
| 4 |
import logging
|
| 5 |
from pathlib import Path
|
| 6 |
from datetime import datetime
|
| 7 |
+
from configuration import (
|
| 8 |
+
JOB_CALL_EXTRACTION_PROMPT,
|
| 9 |
+
SUMMARIZER_MODEL,
|
| 10 |
+
SUMMARIZER_CLIENT
|
| 11 |
+
)
|
| 12 |
|
| 13 |
# pylint: disable=broad-exception-caught
|
| 14 |
|
|
|
|
| 61 |
|
| 62 |
logger.info("Summarizing job call (%d characters)", len(job_call))
|
| 63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
messages = [
|
| 65 |
{
|
| 66 |
'role': 'system',
|
|
|
|
| 69 |
]
|
| 70 |
|
| 71 |
completion_args = {
|
| 72 |
+
'model': SUMMARIZER_MODEL,
|
| 73 |
'messages': messages,
|
| 74 |
}
|
| 75 |
|
| 76 |
try:
|
| 77 |
+
response = SUMMARIZER_CLIENT.chat.completions.create(**completion_args)
|
| 78 |
|
| 79 |
except Exception as e:
|
| 80 |
response = None
|
functions/writer_agent.py
CHANGED
|
@@ -30,7 +30,7 @@ def write_resume(content: str, user_instructions: str = None, job_summary: str =
|
|
| 30 |
agent = CodeAgent(
|
| 31 |
model=AGENT_MODEL,
|
| 32 |
tools=[],
|
| 33 |
-
additional_authorized_imports=['json'],
|
| 34 |
name="writer_agent",
|
| 35 |
verbosity_level=5,
|
| 36 |
max_steps=20,
|
|
|
|
| 30 |
agent = CodeAgent(
|
| 31 |
model=AGENT_MODEL,
|
| 32 |
tools=[],
|
| 33 |
+
additional_authorized_imports=['json', 'pandas'],
|
| 34 |
name="writer_agent",
|
| 35 |
verbosity_level=5,
|
| 36 |
max_steps=20,
|
inference_endpoints/deepseekR1-qwen-32B.py
DELETED
|
@@ -1,74 +0,0 @@
|
|
| 1 |
-
"""Run OpenAI-compatible LLM inference with DeepSeek-V3 and vLLM
|
| 2 |
-
Usage: modal deploy deepseek-v3.py"""
|
| 3 |
-
|
| 4 |
-
## Set up the container image
|
| 5 |
-
import os
|
| 6 |
-
import subprocess
|
| 7 |
-
import modal
|
| 8 |
-
|
| 9 |
-
vllm_image = (
|
| 10 |
-
modal.Image.debian_slim(python_version="3.12")
|
| 11 |
-
.pip_install(
|
| 12 |
-
"vllm==0.7.2",
|
| 13 |
-
"huggingface_hub[hf_transfer]==0.26.2",
|
| 14 |
-
"flashinfer-python==0.2.0.post2", # pinning, very unstable
|
| 15 |
-
extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5",
|
| 16 |
-
)
|
| 17 |
-
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) # faster model transfers
|
| 18 |
-
)
|
| 19 |
-
|
| 20 |
-
# Turn on V1 backend engine. Needs CUDA >=8, excluding 8.6 and 8.9.
|
| 21 |
-
vllm_image = vllm_image.env({"VLLM_USE_V1": "1"})
|
| 22 |
-
|
| 23 |
-
# Download the model weights
|
| 24 |
-
MODELS_DIR = "/models"
|
| 25 |
-
MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
|
| 26 |
-
|
| 27 |
-
# Cache model weights
|
| 28 |
-
hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
|
| 29 |
-
vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
## Build a vLLM engine and serve it
|
| 33 |
-
app = modal.App("deepseek-R1-qwen-32B")
|
| 34 |
-
|
| 35 |
-
N_GPU = 2
|
| 36 |
-
MINUTES = 60 # seconds
|
| 37 |
-
VLLM_PORT = 8000
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
@app.function(
|
| 41 |
-
image=vllm_image,
|
| 42 |
-
gpu=f"H100:{N_GPU}",
|
| 43 |
-
scaledown_window=15 * MINUTES, # how long should we stay up with no requests?
|
| 44 |
-
timeout=10 * MINUTES, # how long should we wait for container start?
|
| 45 |
-
volumes={
|
| 46 |
-
"/root/.cache/huggingface": hf_cache_vol,
|
| 47 |
-
"/root/.cache/vllm": vllm_cache_vol,
|
| 48 |
-
},
|
| 49 |
-
secrets=[modal.Secret.from_name("resumate_key")]
|
| 50 |
-
)
|
| 51 |
-
|
| 52 |
-
@modal.concurrent(
|
| 53 |
-
max_inputs=100
|
| 54 |
-
) # how many requests can one replica handle? tune carefully!
|
| 55 |
-
|
| 56 |
-
@modal.web_server(port=VLLM_PORT, startup_timeout=15 * MINUTES)
|
| 57 |
-
def serve():
|
| 58 |
-
"""Run vLLM inference server with DeepSeek model."""
|
| 59 |
-
|
| 60 |
-
cmd = [
|
| 61 |
-
"vllm",
|
| 62 |
-
"serve",
|
| 63 |
-
"--uvicorn-log-level=info",
|
| 64 |
-
MODEL_NAME,
|
| 65 |
-
"--served-model-name", MODEL_NAME,
|
| 66 |
-
"--tensor-parallel-size", "2",
|
| 67 |
-
"--max-model-len", "32768",
|
| 68 |
-
"--host", "0.0.0.0",
|
| 69 |
-
"--port", str(VLLM_PORT),
|
| 70 |
-
"--api-key",os.environ["MODAL_TOKEN_SECRET"],
|
| 71 |
-
"--enforce-eager"
|
| 72 |
-
]
|
| 73 |
-
|
| 74 |
-
subprocess.Popen(" ".join(cmd), shell=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inference_endpoints/llama3-1-8B-instruct.py
DELETED
|
@@ -1,76 +0,0 @@
|
|
| 1 |
-
"""Run OpenAI-compatible LLM text summarization with LLaMA 3.1-8B and vLLM
|
| 2 |
-
Usage: modal deploy vllm_summarization_server.py"""
|
| 3 |
-
|
| 4 |
-
import os
|
| 5 |
-
import subprocess
|
| 6 |
-
import modal
|
| 7 |
-
|
| 8 |
-
vllm_image = (
|
| 9 |
-
modal.Image.debian_slim(python_version="3.12")
|
| 10 |
-
.pip_install(
|
| 11 |
-
"vllm==0.7.2",
|
| 12 |
-
"huggingface_hub[hf_transfer]==0.26.2",
|
| 13 |
-
"flashinfer-python==0.2.0.post2", # pinning, very unstable
|
| 14 |
-
extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5",
|
| 15 |
-
)
|
| 16 |
-
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) # faster model transfers
|
| 17 |
-
)
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
# Turn on V1 backend engine. Note: NVIDIA T4 does not seem to support
|
| 21 |
-
# this due to CUDA incompatibility. Needs CUDA >=8, excluding 8.6 and 8.9.
|
| 22 |
-
# For V1 backend use L40S
|
| 23 |
-
vllm_image = vllm_image.env({"VLLM_USE_V1": "1"})
|
| 24 |
-
|
| 25 |
-
# Download the model weights
|
| 26 |
-
MODELS_DIR = "/llamas"
|
| 27 |
-
MODEL_NAME = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16"
|
| 28 |
-
MODEL_REVISION = "a7c09948d9a632c2c840722f519672cd94af885d"
|
| 29 |
-
|
| 30 |
-
# Cache model weights
|
| 31 |
-
hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
|
| 32 |
-
vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
## Build a vLLM engine and serve it
|
| 36 |
-
app = modal.App("llama-3-1-8B-instruct")
|
| 37 |
-
|
| 38 |
-
N_GPU = 1
|
| 39 |
-
MINUTES = 60 # seconds
|
| 40 |
-
VLLM_PORT = 8000
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
@app.function(
|
| 44 |
-
image=vllm_image,
|
| 45 |
-
gpu=f"L40S:{N_GPU}",
|
| 46 |
-
scaledown_window=15 * MINUTES, # how long should we stay up with no requests?
|
| 47 |
-
timeout=10 * MINUTES, # how long should we wait for container start?
|
| 48 |
-
volumes={
|
| 49 |
-
"/root/.cache/huggingface": hf_cache_vol,
|
| 50 |
-
"/root/.cache/vllm": vllm_cache_vol,
|
| 51 |
-
},
|
| 52 |
-
secrets=[modal.Secret.from_name("resumate_key")]
|
| 53 |
-
)
|
| 54 |
-
|
| 55 |
-
@modal.concurrent(
|
| 56 |
-
max_inputs=100
|
| 57 |
-
) # how many requests can one replica handle? tune carefully!
|
| 58 |
-
|
| 59 |
-
@modal.web_server(port=VLLM_PORT, startup_timeout=5 * MINUTES)
|
| 60 |
-
def serve():
|
| 61 |
-
"""Serve the LLaMA 3.1-8B Instruct model with vLLM."""
|
| 62 |
-
|
| 63 |
-
cmd = [
|
| 64 |
-
"vllm",
|
| 65 |
-
"serve",
|
| 66 |
-
"--uvicorn-log-level=info",
|
| 67 |
-
MODEL_NAME,
|
| 68 |
-
"--served-model-name", MODEL_NAME,
|
| 69 |
-
"--revision", MODEL_REVISION,
|
| 70 |
-
"--host", "0.0.0.0",
|
| 71 |
-
"--port", str(VLLM_PORT),
|
| 72 |
-
"--api-key", os.environ["MODAL_TOKEN_SECRET"],
|
| 73 |
-
"--enforce-eager"
|
| 74 |
-
]
|
| 75 |
-
|
| 76 |
-
subprocess.Popen(" ".join(cmd), shell=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inference_endpoints/qwen2-5-coder-14B-instruct.py
DELETED
|
@@ -1,74 +0,0 @@
|
|
| 1 |
-
"""Run OpenAI-compatible LLM inference with DeepSeek-V3 and vLLM
|
| 2 |
-
Usage: modal deploy deepseek-v3.py"""
|
| 3 |
-
|
| 4 |
-
## Set up the container image
|
| 5 |
-
import os
|
| 6 |
-
import subprocess
|
| 7 |
-
import modal
|
| 8 |
-
|
| 9 |
-
vllm_image = (
|
| 10 |
-
modal.Image.debian_slim(python_version="3.12")
|
| 11 |
-
.pip_install(
|
| 12 |
-
"vllm==0.7.2",
|
| 13 |
-
"huggingface_hub[hf_transfer]==0.26.2",
|
| 14 |
-
"flashinfer-python==0.2.0.post2", # pinning, very unstable
|
| 15 |
-
extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5",
|
| 16 |
-
)
|
| 17 |
-
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) # faster model transfers
|
| 18 |
-
)
|
| 19 |
-
|
| 20 |
-
# Turn on V1 backend engine. Needs CUDA >=8, excluding 8.6 and 8.9.
|
| 21 |
-
vllm_image = vllm_image.env({"VLLM_USE_V1": "1"})
|
| 22 |
-
|
| 23 |
-
# Download the model weights
|
| 24 |
-
MODELS_DIR = "/models"
|
| 25 |
-
MODEL_NAME = "Qwen/Qwen2.5-Coder-14B-Instruct"
|
| 26 |
-
|
| 27 |
-
# Cache model weights
|
| 28 |
-
hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
|
| 29 |
-
vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
## Build a vLLM engine and serve it
|
| 33 |
-
app = modal.App("qwen2-5-coder-14B-instruct")
|
| 34 |
-
|
| 35 |
-
N_GPU = 1
|
| 36 |
-
MINUTES = 60 # seconds
|
| 37 |
-
VLLM_PORT = 8000
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
@app.function(
|
| 41 |
-
image=vllm_image,
|
| 42 |
-
gpu=f"L40S:{N_GPU}",
|
| 43 |
-
scaledown_window=15 * MINUTES, # how long should we stay up with no requests?
|
| 44 |
-
timeout=10 * MINUTES, # how long should we wait for container start?
|
| 45 |
-
volumes={
|
| 46 |
-
"/root/.cache/huggingface": hf_cache_vol,
|
| 47 |
-
"/root/.cache/vllm": vllm_cache_vol,
|
| 48 |
-
},
|
| 49 |
-
secrets=[modal.Secret.from_name("resumate_key")]
|
| 50 |
-
)
|
| 51 |
-
|
| 52 |
-
@modal.concurrent(
|
| 53 |
-
max_inputs=100
|
| 54 |
-
) # how many requests can one replica handle? tune carefully!
|
| 55 |
-
|
| 56 |
-
@modal.web_server(port=VLLM_PORT, startup_timeout=15 * MINUTES)
|
| 57 |
-
def serve():
|
| 58 |
-
"""Run vLLM inference server with DeepSeek model."""
|
| 59 |
-
|
| 60 |
-
cmd = [
|
| 61 |
-
"vllm",
|
| 62 |
-
"serve",
|
| 63 |
-
"--uvicorn-log-level=info",
|
| 64 |
-
MODEL_NAME,
|
| 65 |
-
"--served-model-name", MODEL_NAME,
|
| 66 |
-
"--tensor-parallel-size", "2",
|
| 67 |
-
"--max-model-len", "16000",
|
| 68 |
-
"--host", "0.0.0.0",
|
| 69 |
-
"--port", str(VLLM_PORT),
|
| 70 |
-
"--api-key",os.environ["MODAL_TOKEN_SECRET"],
|
| 71 |
-
"--enforce-eager"
|
| 72 |
-
]
|
| 73 |
-
|
| 74 |
-
subprocess.Popen(" ".join(cmd), shell=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
gradio==5.35.0
|
| 2 |
-
modal
|
| 3 |
openai
|
| 4 |
PyPDF2
|
| 5 |
requests
|
|
|
|
| 1 |
gradio==5.35.0
|
|
|
|
| 2 |
openai
|
| 3 |
PyPDF2
|
| 4 |
requests
|