Spaces:
Sleeping
Sleeping
Commit
·
b5c921b
1
Parent(s):
83da7d5
Add project files for HF Space
Browse files- Dockerfile +9 -11
- docker-compose.yml +0 -40
- requirements.txt +1 -3
- src/config.py +1 -3
- src/services/analysis_service.py +71 -46
Dockerfile
CHANGED
|
@@ -7,9 +7,8 @@ WORKDIR /app
|
|
| 7 |
# Install essential system-level dependencies
|
| 8 |
# - ffmpeg: Required by the pydub library for audio processing
|
| 9 |
# - git: Required by some pip packages for installation from version control
|
| 10 |
-
RUN apt-get update && apt-get install -y \
|
| 11 |
ffmpeg \
|
| 12 |
-
git \
|
| 13 |
&& rm -rf /var/lib/apt/lists/*
|
| 14 |
|
| 15 |
# Copy the dependency file first to leverage Docker's layer caching
|
|
@@ -21,16 +20,15 @@ RUN pip install --no-cache-dir -r requirements.txt
|
|
| 21 |
# Copy the application source code into the container
|
| 22 |
# force a re-installation of all packages
|
| 23 |
COPY src/ src/
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
# Expose the port Gradio will run on, making it accessible to the host
|
| 27 |
EXPOSE 7860
|
| 28 |
|
| 29 |
-
#
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
ENV XDG_CACHE_HOME=/tmp
|
| 33 |
-
|
| 34 |
-
# Define the default command to run when the container starts
|
| 35 |
-
# Uses Python's module flag '-m' for correct package path resolution
|
| 36 |
-
CMD ollama serve & sleep 5 && python -m src.app
|
|
|
|
| 7 |
# Install essential system-level dependencies
|
| 8 |
# - ffmpeg: Required by the pydub library for audio processing
|
| 9 |
# - git: Required by some pip packages for installation from version control
|
| 10 |
+
RUN RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 11 |
ffmpeg \
|
|
|
|
| 12 |
&& rm -rf /var/lib/apt/lists/*
|
| 13 |
|
| 14 |
# Copy the dependency file first to leverage Docker's layer caching
|
|
|
|
| 20 |
# Copy the application source code into the container
|
| 21 |
# force a re-installation of all packages
|
| 22 |
COPY src/ src/
|
| 23 |
+
|
| 24 |
+
# Set environment variables to control where Hugging Face downloads models
|
| 25 |
+
# inside the container, which can be useful on some platforms.
|
| 26 |
+
ENV HF_HOME=/app/huggingface_cache
|
| 27 |
+
ENV TRANSFORMERS_CACHE=/app/huggingface_cache
|
| 28 |
|
| 29 |
# Expose the port Gradio will run on, making it accessible to the host
|
| 30 |
EXPOSE 7860
|
| 31 |
|
| 32 |
+
# Define the default command to run the application.
|
| 33 |
+
# The server_name="0.0.0.0" is crucial for it to be accessible inside Docker.
|
| 34 |
+
CMD ["python", "-m", "src.app"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docker-compose.yml
DELETED
|
@@ -1,40 +0,0 @@
|
|
| 1 |
-
version: '3.8'
|
| 2 |
-
|
| 3 |
-
services:
|
| 4 |
-
# Service 1: Our Voice Analysis Application
|
| 5 |
-
app:
|
| 6 |
-
# Build the image using the Dockerfile in the current directory
|
| 7 |
-
build: .
|
| 8 |
-
# Expose the Gradio port
|
| 9 |
-
ports:
|
| 10 |
-
- "7860:7860"
|
| 11 |
-
# Set the environment variable for the Ollama host
|
| 12 |
-
environment:
|
| 13 |
-
- OLLAMA_HOST=ollama
|
| 14 |
-
# Make this service depend on the 'ollama' service
|
| 15 |
-
# This ensures that Ollama starts up before our application tries to connect to it
|
| 16 |
-
depends_on:
|
| 17 |
-
- ollama
|
| 18 |
-
|
| 19 |
-
# Service 2: The Ollama Server
|
| 20 |
-
ollama:
|
| 21 |
-
# Use the official Ollama Docker image
|
| 22 |
-
image: ollama/ollama
|
| 23 |
-
# Expose the Ollama API port so our 'app' service can reach it.
|
| 24 |
-
ports:
|
| 25 |
-
- "11434:11434"
|
| 26 |
-
# Set the environment variable to enable GPU support
|
| 27 |
-
deploy:
|
| 28 |
-
resources:
|
| 29 |
-
reservations:
|
| 30 |
-
devices:
|
| 31 |
-
- driver: nvidia
|
| 32 |
-
count: all
|
| 33 |
-
capabilities: [gpu]
|
| 34 |
-
# Mount a volume to persist Ollama data
|
| 35 |
-
# This allows Ollama to retain its state and models across container restarts
|
| 36 |
-
volumes:
|
| 37 |
-
- ollama_data:/root/.ollama
|
| 38 |
-
|
| 39 |
-
volumes:
|
| 40 |
-
ollama_data:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
|
@@ -6,7 +6,5 @@ openai-whisper==20231117
|
|
| 6 |
openai
|
| 7 |
pydub==0.25.1
|
| 8 |
python-dotenv==1.0.1
|
| 9 |
-
requests==2.32.3
|
| 10 |
pytest==8.2.2
|
| 11 |
-
pytest-mock==3.14.0
|
| 12 |
-
ollama==0.2.1
|
|
|
|
| 6 |
openai
|
| 7 |
pydub==0.25.1
|
| 8 |
python-dotenv==1.0.1
|
|
|
|
| 9 |
pytest==8.2.2
|
| 10 |
+
pytest-mock==3.14.0
|
|
|
src/config.py
CHANGED
|
@@ -14,12 +14,10 @@ APP_DESCRIPTION = (
|
|
| 14 |
|
| 15 |
# Model Configuration
|
| 16 |
MODEL_PROVIDER = "local" # Set to 'local' or 'openai'
|
| 17 |
-
OLLAMA_HOST = "localhost" # Default to localhost if not set
|
| 18 |
-
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "llama3") # Default model for Ollama
|
| 19 |
|
| 20 |
# Local model settings (if MODEL_PROVIDER is 'local')
|
| 21 |
LOCAL_TRANSCRIPTION_MODEL = "openai/whisper-base.en"
|
| 22 |
-
LOCAL_ANALYSIS_MODEL = "
|
| 23 |
|
| 24 |
# OpenAI API settings (if MODEL_PROVIDER is 'openai')
|
| 25 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
|
|
|
| 14 |
|
| 15 |
# Model Configuration
|
| 16 |
MODEL_PROVIDER = "local" # Set to 'local' or 'openai'
|
|
|
|
|
|
|
| 17 |
|
| 18 |
# Local model settings (if MODEL_PROVIDER is 'local')
|
| 19 |
LOCAL_TRANSCRIPTION_MODEL = "openai/whisper-base.en"
|
| 20 |
+
LOCAL_ANALYSIS_MODEL = "HuggingFaceH4/zephyr-7b-beta"
|
| 21 |
|
| 22 |
# OpenAI API settings (if MODEL_PROVIDER is 'openai')
|
| 23 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
src/services/analysis_service.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
| 1 |
-
import
|
|
|
|
| 2 |
from openai import OpenAI, OpenAIError
|
| 3 |
from src import config
|
| 4 |
-
from src.utils.exceptions import AnalysisError
|
| 5 |
from src.logging_config import logger
|
| 6 |
|
| 7 |
|
|
@@ -10,36 +11,43 @@ class AnalysisService:
|
|
| 10 |
A service class for performing text analysis tasks.
|
| 11 |
It uses Ollama for local analysis and the OpenAI API for remote analysis.
|
| 12 |
"""
|
|
|
|
| 13 |
|
| 14 |
-
|
|
|
|
| 15 |
"""
|
| 16 |
-
|
| 17 |
-
|
| 18 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
try:
|
| 24 |
-
# Initialize the Ollama client with the determined host
|
| 25 |
-
client = ollama.Client(host=f"http://{ollama_host}:11434")
|
| 26 |
-
logger.info(f"Sending analysis request to Ollama server at {ollama_host}.")
|
| 27 |
-
|
| 28 |
-
response = client.generate(
|
| 29 |
-
model=config.OLLAMA_MODEL,
|
| 30 |
-
prompt=prompt
|
| 31 |
-
)
|
| 32 |
-
|
| 33 |
-
logger.info("Ollama analysis successful.")
|
| 34 |
-
return response['response'].strip()
|
| 35 |
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
def _analyze_openai(self, prompt: str) -> str:
|
| 45 |
"""
|
|
@@ -70,16 +78,35 @@ class AnalysisService:
|
|
| 70 |
|
| 71 |
def _analyze(self, prompt: str) -> str:
|
| 72 |
"""
|
| 73 |
-
|
| 74 |
"""
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
def summarize(self, text: str) -> str:
|
| 85 |
"""
|
|
@@ -146,13 +173,11 @@ class AnalysisService:
|
|
| 146 |
{question}
|
| 147 |
"""
|
| 148 |
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
#
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
return response
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, Pipeline
|
| 3 |
from openai import OpenAI, OpenAIError
|
| 4 |
from src import config
|
| 5 |
+
from src.utils.exceptions import AnalysisError
|
| 6 |
from src.logging_config import logger
|
| 7 |
|
| 8 |
|
|
|
|
| 11 |
A service class for performing text analysis tasks.
|
| 12 |
It uses Ollama for local analysis and the OpenAI API for remote analysis.
|
| 13 |
"""
|
| 14 |
+
_local_pipeline: Pipeline = None
|
| 15 |
|
| 16 |
+
@classmethod
|
| 17 |
+
def _get_local_pipeline(cls) -> Pipeline:
|
| 18 |
"""
|
| 19 |
+
Initializes and returns the local text-generation pipeline.
|
| 20 |
+
This method caches the pipeline to ensure the model is loaded only once per session.
|
| 21 |
"""
|
| 22 |
+
if cls._local_pipeline is None:
|
| 23 |
+
try:
|
| 24 |
+
model_name = config.LOCAL_ANALYSIS_MODEL
|
| 25 |
+
logger.info(f"Initializing local analysis model: {model_name}")
|
| 26 |
+
|
| 27 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 28 |
+
logger.info(f"Using device: {device} for analysis.")
|
| 29 |
|
| 30 |
+
# For 7B models on CPU, loading in a lower precision can save RAM
|
| 31 |
+
torch_dtype = torch.bfloat16 if device == "cuda" else torch.float32
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 34 |
+
model_name,
|
| 35 |
+
torch_dtype=torch_dtype,
|
| 36 |
+
trust_remote_code=True
|
| 37 |
+
)
|
| 38 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 39 |
+
|
| 40 |
+
cls._local_pipeline = pipeline(
|
| 41 |
+
"text-generation",
|
| 42 |
+
model=model,
|
| 43 |
+
tokenizer=tokenizer,
|
| 44 |
+
device=device
|
| 45 |
+
)
|
| 46 |
+
logger.info("Local analysis model initialized successfully.")
|
| 47 |
+
except Exception as e:
|
| 48 |
+
logger.critical(f"Failed to load local analysis model: {e}", exc_info=True)
|
| 49 |
+
raise AnalysisError("Could not initialize the local analysis model.")
|
| 50 |
+
return cls._local_pipeline
|
| 51 |
|
| 52 |
def _analyze_openai(self, prompt: str) -> str:
|
| 53 |
"""
|
|
|
|
| 78 |
|
| 79 |
def _analyze(self, prompt: str) -> str:
|
| 80 |
"""
|
| 81 |
+
Generates a response using the local transformers pipeline.
|
| 82 |
"""
|
| 83 |
+
try:
|
| 84 |
+
logger.info("Starting local analysis with transformers pipeline.")
|
| 85 |
+
pipe = self._get_local_pipeline()
|
| 86 |
+
|
| 87 |
+
# Zephyr and other modern chat models use a specific chat template format.
|
| 88 |
+
# The pipeline tokenizer can apply this for us.
|
| 89 |
+
messages = [{"role": "user", "content": prompt}]
|
| 90 |
+
|
| 91 |
+
# The pipeline handles the conversation templating internally.
|
| 92 |
+
outputs = pipe(
|
| 93 |
+
messages,
|
| 94 |
+
max_new_tokens=512,
|
| 95 |
+
do_sample=True,
|
| 96 |
+
temperature=0.7,
|
| 97 |
+
top_p=0.95,
|
| 98 |
+
)
|
| 99 |
+
# The response format from a text-generation pipeline is slightly different
|
| 100 |
+
response = outputs[0]["generated_text"]
|
| 101 |
+
if isinstance(response, list):
|
| 102 |
+
# For some models, the last message in the list is the response
|
| 103 |
+
response = response[-1]['content']
|
| 104 |
+
|
| 105 |
+
logger.info("Local analysis successful.")
|
| 106 |
+
return response.strip()
|
| 107 |
+
except Exception as e:
|
| 108 |
+
logger.error(f"Error during local analysis: {e}", exc_info=True)
|
| 109 |
+
raise AnalysisError("An unexpected error occurred during local analysis.")
|
| 110 |
|
| 111 |
def summarize(self, text: str) -> str:
|
| 112 |
"""
|
|
|
|
| 173 |
{question}
|
| 174 |
"""
|
| 175 |
|
| 176 |
+
raw_response = self._analyze(prompt)
|
| 177 |
+
# The model might include the whole prompt in its response, so we clean it.
|
| 178 |
+
# We find the last instance of the user's question and take the text after it.
|
| 179 |
+
# This is a common post-processing step for text-generation pipelines.
|
| 180 |
+
split_token = f"**NEW USER QUESTION:**\n{question}"
|
| 181 |
+
if split_token in raw_response:
|
| 182 |
+
return raw_response.split(split_token)[-1].strip()
|
| 183 |
+
return raw_response
|
|
|
|
|
|