Myoussef11 commited on
Commit
b5c921b
·
1 Parent(s): 83da7d5

Add project files for HF Space

Browse files
Dockerfile CHANGED
@@ -7,9 +7,8 @@ WORKDIR /app
7
  # Install essential system-level dependencies
8
  # - ffmpeg: Required by the pydub library for audio processing
9
  # - git: Required by some pip packages for installation from version control
10
- RUN apt-get update && apt-get install -y \
11
  ffmpeg \
12
- git \
13
  && rm -rf /var/lib/apt/lists/*
14
 
15
  # Copy the dependency file first to leverage Docker's layer caching
@@ -21,16 +20,15 @@ RUN pip install --no-cache-dir -r requirements.txt
21
  # Copy the application source code into the container
22
  # force a re-installation of all packages
23
  COPY src/ src/
24
- RUN curl -fsSL https://ollama.com/install.sh | sh
 
 
 
 
25
 
26
  # Expose the port Gradio will run on, making it accessible to the host
27
  EXPOSE 7860
28
 
29
- # Set the environment variable for the Ollama host when running inside Docker.
30
- ENV TRANSFORMERS_CACHE=/tmp/hf_cache
31
- ENV HF_HOME=/tmp/hf_home
32
- ENV XDG_CACHE_HOME=/tmp
33
-
34
- # Define the default command to run when the container starts
35
- # Uses Python's module flag '-m' for correct package path resolution
36
- CMD ollama serve & sleep 5 && python -m src.app
 
7
  # Install essential system-level dependencies
8
  # - ffmpeg: Required by the pydub library for audio processing
9
  # - git: Required by some pip packages for installation from version control
10
+ RUN RUN apt-get update && apt-get install -y --no-install-recommends \
11
  ffmpeg \
 
12
  && rm -rf /var/lib/apt/lists/*
13
 
14
  # Copy the dependency file first to leverage Docker's layer caching
 
20
  # Copy the application source code into the container
21
  # force a re-installation of all packages
22
  COPY src/ src/
23
+
24
+ # Set environment variables to control where Hugging Face downloads models
25
+ # inside the container, which can be useful on some platforms.
26
+ ENV HF_HOME=/app/huggingface_cache
27
+ ENV TRANSFORMERS_CACHE=/app/huggingface_cache
28
 
29
  # Expose the port Gradio will run on, making it accessible to the host
30
  EXPOSE 7860
31
 
32
+ # Define the default command to run the application.
33
+ # The server_name="0.0.0.0" is crucial for it to be accessible inside Docker.
34
+ CMD ["python", "-m", "src.app"]
 
 
 
 
 
docker-compose.yml DELETED
@@ -1,40 +0,0 @@
1
- version: '3.8'
2
-
3
- services:
4
- # Service 1: Our Voice Analysis Application
5
- app:
6
- # Build the image using the Dockerfile in the current directory
7
- build: .
8
- # Expose the Gradio port
9
- ports:
10
- - "7860:7860"
11
- # Set the environment variable for the Ollama host
12
- environment:
13
- - OLLAMA_HOST=ollama
14
- # Make this service depend on the 'ollama' service
15
- # This ensures that Ollama starts up before our application tries to connect to it
16
- depends_on:
17
- - ollama
18
-
19
- # Service 2: The Ollama Server
20
- ollama:
21
- # Use the official Ollama Docker image
22
- image: ollama/ollama
23
- # Expose the Ollama API port so our 'app' service can reach it.
24
- ports:
25
- - "11434:11434"
26
- # Set the environment variable to enable GPU support
27
- deploy:
28
- resources:
29
- reservations:
30
- devices:
31
- - driver: nvidia
32
- count: all
33
- capabilities: [gpu]
34
- # Mount a volume to persist Ollama data
35
- # This allows Ollama to retain its state and models across container restarts
36
- volumes:
37
- - ollama_data:/root/.ollama
38
-
39
- volumes:
40
- ollama_data:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -6,7 +6,5 @@ openai-whisper==20231117
6
  openai
7
  pydub==0.25.1
8
  python-dotenv==1.0.1
9
- requests==2.32.3
10
  pytest==8.2.2
11
- pytest-mock==3.14.0
12
- ollama==0.2.1
 
6
  openai
7
  pydub==0.25.1
8
  python-dotenv==1.0.1
 
9
  pytest==8.2.2
10
+ pytest-mock==3.14.0
 
src/config.py CHANGED
@@ -14,12 +14,10 @@ APP_DESCRIPTION = (
14
 
15
  # Model Configuration
16
  MODEL_PROVIDER = "local" # Set to 'local' or 'openai'
17
- OLLAMA_HOST = "localhost" # Default to localhost if not set
18
- OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "llama3") # Default model for Ollama
19
 
20
  # Local model settings (if MODEL_PROVIDER is 'local')
21
  LOCAL_TRANSCRIPTION_MODEL = "openai/whisper-base.en"
22
- LOCAL_ANALYSIS_MODEL = "meta-llama/Llama-3-8B-Instruct"
23
 
24
  # OpenAI API settings (if MODEL_PROVIDER is 'openai')
25
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 
14
 
15
  # Model Configuration
16
  MODEL_PROVIDER = "local" # Set to 'local' or 'openai'
 
 
17
 
18
  # Local model settings (if MODEL_PROVIDER is 'local')
19
  LOCAL_TRANSCRIPTION_MODEL = "openai/whisper-base.en"
20
+ LOCAL_ANALYSIS_MODEL = "HuggingFaceH4/zephyr-7b-beta"
21
 
22
  # OpenAI API settings (if MODEL_PROVIDER is 'openai')
23
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
src/services/analysis_service.py CHANGED
@@ -1,7 +1,8 @@
1
- import ollama
 
2
  from openai import OpenAI, OpenAIError
3
  from src import config
4
- from src.utils.exceptions import AnalysisError, IrrelevantQuestionError
5
  from src.logging_config import logger
6
 
7
 
@@ -10,36 +11,43 @@ class AnalysisService:
10
  A service class for performing text analysis tasks.
11
  It uses Ollama for local analysis and the OpenAI API for remote analysis.
12
  """
 
13
 
14
- def _analyze_local(self, prompt: str) -> str:
 
15
  """
16
- Generates a response by calling the local Ollama server using the host
17
- address defined in the application's configuration.
18
  """
 
 
 
 
 
 
 
19
 
20
- # Read the configured host from config.py
21
- ollama_host = "localhost"
22
-
23
- try:
24
- # Initialize the Ollama client with the determined host
25
- client = ollama.Client(host=f"http://{ollama_host}:11434")
26
- logger.info(f"Sending analysis request to Ollama server at {ollama_host}.")
27
-
28
- response = client.generate(
29
- model=config.OLLAMA_MODEL,
30
- prompt=prompt
31
- )
32
-
33
- logger.info("Ollama analysis successful.")
34
- return response['response'].strip()
35
 
36
- except ollama.ResponseError as e:
37
- logger.error(f"Ollama API error: {e.error}", exc_info=True)
38
- raise AnalysisError(f"An error occurred with the Ollama API: {e.error}")
39
- except Exception as e:
40
- # Catch other potential issues like connection problems
41
- logger.error(f"Error during Ollama request to {ollama_host}: {e}", exc_info=True)
42
- raise AnalysisError("An unexpected error occurred while communicating with the Ollama server.")
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  def _analyze_openai(self, prompt: str) -> str:
45
  """
@@ -70,16 +78,35 @@ class AnalysisService:
70
 
71
  def _analyze(self, prompt: str) -> str:
72
  """
73
- Private dispatcher method to route analysis to the correct provider.
74
  """
75
- provider = config.MODEL_PROVIDER.lower()
76
- if provider == 'local':
77
- return self._analyze_local(prompt)
78
- elif provider == 'openai':
79
- return self._analyze_openai(prompt)
80
- else:
81
- logger.error(f"Invalid MODEL_PROVIDER configured: {config.MODEL_PROVIDER}")
82
- raise ValueError(f"Invalid model provider '{config.MODEL_PROVIDER}' specified in config.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
  def summarize(self, text: str) -> str:
85
  """
@@ -146,13 +173,11 @@ class AnalysisService:
146
  {question}
147
  """
148
 
149
- response = self._analyze(prompt)
150
-
151
- # Check for our custom error signal from the LLM
152
- if "ERROR: The answer to this question cannot be found" in response:
153
- logger.warning(f"Model indicated question '{question}' is unanswerable from text.")
154
- raise IrrelevantQuestionError(
155
- "The question could not be answered based on the provided audio content."
156
- )
157
-
158
- return response
 
1
+ import torch
2
+ from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, Pipeline
3
  from openai import OpenAI, OpenAIError
4
  from src import config
5
+ from src.utils.exceptions import AnalysisError
6
  from src.logging_config import logger
7
 
8
 
 
11
  A service class for performing text analysis tasks.
12
  It uses Ollama for local analysis and the OpenAI API for remote analysis.
13
  """
14
+ _local_pipeline: Pipeline = None
15
 
16
+ @classmethod
17
+ def _get_local_pipeline(cls) -> Pipeline:
18
  """
19
+ Initializes and returns the local text-generation pipeline.
20
+ This method caches the pipeline to ensure the model is loaded only once per session.
21
  """
22
+ if cls._local_pipeline is None:
23
+ try:
24
+ model_name = config.LOCAL_ANALYSIS_MODEL
25
+ logger.info(f"Initializing local analysis model: {model_name}")
26
+
27
+ device = "cuda" if torch.cuda.is_available() else "cpu"
28
+ logger.info(f"Using device: {device} for analysis.")
29
 
30
+ # For 7B models on CPU, loading in a lower precision can save RAM
31
+ torch_dtype = torch.bfloat16 if device == "cuda" else torch.float32
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
+ model = AutoModelForCausalLM.from_pretrained(
34
+ model_name,
35
+ torch_dtype=torch_dtype,
36
+ trust_remote_code=True
37
+ )
38
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
39
+
40
+ cls._local_pipeline = pipeline(
41
+ "text-generation",
42
+ model=model,
43
+ tokenizer=tokenizer,
44
+ device=device
45
+ )
46
+ logger.info("Local analysis model initialized successfully.")
47
+ except Exception as e:
48
+ logger.critical(f"Failed to load local analysis model: {e}", exc_info=True)
49
+ raise AnalysisError("Could not initialize the local analysis model.")
50
+ return cls._local_pipeline
51
 
52
  def _analyze_openai(self, prompt: str) -> str:
53
  """
 
78
 
79
  def _analyze(self, prompt: str) -> str:
80
  """
81
+ Generates a response using the local transformers pipeline.
82
  """
83
+ try:
84
+ logger.info("Starting local analysis with transformers pipeline.")
85
+ pipe = self._get_local_pipeline()
86
+
87
+ # Zephyr and other modern chat models use a specific chat template format.
88
+ # The pipeline tokenizer can apply this for us.
89
+ messages = [{"role": "user", "content": prompt}]
90
+
91
+ # The pipeline handles the conversation templating internally.
92
+ outputs = pipe(
93
+ messages,
94
+ max_new_tokens=512,
95
+ do_sample=True,
96
+ temperature=0.7,
97
+ top_p=0.95,
98
+ )
99
+ # The response format from a text-generation pipeline is slightly different
100
+ response = outputs[0]["generated_text"]
101
+ if isinstance(response, list):
102
+ # For some models, the last message in the list is the response
103
+ response = response[-1]['content']
104
+
105
+ logger.info("Local analysis successful.")
106
+ return response.strip()
107
+ except Exception as e:
108
+ logger.error(f"Error during local analysis: {e}", exc_info=True)
109
+ raise AnalysisError("An unexpected error occurred during local analysis.")
110
 
111
  def summarize(self, text: str) -> str:
112
  """
 
173
  {question}
174
  """
175
 
176
+ raw_response = self._analyze(prompt)
177
+ # The model might include the whole prompt in its response, so we clean it.
178
+ # We find the last instance of the user's question and take the text after it.
179
+ # This is a common post-processing step for text-generation pipelines.
180
+ split_token = f"**NEW USER QUESTION:**\n{question}"
181
+ if split_token in raw_response:
182
+ return raw_response.split(split_token)[-1].strip()
183
+ return raw_response