yves.zango@orange.com commited on
Commit
b773910
·
1 Parent(s): c7df6b9

usage of pickle instead of joblib

Browse files
Files changed (3) hide show
  1. .DS_Store +0 -0
  2. models/audio_model.pkl +3 -0
  3. tasks/audio.py +85 -96
.DS_Store ADDED
Binary file (6.15 kB). View file
 
models/audio_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc95e0a3e06625d1a666ead9869dc4b9307fb0e3cef4316264ec476b26b7de38
3
+ size 925490
tasks/audio.py CHANGED
@@ -1,52 +1,52 @@
1
  from fastapi import APIRouter, HTTPException
2
  from datetime import datetime
3
- from datasets import load_dataset, get_dataset_config_names
4
  from sklearn.metrics import accuracy_score
5
  import os
6
- import joblib
 
7
  import numpy as np
8
  import librosa
9
- from pathlib import Path
10
-
11
  from .utils.evaluation import AudioEvaluationRequest
12
  from .utils.emissions import tracker, clean_emissions_data, get_space_info
13
 
14
- # Router setup
 
 
 
15
  router = APIRouter()
16
 
17
- DESCRIPTION = "Chainsaw Detection Model"
18
  ROUTE = "/audio"
 
 
 
 
19
 
20
- # Model loading
21
- MODEL_PATH = Path(__file__).parent.parent / "models" / "audio_model.joblib"
22
- try:
23
- model_data = joblib.load(MODEL_PATH)
24
- model = model_data["model"]
25
- scaler = model_data["scaler"]
26
- except Exception as e:
27
- raise RuntimeError(f"Failed to load model: {e}")
28
-
29
- def extract_features(audio_array, sr=12000):
30
- """Extract audio features using Librosa"""
31
  try:
 
 
 
32
  # Convert to mono if stereo
33
- y = np.mean(audio_array, axis=1) if len(audio_array.shape) > 1 else audio_array
34
-
35
- # Extract MFCCs
36
  mfccs = librosa.feature.mfcc(
37
  y=y,
38
- sr=sr,
39
- n_mfcc=13,
40
  n_fft=2048,
41
  hop_length=512
42
  )
43
-
44
- # Extract additional features
45
  zcr = librosa.feature.zero_crossing_rate(y)
46
  rms = librosa.feature.rms(y=y)
47
- spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
48
-
49
- # Calculate statistics
50
  feature_vector = np.concatenate([
51
  np.mean(mfccs, axis=1),
52
  np.std(mfccs, axis=1),
@@ -54,93 +54,82 @@ def extract_features(audio_array, sr=12000):
54
  [np.mean(rms)],
55
  [np.mean(spectral_centroid)]
56
  ])
57
-
58
  return feature_vector
59
-
60
  except Exception as e:
61
- raise HTTPException(status_code=400, detail=f"Feature extraction failed: {str(e)}")
62
 
63
  @router.post(ROUTE, tags=["Audio Task"], description=DESCRIPTION)
64
  async def evaluate_audio(request: AudioEvaluationRequest):
 
 
 
65
  try:
66
- # Get Space info
67
  username, space_url = get_space_info()
68
 
69
- # Load dataset with proper error handling
70
- try:
71
- # Get available configs
72
- configs = get_dataset_config_names(request.dataset_name)
73
-
74
- # Set up dataset loading arguments
75
- dataset_args = {
76
- "path": request.dataset_name,
77
- "token": os.getenv("HF_TOKEN"),
78
- "trust_remote_code": True
79
- }
80
-
81
- # If configs exist, automatically use 'default' if it's the only one
82
- if configs:
83
- if len(configs) == 1 and configs[0] == 'default':
84
- dataset_args["name"] = "default"
85
- else:
86
- raise HTTPException(
87
- status_code=400,
88
- detail=f"Config name is required for this dataset. Available configs: {configs}"
89
- )
90
-
91
- dataset = load_dataset(**dataset_args)
92
-
93
- except Exception as e:
94
- raise HTTPException(
95
- status_code=400,
96
- detail=f"Failed to load dataset: {str(e)}"
97
- )
98
-
99
- # Split dataset
100
- split = dataset["train"].train_test_split(
101
  test_size=request.test_size,
102
  seed=request.test_seed
103
  )
104
- test_data = split["test"]
105
 
106
- # Track emissions
107
  tracker.start()
108
  tracker.start_task("inference")
 
 
 
 
109
 
110
- # Process features
111
- features = []
112
- valid_samples = []
113
- for sample in test_data:
114
- try:
115
- if 'audio' in sample and isinstance(sample['audio'], dict) and 'array' in sample['audio']:
116
- feature = extract_features(sample['audio']['array'])
117
- if feature is not None:
118
- features.append(feature)
119
- valid_samples.append(sample)
120
- except Exception as e:
121
- print(f"Skipping sample due to error: {e}")
122
- continue
123
-
124
- if not features:
125
- raise HTTPException(
126
- status_code=400,
127
- detail="No valid features could be extracted from the audio samples"
128
- )
129
-
130
- # Scale features and make predictions
131
- scaled_features = scaler.transform(features)
132
- predictions = model.predict(scaled_features)
133
- true_labels = [sample["label"] for sample in valid_samples]
134
-
135
- # Calculate results
 
 
136
  emissions_data = tracker.stop_task()
137
-
 
 
 
 
138
  return {
139
  "username": username,
140
  "space_url": space_url,
141
  "submission_timestamp": datetime.now().isoformat(),
142
  "model_description": DESCRIPTION,
143
- "accuracy": float(accuracy_score(true_labels, predictions)),
144
  "energy_consumed_wh": emissions_data.energy_consumed * 1000,
145
  "emissions_gco2eq": emissions_data.emissions * 1000,
146
  "emissions_data": clean_emissions_data(emissions_data),
@@ -148,12 +137,12 @@ async def evaluate_audio(request: AudioEvaluationRequest):
148
  "dataset_config": {
149
  "dataset_name": request.dataset_name,
150
  "test_size": request.test_size,
151
- "test_seed": request.test_seed
152
- }
153
  }
154
 
155
  except Exception as e:
156
  raise HTTPException(
157
  status_code=500,
158
- detail=f"An error occurred during audio evaluation: {str(e)}"
159
  )
 
1
  from fastapi import APIRouter, HTTPException
2
  from datetime import datetime
3
+ from datasets import load_dataset
4
  from sklearn.metrics import accuracy_score
5
  import os
6
+ import pickle
7
+ from pathlib import Path
8
  import numpy as np
9
  import librosa
10
+ from sklearn.preprocessing import StandardScaler
11
+ from dotenv import load_dotenv
12
  from .utils.evaluation import AudioEvaluationRequest
13
  from .utils.emissions import tracker, clean_emissions_data, get_space_info
14
 
15
+ # Charger les variables d'environnement
16
+ load_dotenv()
17
+
18
+ # Configuration du router
19
  router = APIRouter()
20
 
21
+ DESCRIPTION = "Random Forest with Feature Engineering"
22
  ROUTE = "/audio"
23
+ MODEL_PATH = Path(__file__).parent.parent / "models" / "audio_model.pkl"
24
+
25
+ SAMPLING_RATE = 12000
26
+ N_MFCC = 13
27
 
28
+ def extract_features(audio_array):
29
+ """Feature engineering identical to the training phase."""
 
 
 
 
 
 
 
 
 
30
  try:
31
+ if not isinstance(audio_array, np.ndarray) or len(audio_array) == 0:
32
+ return None
33
+
34
  # Convert to mono if stereo
35
+ y = np.mean(audio_array, axis=1) if audio_array.ndim > 1 else audio_array
36
+
37
+ # Extract MFCCs and additional features
38
  mfccs = librosa.feature.mfcc(
39
  y=y,
40
+ sr=SAMPLING_RATE,
41
+ n_mfcc=N_MFCC,
42
  n_fft=2048,
43
  hop_length=512
44
  )
 
 
45
  zcr = librosa.feature.zero_crossing_rate(y)
46
  rms = librosa.feature.rms(y=y)
47
+ spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=SAMPLING_RATE)
48
+
49
+ # Combine features into a single vector
50
  feature_vector = np.concatenate([
51
  np.mean(mfccs, axis=1),
52
  np.std(mfccs, axis=1),
 
54
  [np.mean(rms)],
55
  [np.mean(spectral_centroid)]
56
  ])
57
+
58
  return feature_vector
59
+
60
  except Exception as e:
61
+ raise ValueError(f"Feature extraction error: {str(e)}")
62
 
63
  @router.post(ROUTE, tags=["Audio Task"], description=DESCRIPTION)
64
  async def evaluate_audio(request: AudioEvaluationRequest):
65
+ """
66
+ Evaluate audio classification for rainforest sound detection using Random Forest.
67
+ """
68
  try:
69
+ # Get space information (username and URL)
70
  username, space_url = get_space_info()
71
 
72
+ # Load dataset from Hugging Face
73
+ dataset = load_dataset(
74
+ request.dataset_name,
75
+ token=os.getenv("HF_TOKEN")
76
+ )
77
+
78
+ # Split dataset into train and test sets
79
+ train_test = dataset["train"].train_test_split(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  test_size=request.test_size,
81
  seed=request.test_seed
82
  )
83
+ test_dataset = train_test["test"]
84
 
85
+ # Start emissions tracking for inference phase
86
  tracker.start()
87
  tracker.start_task("inference")
88
+
89
+ # Prepare test data using the same feature engineering as in training
90
+ x_test = []
91
+ true_labels = []
92
 
93
+ for sample in test_dataset:
94
+ features = extract_features(sample["audio"]["array"])
95
+ if features is not None:
96
+ x_test.append(features)
97
+ true_labels.append(sample["label"])
98
+
99
+ if len(x_test) == 0:
100
+ raise ValueError("No valid features could be extracted from the test dataset.")
101
+
102
+ x_test = np.array(x_test)
103
+
104
+ # Load the trained model and scaler from pickle file
105
+ with open(MODEL_PATH, 'rb') as f:
106
+ model_data = pickle.load(f)
107
+
108
+ model = model_data['model']
109
+ scaler = model_data['scaler']
110
+
111
+ # Scale the test data using the scaler from training phase
112
+ if scaler is not None:
113
+ x_test_scaled = scaler.transform(x_test)
114
+ else:
115
+ x_test_scaled = x_test
116
+
117
+ # Make predictions on the test set
118
+ predictions = model.predict(x_test_scaled)
119
+
120
+ # Stop emissions tracking and get data
121
  emissions_data = tracker.stop_task()
122
+
123
+ # Calculate accuracy score for evaluation
124
+ accuracy = accuracy_score(true_labels, predictions)
125
+
126
+ # Prepare and return results as JSON response
127
  return {
128
  "username": username,
129
  "space_url": space_url,
130
  "submission_timestamp": datetime.now().isoformat(),
131
  "model_description": DESCRIPTION,
132
+ "accuracy": float(accuracy),
133
  "energy_consumed_wh": emissions_data.energy_consumed * 1000,
134
  "emissions_gco2eq": emissions_data.emissions * 1000,
135
  "emissions_data": clean_emissions_data(emissions_data),
 
137
  "dataset_config": {
138
  "dataset_name": request.dataset_name,
139
  "test_size": request.test_size,
140
+ "test_seed": request.test_seed,
141
+ },
142
  }
143
 
144
  except Exception as e:
145
  raise HTTPException(
146
  status_code=500,
147
+ detail=f"An error occurred during evaluation: {str(e)}"
148
  )