[email protected] commited on
Commit
4619a60
·
1 Parent(s): b773910

version corrigés avec parallel processing

Browse files
Files changed (4) hide show
  1. Dockerfile +0 -1
  2. app.py +16 -3
  3. models/audio_model.pkl +2 -2
  4. tasks/audio.py +121 -56
Dockerfile CHANGED
@@ -1,6 +1,5 @@
1
  # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
  # you will also find guides on how best to write your Dockerfile
3
-
4
  FROM python:3.9
5
 
6
  RUN useradd -m -u 1000 user
 
1
  # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
  # you will also find guides on how best to write your Dockerfile
 
3
  FROM python:3.9
4
 
5
  RUN useradd -m -u 1000 user
app.py CHANGED
@@ -1,6 +1,7 @@
1
  from fastapi import FastAPI
2
  from dotenv import load_dotenv
3
  from tasks import text, image, audio
 
4
 
5
  # Load environment variables
6
  load_dotenv()
@@ -11,10 +12,22 @@ app = FastAPI(
11
  )
12
 
13
  # Include all routers
14
- app.include_router(text.router)
15
- app.include_router(image.router)
16
  app.include_router(audio.router)
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  @app.get("/")
19
  async def root():
20
  return {
@@ -22,4 +35,4 @@ async def root():
22
  "endpoints": {
23
  "audio": "/audio - Audio classification task"
24
  }
25
- }
 
1
  from fastapi import FastAPI
2
  from dotenv import load_dotenv
3
  from tasks import text, image, audio
4
+ from pathlib import Path
5
 
6
  # Load environment variables
7
  load_dotenv()
 
12
  )
13
 
14
  # Include all routers
 
 
15
  app.include_router(audio.router)
16
 
17
+ @app.get("/health")
18
+ async def health_check():
19
+ try:
20
+ model_exists = Path("/app/models/audio_model.pkl").exists()
21
+ return {
22
+ "status": "healthy",
23
+ "model_loaded": model_exists
24
+ }
25
+ except Exception as e:
26
+ return {
27
+ "status": "unhealthy",
28
+ "error": str(e)
29
+ }
30
+
31
  @app.get("/")
32
  async def root():
33
  return {
 
35
  "endpoints": {
36
  "audio": "/audio - Audio classification task"
37
  }
38
+ }
models/audio_model.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fc95e0a3e06625d1a666ead9869dc4b9307fb0e3cef4316264ec476b26b7de38
3
- size 925490
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39615d655f9a19dee949084eae2b12f4ba071ca867e19997b2794524a35f2947
3
+ size 938130
tasks/audio.py CHANGED
@@ -7,46 +7,110 @@ import pickle
7
  from pathlib import Path
8
  import numpy as np
9
  import librosa
10
- from sklearn.preprocessing import StandardScaler
11
- from dotenv import load_dotenv
12
  from .utils.evaluation import AudioEvaluationRequest
13
  from .utils.emissions import tracker, clean_emissions_data, get_space_info
 
 
14
 
15
- # Charger les variables d'environnement
16
  load_dotenv()
 
 
17
 
18
- # Configuration du router
19
  router = APIRouter()
20
-
21
- DESCRIPTION = "Random Forest with Feature Engineering"
22
  ROUTE = "/audio"
23
- MODEL_PATH = Path(__file__).parent.parent / "models" / "audio_model.pkl"
24
-
25
  SAMPLING_RATE = 12000
26
  N_MFCC = 13
 
 
27
 
28
- def extract_features(audio_array):
29
- """Feature engineering identical to the training phase."""
 
 
 
30
  try:
31
- if not isinstance(audio_array, np.ndarray) or len(audio_array) == 0:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  return None
33
 
34
- # Convert to mono if stereo
35
  y = np.mean(audio_array, axis=1) if audio_array.ndim > 1 else audio_array
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
- # Extract MFCCs and additional features
38
- mfccs = librosa.feature.mfcc(
39
- y=y,
40
- sr=SAMPLING_RATE,
41
- n_mfcc=N_MFCC,
42
- n_fft=2048,
43
- hop_length=512
44
- )
45
- zcr = librosa.feature.zero_crossing_rate(y)
46
- rms = librosa.feature.rms(y=y)
47
- spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=SAMPLING_RATE)
48
-
49
- # Combine features into a single vector
50
  feature_vector = np.concatenate([
51
  np.mean(mfccs, axis=1),
52
  np.std(mfccs, axis=1),
@@ -58,72 +122,72 @@ def extract_features(audio_array):
58
  return feature_vector
59
 
60
  except Exception as e:
61
- raise ValueError(f"Feature extraction error: {str(e)}")
 
 
62
 
63
  @router.post(ROUTE, tags=["Audio Task"], description=DESCRIPTION)
64
  async def evaluate_audio(request: AudioEvaluationRequest):
65
- """
66
- Evaluate audio classification for rainforest sound detection using Random Forest.
67
- """
68
  try:
69
- # Get space information (username and URL)
70
  username, space_url = get_space_info()
71
-
72
- # Load dataset from Hugging Face
73
  dataset = load_dataset(
74
  request.dataset_name,
75
  token=os.getenv("HF_TOKEN")
76
  )
77
 
78
- # Split dataset into train and test sets
79
  train_test = dataset["train"].train_test_split(
80
  test_size=request.test_size,
81
  seed=request.test_seed
82
  )
83
  test_dataset = train_test["test"]
 
84
 
85
- # Start emissions tracking for inference phase
86
  tracker.start()
87
  tracker.start_task("inference")
88
-
89
- # Prepare test data using the same feature engineering as in training
90
  x_test = []
91
  true_labels = []
92
 
93
- for sample in test_dataset:
94
- features = extract_features(sample["audio"]["array"])
95
- if features is not None:
96
- x_test.append(features)
97
- true_labels.append(sample["label"])
 
 
 
 
 
98
 
99
  if len(x_test) == 0:
100
- raise ValueError("No valid features could be extracted from the test dataset.")
101
 
102
- x_test = np.array(x_test)
103
-
104
- # Load the trained model and scaler from pickle file
 
105
  with open(MODEL_PATH, 'rb') as f:
106
  model_data = pickle.load(f)
107
 
108
  model = model_data['model']
109
  scaler = model_data['scaler']
110
 
111
- # Scale the test data using the scaler from training phase
112
- if scaler is not None:
113
- x_test_scaled = scaler.transform(x_test)
114
- else:
115
- x_test_scaled = x_test
116
-
117
- # Make predictions on the test set
118
  predictions = model.predict(x_test_scaled)
119
 
120
- # Stop emissions tracking and get data
121
  emissions_data = tracker.stop_task()
122
-
123
- # Calculate accuracy score for evaluation
124
  accuracy = accuracy_score(true_labels, predictions)
 
 
125
 
126
- # Prepare and return results as JSON response
127
  return {
128
  "username": username,
129
  "space_url": space_url,
@@ -142,6 +206,7 @@ async def evaluate_audio(request: AudioEvaluationRequest):
142
  }
143
 
144
  except Exception as e:
 
145
  raise HTTPException(
146
  status_code=500,
147
  detail=f"An error occurred during evaluation: {str(e)}"
 
7
  from pathlib import Path
8
  import numpy as np
9
  import librosa
10
+ from concurrent.futures import ThreadPoolExecutor
11
+ import multiprocessing
12
  from .utils.evaluation import AudioEvaluationRequest
13
  from .utils.emissions import tracker, clean_emissions_data, get_space_info
14
+ from dotenv import load_dotenv
15
+ import logging
16
 
17
+ # Configuration
18
  load_dotenv()
19
+ logging.basicConfig(level=logging.INFO)
20
+ logger = logging.getLogger(__name__)
21
 
 
22
  router = APIRouter()
23
+ DESCRIPTION = "Parallel Random Forest with Feature Engineering"
 
24
  ROUTE = "/audio"
25
+ MODEL_PATH = Path("/app/models/audio_model.pkl")
 
26
  SAMPLING_RATE = 12000
27
  N_MFCC = 13
28
+ NUM_WORKERS = multiprocessing.cpu_count()
29
+ BATCH_SIZE = 32
30
 
31
+ def process_batch_parallel(batch):
32
+ """Process a batch of test samples in parallel"""
33
+ features = []
34
+ labels = []
35
+
36
  try:
37
+ logger.info(f"Batch type: {type(batch)}")
38
+
39
+ with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
40
+ futures = []
41
+
42
+ # Itérer sur le dataset
43
+ for i in range(len(batch)):
44
+ audio = batch[i]['audio']
45
+ label = batch[i]['label']
46
+
47
+ logger.info(f"Processing audio sample {i}")
48
+ logger.info(f"Audio type: {type(audio)}")
49
+ if hasattr(audio, 'array'):
50
+ logger.info(f"Audio shape: {audio['array'].shape}")
51
+
52
+ future = executor.submit(extract_features_parallel, audio)
53
+ futures.append((future, label))
54
+
55
+ for idx, (future, label) in enumerate(futures):
56
+ try:
57
+ feature = future.result()
58
+ if feature is not None:
59
+ logger.info(f"Successfully extracted features for sample {idx}")
60
+ features.append(feature)
61
+ labels.append(label)
62
+ else:
63
+ logger.warning(f"No features extracted for sample {idx}")
64
+ except Exception as e:
65
+ logger.error(f"Feature extraction error for sample {idx}: {str(e)}")
66
+ continue
67
+
68
+ logger.info(f"Successfully processed {len(features)} samples out of {len(batch)}")
69
+ return features, labels
70
+
71
+ except Exception as e:
72
+ logger.error(f"Batch processing error: {str(e)}")
73
+ return [], []
74
+
75
+
76
+ def extract_features_parallel(audio_data):
77
+ """Optimized parallel feature extraction"""
78
+ try:
79
+ if isinstance(audio_data, dict):
80
+ if 'array' in audio_data:
81
+ audio_array = audio_data['array']
82
+ elif 'path' in audio_data:
83
+ # Si nous avons un chemin de fichier
84
+ y, sr = librosa.load(audio_data['path'], sr=SAMPLING_RATE)
85
+ audio_array = y
86
+ else:
87
+ logger.error("No array or path in audio data")
88
+ return None
89
+ else:
90
+ audio_array = audio_data
91
+
92
+ if len(audio_array) == 0:
93
+ logger.error("Empty audio array")
94
  return None
95
 
96
+ # Conversion en mono si stéréo
97
  y = np.mean(audio_array, axis=1) if audio_array.ndim > 1 else audio_array
98
+
99
+ # Vérification de la longueur minimale
100
+ if len(y) < SAMPLING_RATE:
101
+ logger.warning("Audio too short, padding")
102
+ y = np.pad(y, (0, SAMPLING_RATE - len(y)))
103
+
104
+ # Extraction des features
105
+ with ThreadPoolExecutor(max_workers=4) as executor:
106
+ futures = [
107
+ executor.submit(librosa.feature.mfcc, y=y, sr=SAMPLING_RATE, n_mfcc=N_MFCC),
108
+ executor.submit(librosa.feature.zero_crossing_rate, y),
109
+ executor.submit(librosa.feature.rms, y=y),
110
+ executor.submit(librosa.feature.spectral_centroid, y=y, sr=SAMPLING_RATE)
111
+ ]
112
+ mfccs, zcr, rms, spectral_centroid = [f.result() for f in futures]
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  feature_vector = np.concatenate([
115
  np.mean(mfccs, axis=1),
116
  np.std(mfccs, axis=1),
 
122
  return feature_vector
123
 
124
  except Exception as e:
125
+ logger.error(f"Feature extraction error: {str(e)}")
126
+ return None
127
+
128
 
129
  @router.post(ROUTE, tags=["Audio Task"], description=DESCRIPTION)
130
  async def evaluate_audio(request: AudioEvaluationRequest):
131
+ """Evaluate audio classification with parallel processing"""
 
 
132
  try:
133
+ logger.info("Starting audio evaluation...")
134
  username, space_url = get_space_info()
135
+
136
+ logger.info(f"Loading dataset: {request.dataset_name}")
137
  dataset = load_dataset(
138
  request.dataset_name,
139
  token=os.getenv("HF_TOKEN")
140
  )
141
 
142
+ logger.info("Splitting dataset...")
143
  train_test = dataset["train"].train_test_split(
144
  test_size=request.test_size,
145
  seed=request.test_seed
146
  )
147
  test_dataset = train_test["test"]
148
+ logger.info(f"Test dataset size: {len(test_dataset)}")
149
 
 
150
  tracker.start()
151
  tracker.start_task("inference")
152
+
153
+ logger.info("Processing test data...")
154
  x_test = []
155
  true_labels = []
156
 
157
+ for i in range(0, len(test_dataset), BATCH_SIZE):
158
+ logger.info(f"Processing batch {i}/{len(test_dataset)}")
159
+ batch = test_dataset.select(range(i, min(i + BATCH_SIZE, len(test_dataset))))
160
+ try:
161
+ features, labels = process_batch_parallel(batch)
162
+ x_test.extend(features)
163
+ true_labels.extend(labels)
164
+ except Exception as e:
165
+ logger.error(f"Error processing batch: {str(e)}")
166
+ continue
167
 
168
  if len(x_test) == 0:
169
+ raise ValueError("No valid features could be extracted")
170
 
171
+ logger.info("Loading model...")
172
+ if not MODEL_PATH.exists():
173
+ raise FileNotFoundError(f"Model not found at {MODEL_PATH}")
174
+
175
  with open(MODEL_PATH, 'rb') as f:
176
  model_data = pickle.load(f)
177
 
178
  model = model_data['model']
179
  scaler = model_data['scaler']
180
 
181
+ logger.info("Making predictions...")
182
+ x_test = np.array(x_test)
183
+ x_test_scaled = scaler.transform(x_test) if scaler is not None else x_test
 
 
 
 
184
  predictions = model.predict(x_test_scaled)
185
 
 
186
  emissions_data = tracker.stop_task()
 
 
187
  accuracy = accuracy_score(true_labels, predictions)
188
+
189
+ logger.info(f"Evaluation complete. Accuracy: {accuracy}")
190
 
 
191
  return {
192
  "username": username,
193
  "space_url": space_url,
 
206
  }
207
 
208
  except Exception as e:
209
+ logger.error(f"Error in evaluate_audio: {str(e)}")
210
  raise HTTPException(
211
  status_code=500,
212
  detail=f"An error occurred during evaluation: {str(e)}"