Spaces:
Sleeping
Sleeping
"""PitchDetector class for real-time pitch detection using FCPE.""" | |
import numpy as np | |
import torch | |
from torchfcpe import spawn_bundled_infer_model | |
from improvisation_lab.config import PitchDetectorConfig | |
class PitchDetector: | |
"""Class for real-time pitch detection using FCPE.""" | |
def __init__(self, config: PitchDetectorConfig): | |
"""Initialize pitch detector. | |
Args: | |
config: Configuration settings for pitch detection. | |
""" | |
self.sample_rate = config.sample_rate | |
self.hop_length = config.hop_length | |
self.decoder_mode = config.decoder_mode | |
self.threshold = config.threshold | |
self.f0_min = config.f0_min | |
self.f0_max = config.f0_max | |
self.interp_uv = config.interp_uv | |
self.model = spawn_bundled_infer_model(device=config.device) | |
def detect_pitch(self, audio_frame: np.ndarray) -> float: | |
"""Detect pitch from audio frame. | |
Args: | |
audio_frame: Numpy array of audio samples | |
Returns: | |
Frequency in Hz | |
""" | |
audio_length = len(audio_frame) | |
f0_target_length = (audio_length // self.hop_length) + 1 | |
# Convert to torch tensor and reshape to match expected dimensions | |
# Add batch and channel dimensions | |
audio_tensor = torch.from_numpy(audio_frame).float() | |
audio_tensor = audio_tensor.unsqueeze(0).unsqueeze(-1) | |
pitch = self.model.infer( | |
audio_tensor, | |
sr=self.sample_rate, | |
decoder_mode=self.decoder_mode, | |
threshold=self.threshold, | |
f0_min=self.f0_min, | |
f0_max=self.f0_max, | |
interp_uv=self.interp_uv, | |
output_interp_target_length=f0_target_length, | |
) | |
# Extract the middle frequency value from the pitch tensor | |
# Taking the middle value helps avoid potential inaccuracies at the edges | |
# of the audio frame, providing a more stable frequency estimate. | |
middle_index = pitch.size(1) // 2 | |
frequency = pitch[0, middle_index, 0].item() | |
return frequency | |