atsushieee's picture
Upload folder using huggingface_hub
c1e08a0 verified
raw
history blame
2.1 kB
"""PitchDetector class for real-time pitch detection using FCPE."""
import numpy as np
import torch
from torchfcpe import spawn_bundled_infer_model
from improvisation_lab.config import PitchDetectorConfig
class PitchDetector:
"""Class for real-time pitch detection using FCPE."""
def __init__(self, config: PitchDetectorConfig):
"""Initialize pitch detector.
Args:
config: Configuration settings for pitch detection.
"""
self.sample_rate = config.sample_rate
self.hop_length = config.hop_length
self.decoder_mode = config.decoder_mode
self.threshold = config.threshold
self.f0_min = config.f0_min
self.f0_max = config.f0_max
self.interp_uv = config.interp_uv
self.model = spawn_bundled_infer_model(device=config.device)
def detect_pitch(self, audio_frame: np.ndarray) -> float:
"""Detect pitch from audio frame.
Args:
audio_frame: Numpy array of audio samples
Returns:
Frequency in Hz
"""
audio_length = len(audio_frame)
f0_target_length = (audio_length // self.hop_length) + 1
# Convert to torch tensor and reshape to match expected dimensions
# Add batch and channel dimensions
audio_tensor = torch.from_numpy(audio_frame).float()
audio_tensor = audio_tensor.unsqueeze(0).unsqueeze(-1)
pitch = self.model.infer(
audio_tensor,
sr=self.sample_rate,
decoder_mode=self.decoder_mode,
threshold=self.threshold,
f0_min=self.f0_min,
f0_max=self.f0_max,
interp_uv=self.interp_uv,
output_interp_target_length=f0_target_length,
)
# Extract the middle frequency value from the pitch tensor
# Taking the middle value helps avoid potential inaccuracies at the edges
# of the audio frame, providing a more stable frequency estimate.
middle_index = pitch.size(1) // 2
frequency = pitch[0, middle_index, 0].item()
return frequency