File size: 2,104 Bytes
c1e08a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
"""PitchDetector class for real-time pitch detection using FCPE."""

import numpy as np
import torch
from torchfcpe import spawn_bundled_infer_model

from improvisation_lab.config import PitchDetectorConfig


class PitchDetector:
    """Class for real-time pitch detection using FCPE."""

    def __init__(self, config: PitchDetectorConfig):
        """Initialize pitch detector.

        Args:
            config: Configuration settings for pitch detection.
        """
        self.sample_rate = config.sample_rate
        self.hop_length = config.hop_length
        self.decoder_mode = config.decoder_mode
        self.threshold = config.threshold
        self.f0_min = config.f0_min
        self.f0_max = config.f0_max
        self.interp_uv = config.interp_uv
        self.model = spawn_bundled_infer_model(device=config.device)

    def detect_pitch(self, audio_frame: np.ndarray) -> float:
        """Detect pitch from audio frame.

        Args:
            audio_frame: Numpy array of audio samples

        Returns:
            Frequency in Hz
        """
        audio_length = len(audio_frame)
        f0_target_length = (audio_length // self.hop_length) + 1

        # Convert to torch tensor and reshape to match expected dimensions
        # Add batch and channel dimensions
        audio_tensor = torch.from_numpy(audio_frame).float()
        audio_tensor = audio_tensor.unsqueeze(0).unsqueeze(-1)

        pitch = self.model.infer(
            audio_tensor,
            sr=self.sample_rate,
            decoder_mode=self.decoder_mode,
            threshold=self.threshold,
            f0_min=self.f0_min,
            f0_max=self.f0_max,
            interp_uv=self.interp_uv,
            output_interp_target_length=f0_target_length,
        )

        # Extract the middle frequency value from the pitch tensor
        # Taking the middle value helps avoid potential inaccuracies at the edges
        # of the audio frame, providing a more stable frequency estimate.
        middle_index = pitch.size(1) // 2
        frequency = pitch[0, middle_index, 0].item()
        return frequency