dkounadis
/

artificial-styletts2

@@ -137,25 +137,9 @@ def get_conditioner_provider(output_dim: int, cfg: omegaconf.DictConfig) -> Cond
         model_args = cond_cfg[model_type]
         if model_type == 't5':
             conditioners[str(cond)] = T5Conditioner(output_dim=output_dim, device=device, **model_args)
-        elif model_type == 'lut':
-            conditioners[str(cond)] = LUTConditioner(output_dim=output_dim, **model_args)
-        # elif model_type == 'chroma_stem':
-        #     conditioners[str(cond)] = ChromaStemConditioner(
-        #         output_dim=output_dim,
-        #         duration=duration,
-        #         device=device,
-        #         **model_args
-        #     )
-        elif model_type == 'clap':
-            conditioners[str(cond)] = CLAPEmbeddingConditioner(
-                output_dim=output_dim,
-                device=device,
-                **model_args
-            )
         else:
             raise ValueError(f"Unrecognized conditioning model: {model_type}")
     conditioner = ConditioningProvider(conditioners, device=device, **condition_provider_args)
-    print('     COND\n',conditioner)
     return conditioner
@@ -229,22 +213,7 @@ def get_processor(cfg, sample_rate: int = 24000):
     return sample_processor
-def get_debug_lm_model(device='cpu'):
-    """Instantiate a debug LM to be used for unit tests."""
-    pattern = DelayedPatternProvider(n_q=4)
-    dim = 16
-    providers = {
-        'description': LUTConditioner(n_bins=128, dim=dim, output_dim=dim, tokenizer="whitespace"),
-    }
-    condition_provider = ConditioningProvider(providers)
-    fuser = ConditionFuser(
-        {'cross': ['description'], 'prepend': [],
-         'sum': [], 'input_interpolate': []})
-    lm = LMModel(
-        pattern, condition_provider, fuser,
-        n_q=4, card=400, dim=dim, num_heads=4, custom=True, num_layers=2,
-        cross_attention=True, causal=True)
-    return lm.to(device).eval()
 def get_wrapped_compression_model(

         model_args = cond_cfg[model_type]
         if model_type == 't5':
             conditioners[str(cond)] = T5Conditioner(output_dim=output_dim, device=device, **model_args)
         else:
             raise ValueError(f"Unrecognized conditioning model: {model_type}")
     conditioner = ConditioningProvider(conditioners, device=device, **condition_provider_args)
     return conditioner
     return sample_processor
 def get_wrapped_compression_model(

audiocraft/conditioners.py CHANGED Viewed

@@ -1,9 +1,3 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
 from collections import defaultdict
 from copy import deepcopy
 from dataclasses import dataclass, field
@@ -16,7 +10,6 @@ import re
 import typing as tp
 import warnings
 import soundfile
-import einops
 from num2words import num2words
 import spacy
 from transformers import T5EncoderModel, T5Tokenizer  # type: ignore
@@ -42,12 +35,7 @@ TextCondition = tp.Optional[str]  # a text condition can be a string or None (if
 ConditionType = tp.Tuple[torch.Tensor, torch.Tensor]  # condition, mask
-class WavCondition(tp.NamedTuple):
-    wav: torch.Tensor
-    length: torch.Tensor
-    sample_rate: tp.List[int]
-    path: tp.List[tp.Optional[str]] = []
-    seek_time: tp.List[tp.Optional[float]] = []
 class JointEmbedCondition(tp.NamedTuple):
@@ -62,7 +50,7 @@ class JointEmbedCondition(tp.NamedTuple):
 @dataclass
 class ConditioningAttributes:
     text: tp.Dict[str, tp.Optional[str]] = field(default_factory=dict)
-    wav: tp.Dict[str, WavCondition] = field(default_factory=dict)
     joint_embed: tp.Dict[str, JointEmbedCondition] = field(default_factory=dict)
     def __getitem__(self, item):
@@ -107,67 +95,13 @@ class ConditioningAttributes:
-def nullify_condition(condition: ConditionType, dim: int = 1):
-    """Transform an input condition to a null condition.
-    The way it is done by converting it to a single zero vector similarly
-    to how it is done inside WhiteSpaceTokenizer and NoopTokenizer.
-    Args:
-        condition (ConditionType): A tuple of condition and mask (tuple[torch.Tensor, torch.Tensor])
-        dim (int): The dimension that will be truncated (should be the time dimension)
-        WARNING!: dim should not be the batch dimension!
-    Returns:
-        ConditionType: A tuple of null condition and mask
-    """
-    assert dim != 0, "dim cannot be the batch dimension!"
-    assert isinstance(condition, tuple) and \
-        isinstance(condition[0], torch.Tensor) and \
-        isinstance(condition[1], torch.Tensor), "'nullify_condition' got an unexpected input type!"
-    cond, mask = condition
-    B = cond.shape[0]
-    last_dim = cond.dim() - 1
-    out = cond.transpose(dim, last_dim)
-    out = 0. * out[..., :1]
-    out = out.transpose(dim, last_dim)
-    mask = torch.zeros((B, 1), device=out.device).int()
-    assert cond.dim() == out.dim()
-    return out, mask
-def nullify_wav(cond: WavCondition) -> WavCondition:
-    """Transform a WavCondition to a nullified WavCondition.
-    It replaces the wav by a null tensor, forces its length to 0, and replaces metadata by dummy attributes.
-    Args:
-        cond (WavCondition): Wav condition with wav, tensor of shape [B, T].
-    Returns:
-        WavCondition: Nullified wav condition.
-    """
-    null_wav, _ = nullify_condition((cond.wav, torch.zeros_like(cond.wav)), dim=cond.wav.dim() - 1)
-    return WavCondition(
-        wav=null_wav,
-        length=torch.tensor([0] * cond.wav.shape[0], device=cond.wav.device),
-        sample_rate=cond.sample_rate,
-        path=[None] * cond.wav.shape[0],
-        seek_time=[None] * cond.wav.shape[0],
-    )
-def nullify_joint_embed(embed: JointEmbedCondition) -> JointEmbedCondition:
-    """Nullify the joint embedding condition by replacing it by a null tensor, forcing its length to 0,
-    and replacing metadata by dummy attributes.
-    Args:
-        cond (JointEmbedCondition): Joint embedding condition with wav and text, wav tensor of shape [B, C, T].
-    """
-    null_wav, _ = nullify_condition((embed.wav, torch.zeros_like(embed.wav)), dim=embed.wav.dim() - 1)
-    return JointEmbedCondition(
-        wav=null_wav, text=[None] * len(embed.text),
-        length=torch.LongTensor([0]).to(embed.wav.device),
-        sample_rate=embed.sample_rate,
-        path=[None] * embed.wav.shape[0],
-        seek_time=[0] * embed.wav.shape[0],
-    )
 class Tokenizer:
@@ -419,129 +353,7 @@ class T5Conditioner(TextConditioner):
-def dropout_condition(sample: ConditioningAttributes, condition_type: str, condition: str) -> ConditioningAttributes:
-    """Utility function for nullifying an attribute inside an ConditioningAttributes object.
-    If the condition is of type "wav", then nullify it using `nullify_condition` function.
-    If the condition is of any other type, set its value to None.
-    Works in-place.
-    """
-    if condition_type not in ['text', 'wav', 'joint_embed']:
-        raise ValueError(
-            "dropout_condition got an unexpected condition type!"
-            f" expected 'text', 'wav' or 'joint_embed' but got '{condition_type}'"
-        )
-    if condition not in getattr(sample, condition_type):
-        raise ValueError(
-            "dropout_condition received an unexpected condition!"
-            f" expected wav={sample.wav.keys()} and text={sample.text.keys()}"
-            f" but got '{condition}' of type '{condition_type}'!"
-        )
-    if condition_type == 'wav':
-        wav_cond = sample.wav[condition]
-        sample.wav[condition] = nullify_wav(wav_cond)
-    elif condition_type == 'joint_embed':
-        embed = sample.joint_embed[condition]
-        sample.joint_embed[condition] = nullify_joint_embed(embed)
-    else:
-        sample.text[condition] = None
-    return sample
-class DropoutModule(nn.Module):
-    """Base module for all dropout modules."""
-    def __init__(self, seed: int = 1234):
-        super().__init__()
-        self.rng = torch.Generator()
-        self.rng.manual_seed(seed)
-class AttributeDropout(DropoutModule):
-    """Dropout with a given probability per attribute.
-    This is different from the behavior of ClassifierFreeGuidanceDropout as this allows for attributes
-    to be dropped out separately. For example, "artist" can be dropped while "genre" remains.
-    This is in contrast to ClassifierFreeGuidanceDropout where if "artist" is dropped "genre"
-    must also be dropped.
-    Args:
-        p (tp.Dict[str, float]): A dict mapping between attributes and dropout probability. For example:
-            ...
-            "genre": 0.1,
-            "artist": 0.5,
-            "wav": 0.25,
-            ...
-        active_on_eval (bool, optional): Whether the dropout is active at eval. Default to False.
-        seed (int, optional): Random seed.
-    """
-    def __init__(self, p: tp.Dict[str, tp.Dict[str, float]], active_on_eval: bool = False, seed: int = 1234):
-        super().__init__(seed=seed)
-        self.active_on_eval = active_on_eval
-        # construct dict that return the values from p otherwise 0
-        self.p = {}
-        for condition_type, probs in p.items():
-            self.p[condition_type] = defaultdict(lambda: 0, probs)
-    def forward(self, samples: tp.List[ConditioningAttributes]) -> tp.List[ConditioningAttributes]:
-        """
-        Args:
-            samples (list[ConditioningAttributes]): List of conditions.
-        Returns:
-            list[ConditioningAttributes]: List of conditions after certain attributes were set to None.
-        """
-        if not self.training and not self.active_on_eval:
-            return samples
-        samples = deepcopy(samples)
-        for condition_type, ps in self.p.items():  # for condition types [text, wav]
-            for condition, p in ps.items():  # for attributes of each type (e.g., [artist, genre])
-                if torch.rand(1, generator=self.rng).item() < p:
-                    for sample in samples:
-                        dropout_condition(sample, condition_type, condition)
-        return samples
-    def __repr__(self):
-        return f"AttributeDropout({dict(self.p)})"
-class ClassifierFreeGuidanceDropout(DropoutModule):
-    """Classifier Free Guidance dropout.
-    All attributes are dropped with the same probability.
-    Args:
-        p (float): Probability to apply condition dropout during training.
-        seed (int): Random seed.
-    """
-    def __init__(self, p: float, seed: int = 1234):
-        super().__init__(seed=seed)
-        self.p = p
-    def forward(self, samples: tp.List[ConditioningAttributes]) -> tp.List[ConditioningAttributes]:
-        """
-        Args:
-            samples (list[ConditioningAttributes]): List of conditions.
-        Returns:
-            list[ConditioningAttributes]: List of conditions after all attributes were set to None.
-        """
-        if not self.training:
-            return samples
-        # decide on which attributes to drop in a batched fashion
-        drop = torch.rand(1, generator=self.rng).item() < self.p
-        if not drop:
-            return samples
-        # nullify conditions of all attributes
-        samples = deepcopy(samples)
-        for condition_type in ["wav", "text"]:
-            for sample in samples:
-                for condition in sample.attributes[condition_type]:
-                    dropout_condition(sample, condition_type, condition)
-        return samples
-    def __repr__(self):
-        return f"ClassifierFreeGuidanceDropout(p={self.p})"
 class ConditioningProvider(nn.Module):
@@ -696,43 +508,17 @@ class ConditionFuser(StreamingModule):
         """
         B, T, _ = input.shape
-        if 'offsets' in self._streaming_state:
-            first_step = False
-            offsets = self._streaming_state['offsets']
-        else:
-            first_step = True
-            offsets = torch.zeros(input.shape[0], dtype=torch.long, device=input.device)
-        assert set(conditions.keys()).issubset(set(self.cond2fuse.keys())), \
-            f"given conditions contain unknown attributes for fuser, " \
-            f"expected {self.cond2fuse.keys()}, got {conditions.keys()}"
         cross_attention_output = None
         for cond_type, (cond, cond_mask) in conditions.items():
-            op = self.cond2fuse[cond_type]
-            if op == 'sum':
-                input += cond
-            elif op == 'input_interpolate':
-                cond = einops.rearrange(cond, "b t d -> b d t")
-                cond = F.interpolate(cond, size=input.shape[1])
-                input += einops.rearrange(cond, "b d t -> b t d")
-            elif op == 'prepend':
-                if first_step:
-                    input = torch.cat([cond, input], dim=1)
-            elif op == 'cross':
-                if cross_attention_output is not None:
-                    cross_attention_output = torch.cat([cross_attention_output, cond], dim=1)
-                else:
-                    cross_attention_output = cond
-            else:
-                raise ValueError(f"unknown op ({op})")
-        if self.cross_attention_pos_emb and cross_attention_output is not None:
-            positions = torch.arange(
-                cross_attention_output.shape[1],
-                device=cross_attention_output.device
-            ).view(1, -1, 1)
-            pos_emb = create_sin_embedding(positions, cross_attention_output.shape[-1])
-            cross_attention_output = cross_attention_output + self.cross_attention_pos_emb_scale * pos_emb
         if self._is_streaming:
             self._streaming_state['offsets'] = offsets + T

 from collections import defaultdict
 from copy import deepcopy
 from dataclasses import dataclass, field
 import typing as tp
 import warnings
 import soundfile
 from num2words import num2words
 import spacy
 from transformers import T5EncoderModel, T5Tokenizer  # type: ignore
 ConditionType = tp.Tuple[torch.Tensor, torch.Tensor]  # condition, mask
 class JointEmbedCondition(tp.NamedTuple):
 @dataclass
 class ConditioningAttributes:
     text: tp.Dict[str, tp.Optional[str]] = field(default_factory=dict)
+    wav: tp.Dict[str, tp.Optional[str]] = field(default_factory=dict)
     joint_embed: tp.Dict[str, JointEmbedCondition] = field(default_factory=dict)
     def __getitem__(self, item):
 class Tokenizer:
 class ConditioningProvider(nn.Module):
         """
         B, T, _ = input.shape
+        first_step = True
+        offsets = torch.zeros(input.shape[0], dtype=torch.long, device=input.device)
         cross_attention_output = None
         for cond_type, (cond, cond_mask) in conditions.items():
+            # print(f'{self.cond2fuse=}')  - self.cond2fuse={'description': 'cross'}
+            cross_attention_output = cond
         if self._is_streaming:
             self._streaming_state['offsets'] = offsets + T

demo.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from audiocraft.audiogen import AudioGen #, audio_write
+import audiofile
+import numpy as np
+print('\n\n\n\n___________________')
+txt = 'australian music'
+sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
+sound_generator.set_generation_params(duration=1)   # why is generating so long at 14 seconds
+x = sound_generator.generate([txt])[0].detach().cpu().numpy()[0, :]
+x /= np.abs(x).max() + 1e-7
+audiofile.write('_audio_.wav', x, 16000)