Source code for kapre.signal

"""Signal layers.

This module includes Kapre layers that deal with audio signals (waveforms).

"""
import tensorflow as tf
from tensorflow.keras.layers import Layer
from tensorflow.keras import backend as K

from . import backend
from .backend import _CH_FIRST_STR, _CH_LAST_STR, _CH_DEFAULT_STR


__all__ = ['Frame', 'Energy', 'MuLawEncoding', 'MuLawDecoding', 'LogmelToMFCC']


[docs]class Frame(Layer):
    """
    Frame input audio signal. It is a wrapper of `tf.signal.frame`.

    Args:
        frame_length (int): length of a frame
        hop_length (int): hop length aka frame rate
        pad_end (bool): whether to pad at the end of the signal of there would be a otherwise-discarded partial frame
        pad_value (int or float): value to use in the padding
        data_format (str): `channels_first`, `channels_last`, or `default`
        **kwargs: optional keyword args for `tf.keras.layers.Layer()`

    Example:
        ::

            input_shape = (2048, 1)  # mono signal
            model = Sequential()
            model.add(kapre.Frame(frame_length=1024, hop_length=512, input_shape=input_shape))
            # now the shape is (batch, n_frame=3, frame_length=1024, ch=1)

    """

    def __init__(
        self, frame_length, hop_length, pad_end=False, pad_value=0, data_format='default', **kwargs
    ):
        super(Frame, self).__init__(**kwargs)

        backend.validate_data_format_str(data_format)

        self.frame_length = frame_length
        self.hop_length = hop_length
        self.pad_end = pad_end
        self.pad_value = pad_value

        if data_format == _CH_DEFAULT_STR:
            self.data_format = K.image_data_format()
        else:
            self.data_format = data_format

        if data_format == _CH_FIRST_STR:
            self.time_axis = 2  # batch, ch, time
        else:
            self.time_axis = 1  # batch, time, ch

[docs]    def call(self, x):
        """
        Args:
            x (`Tensor`): batch audio signal in the specified 1D format in initiation.

        Returns:
            (`Tensor`): A framed tensor. The shape is (batch, time (frames), frame_length, channel) if `channels_last`,
            or (batch, channel, time (frames), frame_length) if `channels_first`.
        """
        return tf.signal.frame(
            x,
            frame_length=self.frame_length,
            frame_step=self.hop_length,
            pad_end=self.pad_end,
            pad_value=self.pad_value,
            axis=self.time_axis,
        )

    def get_config(self):
        config = super(Frame, self).get_config()
        config.update(
            {
                'frame_length': self.frame_length,
                'hop_length': self.hop_length,
                'pad_end': self.pad_end,
                'pad_value': self.pad_value,
                'data_format': self.data_format,
            }
        )

        return config


[docs]class Energy(Layer):
    """
    Compute energy of each frame. The energy computed for each frame then is normalized so that the values would
    represent energy per `ref_duration`. I.e., if `frame_length` > `sample_rate * ref_duration`,

    Args:
        sample_rate (int): sample rate of the audio
        ref_duration (float): reference duration for normalization
        frame_length (int): length of a frame that is used in computing energy
        hop_length (int): hop length aka frame rate. time resolution of the energy computation.
        pad_end (bool): whether to pad at the end of the signal of there would be a otherwise-discarded partial frame
        pad_value (int or float): value to use in the padding
        data_format (str): `channels_first`, `channels_last`, or `default`
        **kwargs: optional keyword args for `tf.keras.layers.Layer()`

    Example:
        ::

            input_shape = (2048, 1)  # mono signal
            model = Sequential()
            model.add(kapre.Energy(frame_length=1024, hop_length=512, input_shape=input_shape))
            # now the shape is (batch, n_frame=3, ch=1)

    """

    def __init__(
        self,
        sample_rate=22050,
        ref_duration=0.1,
        frame_length=2205,
        hop_length=1102,
        pad_end=False,
        pad_value=0,
        data_format='default',
        **kwargs,
    ):
        super(Energy, self).__init__(**kwargs)

        backend.validate_data_format_str(data_format)

        self.sample_rate = sample_rate
        self.ref_duration = ref_duration
        self.frame_length = frame_length
        self.hop_length = hop_length
        self.pad_end = pad_end
        self.pad_value = pad_value

        if data_format == _CH_DEFAULT_STR:
            self.data_format = K.image_data_format()
        else:
            self.data_format = data_format

        if data_format == _CH_FIRST_STR:
            self.time_axis = 2  # batch, ch, time
        else:
            self.time_axis = 1  # batch, time, ch

[docs]    def call(self, x):
        """
        Args:
            x (`Tensor`): batch audio signal in the specified 1D format in initiation.

        Returns:
            (`Tensor`): A framed tensor. The shape is (batch, time (frames), channel) if `channels_last`, or
            (batch, channel, time (frames)) if `channels_first`.
        """
        frames = tf.signal.frame(
            x,
            frame_length=self.frame_length,
            frame_step=self.hop_length,
            pad_end=self.pad_end,
            pad_value=self.pad_value,
            axis=self.time_axis,
        )
        frames = tf.math.square(frames)  # batch, ndim=4

        frame_axis = 2 if self.data_format == _CH_LAST_STR else 3
        energies = tf.math.reduce_sum(
            frames, axis=frame_axis
        )  # batch, ndim=3. (b, t, ch) or (b, ch, t)

        # normalize it to self.ref_duration
        nor_coeff = self.ref_duration / (self.frame_length / self.sample_rate)

        return nor_coeff * energies

    def get_config(self):
        config = super(Energy, self).get_config()
        config.update(
            {
                'frame_length': self.frame_length,
                'hop_length': self.hop_length,
                'pad_end': self.pad_end,
                'pad_value': self.pad_value,
                'data_format': self.data_format,
            }
        )

        return config


[docs]class MuLawEncoding(Layer):
    """
    Mu-law encoding (compression) of audio signal, in [-1, 1], to [0, quantization_channels - 1].
    See `Wikipedia <https://en.wikipedia.org/wiki/Μ-law_algorithm>`_ for more details.

    Args:
        quantization_channels (positive int): Number of channels. For 8-bit encoding, use 256.
        **kwargs: optional keyword args for `tf.keras.layers.Layer()`

    Note:
        Mu-law encoding was originally developed to increase signal-to-noise ratio of signal during transmission.
        In deep learning, mu-law became popular by `WaveNet <https://arxiv.org/abs/1609.03499>`_ where
        8-bit (256 channels) mu-law quantization was applied to the signal so that the generation of waveform amplitudes
        became a single-label 256-class classification problem.

    Example:
        ::

            input_shape = (2048, 1)  # mono signal (float in [-1, 1])
            model = Sequential()
            model.add(kapre.MuLawEncoding(quantization_channels=256, input_shape=input_shape))
            # now the shape is (batch, time=2048, ch=1) with int in [0, quantization_channels - 1]


    """

    def __init__(
        self,
        quantization_channels,
        **kwargs,
    ):
        super(MuLawEncoding, self).__init__(**kwargs)
        self.quantization_channels = quantization_channels

[docs]    def call(self, x):
        """

        Args:
            x (float `Tensor`): audio signal to encode. Shape doesn't matter.

        Returns:
            (int `Tensor`): mu-law encoded x. Shape doesn't change.
        """
        return backend.mu_law_encoding(x, self.quantization_channels)

    def get_config(self):
        config = super(MuLawEncoding, self).get_config()
        config.update(
            {
                'quantization_channels': self.quantization_channels,
            }
        )

        return config


[docs]class MuLawDecoding(Layer):
    """
    Mu-law decoding (expansion) of mu-law encoded audio signal to [-1, 1].
    See `Wikipedia <https://en.wikipedia.org/wiki/Μ-law_algorithm>`_ for more details.

    Args:
        quantization_channels (positive int): Number of channels. For 8-bit encoding, use 256.
        **kwargs: optional keyword args for `tf.keras.layers.Layer()`

    Example:
        ::

            input_shape = (2048, 1)  # mono signal (int in [0, quantization_channels - 1])
            model = Sequential()
            model.add(kapre.MuLawDecoding(quantization_channels=256, input_shape=input_shape))
            # now the shape is (batch, time=2048, ch=1) with float dtype in [-1, 1]

    """

    def __init__(
        self,
        quantization_channels,
        **kwargs,
    ):
        super(MuLawDecoding, self).__init__(**kwargs)
        self.quantization_channels = quantization_channels

[docs]    def call(self, x):
        """

        Args:
            x (int `Tensor`): audio signal to decode. Shape doesn't matter.

        Returns:
            (float `Tensor`): mu-law encoded x. Shape doesn't change.
        """
        return backend.mu_law_decoding(x, self.quantization_channels)

    def get_config(self):
        config = super(MuLawDecoding, self).get_config()
        config.update(
            {
                'quantization_channels': self.quantization_channels,
            }
        )

        return config


[docs]class LogmelToMFCC(Layer):
    """
    Compute MFCC from log-melspectrogram.

    It wraps `tf.signal.mfccs_from_log_mel_spectrogram()`, which performs DCT-II.

    Note:
        In librosa, the DCT-II scales by `sqrt(1/n)` where `n` is the bin index of MFCC as it uses
        scipy. This is the correct orthogonal DCT.
        In Tensorflow though, because it follows HTK, it scales by `(0.5 * sqrt(2/n))`. This results in
        `sqrt(2)` scale difference in the first MFCC bins (`n=1`).

        As long as all of your data in training / inference / deployment is consistent (i.e., do not
        mix librosa and kapre MFCC), it'll be fine!

    Args:
        n_mfccs (int): Number of MFCC
        data_format (str): `channels_first`, `channels_last`, or `default`
        **kwargs: optional keyword args for `tf.keras.layers.Layer()`

    Example:
        ::

            input_shape = (40, 128, 1)  # mono melspectrogram with 40 frames and n_mels=128
            model = Sequential()
            model.add(kapre.LogmelToMFCC(n_mfccs=20, input_shape=input_shape))
            # now the shape is (batch, time=40, n_mfccs=20, ch=1)

    """

    def __init__(self, n_mfccs=20, data_format='default', **kwargs):
        super(LogmelToMFCC, self).__init__(**kwargs)
        backend.validate_data_format_str(data_format)

        self.n_mfccs = n_mfccs
        if data_format == _CH_DEFAULT_STR:
            self.data_format = K.image_data_format()
        else:
            self.data_format = data_format

        if self.data_format == _CH_LAST_STR:
            self.permutation = (0, 1, 3, 2)
        else:
            self.permutation = None

[docs]    def call(self, log_melgrams):
        """

        Args:
            log_melgrams (float `Tensor`): a batch of log_melgrams. `(b, time, mel, ch)` if `channels_last`
                and `(b, ch, time, mel)` if `channels_first`.

        Returns:
            (float `Tensor`):
            MFCCs. `(batch, time, n_mfccs, ch)` if `channels_last`, `(batch, ch, time, n_mfccs)` if `channels_first`.
        """
        if self.permutation is not None:  # reshape so that last channel == mel
            log_melgrams = K.permute_dimensions(log_melgrams, pattern=self.permutation)

        mfccs = tf.signal.mfccs_from_log_mel_spectrograms(log_melgrams)
        mfccs = mfccs[..., : self.n_mfccs]

        if self.permutation is not None:
            mfccs = K.permute_dimensions(mfccs, pattern=self.permutation)

        return mfccs

    def get_config(self):
        config = super(LogmelToMFCC, self).get_config()
        config.update({'n_mfccs': self.n_mfccs, 'data_format': self.data_format})

        return config