Source code for kapre.backend

"""Backend operations of Kapre.

This module summarizes operations and functions that are used in Kapre layers.

Attributes:
    _CH_FIRST_STR (str): 'channels_first', a pre-defined string.
    _CH_LAST_STR (str): 'channels_last', a pre-defined string.
    _CH_DEFAULT_STR (str): 'default', a pre-defined string.

"""
from tensorflow.keras import backend as K
import tensorflow as tf
import numpy as np
import librosa

_CH_FIRST_STR = 'channels_first'
_CH_LAST_STR = 'channels_last'
_CH_DEFAULT_STR = 'default'


[docs]def get_window_fn(window_name=None):
    """Return a window function given its name.
    This function is used inside layers such as `STFT` to get a window function.

    Args:
        window_name (None or str): name of window function. On Tensorflow 2.3, there are five windows available in
        `tf.signal` (`hamming_window`, `hann_window`, `kaiser_bessel_derived_window`, `kaiser_window`, `vorbis_window`).

    """

    if window_name is None:
        return tf.signal.hann_window

    available_windows = {
        'hamming_window': tf.signal.hamming_window,
        'hann_window': tf.signal.hann_window,
    }
    if hasattr(tf.signal, 'kaiser_bessel_derived_window'):
        available_windows['kaiser_bessel_derived_window'] = tf.signal.kaiser_bessel_derived_window
    if hasattr(tf.signal, 'kaiser_window'):
        available_windows['kaiser_window'] = tf.signal.kaiser_window
    if hasattr(tf.signal, 'vorbis_window'):
        available_windows['vorbis_window'] = tf.signal.vorbis_window

    if window_name not in available_windows:
        raise NotImplementedError(
            'Window name %s is not supported now. Currently, %d windows are'
            'supported - %s'
            % (
                window_name,
                len(available_windows),
                ', '.join([k for k in available_windows.keys()]),
            )
        )

    return available_windows[window_name]


[docs]def validate_data_format_str(data_format):
    """A function that validates the data format string."""
    if data_format not in (_CH_DEFAULT_STR, _CH_FIRST_STR, _CH_LAST_STR):
        raise ValueError(
            'data_format should be one of {}'.format(
                str([_CH_FIRST_STR, _CH_LAST_STR, _CH_DEFAULT_STR])
            )
            + ' but we received {}'.format(data_format)
        )


[docs]def magnitude_to_decibel(x, ref_value=1.0, amin=1e-5, dynamic_range=80.0):
    """A function that converts magnitude to decibel scaling.
    In essence, it runs `10 * log10(x)`, but with some other utility operations.

    Similar to `librosa.power_to_db` with `ref=1.0` and `top_db=dynamic_range`

    Args:
        x (`Tensor`): float tensor. Can be batch or not. Something like magnitude of STFT.
        ref_value (`float`): an input value that would become 0 dB in the result.
            For spectrogram magnitudes, ref_value=1.0 usually make the decibel-scaled output to be around zero
            if the input audio was in [-1, 1].
        amin (`float`): the noise floor of the input. An input that is smaller than `amin`, it's converted to `amin`.
        dynamic_range (`float`): range of the resulting value. E.g., if the maximum magnitude is 30 dB,
            the noise floor of the output would become (30 - dynamic_range) dB

    Returns:
        log_spec (`Tensor`): a decibel-scaled version of `x`.

    Note:
        In many deep learning based application, the input spectrogram magnitudes (e.g., abs(STFT)) are decibel-scaled
        (=logarithmically mapped) for a better performance.

    Example:
        ::

            input_shape = (2048, 1)  # mono signal
            model = Sequential()
            model.add(kapre.Frame(frame_length=1024, hop_length=512, input_shape=input_shape))
            # now the shape is (batch, n_frame=3, frame_length=1024, ch=1)

    """

    def _log10(x):
        return tf.math.log(x) / tf.math.log(tf.constant(10, dtype=x.dtype))

    if K.ndim(x) > 1:  # we assume x is batch in this case
        max_axis = tuple(range(K.ndim(x))[1:])
    else:
        max_axis = None

    if amin is None:
        amin = 1e-5

    amin = tf.cast(amin, dtype=x.dtype)
    log_spec = 10.0 * _log10(tf.math.maximum(x, amin))
    log_spec = log_spec - 10.0 * _log10(tf.math.maximum(amin, ref_value))

    log_spec = tf.math.maximum(
        log_spec, tf.math.reduce_max(log_spec, axis=max_axis, keepdims=True) - dynamic_range
    )

    return log_spec


[docs]def filterbank_mel(
    sample_rate, n_freq, n_mels=128, f_min=0.0, f_max=None, htk=False, norm='slaney'
):
    """A wrapper for librosa.filters.mel that additionally does transpose and tensor conversion

    Args:
        sample_rate (`int`): sample rate of the input audio
        n_freq (`int`): number of frequency bins in the input STFT magnitude.
        n_mels (`int`): the number of mel bands
        f_min (`float`): lowest frequency that is going to be included in the mel filterbank (Hertz)
        f_max (`float`): highest frequency that is going to be included in the mel filterbank (Hertz)
        htk (bool): whether to use `htk` formula or not
        norm: The default, 'slaney', would normalize the the mel weights by the width of the mel band.

    Returns:
        (`Tensor`): mel filterbanks. Shape=`(n_freq, n_mels)`
    """
    filterbank = librosa.filters.mel(
        sr=sample_rate,
        n_fft=(n_freq - 1) * 2,
        n_mels=n_mels,
        fmin=f_min,
        fmax=f_max,
        htk=htk,
        norm=norm,
    ).astype(K.floatx())
    return tf.convert_to_tensor(filterbank.T)


[docs]def filterbank_log(sample_rate, n_freq, n_bins=84, bins_per_octave=12, f_min=None, spread=0.125):
    """A function that returns a approximation of constant-Q filter banks for a fixed-window STFT.
    Each filter is a log-normal window centered at the corresponding frequency.

    Args:
        sample_rate (`int`): audio sampling rate
        n_freq (`int`): number of the input frequency bins. E.g., `n_fft / 2 + 1`
        n_bins (`int`): number of the resulting log-frequency bins.  Defaults to 84 (7 octaves).
        bins_per_octave (`int`): number of bins per octave. Defaults to 12 (semitones).
        f_min (`float`): lowest frequency that is going to be included in the log filterbank. Defaults to `C1 ~= 32.70`
        spread (`float`): spread of each filter, as a fraction of a bin.

    Returns:
        (`Tensor`): log-frequency filterbanks. Shape=`(n_freq, n_bins)`

    Note:
        The code is originally from `logfrequency` in librosa 0.4 (deprecated) and copy-and-pasted.
        `tuning` parameter was removed and we use `n_freq` instead of `n_fft`.
    """

    if f_min is None:
        f_min = 32.70319566

    f_max = f_min * 2 ** (n_bins / bins_per_octave)
    if f_max > sample_rate // 2:
        raise RuntimeError(
            'Maximum frequency of log filterbank should be lower or equal to the maximum'
            'frequency of the input (defined by its sample rate), '
            'but f_max=%f and maximum frequency is %f. \n'
            'Fix it by reducing n_bins, increasing bins_per_octave and/or reducing f_min.\n'
            'You can also do it by increasing sample_rate but it means you need to upsample'
            'the input audio data, too.' % (f_max, sample_rate)
        )

    # What's the shape parameter for our log-normal filters?
    sigma = float(spread) / bins_per_octave

    # Construct the output matrix
    basis = np.zeros((n_bins, n_freq))

    # Get log frequencies of bins
    log_freqs = np.log2(librosa.fft_frequencies(sample_rate, (n_freq - 1) * 2)[1:])

    for i in range(n_bins):
        # What's the center (median) frequency of this filter?
        c_freq = f_min * (2.0 ** (float(i) / bins_per_octave))

        # Place a log-normal window around c_freq
        basis[i, 1:] = np.exp(
            -0.5 * ((log_freqs - np.log2(c_freq)) / sigma) ** 2 - np.log2(sigma) - log_freqs
        )

    # Normalize the filters
    basis = librosa.util.normalize(basis, norm=1, axis=1)
    basis = basis.astype(K.floatx())

    return tf.convert_to_tensor(basis.T)


[docs]def mu_law_encoding(signal, quantization_channels):
    """Encode signal based on mu-law companding. Also called mu-law compressing.

    This algorithm assumes the signal has been scaled to between -1 and 1 and returns a signal encoded
    with values from 0 to quantization_channels - 1.
    See `Wikipedia <https://en.wikipedia.org/wiki/Μ-law_algorithm>`_ for more details.

    Args:
        signal (float `Tensor`): audio signal to encode
        quantization_channels (positive int): Number of channels. For 8-bit encoding, use 256.

    Returns:
        signal_mu (int `Tensor`): mu-encoded signal
    """
    mu = quantization_channels - 1.0
    signal_mu = tf.math.sign(signal) * tf.math.log1p(mu * tf.math.abs(signal)) / tf.math.log1p(mu)
    signal_mu = tf.cast(((signal_mu + 1) / 2.0 * mu + 0.5), tf.int32)
    return signal_mu


[docs]def mu_law_decoding(signal_mu, quantization_channels):
    """Decode mu-law encoded signals based on mu-law companding. Also called mu-law expanding.

    See `Wikipedia <https://en.wikipedia.org/wiki/Μ-law_algorithm>`_ for more details.

    Args:
        signal_mu (int `Tensor`): mu-encoded signal to decode
        quantization_channels (positive int): Number of channels. For 8-bit encoding, use 256.

    Returns:
        signal (float `Tensor`): decoded audio signal
    """
    mu = quantization_channels - 1.0
    signal_mu = K.cast_to_floatx(signal_mu)

    signal = (signal_mu / mu) * 2 - 1.0
    signal = (
        tf.math.sign(signal) * (tf.math.exp(tf.math.abs(signal) * tf.math.log1p(mu)) - 1.0) / mu
    )
    return signal