Source code for kapre.signal

"""Signal layers.

This module includes Kapre layers that deal with audio signals (waveforms).

import tensorflow as tf
from tensorflow.keras.layers import Layer
from tensorflow.keras import backend as K

from . import backend
from .backend import _CH_FIRST_STR, _CH_LAST_STR, _CH_DEFAULT_STR

__all__ = ['Frame', 'Energy', 'MuLawEncoding', 'MuLawDecoding', 'LogmelToMFCC']

[docs]class Frame(Layer): """ Frame input audio signal. It is a wrapper of `tf.signal.frame`. Args: frame_length (int): length of a frame hop_length (int): hop length aka frame rate pad_end (bool): whether to pad at the end of the signal of there would be a otherwise-discarded partial frame pad_value (int or float): value to use in the padding data_format (str): `channels_first`, `channels_last`, or `default` **kwargs: optional keyword args for `tf.keras.layers.Layer()` Example: :: input_shape = (2048, 1) # mono signal model = Sequential() model.add(kapre.Frame(frame_length=1024, hop_length=512, input_shape=input_shape)) # now the shape is (batch, n_frame=3, frame_length=1024, ch=1) """ def __init__( self, frame_length, hop_length, pad_end=False, pad_value=0, data_format='default', **kwargs ): super(Frame, self).__init__(**kwargs) backend.validate_data_format_str(data_format) self.frame_length = frame_length self.hop_length = hop_length self.pad_end = pad_end self.pad_value = pad_value if data_format == _CH_DEFAULT_STR: self.data_format = K.image_data_format() else: self.data_format = data_format if data_format == _CH_FIRST_STR: self.time_axis = 2 # batch, ch, time else: self.time_axis = 1 # batch, time, ch
[docs] def call(self, x): """ Args: x (`Tensor`): batch audio signal in the specified 1D format in initiation. Returns: (`Tensor`): A framed tensor. The shape is (batch, time (frames), frame_length, channel) if `channels_last`, or (batch, channel, time (frames), frame_length) if `channels_first`. """ return tf.signal.frame( x, frame_length=self.frame_length, frame_step=self.hop_length, pad_end=self.pad_end, pad_value=self.pad_value, axis=self.time_axis, )
def get_config(self): config = super(Frame, self).get_config() config.update( { 'frame_length': self.frame_length, 'hop_length': self.hop_length, 'pad_end': self.pad_end, 'pad_value': self.pad_value, 'data_format': self.data_format, } ) return config
[docs]class Energy(Layer): """ Compute energy of each frame. The energy computed for each frame then is normalized so that the values would represent energy per `ref_duration`. I.e., if `frame_length` > `sample_rate * ref_duration`, Args: sample_rate (int): sample rate of the audio ref_duration (float): reference duration for normalization frame_length (int): length of a frame that is used in computing energy hop_length (int): hop length aka frame rate. time resolution of the energy computation. pad_end (bool): whether to pad at the end of the signal of there would be a otherwise-discarded partial frame pad_value (int or float): value to use in the padding data_format (str): `channels_first`, `channels_last`, or `default` **kwargs: optional keyword args for `tf.keras.layers.Layer()` Example: :: input_shape = (2048, 1) # mono signal model = Sequential() model.add(kapre.Energy(frame_length=1024, hop_length=512, input_shape=input_shape)) # now the shape is (batch, n_frame=3, ch=1) """ def __init__( self, sample_rate=22050, ref_duration=0.1, frame_length=2205, hop_length=1102, pad_end=False, pad_value=0, data_format='default', **kwargs, ): super(Energy, self).__init__(**kwargs) backend.validate_data_format_str(data_format) self.sample_rate = sample_rate self.ref_duration = ref_duration self.frame_length = frame_length self.hop_length = hop_length self.pad_end = pad_end self.pad_value = pad_value if data_format == _CH_DEFAULT_STR: self.data_format = K.image_data_format() else: self.data_format = data_format if data_format == _CH_FIRST_STR: self.time_axis = 2 # batch, ch, time else: self.time_axis = 1 # batch, time, ch
[docs] def call(self, x): """ Args: x (`Tensor`): batch audio signal in the specified 1D format in initiation. Returns: (`Tensor`): A framed tensor. The shape is (batch, time (frames), channel) if `channels_last`, or (batch, channel, time (frames)) if `channels_first`. """ frames = tf.signal.frame( x, frame_length=self.frame_length, frame_step=self.hop_length, pad_end=self.pad_end, pad_value=self.pad_value, axis=self.time_axis, ) frames = tf.math.square(frames) # batch, ndim=4 frame_axis = 2 if self.data_format == _CH_LAST_STR else 3 energies = tf.math.reduce_sum( frames, axis=frame_axis ) # batch, ndim=3. (b, t, ch) or (b, ch, t) # normalize it to self.ref_duration nor_coeff = self.ref_duration / (self.frame_length / self.sample_rate) return nor_coeff * energies
def get_config(self): config = super(Energy, self).get_config() config.update( { 'frame_length': self.frame_length, 'hop_length': self.hop_length, 'pad_end': self.pad_end, 'pad_value': self.pad_value, 'data_format': self.data_format, } ) return config
[docs]class MuLawEncoding(Layer): """ Mu-law encoding (compression) of audio signal, in [-1, 1], to [0, quantization_channels - 1]. See `Wikipedia <Μ-law_algorithm>`_ for more details. Args: quantization_channels (positive int): Number of channels. For 8-bit encoding, use 256. **kwargs: optional keyword args for `tf.keras.layers.Layer()` Note: Mu-law encoding was originally developed to increase signal-to-noise ratio of signal during transmission. In deep learning, mu-law became popular by `WaveNet <>`_ where 8-bit (256 channels) mu-law quantization was applied to the signal so that the generation of waveform amplitudes became a single-label 256-class classification problem. Example: :: input_shape = (2048, 1) # mono signal (float in [-1, 1]) model = Sequential() model.add(kapre.MuLawEncoding(quantization_channels=256, input_shape=input_shape)) # now the shape is (batch, time=2048, ch=1) with int in [0, quantization_channels - 1] """ def __init__( self, quantization_channels, **kwargs, ): super(MuLawEncoding, self).__init__(**kwargs) self.quantization_channels = quantization_channels
[docs] def call(self, x): """ Args: x (float `Tensor`): audio signal to encode. Shape doesn't matter. Returns: (int `Tensor`): mu-law encoded x. Shape doesn't change. """ return backend.mu_law_encoding(x, self.quantization_channels)
def get_config(self): config = super(MuLawEncoding, self).get_config() config.update( { 'quantization_channels': self.quantization_channels, } ) return config
[docs]class MuLawDecoding(Layer): """ Mu-law decoding (expansion) of mu-law encoded audio signal to [-1, 1]. See `Wikipedia <Μ-law_algorithm>`_ for more details. Args: quantization_channels (positive int): Number of channels. For 8-bit encoding, use 256. **kwargs: optional keyword args for `tf.keras.layers.Layer()` Example: :: input_shape = (2048, 1) # mono signal (int in [0, quantization_channels - 1]) model = Sequential() model.add(kapre.MuLawDecoding(quantization_channels=256, input_shape=input_shape)) # now the shape is (batch, time=2048, ch=1) with float dtype in [-1, 1] """ def __init__( self, quantization_channels, **kwargs, ): super(MuLawDecoding, self).__init__(**kwargs) self.quantization_channels = quantization_channels
[docs] def call(self, x): """ Args: x (int `Tensor`): audio signal to decode. Shape doesn't matter. Returns: (float `Tensor`): mu-law encoded x. Shape doesn't change. """ return backend.mu_law_decoding(x, self.quantization_channels)
def get_config(self): config = super(MuLawDecoding, self).get_config() config.update( { 'quantization_channels': self.quantization_channels, } ) return config
[docs]class LogmelToMFCC(Layer): """ Compute MFCC from log-melspectrogram. It wraps `tf.signal.mfccs_from_log_mel_spectrogram()`, which performs DCT-II. Note: In librosa, the DCT-II scales by `sqrt(1/n)` where `n` is the bin index of MFCC as it uses scipy. This is the correct orthogonal DCT. In Tensorflow though, because it follows HTK, it scales by `(0.5 * sqrt(2/n))`. This results in `sqrt(2)` scale difference in the first MFCC bins (`n=1`). As long as all of your data in training / inference / deployment is consistent (i.e., do not mix librosa and kapre MFCC), it'll be fine! Args: n_mfccs (int): Number of MFCC data_format (str): `channels_first`, `channels_last`, or `default` **kwargs: optional keyword args for `tf.keras.layers.Layer()` Example: :: input_shape = (40, 128, 1) # mono melspectrogram with 40 frames and n_mels=128 model = Sequential() model.add(kapre.LogmelToMFCC(n_mfccs=20, input_shape=input_shape)) # now the shape is (batch, time=40, n_mfccs=20, ch=1) """ def __init__(self, n_mfccs=20, data_format='default', **kwargs): super(LogmelToMFCC, self).__init__(**kwargs) backend.validate_data_format_str(data_format) self.n_mfccs = n_mfccs if data_format == _CH_DEFAULT_STR: self.data_format = K.image_data_format() else: self.data_format = data_format if self.data_format == _CH_LAST_STR: self.permutation = (0, 1, 3, 2) else: self.permutation = None
[docs] def call(self, log_melgrams): """ Args: log_melgrams (float `Tensor`): a batch of log_melgrams. `(b, time, mel, ch)` if `channels_last` and `(b, ch, time, mel)` if `channels_first`. Returns: (float `Tensor`): MFCCs. `(batch, time, n_mfccs, ch)` if `channels_last`, `(batch, ch, time, n_mfccs)` if `channels_first`. """ if self.permutation is not None: # reshape so that last channel == mel log_melgrams = K.permute_dimensions(log_melgrams, pattern=self.permutation) mfccs = tf.signal.mfccs_from_log_mel_spectrograms(log_melgrams) mfccs = mfccs[..., : self.n_mfccs] if self.permutation is not None: mfccs = K.permute_dimensions(mfccs, pattern=self.permutation) return mfccs
def get_config(self): config = super(LogmelToMFCC, self).get_config() config.update({'n_mfccs': self.n_mfccs, 'data_format': self.data_format}) return config