Examples¶
We provide fully functioning code snippets here. More detailed examples are under documentations of all the layers and functions.
How To Import¶
import kapre # to import the whole library
from kapre import ( # `time_frequency` layers can be directly imported from `kapre`
STFT,
InverseSTFT,
Magnitude,
Phase,
MagnitudeToDecibel,
ApplyFilterbank,
Delta,
ConcatenateFrequencyMap,
)
from kapre import ( # `signal` layers can be also directly imported from kapre
Frame,
Energy,
MuLawEncoding,
MuLawDecoding,
LogmelToMFCC,
)
# from kapre import backend # we can do this, but `backend` might be a too general name
import kapre.backend # for namespace sanity, you might prefer this
from kapre import backend as kapre_backend # or maybe this
from kapre.composed import ( # function names in `composed` are purposefully verbose.
get_stft_magnitude_layer,
get_melspectrogram_layer,
get_log_frequency_spectrogram_layer,
get_perfectly_reconstructing_stft_istft,
get_stft_mag_phase,
get_frequency_aware_conv2d,
)
Use STFT Magnitude¶
import tensorflow as tf
from tensorflow.keras.models import Sequential
from kapre import STFT, Magnitude, MagnitudeToDecibel
sampling_rate = 16000 # sampling rate of your input audio
duration = 20.0 # duration of the audio
num_channel = 2 # number of channels of the audio
input_shape = (int(sampling_rate * duration), num_channel) # let's follow `channels_last` convention
model = Sequential()
model.add(STFT(n_fft=2048, win_length=2018, hop_length=1024,
window_name='hann_window', pad_end=False,
input_data_format='channels_last', output_data_format='channels_last',
input_shape=input_shape)) # complex64
model.add(Magnitude()) # float32
model.add(MagnitudeToDecibel()) # float32 but in decibel scale
model.summary() # this would be an "audio frontend" of your model
"""
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
stft (STFT) (None, 311, 1025, 2) 0
_________________________________________________________________
magnitude (Magnitude) (None, 311, 1025, 2) 0
_________________________________________________________________
magnitude_to_decibel (Magnit (None, 311, 1025, 2) 0
=================================================================
Total params: 0
Trainable params: 0
Non-trainable params: 0
_________________________________________________________________
"""
# A 20-second stereo audio signal is converted to a (311, 1025, 2) tensor.
# Now, you can add your own model. For example, let's add ResNet50
# with global average pooling, no pre-trained weights,
# and for a 10-class classification.
model.add(
tf.keras.applications.ResNet50(
include_top=True, weights=None, input_tensor=None,
input_shape=(311, 1025, 2), pooling='avg', classes=10
)
)
model.summary()
"""
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
stft (STFT) (None, 311, 1025, 2) 0
_________________________________________________________________
magnitude (Magnitude) (None, 311, 1025, 2) 0
_________________________________________________________________
magnitude_to_decibel (Magnit (None, 311, 1025, 2) 0
_________________________________________________________________
resnet50 (Functional) (None, 10) 23605066
=================================================================
Total params: 23,605,066
Trainable params: 23,551,946
Non-trainable params: 53,120
_________________________________________________________________
"""
Use STFT Magnitude – a lazy version¶
from tensorflow.keras.models import Sequential
from kapre.composed import get_stft_magnitude_layer
sampling_rate = 16000 # sampling rate of your input audio
duration = 20.0 # duration of the audio
num_channel = 2 # number of channels of the audio
input_shape = (int(sampling_rate * duration), num_channel) # let's follow `channels_last` convention
model = Sequential(get_stft_magnitude_layer(input_shape=input_shape, return_decibel=True))
model.summary() # this lazy version provides an abstraction view of stft_magnitude
"""
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
stft_magnitude (Sequential) (None, 622, 1025, 2) 0
=================================================================
Total params: 0
Trainable params: 0
Non-trainable params: 0
_________________________________________________________________
"""
# Here, a 20-second stereo audio signal is converted to a (622, 1025, 2) tensor.
# x2 more temporal frames compared to the example above because we didn't set hop_length here,
# and that means it's set to a 25% hop length, not 50% as above.
model.layers[0].summary() # let's deep dive one level
"""
Model: "stft_magnitude"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
stft (STFT) (None, 622, 1025, 2) 0
_________________________________________________________________
magnitude (Magnitude) (None, 622, 1025, 2) 0
_________________________________________________________________
magnitude_to_decibel (Magnit (None, 622, 1025, 2) 0
=================================================================
Total params: 0
Trainable params: 0
Non-trainable params: 0
_________________________________________________________________
"""