In [0]:
import numpy as np

**WaveGlow**

In [0]:
L = 2048 # audio length
n_audio_channel_init = 8 # initial audio channel 
C_mel = 80 * 8 # After upsampling and unfolding 
kernal_size = 3
C_wn = 256 # input channel size of in_layer
C_wn_middle = C_wn * 2 # output channel size of in_layer and cond_layer
n_flows = 12
n_layers = 8
n_early_output = 2
n_early_output_interval = 4
duration = 0.725

n_audio_channels = []
n_audio = n_audio_channel_init
for i in range(n_flows):
  if i % n_early_output_interval == 0 and i > 0:
    n_audio -= n_early_output
  n_audio_channels.append(n_audio) # audio channel after early output

# in_layers
WN_in_layers = L * kernal_size * C_wn * C_wn_middle * n_layers * n_flows
print('MACs of in_layers', WN_in_layers / duration / 1e9)
# cond layers
WN_cond_layers = L * C_mel * C_wn_middle * n_layers * n_flows 
print('MACs of cond_layers', WN_cond_layers / duration / 1e9)
# res skip layers
WN_res_layers = (L * C_wn * C_wn_middle * (n_layers - 1) + L * C_wn * C_wn) * n_flows
print('MACs of res_skip_layers', WN_res_layers / duration / 1e9)
# invertible convs
inv1x1 = np.sum([n**2 * L for n in n_audio_channels])
print('MACs of invertible conv layers', inv1x1 / duration / 1e9)
# start
starts = np.sum([n / 2 * C_wn * L for n in n_audio_channels])
print('MACs of start conv layers', starts / duration / 1e9)
# end
ends = np.sum([C_wn * n * L for n in n_audio_channels])
print('MACs of end conv layers', ends / duration / 1e9)
# total
WG_total = WN_in_layers + WN_cond_layers + WN_res_layers + inv1x1 + starts + ends
print('Total number of MACs is', WG_total / duration / 1e9)

('MACs of in_layers', 106.63367079724138)
('MACs of cond_layers', 88.86139233103448)
('MACs of res_skip_layers', 33.32302212413793)
('MACs of invertible conv layers', 0.00131072)
('MACs of start conv layers', 0.02603361103448276)
('MACs of end conv layers', 0.05206722206896552)
('Total number of MACs is', 228.89749680551725)


SqueezeWave L=64, C=128

In [0]:
L = 64 # audio length
n_audio_channel_init = 256 # initial audio channel 
L_mel = 64 # mel-spectrogram length
C_mel =80 # mel-spectrogram channel 
kernal_size = 3
C_wn = 128 # input channel size of in_layer
C_wn_middle = C_wn * 2 # output channel size of in_layer and cond_layer
n_flows = 12
n_layers = 8
n_early_output = 16
n_early_output_interval = 2
duration = 0.725

n_audio_channels = []
n_audio = n_audio_channel_init
for i in range(n_flows):
  if i % n_early_output_interval == 0 and i > 0:
    n_audio -= n_early_output
  n_audio_channels.append(n_audio) # audio channel after early output

# in_layers
WN_in_layers = L * kernal_size * C_wn * n_layers * n_flows # depthwise
WN_in_layers += L * C_wn * C_wn_middle * n_layers * n_flows # pointwise
print('MACs of in_layers', WN_in_layers / duration / 1e9)
# cond_layers
WN_cond_layers = L_mel * C_mel * C_wn_middle * n_layers * n_flows
print('MACs of cond_layers', WN_cond_layers / duration / 1e9)
# res_skip_layers
WN_res_layers = L * C_wn * C_wn * n_layers * n_flows
print('MACs of res_skip_layers', WN_res_layers / duration / 1e9)
# invertible convs
inv1x1 = np.sum([n**2 * L for n in n_audio_channels])
print('MACs of invertible conv layers', inv1x1 / duration / 1e9)
# start
starts = np.sum([n / 2 * C_wn * L for n in n_audio_channels])
print('MACs of start conv layers', starts / duration / 1e9)
#end
ends = np.sum([C_wn * n * L for n in n_audio_channels])
print('MACs of end conv layers', ends / duration / 1e9)
# total
total = WN_in_layers + WN_cond_layers + WN_res_layers + inv1x1 + starts + ends
print('Total number of MACs is', total / duration / 1e9)
print('Reduction compared with WaveGlow', WG_total / total)

('MACs of in_layers', 0.2809460524137931)
('MACs of cond_layers', 0.17355740689655172)
('MACs of res_skip_layers', 0.1388459255172414)
('MACs of invertible conv layers', 0.0502141351724138)
('MACs of start conv layers', 0.014643906206896554)
('MACs of end conv layers', 0.029287812413793107)
('Total number of MACs is', 0.6874952386206896)
('Reduction compared with WaveGlow', 332)


**SqueezeWave L=64, C=256**

In [0]:
L = 64 # audio length
n_audio_channel_init = 256 # initial audio channel 
L_mel = 64 # mel-spectrogram length
C_mel =80 # mel-spectrogram channel 
kernal_size = 3
C_wn = 256 # input channel size of in_layer
C_wn_middle = C_wn * 2 # output channel size of in_layer and cond_layer
n_flows = 12
n_layers = 8
n_early_output = 16
n_early_output_interval = 2
duration = 0.725

n_audio_channels = []
n_audio = n_audio_channel_init
for i in range(n_flows):
  if i % n_early_output_interval == 0 and i > 0:
    n_audio -= n_early_output
  n_audio_channels.append(n_audio) # audio channel after early output

# in_layers
WN_in_layers = L * kernal_size * C_wn * n_layers * n_flows # depthwise
WN_in_layers += L * C_wn * C_wn_middle * n_layers * n_flows # pointwise
print('MACs of in_layers', WN_in_layers / duration / 1e9)
# cond_layers
WN_cond_layers = L_mel * C_mel * C_wn_middle * n_layers * n_flows
print('MACs of cond_layers', WN_cond_layers / duration / 1e9)
# res_skip_layers
WN_res_layers = L * C_wn * C_wn * n_layers * n_flows
print('MACs of res_skip_layers', WN_res_layers / duration / 1e9)
# invertible convs
inv1x1 = np.sum([n**2 * L for n in n_audio_channels])
print('MACs of invertible conv layers', inv1x1 / duration / 1e9)
# start
starts = np.sum([n / 2 * C_wn * L for n in n_audio_channels])
print('MACs of start conv layers', starts / duration / 1e9)
#end
ends = np.sum([C_wn * n * L for n in n_audio_channels])
print('MACs of end conv layers', ends / duration / 1e9)
# total
total = WN_in_layers + WN_cond_layers + WN_res_layers + inv1x1 + starts + ends
print('Total number of MACs is', total / duration / 1e9)
print('Reduction compared with WaveGlow', WG_total / total)

('MACs of in_layers', 1.1172758068965518)
('MACs of cond_layers', 0.34711481379310344)
('MACs of res_skip_layers', 0.5553837020689656)
('MACs of invertible conv layers', 0.0502141351724138)
('MACs of start conv layers', 0.029287812413793107)
('MACs of end conv layers', 0.058575624827586215)
('Total number of MACs is', 2.157851895172414)
('Reduction compared with WaveGlow', 106)


**SqueezeWave L=128, C=128**

In [0]:
L = 128 # audio length
n_audio_channel_init = 128 # initial audio channel 
L_mel = 64 # mel-spectrogram length
C_mel =80 # mel-spectrogram channel 
kernal_size = 3
C_wn = 128 # input channel size of in_layer
C_wn_middle = C_wn * 2 # output channel size of in_layer and cond_layer
n_flows = 12
n_layers = 8
n_early_output = 16
n_early_output_interval = 2
duration = 0.725

n_audio_channels = []
n_audio = n_audio_channel_init
for i in range(n_flows):
  if i % n_early_output_interval == 0 and i > 0:
    n_audio -= n_early_output
  n_audio_channels.append(n_audio) # audio channel after early output

# in_layers
WN_in_layers = L * kernal_size * C_wn * n_layers * n_flows # depthwise
WN_in_layers += L * C_wn * C_wn_middle * n_layers * n_flows # pointwise
print('MACs of in_layers', WN_in_layers / duration / 1e9)
# cond_layers
WN_cond_layers = L_mel * C_mel * C_wn_middle * n_layers * n_flows
print('MACs of cond_layers', WN_cond_layers / duration / 1e9)
# res_skip_layers
WN_res_layers = L * C_wn * C_wn * n_layers * n_flows
print('MACs of res_skip_layers', WN_res_layers / duration / 1e9)
# invertible convs
inv1x1 = np.sum([n**2 * L for n in n_audio_channels])
print('MACs of invertible conv layers', inv1x1 / duration / 1e9)
# start
starts = np.sum([n / 2 * C_wn * L for n in n_audio_channels])
print('MACs of start conv layers', starts / duration / 1e9)
#end
ends = np.sum([C_wn * n * L for n in n_audio_channels])
print('MACs of end conv layers', ends / duration / 1e9)
# total
total = WN_in_layers + WN_cond_layers + WN_res_layers + inv1x1 + starts + ends
print('Total number of MACs is', total / duration / 1e9)
print('Reduction compared with WaveGlow', WG_total / total)

('MACs of in_layers', 0.5618921048275862)
('MACs of cond_layers', 0.17355740689655172)
('MACs of res_skip_layers', 0.2776918510344828)
('MACs of invertible conv layers', 0.017988502068965517)
('MACs of start conv layers', 0.011932071724137933)
('MACs of end conv layers', 0.023864143448275865)
('Total number of MACs is', 1.06692608)
('Reduction compared with WaveGlow', 214)


**SqueezeWave L=128, C=256**

In [0]:
L = 128 # audio length
n_audio_channel_init = 128 # initial audio channel 
L_mel = 64 # mel-spectrogram length
C_mel =80 # mel-spectrogram channel 
kernal_size = 3
C_wn = 256 # input channel size of in_layer
C_wn_middle = C_wn * 2 # output channel size of in_layer and cond_layer
n_flows = 12
n_layers = 8
n_early_output = 16
n_early_output_interval = 2
duration = 0.725

n_audio_channels = []
n_audio = n_audio_channel_init
for i in range(n_flows):
  if i % n_early_output_interval == 0 and i > 0:
    n_audio -= n_early_output
  n_audio_channels.append(n_audio) # audio channel after early output

# in_layers
WN_in_layers = L * kernal_size * C_wn * n_layers * n_flows # depthwise
WN_in_layers += L * C_wn * C_wn_middle * n_layers * n_flows # pointwise
print('MACs of in_layers', WN_in_layers / duration / 1e9)
# cond_layers
WN_cond_layers = L_mel * C_mel * C_wn_middle * n_layers * n_flows
print('MACs of cond_layers', WN_cond_layers / duration / 1e9)
# res_skip_layers
WN_res_layers = L * C_wn * C_wn * n_layers * n_flows
print('MACs of res_skip_layers', WN_res_layers / duration / 1e9)
# invertible convs
inv1x1 = np.sum([n**2 * L for n in n_audio_channels])
print('MACs of invertible conv layers', inv1x1 / duration / 1e9)
# start
starts = np.sum([n / 2 * C_wn * L for n in n_audio_channels])
print('MACs of start conv layers', starts / duration / 1e9)
#end
ends = np.sum([C_wn * n * L for n in n_audio_channels])
print('MACs of end conv layers', ends / duration / 1e9)
# total
total = WN_in_layers + WN_cond_layers + WN_res_layers + inv1x1 + starts + ends
print('Total number of MACs is', total / duration / 1e9)
print('Reduction compared with WaveGlow', WG_total / total)

('MACs of in_layers', 2.2345516137931036)
('MACs of cond_layers', 0.34711481379310344)
('MACs of res_skip_layers', 1.1107674041379312)
('MACs of invertible conv layers', 0.017988502068965517)
('MACs of start conv layers', 0.023864143448275865)
('MACs of end conv layers', 0.04772828689655173)
('Total number of MACs is', 3.7820147641379314)
('Reduction compared with WaveGlow', 60)
