-
Notifications
You must be signed in to change notification settings - Fork 1
/
denoiser_test.py
142 lines (114 loc) · 5.84 KB
/
denoiser_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# Imports
import librosa
import pandas as pd
import os
import datetime
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import IPython.display as ipd
import librosa.display
import scipy
import glob
import numpy as np
import math
import warnings
import pickle
from sklearn.utils import shuffle
import sys
from tensorflow import keras
model = keras.models.load_model('denoiser_CNN.h5')
windowLength = 256
overlap = round(0.25 * windowLength) # overlap of 75%
ffTLength = windowLength
inputFs = 48e3
fs = 16e3
numFeatures = ffTLength//2 + 1
numSegments = 8
class FeatureExtractor:
def __init__(self, audio, *, windowLength, overlap, sample_rate):
self.audio = audio
self.ffT_length = windowLength
self.window_length = windowLength
self.overlap = overlap
self.sample_rate = sample_rate
self.window = scipy.signal.hamming(self.window_length, sym=False)
def get_stft_spectrogram(self):
return librosa.stft(self.audio, n_fft=self.ffT_length, win_length=self.window_length, hop_length=self.overlap,
window=self.window, center=True)
def get_audio_from_stft_spectrogram(self, stft_features):
return librosa.istft(stft_features, win_length=self.window_length, hop_length=self.overlap,
window=self.window, center=True)
def get_mel_spectrogram(self):
return librosa.feature.melspectrogram(self.audio, sr=self.sample_rate, power=2.0, pad_mode='reflect',
n_fft=self.ffT_length, hop_length=self.overlap, center=True)
def get_audio_from_mel_spectrogram(self, M):
return librosa.feature.inverse.mel_to_audio(M, sr=self.sample_rate, n_fft=self.ffT_length, hop_length=self.overlap,
win_length=self.window_length, window=self.window,
center=True, pad_mode='reflect', power=2.0, n_iter=32, length=None)
###################### utility functions #############################
def read_audio(filepath, sample_rate, normalize=True):
"""Read an audio file and return it as a numpy array"""
audio, sr = librosa.load(filepath, sr=sample_rate)
if normalize:
div_fac = 1 / np.max(np.abs(audio)) / 3.0
audio = audio * div_fac
return audio, sr
def add_noise_to_clean_audio(clean_audio, noise_signal):
"""Adds noise to an audio sample"""
if len(clean_audio) >= len(noise_signal):
# print("The noisy signal is smaller than the clean audio input. Duplicating the noise.")
while len(clean_audio) >= len(noise_signal):
noise_signal = np.append(noise_signal, noise_signal)
## Extract a noise segment from a random location in the noise file
ind = np.random.randint(0, noise_signal.size - clean_audio.size)
noiseSegment = noise_signal[ind: ind + clean_audio.size]
speech_power = np.sum(clean_audio ** 2)
noise_power = np.sum(noiseSegment ** 2)
noisyAudio = clean_audio + np.sqrt(speech_power / noise_power) * noiseSegment
return noisyAudio
def play(audio, sample_rate):
ipd.display(ipd.Audio(data=audio, rate=sample_rate))
def prepare_input_features(stft_features):
noisySTFT = np.concatenate([stft_features[:,0:numSegments-1], stft_features], axis=1)
stftSegments = np.zeros((numFeatures, numSegments , noisySTFT.shape[1] - numSegments + 1))
for index in range(noisySTFT.shape[1] - numSegments + 1):
stftSegments[:,:,index] = noisySTFT[:,index:index + numSegments]
return stftSegments
def revert_features_to_audio(noiseAudioFeatureExtractor,features, phase, cleanMean=None, cleanStd=None):
if cleanMean and cleanStd:
features = cleanStd * features + cleanMean
phase = np.transpose(phase, (1, 0))
features = np.squeeze(features)
features = features * np.exp(1j * phase)
features = np.transpose(features, (1, 0))
return noiseAudioFeatureExtractor.get_audio_from_stft_spectrogram(features)
def denoiseAudio(clean_audio, noise_audio):
cleanAudio, sr = read_audio(clean_audio, sample_rate=fs)
noiseAudio, sr = read_audio(noise_audio, sample_rate=fs)
cleanAudioFeatureExtractor = FeatureExtractor(cleanAudio, windowLength=windowLength, overlap=overlap, sample_rate=sr)
stft_features = cleanAudioFeatureExtractor.get_stft_spectrogram()
stft_features = np.abs(stft_features)
noisyAudio = add_noise_to_clean_audio(cleanAudio, noiseAudio)
librosa.output.write_wav(clean_audio+"_noisy.wav", noisyAudio, 16000)
#ipd.Audio(data=noisyAudio, rate=fs) # load a local WAV file
noiseAudioFeatureExtractor = FeatureExtractor(noisyAudio, windowLength=windowLength, overlap=overlap, sample_rate=sr)
noise_stft_features = noiseAudioFeatureExtractor.get_stft_spectrogram()
noisyPhase = np.angle(noise_stft_features)
noise_stft_features = np.abs(noise_stft_features)
mean = np.mean(noise_stft_features)
std = np.std(noise_stft_features)
noise_stft_features = (noise_stft_features - mean) / std
predictors = prepare_input_features(noise_stft_features)
predictors = np.reshape(predictors, (predictors.shape[0], predictors.shape[1], 1, predictors.shape[2]))
predictors = np.transpose(predictors, (3, 0, 1, 2)).astype(np.float32)
STFTFullyConvolutional = model.predict(predictors)
denoisedAudioFullyConvolutional = revert_features_to_audio(noiseAudioFeatureExtractor,STFTFullyConvolutional, noisyPhase, mean, std)
# ipd.Audio(data=denoisedAudioFullyConvolutional, rate=fs) # load a local WAV file
librosa.output.write_wav(clean_audio+"_output.wav", denoisedAudioFullyConvolutional, 16000)
print("write successful")
print("Audio saved: " + clean_audio + "_output.wav")
if __name__ == "__main__":
clean_audio = sys.argv[1]
noisy_audio = sys.argv[2]
denoiseAudio(clean_audio, noisy_audio)