MFCC
1. Mel Frequency Cepstrum Coefficient
$$Mel(f)=2585*log_{10}(1+\frac{f}{700})$$
import numpy as np
import matplotlib.pyplot as plt
x = np.arange(8001)
y = 2595 * np.log10(1+x/700)
plt.plot(x, y, color='blue', linewidth=3)
plt.xlabel("f", fontsize=17)
plt.ylabel("Mel(f)", fontsize=17)
plt.xlim(0,x[-1])
plt.ylim(0,y[-1])
plt.savefig('mel_f.png', dpi=500)
2. workflow
import numpy as np
import scipy.io.wavfile
from matplotlib import pyplot as plt
from scipy.fftpack import dct
# 原始数据,读取前3.5s 的数据
sample_rate, signal = scipy.io.wavfile.read('OSR_us_000_0010_8k.wav')
original_signal = signal[0:int(3.5*sample_rate)]
signal_num = np.arange(len(signal))
sample_num = np.arange(len(original_signal))
# 绘图 01
plt.figure(figsize=(11,7), dpi=500)
plt.subplot(211)
plt.plot(signal_num/sample_rate, signal, color='black')
plt.plot(sample_num/sample_rate, original_signal, color='blue')
plt.ylabel("Amplitude")
plt.title("signal of Voice")
plt.subplot(212)
plt.plot(sample_num/sample_rate, original_signal, color='blue')
plt.xlabel("Time (sec)")
plt.ylabel("Amplitude")
plt.title("3.5s signal of Voice ")
plt.savefig('mfcc_01.png')
reference
https://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html https://zhuanlan.zhihu.com/p/88625876