- Notifications
You must be signed in to change notification settings - Fork14
A differentiable version of SPTK
License
NotificationsYou must be signed in to change notification settings
sp-nitech/diffsptk
Folders and files
Name | Name | Last commit message | Last commit date | |
---|---|---|---|---|
Repository files navigation
diffsptk is a differentiable version ofSPTK based on the PyTorch framework.
- Python 3.10+
- PyTorch 2.3.1+
The latest stable release can be installed through PyPI by running
pip install diffsptk
The development release can be installed from the master branch:
pip install git+https://github.com/sp-nitech/diffsptk.git@master
importdiffsptkstft_params= {"frame_length":400,"frame_period":80,"fft_length":512}# Read waveform.x,sr=diffsptk.read("assets/data.wav",device="cuda")# Compute spectrogram using a nn.Module class.X1=diffsptk.STFT(**stft_params).to("cuda")(x)# Compute spectrogram using a functional method.X2=diffsptk.functional.stft(x,**stft_params)assertX1.device==X2.deviceassertX1.allclose(X2)
importdiffsptkfl=400# Frame length.fp=80# Frame period.n_fft=512# FFT length.M=24# Mel-cepstrum dimensions.# Read waveform.x,sr=diffsptk.read("assets/data.wav")# Compute STFT amplitude of x.stft=diffsptk.STFT(frame_length=fl,frame_period=fp,fft_length=n_fft)X=stft(x)# Estimate mel-cepstrum of x.alpha=diffsptk.get_alpha(sr)mcep=diffsptk.MelCepstralAnalysis(fft_length=n_fft,cep_order=M,alpha=alpha,n_iter=10,)mc=mcep(X)# Reconstruct x.mlsa=diffsptk.MLSA(filter_order=M,frame_period=fp,alpha=alpha,taylor_order=20)x_hat=mlsa(mlsa(x,-mc),mc)# Write reconstructed waveform.diffsptk.write("reconst.wav",x_hat,sr)# Compute error.error= (x_hat-x).abs().sum()print(error)# Extract pitch of x.pitch=diffsptk.Pitch(frame_period=fp,sample_rate=sr,f_min=80,f_max=180,voicing_threshold=0.4,out_format="pitch",)p=pitch(x)# Generate excitation signal.excite=diffsptk.ExcitationGeneration(frame_period=fp)e=excite(p)n=diffsptk.nrand(x.size(0)-1)# Synthesize waveform.x_voiced=mlsa(e,mc)x_unvoiced=mlsa(n,mc)# Output analysis-synthesis result.diffsptk.write("voiced.wav",x_voiced,sr)diffsptk.write("unvoiced.wav",x_unvoiced,sr)
importdiffsptkfp=80# Frame period.n_fft=1024# FFT length.M=24# Mel-cepstrum dimensions.# Read waveform.x,sr=diffsptk.read("assets/data.wav")# Extract F0 of x, or prepare well-estimated F0.pitch=diffsptk.Pitch(frame_period=fp,sample_rate=sr,f_min=80,f_max=180,voicing_threshold=0.4,out_format="f0",)f0=pitch(x)# Extract aperiodicity of x by D4C.ap=diffsptk.Aperiodicity(frame_period=fp,sample_rate=sr,fft_length=n_fft,algorithm="d4c",out_format="a",)A=ap(x,f0)# Extract spectral envelope of x by CheapTrick.pitch_spec=diffsptk.PitchAdaptiveSpectralAnalysis(frame_period=fp,sample_rate=sr,fft_length=n_fft,algorithm="cheap-trick",out_format="power",)H=pitch_spec(x,f0)# Estimate mel-cepstrum of x.alpha=diffsptk.get_alpha(sr)mcep=diffsptk.MelCepstralAnalysis(fft_length=n_fft,cep_order=M,alpha=alpha)mc_a=mcep(A)mc_h=mcep(H)# Generate excitation signals.excite=diffsptk.ExcitationGeneration(frame_period=fp,unvoiced_region="zeros")p= (sr/f0).nan_to_num(posinf=0)pulse=excite(p)noise=diffsptk.nrand(len(pulse)-1)# Make mixed excitation signal and reconstruct x.mlsa=diffsptk.MLSA(filter_order=M,frame_period=fp,alpha=alpha,taylor_order=20)e_p=pulse-mlsa(pulse,mc_a)e_a=mlsa(noise,mc_a)e=e_p+e_ax_hat=mlsa(e,mc_h)# Write reconstructed waveform.diffsptk.write("reconst.wav",x_hat,sr)
importdiffsptkfl=400# Frame length.fp=80# Frame period.M=24# LPC dimensions.# Read waveform.x,sr=diffsptk.read("assets/data.wav")# Estimate LPC of x.frame=diffsptk.Frame(frame_length=fl,frame_period=fp)window=diffsptk.Window(in_length=fl)lpc=diffsptk.LPC(frame_length=fl,lpc_order=M,eps=1e-6)a=lpc(window(frame(x)))# Convert to inverse filter coefficients.norm0=diffsptk.AllPoleToAllZeroDigitalFilterCoefficients(filter_order=M)b=norm0(a)# Reconstruct x.zerodf=diffsptk.AllZeroDigitalFilter(filter_order=M,frame_period=fp)poledf=diffsptk.AllPoleDigitalFilter(filter_order=M,frame_period=fp)x_hat=poledf(zerodf(x,b),a)# Write reconstructed waveform.diffsptk.write("reconst.wav",x_hat,sr)# Compute error.error= (x_hat-x).abs().sum()print(error)
importdiffsptkfl=400# Frame lengthfp=80# Frame periodn_fft=512# FFT lengthn_channel=80# Number of channelsM=12# MFCC/PLP dimensions# Read waveform.x,sr=diffsptk.read("assets/data.wav")# Compute STFT amplitude of x.stft=diffsptk.STFT(frame_length=fl,frame_period=fp,fft_length=n_fft)X=stft(x)# Extract log mel-spectrogram.fbank=diffsptk.MelFilterBankAnalysis(fft_length=n_fft,n_channel=n_channel,sample_rate=sr,)Y=fbank(X)print(Y.shape)# Extract MFCC.mfcc=diffsptk.MFCC(fft_length=n_fft,mfcc_order=M,n_channel=n_channel,sample_rate=sr,)Y=mfcc(X)print(Y.shape)# Extract PLP.plp=diffsptk.PLP(fft_length=n_fft,plp_order=M,n_channel=n_channel,sample_rate=sr,)Y=plp(X)print(Y.shape)
importdiffsptkK=4# Number of subbands.M=40# Order of filter.# Read waveform.x,sr=diffsptk.read("assets/data.wav")# Decompose x.pqmf=diffsptk.PQMF(K,M)decimate=diffsptk.Decimation(K)y=decimate(pqmf(x))# Reconstruct x.interpolate=diffsptk.Interpolation(K)ipqmf=diffsptk.IPQMF(K,M)x_hat=ipqmf(interpolate(K*y)).reshape(-1)# Write reconstructed waveform.diffsptk.write("reconst.wav",x_hat,sr)# Compute error.error= (x_hat-x).abs().sum()print(error)
importdiffsptk# Read waveform.x,sr=diffsptk.read("assets/data.wav")# Decompose x.gammatone=diffsptk.GammatoneFilterBankAnalysis(sr)y=gammatone(x)# Reconstruct x.igammatone=diffsptk.GammatoneFilterBankSynthesis(sr)x_hat=igammatone(y).reshape(-1)# Write reconstructed waveform.diffsptk.write("reconst.wav",x_hat,sr)# Compute error.error= (x_hat-x).abs().sum()print(error)
importdiffsptkimportlibrosa# This is to get sample audio.fp=128# Frame period.K=252# Number of CQ-bins.B=36# Number of bins per octave.# Read waveform.x,sr=diffsptk.read(librosa.ex("trumpet"))# Transform x.cqt=diffsptk.CQT(fp,sr,n_bin=K,n_bin_per_octave=B)c=cqt(x)# Reconstruct x.icqt=diffsptk.ICQT(fp,sr,n_bin=K,n_bin_per_octave=B)x_hat=icqt(c,out_length=x.size(0))# Write reconstructed waveform.diffsptk.write("reconst.wav",x_hat,sr)# Compute error.error= (x_hat-x).abs().sum()print(error)
importdiffsptkfl=512# Frame length.# Read waveform.x,sr=diffsptk.read("assets/data.wav")# Transform x.mdct=diffsptk.MDCT(fl)c=mdct(x)# Reconstruct x.imdct=diffsptk.IMDCT(fl)x_hat=imdct(c,out_length=x.size(0))# Write reconstructed waveform.diffsptk.write("reconst.wav",x_hat,sr)# Compute error.error= (x_hat-x).abs().sum()print(error)
importdiffsptkK=2# Codebook size.M=4# Order of vector.# Prepare input.x=diffsptk.nrand(M)# Quantize x.vq=diffsptk.VectorQuantization(M,K)x_hat,indices,commitment_loss=vq(x)# Compute error.error= (x_hat-x).abs().sum()print(error)
This software is released under the Apache License 2.0.
@InProceedings{sp-nitech2023sptk,author ={Takenori Yoshimura and Takato Fujimoto and Keiichiro Oura and Keiichi Tokuda},title ={{SPTK4}: An open-source software toolkit for speech signal processing},booktitle ={12th ISCA Speech Synthesis Workshop (SSW 2023)},pages ={211--217},year ={2023},}
About
A differentiable version of SPTK