- Notifications
You must be signed in to change notification settings - Fork18
A differentiable version of SPTK
License
NotificationsYou must be signed in to change notification settings
sp-nitech/diffsptk
Folders and files
Name | Name | Last commit message | Last commit date | |
---|---|---|---|---|
Repository files navigation
diffsptk is a differentiable version ofSPTK based on the PyTorch framework.
- Python 3.10+
- PyTorch 2.3.1+
The latest stable release can be installed through PyPI by running
pip install diffsptk
The development release can be installed from the master branch:
pip install git+https://github.com/sp-nitech/diffsptk.git@master
importdiffsptkstft_params= {"frame_length":400,"frame_period":80,"fft_length":512}# Read waveform.x,sr=diffsptk.read("assets/data.wav",device="cuda")# Compute spectrogram using a nn.Module class.X1=diffsptk.STFT(**stft_params,device="cuda")(x)# Compute spectrogram using a functional method.X2=diffsptk.functional.stft(x,**stft_params)print(X1.allclose(X2))
importdiffsptkfl=400# Frame length.fp=80# Frame period.n_fft=512# FFT length.M=24# Mel-cepstrum dimensions.# Read waveform.x,sr=diffsptk.read("assets/data.wav")# Compute STFT amplitude of x.stft=diffsptk.STFT(frame_length=fl,frame_period=fp,fft_length=n_fft)X=stft(x)# Estimate mel-cepstrum of x.alpha=diffsptk.get_alpha(sr)mcep=diffsptk.MelCepstralAnalysis(fft_length=n_fft,cep_order=M,alpha=alpha,n_iter=10,)mc=mcep(X)# Reconstruct x.mlsa=diffsptk.MLSA(filter_order=M,frame_period=fp,alpha=alpha,taylor_order=20)x_hat=mlsa(mlsa(x,-mc),mc)# Write reconstructed waveform.diffsptk.write("reconst.wav",x_hat,sr)# Compute error.error= (x_hat-x).abs().sum()print(error)# Extract pitch of x.pitch=diffsptk.Pitch(frame_period=fp,sample_rate=sr,f_min=80,f_max=180,voicing_threshold=0.4,out_format="pitch",)p=pitch(x)# Generate excitation signal.excite=diffsptk.ExcitationGeneration(frame_period=fp)e=excite(p)n=diffsptk.nrand(x.size(0)-1)# Synthesize waveform.x_voiced=mlsa(e,mc)x_unvoiced=mlsa(n,mc)# Output analysis-synthesis result.diffsptk.write("voiced.wav",x_voiced,sr)diffsptk.write("unvoiced.wav",x_unvoiced,sr)
importdiffsptkfp=80# Frame period.n_fft=1024# FFT length.# Read waveform.x,sr=diffsptk.read("assets/data.wav")# Extract F0 of x, or prepare well-estimated F0.pitch=diffsptk.Pitch(frame_period=fp,sample_rate=sr,f_min=80,f_max=180,voicing_threshold=0.4,out_format="f0",)f0=pitch(x)# Extract aperiodicity of x by D4C.ap=diffsptk.Aperiodicity(frame_period=fp,sample_rate=sr,fft_length=n_fft,algorithm="d4c",out_format="a",)A=ap(x,f0)# Extract spectral envelope of x by CheapTrick.pitch_spec=diffsptk.PitchAdaptiveSpectralAnalysis(frame_period=fp,sample_rate=sr,fft_length=n_fft,algorithm="cheap-trick",out_format="power",)S=pitch_spec(x,f0)# Reconstruct x.world_synth=diffsptk.WorldSynthesis(frame_period=fp,sample_rate=sr,fft_length=n_fft,)x_hat=world_synth(f0,A,S)# Write reconstructed waveform.diffsptk.write("reconst.wav",x_hat,sr)# Compute error.error= (x_hat-x).abs().sum()print(error)
importdiffsptkfl=400# Frame length.fp=80# Frame period.M=24# LPC dimensions.# Read waveform.x,sr=diffsptk.read("assets/data.wav")# Estimate LPC of x.frame=diffsptk.Frame(frame_length=fl,frame_period=fp)window=diffsptk.Window(in_length=fl)lpc=diffsptk.LPC(frame_length=fl,lpc_order=M,eps=1e-5)a=lpc(window(frame(x)))# Convert to inverse filter coefficients.norm0=diffsptk.AllPoleToAllZeroDigitalFilterCoefficients(filter_order=M)b=norm0(a)# Reconstruct x.zerodf=diffsptk.AllZeroDigitalFilter(filter_order=M,frame_period=fp)poledf=diffsptk.AllPoleDigitalFilter(filter_order=M,frame_period=fp)x_hat=poledf(zerodf(x,b),a)# Write reconstructed waveform.diffsptk.write("reconst.wav",x_hat,sr)# Compute error.error= (x_hat-x).abs().sum()print(error)
importdiffsptkfl=400# Frame length.fp=80# Frame period.n_fft=512# FFT length.n_channel=128# Number of channels.# Read waveform.x,sr=diffsptk.read("assets/data.wav")# Compute STFT amplitude of x.stft=diffsptk.STFT(frame_length=fl,frame_period=fp,fft_length=n_fft)X=stft(x)# Extract log mel-spectrogram.fbank=diffsptk.FBANK(fft_length=n_fft,n_channel=n_channel,sample_rate=sr,)Y=fbank(X)# Reconstruct linear spectrogram.ifbank=diffsptk.IFBANK(n_channel=n_channel,fft_length=n_fft,sample_rate=sr,)X_hat=ifbank(Y)# Reconstruct x.griffin=diffsptk.GriffinLim(frame_length=fl,frame_period=fp,fft_length=n_fft,)x_hat=griffin(X_hat,out_length=x.size(0))# Write reconstructed waveform.diffsptk.write("reconst.wav",x_hat,sr)# Compute error.error= (x_hat-x).abs().sum()print(error)
importdiffsptkK=4# Number of subbands.M=40# Order of filter.# Read waveform.x,sr=diffsptk.read("assets/data.wav")# Decompose x.pqmf=diffsptk.PQMF(K,M)decimate=diffsptk.Decimation(K)y=decimate(pqmf(x))# Reconstruct x.interpolate=diffsptk.Interpolation(K)ipqmf=diffsptk.IPQMF(K,M)x_hat=ipqmf(interpolate(K*y)).reshape(-1)# Write reconstructed waveform.diffsptk.write("reconst.wav",x_hat,sr)# Compute error.error= (x_hat-x).abs().sum()print(error)
importdiffsptk# Read waveform.x,sr=diffsptk.read("assets/data.wav")# Decompose x.gammatone=diffsptk.GammatoneFilterBankAnalysis(sr)y=gammatone(x)# Reconstruct x.igammatone=diffsptk.GammatoneFilterBankSynthesis(sr)x_hat=igammatone(y).reshape(-1)# Write reconstructed waveform.diffsptk.write("reconst.wav",x_hat,sr)# Compute error.error= (x_hat-x).abs().sum()print(error)
importdiffsptk# Read waveform.x,sr=diffsptk.read("assets/data.wav")# Decompose x.oband=diffsptk.FractionalOctaveBandAnalysis(sr)y=oband(x)# Reconstruct x.x_hat=y.sum(1).reshape(-1)# Write reconstructed waveform.diffsptk.write("reconst.wav",x_hat,sr)# Compute error.error= (x_hat-x).abs().sum()print(error)
importdiffsptkimportlibrosa# This is to get sample audio.fp=128# Frame period.K=252# Number of CQ-bins.B=36# Number of bins per octave.# Read waveform.x,sr=diffsptk.read(librosa.ex("trumpet"))# Transform x.cqt=diffsptk.CQT(fp,sr,n_bin=K,n_bin_per_octave=B)c=cqt(x)# Reconstruct x.icqt=diffsptk.ICQT(fp,sr,n_bin=K,n_bin_per_octave=B)x_hat=icqt(c,out_length=x.size(0))# Write reconstructed waveform.diffsptk.write("reconst.wav",x_hat,sr)# Compute error.error= (x_hat-x).abs().sum()print(error)
importdiffsptkfl=512# Frame length.# Read waveform.x,sr=diffsptk.read("assets/data.wav")# Transform x.mdct=diffsptk.MDCT(fl)c=mdct(x)# Reconstruct x.imdct=diffsptk.IMDCT(fl)x_hat=imdct(c,out_length=x.size(0))# Write reconstructed waveform.diffsptk.write("reconst.wav",x_hat,sr)# Compute error.error= (x_hat-x).abs().sum()print(error)
importdiffsptkK=2# Codebook size.M=4# Order of vector.# Prepare input.x=diffsptk.nrand(M)# Quantize x.vq=diffsptk.VectorQuantization(M,K)x_hat,indices,commitment_loss=vq(x)# Compute error.error= (x_hat-x).abs().sum()print(error)
This software is released under the Apache License 2.0.
@InProceedings{sp-nitech2023sptk,author ={Takenori Yoshimura and Takato Fujimoto and Keiichiro Oura and Keiichi Tokuda},title ={{SPTK4}: An open-source software toolkit for speech signal processing},booktitle ={12th ISCA Speech Synthesis Workshop (SSW 2023)},pages ={211--217},year ={2023},}
About
A differentiable version of SPTK
Topics
Resources
License
Code of conduct
Uh oh!
There was an error while loading.Please reload this page.
Stars
Watchers
Forks
Uh oh!
There was an error while loading.Please reload this page.
Contributors3
Uh oh!
There was an error while loading.Please reload this page.