Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up

A differentiable version of SPTK

License

NotificationsYou must be signed in to change notification settings

sp-nitech/diffsptk

Repository files navigation

diffsptk is a differentiable version ofSPTK based on the PyTorch framework.

Latest ManualStable ManualDownloadsPython VersionPyTorch VersionPyPI VersionCodecovLicenseGitHub ActionsRuff

Requirements

  • Python 3.10+
  • PyTorch 2.3.1+

Documentation

  • Seethis page for the reference manual.
  • Ourpaper is available on the ISCA Archive.

Installation

The latest stable release can be installed through PyPI by running

pip install diffsptk

The development release can be installed from the master branch:

pip install git+https://github.com/sp-nitech/diffsptk.git@master

Examples

Running on a GPU

importdiffsptkstft_params= {"frame_length":400,"frame_period":80,"fft_length":512}# Read waveform.x,sr=diffsptk.read("assets/data.wav",device="cuda")# Compute spectrogram using a nn.Module class.X1=diffsptk.STFT(**stft_params).to("cuda")(x)# Compute spectrogram using a functional method.X2=diffsptk.functional.stft(x,**stft_params)assertX1.device==X2.deviceassertX1.allclose(X2)

Mel-cepstral analysis and synthesis

importdiffsptkfl=400# Frame length.fp=80# Frame period.n_fft=512# FFT length.M=24# Mel-cepstrum dimensions.# Read waveform.x,sr=diffsptk.read("assets/data.wav")# Compute STFT amplitude of x.stft=diffsptk.STFT(frame_length=fl,frame_period=fp,fft_length=n_fft)X=stft(x)# Estimate mel-cepstrum of x.alpha=diffsptk.get_alpha(sr)mcep=diffsptk.MelCepstralAnalysis(fft_length=n_fft,cep_order=M,alpha=alpha,n_iter=10,)mc=mcep(X)# Reconstruct x.mlsa=diffsptk.MLSA(filter_order=M,frame_period=fp,alpha=alpha,taylor_order=20)x_hat=mlsa(mlsa(x,-mc),mc)# Write reconstructed waveform.diffsptk.write("reconst.wav",x_hat,sr)# Compute error.error= (x_hat-x).abs().sum()print(error)# Extract pitch of x.pitch=diffsptk.Pitch(frame_period=fp,sample_rate=sr,f_min=80,f_max=180,voicing_threshold=0.4,out_format="pitch",)p=pitch(x)# Generate excitation signal.excite=diffsptk.ExcitationGeneration(frame_period=fp)e=excite(p)n=diffsptk.nrand(x.size(0)-1)# Synthesize waveform.x_voiced=mlsa(e,mc)x_unvoiced=mlsa(n,mc)# Output analysis-synthesis result.diffsptk.write("voiced.wav",x_voiced,sr)diffsptk.write("unvoiced.wav",x_unvoiced,sr)

WORLD analysis and mel-cepstral synthesis

importdiffsptkfp=80# Frame period.n_fft=1024# FFT length.M=24# Mel-cepstrum dimensions.# Read waveform.x,sr=diffsptk.read("assets/data.wav")# Extract F0 of x, or prepare well-estimated F0.pitch=diffsptk.Pitch(frame_period=fp,sample_rate=sr,f_min=80,f_max=180,voicing_threshold=0.4,out_format="f0",)f0=pitch(x)# Extract aperiodicity of x by D4C.ap=diffsptk.Aperiodicity(frame_period=fp,sample_rate=sr,fft_length=n_fft,algorithm="d4c",out_format="a",)A=ap(x,f0)# Extract spectral envelope of x by CheapTrick.pitch_spec=diffsptk.PitchAdaptiveSpectralAnalysis(frame_period=fp,sample_rate=sr,fft_length=n_fft,algorithm="cheap-trick",out_format="power",)H=pitch_spec(x,f0)# Estimate mel-cepstrum of x.alpha=diffsptk.get_alpha(sr)mcep=diffsptk.MelCepstralAnalysis(fft_length=n_fft,cep_order=M,alpha=alpha)mc_a=mcep(A)mc_h=mcep(H)# Generate excitation signals.excite=diffsptk.ExcitationGeneration(frame_period=fp,unvoiced_region="zeros")p= (sr/f0).nan_to_num(posinf=0)pulse=excite(p)noise=diffsptk.nrand(len(pulse)-1)# Make mixed excitation signal and reconstruct x.mlsa=diffsptk.MLSA(filter_order=M,frame_period=fp,alpha=alpha,taylor_order=20)e_p=pulse-mlsa(pulse,mc_a)e_a=mlsa(noise,mc_a)e=e_p+e_ax_hat=mlsa(e,mc_h)# Write reconstructed waveform.diffsptk.write("reconst.wav",x_hat,sr)

LPC analysis and synthesis

importdiffsptkfl=400# Frame length.fp=80# Frame period.M=24# LPC dimensions.# Read waveform.x,sr=diffsptk.read("assets/data.wav")# Estimate LPC of x.frame=diffsptk.Frame(frame_length=fl,frame_period=fp)window=diffsptk.Window(in_length=fl)lpc=diffsptk.LPC(frame_length=fl,lpc_order=M,eps=1e-6)a=lpc(window(frame(x)))# Convert to inverse filter coefficients.norm0=diffsptk.AllPoleToAllZeroDigitalFilterCoefficients(filter_order=M)b=norm0(a)# Reconstruct x.zerodf=diffsptk.AllZeroDigitalFilter(filter_order=M,frame_period=fp)poledf=diffsptk.AllPoleDigitalFilter(filter_order=M,frame_period=fp)x_hat=poledf(zerodf(x,b),a)# Write reconstructed waveform.diffsptk.write("reconst.wav",x_hat,sr)# Compute error.error= (x_hat-x).abs().sum()print(error)

Mel-spectrogram, MFCC, and PLP extraction

importdiffsptkfl=400# Frame lengthfp=80# Frame periodn_fft=512# FFT lengthn_channel=80# Number of channelsM=12# MFCC/PLP dimensions# Read waveform.x,sr=diffsptk.read("assets/data.wav")# Compute STFT amplitude of x.stft=diffsptk.STFT(frame_length=fl,frame_period=fp,fft_length=n_fft)X=stft(x)# Extract log mel-spectrogram.fbank=diffsptk.MelFilterBankAnalysis(fft_length=n_fft,n_channel=n_channel,sample_rate=sr,)Y=fbank(X)print(Y.shape)# Extract MFCC.mfcc=diffsptk.MFCC(fft_length=n_fft,mfcc_order=M,n_channel=n_channel,sample_rate=sr,)Y=mfcc(X)print(Y.shape)# Extract PLP.plp=diffsptk.PLP(fft_length=n_fft,plp_order=M,n_channel=n_channel,sample_rate=sr,)Y=plp(X)print(Y.shape)

Subband decomposition

importdiffsptkK=4# Number of subbands.M=40# Order of filter.# Read waveform.x,sr=diffsptk.read("assets/data.wav")# Decompose x.pqmf=diffsptk.PQMF(K,M)decimate=diffsptk.Decimation(K)y=decimate(pqmf(x))# Reconstruct x.interpolate=diffsptk.Interpolation(K)ipqmf=diffsptk.IPQMF(K,M)x_hat=ipqmf(interpolate(K*y)).reshape(-1)# Write reconstructed waveform.diffsptk.write("reconst.wav",x_hat,sr)# Compute error.error= (x_hat-x).abs().sum()print(error)

Gammatone filter bank analysis and synthesis

importdiffsptk# Read waveform.x,sr=diffsptk.read("assets/data.wav")# Decompose x.gammatone=diffsptk.GammatoneFilterBankAnalysis(sr)y=gammatone(x)# Reconstruct x.igammatone=diffsptk.GammatoneFilterBankSynthesis(sr)x_hat=igammatone(y).reshape(-1)# Write reconstructed waveform.diffsptk.write("reconst.wav",x_hat,sr)# Compute error.error= (x_hat-x).abs().sum()print(error)

Constant-Q transform

importdiffsptkimportlibrosa# This is to get sample audio.fp=128# Frame period.K=252# Number of CQ-bins.B=36# Number of bins per octave.# Read waveform.x,sr=diffsptk.read(librosa.ex("trumpet"))# Transform x.cqt=diffsptk.CQT(fp,sr,n_bin=K,n_bin_per_octave=B)c=cqt(x)# Reconstruct x.icqt=diffsptk.ICQT(fp,sr,n_bin=K,n_bin_per_octave=B)x_hat=icqt(c,out_length=x.size(0))# Write reconstructed waveform.diffsptk.write("reconst.wav",x_hat,sr)# Compute error.error= (x_hat-x).abs().sum()print(error)

Modified discrete cosine transform

importdiffsptkfl=512# Frame length.# Read waveform.x,sr=diffsptk.read("assets/data.wav")# Transform x.mdct=diffsptk.MDCT(fl)c=mdct(x)# Reconstruct x.imdct=diffsptk.IMDCT(fl)x_hat=imdct(c,out_length=x.size(0))# Write reconstructed waveform.diffsptk.write("reconst.wav",x_hat,sr)# Compute error.error= (x_hat-x).abs().sum()print(error)

Vector quantization

importdiffsptkK=2# Codebook size.M=4# Order of vector.# Prepare input.x=diffsptk.nrand(M)# Quantize x.vq=diffsptk.VectorQuantization(M,K)x_hat,indices,commitment_loss=vq(x)# Compute error.error= (x_hat-x).abs().sum()print(error)

License

This software is released under the Apache License 2.0.

Citation

@InProceedings{sp-nitech2023sptk,author ={Takenori Yoshimura and Takato Fujimoto and Keiichiro Oura and Keiichi Tokuda},title ={{SPTK4}: An open-source software toolkit for speech signal processing},booktitle ={12th ISCA Speech Synthesis Workshop (SSW 2023)},pages ={211--217},year ={2023},}

[8]ページ先頭

©2009-2025 Movatter.jp