Last active
June 13, 2023 16:21
-
-
Save r9y9/88bda659c97f46f42525 to your computer and use it in GitHub Desktop.
GMM-based statistical voice conversion module (http://r9y9.github.io/blog/2014/07/13/statistical-voice-conversion-wakaran/)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# coding: utf-8 | |
import numpy as np | |
from numpy import linalg | |
from sklearn.mixture import GMM | |
import scipy.linalg | |
import scipy.sparse | |
import scipy.sparse.linalg | |
class GMMMap: | |
"""GMM-based frame-by-frame speech parameter mapping. | |
GMMMap represents a class to transform spectral features of a source | |
speaker to that of a target speaker based on Gaussian Mixture Models | |
of source and target joint spectral features. | |
Notation | |
-------- | |
Source speaker's feature: X = {x_t}, 0 <= t < T | |
Target speaker's feature: Y = {y_t}, 0 <= t < T | |
where T is the number of time frames. | |
Parameters | |
---------- | |
gmm : sklearn.mixture.GMM | |
Gaussian Mixture Models of source and target joint features | |
swap : bool | |
True: source -> target | |
False target -> source | |
Attributes | |
---------- | |
num_mixtures : int | |
the number of Gaussian mixtures | |
weights : array, shape (`num_mixtures`) | |
weights for each gaussian | |
src_means : array, shape (`num_mixtures`, `order of spectral feature`) | |
means of GMM for a source speaker | |
tgt_means : array, shape (`num_mixtures`, `order of spectral feature`) | |
means of GMM for a target speaker | |
covarXX : array, shape (`num_mixtures`, `order of spectral feature`, | |
`order of spectral feature`) | |
variance matrix of source speaker's spectral feature | |
covarXY : array, shape (`num_mixtures`, `order of spectral feature`, | |
`order of spectral feature`) | |
covariance matrix of source and target speaker's spectral feature | |
covarYX : array, shape (`num_mixtures`, `order of spectral feature`, | |
`order of spectral feature`) | |
covariance matrix of target and source speaker's spectral feature | |
covarYY : array, shape (`num_mixtures`, `order of spectral feature`, | |
`order of spectral feature`) | |
variance matrix of target speaker's spectral feature | |
D : array, shape (`num_mixtures`, `order of spectral feature`, | |
`order of spectral feature`) | |
covariance matrices of target static spectral features | |
px : sklearn.mixture.GMM | |
Gaussian Mixture Models of source speaker's features | |
Reference | |
--------- | |
- [Toda 2007] Voice Conversion Based on Maximum Likelihood Estimation | |
of Spectral Parameter Trajectory. | |
http://isw3.naist.jp/~tomoki/Tomoki/Journals/IEEE-Nov-2007_MLVC.pdf | |
""" | |
def __init__(self, gmm, swap=False): | |
# D is the order of spectral feature for a speaker | |
self.num_mixtures, D = gmm.means_.shape[0], gmm.means_.shape[1]/2 | |
self.weights = gmm.weights_ | |
# Split source and target parameters from joint GMM | |
self.src_means = gmm.means_[:, 0:D] | |
self.tgt_means = gmm.means_[:, D:] | |
self.covarXX = gmm.covars_[:, :D, :D] | |
self.covarXY = gmm.covars_[:, :D, D:] | |
self.covarYX = gmm.covars_[:, D:, :D] | |
self.covarYY = gmm.covars_[:, D:, D:] | |
# swap src and target parameters | |
if swap: | |
self.tgt_means, self.src_means = self.src_means, self.tgt_means | |
self.covarYY, self.covarXX = self.covarXX, self.covarYY | |
self.covarYX, self.covarXY = self.XY, self.covarYX | |
# Compute D eq.(12) in [Toda 2007] | |
self.D = np.zeros(self.num_mixtures*D*D).reshape(self.num_mixtures, D, D) | |
for m in range(self.num_mixtures): | |
xx_inv_xy = np.linalg.solve(self.covarXX[m], self.covarXY[m]) | |
self.D[m] = self.covarYY[m] - np.dot(self.covarYX[m], xx_inv_xy) | |
# p(x), which is used to compute posterior prob. for a given source | |
# spectral feature in mapping stage. | |
self.px = GMM(n_components=self.num_mixtures, covariance_type="full") | |
self.px.means_ = self.src_means | |
self.px.covars_ = self.covarXX | |
self.px.weights_ = self.weights | |
def convert(self, src): | |
""" | |
Mapping source spectral feature x to target spectral feature y | |
so that minimize the mean least squared error. | |
More specifically, it returns the value E(p(y|x)]. | |
Parameters | |
---------- | |
src : array, shape (`order of spectral feature`) | |
source speaker's spectral feature that will be transformed | |
Return | |
------ | |
converted spectral feature | |
""" | |
D = len(src) | |
# Eq.(11) | |
E = np.zeros((self.num_mixtures, D)) | |
for m in range(self.num_mixtures): | |
xx = np.linalg.solve(self.covarXX[m], src - self.src_means[m]) | |
E[m] = self.tgt_means[m] + self.covarYX[m].dot(xx) | |
# Eq.(9) p(m|x) | |
posterior = self.px.predict_proba(np.atleast_2d(src)) | |
# Eq.(13) conditinal mean E[p(y|x)] | |
return posterior.dot(E) | |
class TrajectoryGMMMap(GMMMap): | |
""" | |
Trajectory-based speech parameter mapping for voice conversion | |
based on the maximum likelihood criterion. | |
Parameters | |
---------- | |
gmm : scipy.mixture.GMM | |
Gaussian Mixture Models of source and target speaker joint features | |
gv : scipy.mixture.GMM (default=None) | |
Gaussian Mixture Models of target speaker's global variance of spectral | |
feature | |
swap : bool (default=False) | |
True: source -> target | |
False target -> source | |
Attributes | |
---------- | |
TODO | |
Reference | |
--------- | |
- [Toda 2007] Voice Conversion Based on Maximum Likelihood Estimation | |
of Spectral Parameter Trajectory. | |
http://isw3.naist.jp/~tomoki/Tomoki/Journals/IEEE-Nov-2007_MLVC.pdf | |
""" | |
def __init__(self, gmm, T, gv=None, swap=False): | |
GMMMap.__init__(self, gmm, swap) | |
self.T = T | |
# shape[1] = d(src) + d(src_delta) + d(tgt) + d(tgt_delta) | |
D = gmm.means_.shape[1] / 4 | |
## Setup for Trajectory-based mapping | |
self.__construct_weight_matrix(T, D) | |
## Setup for GV post-filtering | |
# It is assumed that GV is modeled as a single mixture GMM | |
if gv != None: | |
self.gv_mean = gv.means_[0] | |
self.gv_covar = gv.covars_[0] | |
self.Pv = np.linalg.inv(self.gv_covar) | |
def __construct_weight_matrix(self, T, D): | |
# Construct Weight matrix W | |
# Eq.(25) ~ (28) | |
for t in range(T): | |
w0 = scipy.sparse.lil_matrix((D, D*T)) | |
w1 = scipy.sparse.lil_matrix((D, D*T)) | |
w0[0:,t*D:(t+1)*D] = scipy.sparse.diags(np.ones(D), 0) | |
if t-1 >= 0: | |
tmp = np.zeros(D) | |
tmp.fill(-0.5) | |
w1[0:,(t-1)*D:t*D] = scipy.sparse.diags(tmp, 0) | |
if t+1 < T: | |
tmp = np.zeros(D) | |
tmp.fill(0.5) | |
w1[0:,(t+1)*D:(t+2)*D] = scipy.sparse.diags(tmp, 0) | |
W_t = scipy.sparse.vstack([w0, w1]) | |
# Slower | |
# self.W[2*D*t:2*D*(t+1),:] = W_t | |
if t == 0: | |
self.W = W_t | |
else: | |
self.W = scipy.sparse.vstack([self.W, W_t]) | |
self.W = scipy.sparse.csr_matrix(self.W) | |
assert self.W.shape == (2*D*T, D*T) | |
def convert(self, src): | |
""" | |
Mapping source spectral feature x to target spectral feature y | |
so that maximize the likelihood of y given x. | |
Parameters | |
---------- | |
src : array, shape (`the number of frames`, `the order of spectral feature`) | |
a sequence of source speaker's spectral feature that will be | |
transformed | |
Return | |
------ | |
a sequence of transformed spectral features | |
""" | |
T, D = src.shape[0], src.shape[1]/2 | |
if T != self.T: | |
self.__construct_weight_matrix(T, D) | |
# A suboptimum mixture sequence (eq.37) | |
optimum_mix = self.px.predict(src) | |
# Compute E eq.(40) | |
self.E = np.zeros((T, 2*D)) | |
for t in range(T): | |
m = optimum_mix[t] # estimated mixture index at time t | |
xx = np.linalg.solve(self.covarXX[m], src[t] - self.src_means[m]) | |
# Eq. (22) | |
self.E[t] = self.tgt_means[m] + np.dot(self.covarYX[m], xx) | |
self.E = self.E.flatten() | |
# Compute D eq.(41). Note that self.D represents D^-1. | |
self.D = np.zeros((T, 2*D, 2*D)) | |
for t in range(T): | |
m = optimum_mix[t] | |
xx_inv_xy = np.linalg.solve(self.covarXX[m], self.covarXY[m]) | |
# Eq. (23) | |
self.D[t] = self.covarYY[m] - np.dot(self.covarYX[m], xx_inv_xy) | |
self.D[t] = np.linalg.inv(self.D[t]) | |
self.D = scipy.linalg.block_diag(*self.D) | |
# represent D as a sparse matrix | |
self.D = scipy.sparse.csr_matrix(self.D) | |
# Compute target static features | |
# eq.(39) | |
covar = self.W.T.dot(self.D.dot(self.W)) | |
y = scipy.sparse.linalg.spsolve(covar, self.W.T.dot(self.D.dot(self.E)),\ | |
use_umfpack=False) | |
return y.reshape((T, D)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Does anyone as the above code in matlab