from sklearn.cluster import KMeans
import numpy as np
from scipy.optimize import minimize
class alg:
def __init__(self, similarity_matrices, num_clusters, random_seed=0):
self.num_clusters = num_clusters
self.random_seed = random_seed
self.num_nodes = None
self.similarity_matrices = []
self.metapath_index = {}
self.alpha = None
self.beta = None
self.gamma = None
for index, (metapath, matrix) in enumerate(similarity_matrices.items()):
if self.num_nodes is None:
self.num_nodes = matrix.shape[0]
if matrix.shape != (self.num_nodes, self.num_nodes):
raise ValueError('Invalid shape of similarity matrix.')
row_normalized_matrix = matrix/matrix.sum(axis=1, keepdims=True)
self.similarity_matrices.append(row_normalized_matrix)
self.metapath_index[metapath] = index
self.similarity_matrices = np.array(self.similarity_matrices)
self.num_metapaths = len(similarity_matrices)
def run(self, verbose=False, cluster_using='similarity'):
if cluster_using not in ['similarity', 'laplacian']:
raise ValueError('Invalid option for parameter \'cluster_using\'.')
similarity_matrix, metapath_weights = self.optimize(verbose=verbose)
if cluster_using == 'similarity':
labels = self.cluster(similarity_matrix)
elif cluster_using == 'laplacian':
laplacian = normalized_laplacian(similarity_matrix)
labels = self.cluster(eigenvectors(laplacian, num=self.num_clusters))
metapath_weights_dict = {metapath: metapath_weights[index] for metapath, index in self.metapath_index.items()}
return labels, similarity_matrix, metapath_weights_dict
def cluster(self, feature_matrix):
return KMeans(self.num_clusters, n_init=10, random_state=self.random_seed).fit_predict(feature_matrix)
def optimize(self, num_iterations=20, alpha=0.5, beta=10, gamma=0.01, verbose=False):
self.alpha = alpha
self.beta = beta
self.gamma = gamma
lambdas = np.ones(self.num_metapaths)/self.num_metapaths
W = np.tensordot(lambdas, self.similarity_matrices, axes=[[0], [0]])
S = W
for iteration in range(num_iterations):
if verbose:
loss = np.trace(np.matmul((S - W).T, (S - W)))
loss += self.alpha * np.trace(np.matmul(S.T, S))
loss += self.beta * np.dot(lambdas, lambdas)
loss += self.gamma * np.sum(eigenvalues(normalized_laplacian(S), num=self.num_clusters))
print('Iteration %d: Loss = %0.3f' % (iteration, loss))
F = self.optimize_F(S)
S = self.optimize_S(W, F)
lambdas = self.optimize_lambdas(S, lambdas)
W = np.tensordot(lambdas, self.similarity_matrices, axes=[[0], [0]])
return S, lambdas
def optimize_F(self, S):
LS = normalized_laplacian(S)
return eigenvectors(LS, num=self.num_clusters)
def optimize_S(self, W, F):
Q = distance_matrix(F, metric='euclidean')
P = (2*W - self.gamma*Q)/(2 + 2*self.alpha)
S = np.zeros((self.num_nodes, self.num_nodes))
for index in range(S.shape[0]):
S[index] = best_simplex_projection(P[index])
return S
def optimize_lambdas(self, S, init_lambdas):
def objective(lambdas):
W = np.tensordot(lambdas, self.similarity_matrices, axes=[[0], [0]])
value = np.trace(np.matmul(W.T, W))
value -= 2 * np.trace(np.matmul(S.T, W))
value += self.beta * np.dot(lambdas, lambdas)
return value
def constraints():
def sum_one(lambdas):
return np.sum(lambdas) - 1
return {
'type': 'eq',
'fun': sum_one,
}
def bounds(init_lambdas):
return [(0, 1) for init_lambda in init_lambdas]
return minimize(objective, init_lambdas, method='SLSQP', constraints=constraints(), bounds=bounds(init_lambdas)).x
MATLAB伪代码算法实现:
function [y, S, evs, A] = alg(mp_matrix, c, true_cluster)
NITER = 20;
zr = 10e-11;
alpha = 0.5;
beta = 10;
gamma = 0.01;
P = size(mp_matrix,1);
n = size(mp_matrix,2);
lambda = ones(P,1)./P;
eps = 1e-10;
A0 = zeros(n,n);
for p = 1:P
A0 = A0 + lambda(p) * squeeze(mp_matrix(p,:,:));
end;
A0 = A0-diag(diag(A0));
A10 = (A0+A0')/2;
D10 = diag(sum(A10));
L0 = D10 - A10;
[F0, ~, evs]=eig1(L0, n, 0);
F = F0(:,1:c);
[pred] = postprocess(F,c,true_cluster);
for iter = 1:NITER
dist = L2_distance_1(F',F');
S = zeros(n);
for i=1:n
a0 = A0(i,:);
idxa0 = 1:n;
ai = a0(idxa0);
di = dist(i,idxa0);
ad = (ai-0.5*gamma*di)/(1+alpha); S(i,idxa0) = EProjSimplex_new(ad);
end;
A = S;
A = (A+A')/2;
D = diag(sum(A));
L = D-A;
F_old = F;
[F, ~, ev]=eig1(L, c, 0);
[pred] = postprocess(F,c,true_cluster);
evs(:,iter+1) = ev;
fn1 = sum(ev(1:c));
fn2 = sum(ev(1:c+1));
lambda_old = lambda;
if fn1 > zr
gamma = 2*gamma;
lambda = optimizeLambda(mp_matrix, S, beta); % optimize lambda
elseif fn2 < zr
gamma = gamma/2; F = F_old; lambda = lambda_old;
else
break;
end;
A0 = zeros(n,n);
for p = 1:P
A0 = A0 + lambda(p) * squeeze(mp_matrix(p,:,:));
end;
end;
[clusternum, y]=graphconncomp(sparse(A)); y = y';
nmi = calculateNMI(y,true_cluster);
purity = eval_acc_purity(true_cluster,y);
ri = eval_rand(true_cluster,y);
fprintf('Final NMI is %f\n',nmi);
fprintf('Final purity is %f\n',purity);
fprintf('Final rand is %f\n',ri);
if clusternum ~= c
sprintf('Can not find the correct cluster number: %d', c)
end;
谱聚类将聚类转化为图分割问题,该问题优化衡量分割质量的某个标准,例如正则化切割。通常,给定一组对象 X={x1,x2,…,xn},标准谱聚类方法首先构造一个无向图 G=(X,S),其中 X 表示顶点集,S 是一个矩阵,Sij度量对象xi 和xj 之间的相似性。然后,计算拉普拉斯矩阵LS,在此基础上执行特征分解以获得与 k 个最小特征值相对应的 k 个特征向量,其中 k 是所需的聚类数量。这些特征向量被用作对象的新特征空间。最后,应用后处理步骤,例如 k均值和光谱旋转将对象划分为k 个聚类。