I am trying to plot results of multiple clustering algorithms on multiple data sets
The code is the following
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from time import time
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
np.random.seed(0)
cs_arry=[[0,1,0,1],[1,0,0,1],[0,0,1,1],[1,1,0,1],[1,0,1,1],[0,1,1,1]]
cs_arry=np.array(cs_arry * 20)
colors = np.array([x for x in cs_arry])
#################
n_pts = 1500#####
#################
noisy_circles = datasets.make_circles(n_samples=n_pts, factor=.5, noise=.05)
no_structure = np.random.rand(n_pts, 2), None
my_circles=make_circ(n_samples=n_pts, factor=.35, noise=.025, in_fact=2)
##############################################
blob_centers = [(-10, -10), (-3, 10), (10, 10)] # adjust blob centers for blob data making
##############################################
blobs=datasets.make_blobs(n_samples=n_pts, n_features=3, centers=blob_centers, cluster_std=3,
center_box=(-20,20), shuffle=True, random_state=0)
clustering_names=['Input','K-means','K-medoids','AP','Spectral','Gaussian','Hierarchical']
nofclusters=3
plt.figure(figsize=(len(clustering_names) * 2 + 3, 9.5))
plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=0,
hspace=0)
data_sets = [noisy_circles, my_circles, blobs, no_structure]
plot_num = 1
for i_dataset, dataset in enumerate(data_sets):
x, y = dataset
x = StandardScaler().fit_transform(x)
bandwidth = cluster.estimate_bandwidth(x, quantile=0.3)
connectivity = kneighbors_graph(x, n_neighbors=10, include_self=False)
connectivity = 0.5 * (connectivity + connectivity.T)
#clustering algorithms
orig_data=cluster.KMeans(n_clusters=1)
kmeans=cluster.KMeans(n_clusters=nofclusters,init='k-means++')
Kmedoids=cluster.KMeans(n_clusters=nofclusters,init='random')
heir = cluster.AgglomerativeClustering(n_clusters=nofclusters, linkage='ward',
connectivity=connectivity)
spectral = cluster.SpectralClustering(n_clusters=2,
eigen_solver='arpack',
affinity="nearest_neighbors")
affinity_propagation = cluster.AffinityPropagation(damping=.9,
preference=-200)
gmm=mixture.gmm(n_components=nofclusters, covariance_type='diag', random_state=None,
thresh=None, tol=0.001, min_covar=0.001, n_iter=100, n_init=1, params='wmc',
init_params='wmc', verbose=0)
#####################################
#####################################
clustering_algo=[orig_data, kmeans, kmedoids, ap, spectral, gmm, heir]
#####################################
#####################################
for name, algorithm in zip(clustering_names, clustering_algo):
t0 = time.time()
algorithm.fit(x)
t1 = time.time()
if hasattr(algorithm, 'labels_'):
y_pred = algorithm.labels_.astype(np.int)
else:
y_pred = algorithm.predict(x)
# plot
plt.subplot(len(datasets), len(clustering_algo), plot_num)
if i_dataset == 0:
plt.title(name, size=18)
if name=='Input':
plt.scatter(x[:, 0], x[:, 1], color='k', s=10, marker='o', facecolors='none')
else: plt.scatter(x[:, 0], x[:, 1], color=colors[y_pred], s=10, marker='o', facecolors='none')
plt.xlim(-2, 2)
plt.ylim(-2, 2)
plt.xticks(())
plt.yticks(())
if name!='Input':
plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
transform=plt.gca().transAxes, size=15,
horizontalalignment='right')
plot_num += 1
plt.show()
I was wondering if there is a way to set different numbers of clusters for different datasets. Specifically, I want the circle datas to have 2 clusters while the remaining sets have 3.
I tried inserting
if data_sets==noisy_circles or my_circles:
nofclusters=2
or
while data_sets==noisy_circles or my_circles:
nofclusters=2
before and after the for loop statement with no results.
Any help is appreciated.
Thank you.
Aucun commentaire:
Enregistrer un commentaire