lundi 28 mars 2016

Different no. of clusters for different data sets

I am trying to plot results of multiple clustering algorithms on multiple data sets

The code is the following

import numpy as np
import matplotlib.pyplot as plt
import matplotlib

from time import time

from sklearn.datasets import load_digits
from sklearn.decomposition import PCA 
from sklearn.preprocessing import scale


from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler

np.random.seed(0)


cs_arry=[[0,1,0,1],[1,0,0,1],[0,0,1,1],[1,1,0,1],[1,0,1,1],[0,1,1,1]]
cs_arry=np.array(cs_arry * 20)
colors = np.array([x for x in cs_arry])

#################
n_pts = 1500#####
#################


noisy_circles = datasets.make_circles(n_samples=n_pts, factor=.5, noise=.05)
no_structure = np.random.rand(n_pts, 2), None
my_circles=make_circ(n_samples=n_pts, factor=.35, noise=.025, in_fact=2)
##############################################
blob_centers = [(-10, -10), (-3, 10), (10, 10)] # adjust blob centers for blob data making 
##############################################
blobs=datasets.make_blobs(n_samples=n_pts, n_features=3, centers=blob_centers, cluster_std=3, 
                          center_box=(-20,20), shuffle=True, random_state=0)


clustering_names=['Input','K-means','K-medoids','AP','Spectral','Gaussian','Hierarchical']

nofclusters=3
plt.figure(figsize=(len(clustering_names) * 2 + 3, 9.5))
plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=0,
                    hspace=0)
data_sets = [noisy_circles, my_circles, blobs, no_structure]
plot_num = 1

for i_dataset, dataset in enumerate(data_sets):
    x, y = dataset
    x = StandardScaler().fit_transform(x)

    bandwidth = cluster.estimate_bandwidth(x, quantile=0.3)
    connectivity = kneighbors_graph(x, n_neighbors=10, include_self=False)
    connectivity = 0.5 * (connectivity + connectivity.T)

#clustering algorithms
    orig_data=cluster.KMeans(n_clusters=1)
    kmeans=cluster.KMeans(n_clusters=nofclusters,init='k-means++')
    Kmedoids=cluster.KMeans(n_clusters=nofclusters,init='random')
    heir = cluster.AgglomerativeClustering(n_clusters=nofclusters, linkage='ward',
                                           connectivity=connectivity)
    spectral = cluster.SpectralClustering(n_clusters=2,
                                          eigen_solver='arpack',
                                          affinity="nearest_neighbors")
    affinity_propagation = cluster.AffinityPropagation(damping=.9,
                                                       preference=-200)
    gmm=mixture.gmm(n_components=nofclusters, covariance_type='diag', random_state=None, 
                    thresh=None, tol=0.001, min_covar=0.001, n_iter=100, n_init=1, params='wmc', 
                    init_params='wmc', verbose=0)

    #####################################
    #####################################

    clustering_algo=[orig_data, kmeans, kmedoids, ap, spectral, gmm, heir]

    #####################################
    #####################################

    for name, algorithm in zip(clustering_names, clustering_algo):

        t0 = time.time()
        algorithm.fit(x)
        t1 = time.time()
        if hasattr(algorithm, 'labels_'):
            y_pred = algorithm.labels_.astype(np.int)
        else:
            y_pred = algorithm.predict(x)

        # plot
        plt.subplot(len(datasets), len(clustering_algo), plot_num)

        if i_dataset == 0:
            plt.title(name, size=18)
        if name=='Input':
            plt.scatter(x[:, 0], x[:, 1], color='k', s=10, marker='o', facecolors='none')
        else: plt.scatter(x[:, 0], x[:, 1], color=colors[y_pred], s=10, marker='o', facecolors='none')


        plt.xlim(-2, 2)
        plt.ylim(-2, 2)
        plt.xticks(())
        plt.yticks(())
        if name!='Input':
            plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
                     transform=plt.gca().transAxes, size=15,
                     horizontalalignment='right')
        plot_num += 1

plt.show()

I was wondering if there is a way to set different numbers of clusters for different datasets. Specifically, I want the circle datas to have 2 clusters while the remaining sets have 3.

I tried inserting

if data_sets==noisy_circles or my_circles: 
     nofclusters=2

or

while data_sets==noisy_circles or my_circles: 
     nofclusters=2

before and after the for loop statement with no results.

Any help is appreciated.

Thank you.

Aucun commentaire:

Enregistrer un commentaire