  • Gap统计量(Gap Statistics)( !pip install --upgrade gap-stat[rust]

  • Calinski-Harabasz指数(Calinski-Harabasz Index )( !pip install yellowbrick

  • Davies Bouldin评分(Davies Bouldin Score )(作为Scikit-Learn的一部分提供)

  • 轮廓评分(Silhouette Score )( !pip install yellowbrick


# Libraries to help with reading and manipulating dataimport pandas as pdimport numpy as np# libaries to help with data visualizationimport matplotlib.pyplot as pltimport seaborn as sns# Removes the limit for the number of displayed columnspd.set_option("display.max_columns", None)# Sets the limit for the number of displayed rowspd.set_option("display.max_rows", 200)# to scale the data using z-scorefrom sklearn.preprocessing import StandardScaler# to compute distancesfrom scipy.spatial.distance import cdist, pdist# to perform k-means clustering and compute silhouette scoresfrom sklearn.cluster import KMeansfrom sklearn.metrics import silhouette_score# to visualize the elbow curve and silhouette scoresfrom yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer# to perform hierarchical clustering, compute cophenetic correlation, and create dendrogramsfrom sklearn.cluster import AgglomerativeClusteringfrom scipy.cluster.hierarchy import dendrogram, linkage, cophenetsns.set(color_codes=True)
from sklearn.datasets import load_iris, load_wine, load_digits, make_blobswine = load_wine()X_wine = wine.dataX_wine



Gap统计量(Gap Statistics)

from gap_statistic import OptimalKfrom sklearn.cluster import KMeansdef KMeans_clustering_func(X, k):    """     K Means Clustering function, which uses the K Means model from sklearn.    These user-defined functions *must* take the X (input features) and a k     when initializing OptimalK    """        # Include any clustering Algorithm that can return cluster centers        m = KMeans(random_state=11, n_clusters=k)    m.fit(X)    return m.cluster_centers_, m.predict(X)#--------------------create a wrapper around OptimalK to extract cluster centers and cluster labelsoptimalK = OptimalK(clusterer=KMeans_clustering_func)#--------------------Run optimal K on the input data (subset_scaled_interim) and number of clustersn_clusters = optimalK(X_wine_scaled, cluster_array=np.arange(1, 15))print('Optimal clusters: ', n_clusters)#--------------------Gap Statistics data frameoptimalK.gap_df[['n_clusters', 'gap_value']]

plt.figure(figsize=(10,6))n_clusters=3plt.plot(optimalK.gap_df.n_clusters.values, optimalK.gap_df.gap_value.values, linewidth=2)plt.scatter(optimalK.gap_df[optimalK.gap_df.n_clusters == n_clusters].n_clusters,            optimalK.gap_df[optimalK.gap_df.n_clusters == n_clusters].gap_value, s=250
