import pickle import os os.environ["OMP_NUM_THREADS"] = '1' from sklearn.decomposition import PCA import numpy as np import matplotlib.pyplot as plt from sklearn.cluster import KMeans from sklearn import metrics from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score import shutil import math distance_lists = [] all_distance_lists = [] def showImg(newx): plt.scatter(newx[:, 0], newx[:, 1]) plt.show() def showHistogram(data): fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(10, 8)) bins = np.arange(0, 1.5, 0.1) axs[0, 0].hist(data[0], bins=bins, edgecolor='black') axs[0, 0].set_title('silhouette') axs[0, 0].set_xlabel('Similarity') axs[0, 0].set_ylabel('Frequency') axs[0, 0].legend(labels=['频次']) axs[0, 1].hist(data[1], bins=bins, edgecolor='black') axs[0, 1].set_title('calinski') axs[0, 1].set_xlabel('Similarity') axs[0, 1].set_ylabel('Frequency') axs[0, 1].legend(labels=['频次']) axs[1, 0].hist(data[2], bins=bins, edgecolor='black') axs[1, 0].set_title('davies') axs[1, 0].set_xlabel('Similarity') axs[1, 0].set_ylabel('Frequency') axs[1, 0].legend(labels=['频次']) axs[1, 1].hist(data[3], bins=bins, edgecolor='black') axs[1, 1].set_title('inertia') axs[1, 1].set_xlabel('Similarity') axs[1, 1].set_ylabel('Frequency') axs[1, 1].legend(labels=['频次']) # 显示图形 plt.tight_layout() plt.savefig('multiple_histograms_in_ranges_non_pca.png') plt.show() def get_pca_data(data): pca = PCA(n_components=16) newx = pca.fit_transform(data) # print(pca.explained_variance_ratio_) return newx def cosine_similarity(vec1, vec2): # dot_product = sum(a * b for a, b in zip(vec1, vec2)) # norm_vec1 = math.sqrt(sum(x ** 2 for x in vec1)) # norm_vec2 = math.sqrt(sum(x ** 2 for x in vec2)) # return dot_product / (norm_vec1 * norm_vec2) vec1 = np.array(vec1) vec2 = np.array(vec2) cos_sim = vec1.dot(vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) return cos_sim def get_kclusters(data, k, dicts, standard='silhouette'): Kmeans = KMeans(n_clusters=k, n_init='auto') y_pred = Kmeans.fit_predict(data) if standard == 'silhouette': s_score = silhouette_score(data, y_pred, metric='euclidean') dicts[s_score] = [y_pred, Kmeans.cluster_centers_] elif standard == 'calinski': c_score = calinski_harabasz_score(data, y_pred) dicts[c_score] = [y_pred, Kmeans.cluster_centers_] elif standard == 'davies': d_score = davies_bouldin_score(data, y_pred) dicts[d_score] = [y_pred, Kmeans.cluster_centers_] elif standard == 'inertia': i_score = Kmeans.inertia_ dicts[i_score] = [y_pred, Kmeans.cluster_centers_] return dicts def get_keams(data, standard='silhouette'): global distance_lists distance_list = [] dicts = {} for k in range(2, 10): dicts = get_kclusters(data, k, dicts, standard) if standard == 'silhouette' or standard == 'calinski': max_key = max(dicts.keys()) value = dicts[max_key] elif standard == 'davies' or standard == 'inertia': max_key = min(dicts.keys()) value = dicts[max_key] for num, i in enumerate(value[0]): distance_list.append(abs(cosine_similarity(value[1][i], data[num]))) distance_lists += distance_list return distance_lists def move_file(labels, y_pred): for label, y_label in zip(labels, y_pred): print('label >>> {}'.format(label)) if not os.path.isdir(os.sep.join(['data', label.split('_')[0], str(y_label)])): os.mkdir(os.sep.join(['data', label.split('_')[0], str(y_label)])) try: shutil.move(os.sep.join(['data', label.split('_')[0], label]), os.sep.join(['data', label.split('_')[0], str(y_label), label])) except Exception as e: print(e) def move_video(root_pth='data_test'): # 数据准备 for Dirs in os.listdir(root_pth): for root, dirs, files in os.walk(os.sep.join([root_pth, Dirs])): if len(dirs) > 0: for dir in dirs: filespth = os.sep.join([root_pth, Dirs, dir]) for file in os.listdir(filespth): if file.endswith('.jpg'): oldpth = os.sep.join([filespth, file]) newpth = os.sep.join([root_pth, Dirs, file]) shutil.move(oldpth, newpth) def main(): global distance_lists for standard in ['silhouette', 'calinski', 'davies', 'inertia']: with open('data_test/data.pkl', 'rb') as f: data = pickle.load(f) for keys, values in data.items(): all_data, labels = [], [] temp = None for value in values: for key1, value1 in value.items(): # print('key1 >>>> {}'.format(key1)) if temp is None: labels.append(key1) temp = value1 else: labels.append(key1) temp = np.vstack((temp, value1)) try: # temp = get_pca_data(temp) get_keams(temp, standard) # move_file(labels, y_pred) except Exception as e: print(e) f.close() all_distance_lists.append(distance_lists) distance_lists = [] showHistogram(all_distance_lists) if __name__ == '__main__': main() # move_video()