161 lines
5.6 KiB
Python
161 lines
5.6 KiB
Python
import pickle
|
|
import os
|
|
|
|
os.environ["OMP_NUM_THREADS"] = '1'
|
|
from sklearn.decomposition import PCA
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
from sklearn.cluster import KMeans
|
|
from sklearn import metrics
|
|
|
|
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
|
|
import shutil
|
|
import math
|
|
|
|
distance_lists = []
|
|
all_distance_lists = []
|
|
|
|
|
|
def showImg(newx):
|
|
plt.scatter(newx[:, 0], newx[:, 1])
|
|
plt.show()
|
|
|
|
|
|
def showHistogram(data):
|
|
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(10, 8))
|
|
bins = np.arange(0, 1.5, 0.1)
|
|
axs[0, 0].hist(data[0], bins=bins, edgecolor='black')
|
|
axs[0, 0].set_title('silhouette')
|
|
axs[0, 0].set_xlabel('Similarity')
|
|
axs[0, 0].set_ylabel('Frequency')
|
|
axs[0, 0].legend(labels=['频次'])
|
|
axs[0, 1].hist(data[1], bins=bins, edgecolor='black')
|
|
axs[0, 1].set_title('calinski')
|
|
axs[0, 1].set_xlabel('Similarity')
|
|
axs[0, 1].set_ylabel('Frequency')
|
|
axs[0, 1].legend(labels=['频次'])
|
|
axs[1, 0].hist(data[2], bins=bins, edgecolor='black')
|
|
axs[1, 0].set_title('davies')
|
|
axs[1, 0].set_xlabel('Similarity')
|
|
axs[1, 0].set_ylabel('Frequency')
|
|
axs[1, 0].legend(labels=['频次'])
|
|
axs[1, 1].hist(data[3], bins=bins, edgecolor='black')
|
|
axs[1, 1].set_title('inertia')
|
|
axs[1, 1].set_xlabel('Similarity')
|
|
axs[1, 1].set_ylabel('Frequency')
|
|
axs[1, 1].legend(labels=['频次'])
|
|
# 显示图形
|
|
plt.tight_layout()
|
|
plt.savefig('multiple_histograms_in_ranges_non_pca.png')
|
|
plt.show()
|
|
|
|
|
|
def get_pca_data(data):
|
|
pca = PCA(n_components=16)
|
|
newx = pca.fit_transform(data)
|
|
# print(pca.explained_variance_ratio_)
|
|
return newx
|
|
|
|
|
|
def cosine_similarity(vec1, vec2):
|
|
# dot_product = sum(a * b for a, b in zip(vec1, vec2))
|
|
# norm_vec1 = math.sqrt(sum(x ** 2 for x in vec1))
|
|
# norm_vec2 = math.sqrt(sum(x ** 2 for x in vec2))
|
|
# return dot_product / (norm_vec1 * norm_vec2)
|
|
vec1 = np.array(vec1)
|
|
vec2 = np.array(vec2)
|
|
cos_sim = vec1.dot(vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
|
|
return cos_sim
|
|
|
|
def get_kclusters(data, k, dicts, standard='silhouette'):
|
|
Kmeans = KMeans(n_clusters=k, n_init='auto')
|
|
y_pred = Kmeans.fit_predict(data)
|
|
if standard == 'silhouette':
|
|
s_score = silhouette_score(data, y_pred, metric='euclidean')
|
|
dicts[s_score] = [y_pred, Kmeans.cluster_centers_]
|
|
elif standard == 'calinski':
|
|
c_score = calinski_harabasz_score(data, y_pred)
|
|
dicts[c_score] = [y_pred, Kmeans.cluster_centers_]
|
|
elif standard == 'davies':
|
|
d_score = davies_bouldin_score(data, y_pred)
|
|
dicts[d_score] = [y_pred, Kmeans.cluster_centers_]
|
|
elif standard == 'inertia':
|
|
i_score = Kmeans.inertia_
|
|
dicts[i_score] = [y_pred, Kmeans.cluster_centers_]
|
|
return dicts
|
|
|
|
|
|
def get_keams(data, standard='silhouette'):
|
|
global distance_lists
|
|
distance_list = []
|
|
dicts = {}
|
|
for k in range(2, 10):
|
|
dicts = get_kclusters(data, k, dicts, standard)
|
|
if standard == 'silhouette' or standard == 'calinski':
|
|
max_key = max(dicts.keys())
|
|
value = dicts[max_key]
|
|
elif standard == 'davies' or standard == 'inertia':
|
|
max_key = min(dicts.keys())
|
|
value = dicts[max_key]
|
|
|
|
for num, i in enumerate(value[0]):
|
|
distance_list.append(abs(cosine_similarity(value[1][i], data[num])))
|
|
distance_lists += distance_list
|
|
return distance_lists
|
|
|
|
def move_file(labels, y_pred):
|
|
for label, y_label in zip(labels, y_pred):
|
|
print('label >>> {}'.format(label))
|
|
if not os.path.isdir(os.sep.join(['data', label.split('_')[0], str(y_label)])):
|
|
os.mkdir(os.sep.join(['data', label.split('_')[0], str(y_label)]))
|
|
try:
|
|
shutil.move(os.sep.join(['data', label.split('_')[0], label]),
|
|
os.sep.join(['data', label.split('_')[0], str(y_label), label]))
|
|
except Exception as e:
|
|
print(e)
|
|
|
|
def move_video(root_pth='data_test'): # 数据准备
|
|
for Dirs in os.listdir(root_pth):
|
|
for root, dirs, files in os.walk(os.sep.join([root_pth, Dirs])):
|
|
if len(dirs) > 0:
|
|
for dir in dirs:
|
|
filespth = os.sep.join([root_pth, Dirs, dir])
|
|
for file in os.listdir(filespth):
|
|
if file.endswith('.jpg'):
|
|
oldpth = os.sep.join([filespth, file])
|
|
newpth = os.sep.join([root_pth, Dirs, file])
|
|
shutil.move(oldpth, newpth)
|
|
|
|
def main():
|
|
global distance_lists
|
|
for standard in ['silhouette', 'calinski', 'davies', 'inertia']:
|
|
with open('data_test/data.pkl', 'rb') as f:
|
|
data = pickle.load(f)
|
|
for keys, values in data.items():
|
|
all_data, labels = [], []
|
|
temp = None
|
|
for value in values:
|
|
for key1, value1 in value.items():
|
|
# print('key1 >>>> {}'.format(key1))
|
|
if temp is None:
|
|
labels.append(key1)
|
|
temp = value1
|
|
else:
|
|
labels.append(key1)
|
|
temp = np.vstack((temp, value1))
|
|
try:
|
|
# temp = get_pca_data(temp)
|
|
get_keams(temp, standard)
|
|
# move_file(labels, y_pred)
|
|
except Exception as e:
|
|
print(e)
|
|
f.close()
|
|
all_distance_lists.append(distance_lists)
|
|
distance_lists = []
|
|
showHistogram(all_distance_lists)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|
|
# move_video()
|