update
This commit is contained in:
160
dealdata.py
Normal file
160
dealdata.py
Normal file
@ -0,0 +1,160 @@
|
||||
import pickle
|
||||
import os
|
||||
|
||||
os.environ["OMP_NUM_THREADS"] = '1'
|
||||
from sklearn.decomposition import PCA
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn import metrics
|
||||
|
||||
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
|
||||
import shutil
|
||||
import math
|
||||
|
||||
distance_lists = []
|
||||
all_distance_lists = []
|
||||
|
||||
|
||||
def showImg(newx):
|
||||
plt.scatter(newx[:, 0], newx[:, 1])
|
||||
plt.show()
|
||||
|
||||
|
||||
def showHistogram(data):
|
||||
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(10, 8))
|
||||
bins = np.arange(0, 1.5, 0.1)
|
||||
axs[0, 0].hist(data[0], bins=bins, edgecolor='black')
|
||||
axs[0, 0].set_title('silhouette')
|
||||
axs[0, 0].set_xlabel('Similarity')
|
||||
axs[0, 0].set_ylabel('Frequency')
|
||||
axs[0, 0].legend(labels=['频次'])
|
||||
axs[0, 1].hist(data[1], bins=bins, edgecolor='black')
|
||||
axs[0, 1].set_title('calinski')
|
||||
axs[0, 1].set_xlabel('Similarity')
|
||||
axs[0, 1].set_ylabel('Frequency')
|
||||
axs[0, 1].legend(labels=['频次'])
|
||||
axs[1, 0].hist(data[2], bins=bins, edgecolor='black')
|
||||
axs[1, 0].set_title('davies')
|
||||
axs[1, 0].set_xlabel('Similarity')
|
||||
axs[1, 0].set_ylabel('Frequency')
|
||||
axs[1, 0].legend(labels=['频次'])
|
||||
axs[1, 1].hist(data[3], bins=bins, edgecolor='black')
|
||||
axs[1, 1].set_title('inertia')
|
||||
axs[1, 1].set_xlabel('Similarity')
|
||||
axs[1, 1].set_ylabel('Frequency')
|
||||
axs[1, 1].legend(labels=['频次'])
|
||||
# 显示图形
|
||||
plt.tight_layout()
|
||||
plt.savefig('multiple_histograms_in_ranges_non_pca.png')
|
||||
plt.show()
|
||||
|
||||
|
||||
def get_pca_data(data):
|
||||
pca = PCA(n_components=16)
|
||||
newx = pca.fit_transform(data)
|
||||
# print(pca.explained_variance_ratio_)
|
||||
return newx
|
||||
|
||||
|
||||
def cosine_similarity(vec1, vec2):
|
||||
# dot_product = sum(a * b for a, b in zip(vec1, vec2))
|
||||
# norm_vec1 = math.sqrt(sum(x ** 2 for x in vec1))
|
||||
# norm_vec2 = math.sqrt(sum(x ** 2 for x in vec2))
|
||||
# return dot_product / (norm_vec1 * norm_vec2)
|
||||
vec1 = np.array(vec1)
|
||||
vec2 = np.array(vec2)
|
||||
cos_sim = vec1.dot(vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
|
||||
return cos_sim
|
||||
|
||||
def get_kclusters(data, k, dicts, standard='silhouette'):
|
||||
Kmeans = KMeans(n_clusters=k, n_init='auto')
|
||||
y_pred = Kmeans.fit_predict(data)
|
||||
if standard == 'silhouette':
|
||||
s_score = silhouette_score(data, y_pred, metric='euclidean')
|
||||
dicts[s_score] = [y_pred, Kmeans.cluster_centers_]
|
||||
elif standard == 'calinski':
|
||||
c_score = calinski_harabasz_score(data, y_pred)
|
||||
dicts[c_score] = [y_pred, Kmeans.cluster_centers_]
|
||||
elif standard == 'davies':
|
||||
d_score = davies_bouldin_score(data, y_pred)
|
||||
dicts[d_score] = [y_pred, Kmeans.cluster_centers_]
|
||||
elif standard == 'inertia':
|
||||
i_score = Kmeans.inertia_
|
||||
dicts[i_score] = [y_pred, Kmeans.cluster_centers_]
|
||||
return dicts
|
||||
|
||||
|
||||
def get_keams(data, standard='silhouette'):
|
||||
global distance_lists
|
||||
distance_list = []
|
||||
dicts = {}
|
||||
for k in range(2, 10):
|
||||
dicts = get_kclusters(data, k, dicts, standard)
|
||||
if standard == 'silhouette' or standard == 'calinski':
|
||||
max_key = max(dicts.keys())
|
||||
value = dicts[max_key]
|
||||
elif standard == 'davies' or standard == 'inertia':
|
||||
max_key = min(dicts.keys())
|
||||
value = dicts[max_key]
|
||||
|
||||
for num, i in enumerate(value[0]):
|
||||
distance_list.append(abs(cosine_similarity(value[1][i], data[num])))
|
||||
distance_lists += distance_list
|
||||
return distance_lists
|
||||
|
||||
def move_file(labels, y_pred):
|
||||
for label, y_label in zip(labels, y_pred):
|
||||
print('label >>> {}'.format(label))
|
||||
if not os.path.isdir(os.sep.join(['data', label.split('_')[0], str(y_label)])):
|
||||
os.mkdir(os.sep.join(['data', label.split('_')[0], str(y_label)]))
|
||||
try:
|
||||
shutil.move(os.sep.join(['data', label.split('_')[0], label]),
|
||||
os.sep.join(['data', label.split('_')[0], str(y_label), label]))
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
def move_video(root_pth='data_test'): # 数据准备
|
||||
for Dirs in os.listdir(root_pth):
|
||||
for root, dirs, files in os.walk(os.sep.join([root_pth, Dirs])):
|
||||
if len(dirs) > 0:
|
||||
for dir in dirs:
|
||||
filespth = os.sep.join([root_pth, Dirs, dir])
|
||||
for file in os.listdir(filespth):
|
||||
if file.endswith('.jpg'):
|
||||
oldpth = os.sep.join([filespth, file])
|
||||
newpth = os.sep.join([root_pth, Dirs, file])
|
||||
shutil.move(oldpth, newpth)
|
||||
|
||||
def main():
|
||||
global distance_lists
|
||||
for standard in ['silhouette', 'calinski', 'davies', 'inertia']:
|
||||
with open('data_test/data.pkl', 'rb') as f:
|
||||
data = pickle.load(f)
|
||||
for keys, values in data.items():
|
||||
all_data, labels = [], []
|
||||
temp = None
|
||||
for value in values:
|
||||
for key1, value1 in value.items():
|
||||
# print('key1 >>>> {}'.format(key1))
|
||||
if temp is None:
|
||||
labels.append(key1)
|
||||
temp = value1
|
||||
else:
|
||||
labels.append(key1)
|
||||
temp = np.vstack((temp, value1))
|
||||
try:
|
||||
# temp = get_pca_data(temp)
|
||||
get_keams(temp, standard)
|
||||
# move_file(labels, y_pred)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
f.close()
|
||||
all_distance_lists.append(distance_lists)
|
||||
distance_lists = []
|
||||
showHistogram(all_distance_lists)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
# move_video()
|
Reference in New Issue
Block a user