update

2024-11-27 15:37:10 +08:00
commit 3a5214c796
696 changed files with 56947 additions and 0 deletions
--- a/dealdata.py
+++ b/dealdata.py
@ -0,0 +1,160 @@
+import pickle
+import os
+
+os.environ["OMP_NUM_THREADS"] = '1'
+from sklearn.decomposition import PCA
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.cluster import KMeans
+from sklearn import metrics
+
+from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
+import shutil
+import math
+
+distance_lists = []
+all_distance_lists = []
+
+
+def showImg(newx):
+    plt.scatter(newx[:, 0], newx[:, 1])
+    plt.show()
+
+
+def showHistogram(data):
+    fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(10, 8))
+    bins = np.arange(0, 1.5, 0.1)
+    axs[0, 0].hist(data[0], bins=bins, edgecolor='black')
+    axs[0, 0].set_title('silhouette')
+    axs[0, 0].set_xlabel('Similarity')
+    axs[0, 0].set_ylabel('Frequency')
+    axs[0, 0].legend(labels=['频次'])
+    axs[0, 1].hist(data[1], bins=bins, edgecolor='black')
+    axs[0, 1].set_title('calinski')
+    axs[0, 1].set_xlabel('Similarity')
+    axs[0, 1].set_ylabel('Frequency')
+    axs[0, 1].legend(labels=['频次'])
+    axs[1, 0].hist(data[2], bins=bins, edgecolor='black')
+    axs[1, 0].set_title('davies')
+    axs[1, 0].set_xlabel('Similarity')
+    axs[1, 0].set_ylabel('Frequency')
+    axs[1, 0].legend(labels=['频次'])
+    axs[1, 1].hist(data[3], bins=bins, edgecolor='black')
+    axs[1, 1].set_title('inertia')
+    axs[1, 1].set_xlabel('Similarity')
+    axs[1, 1].set_ylabel('Frequency')
+    axs[1, 1].legend(labels=['频次'])
+    # 显示图形
+    plt.tight_layout()
+    plt.savefig('multiple_histograms_in_ranges_non_pca.png')
+    plt.show()
+
+
+def get_pca_data(data):
+    pca = PCA(n_components=16)
+    newx = pca.fit_transform(data)
+    # print(pca.explained_variance_ratio_)
+    return newx
+
+
+def cosine_similarity(vec1, vec2):
+    # dot_product = sum(a * b for a, b in zip(vec1, vec2))
+    # norm_vec1 = math.sqrt(sum(x ** 2 for x in vec1))
+    # norm_vec2 = math.sqrt(sum(x ** 2 for x in vec2))
+    # return dot_product / (norm_vec1 * norm_vec2)
+    vec1 = np.array(vec1)
+    vec2 = np.array(vec2)
+    cos_sim = vec1.dot(vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
+    return cos_sim
+
+def get_kclusters(data, k, dicts, standard='silhouette'):
+    Kmeans = KMeans(n_clusters=k, n_init='auto')
+    y_pred = Kmeans.fit_predict(data)
+    if standard == 'silhouette':
+        s_score = silhouette_score(data, y_pred, metric='euclidean')
+        dicts[s_score] = [y_pred, Kmeans.cluster_centers_]
+    elif standard == 'calinski':
+        c_score = calinski_harabasz_score(data, y_pred)
+        dicts[c_score] = [y_pred, Kmeans.cluster_centers_]
+    elif standard == 'davies':
+        d_score = davies_bouldin_score(data, y_pred)
+        dicts[d_score] = [y_pred, Kmeans.cluster_centers_]
+    elif standard == 'inertia':
+        i_score = Kmeans.inertia_
+        dicts[i_score] = [y_pred, Kmeans.cluster_centers_]
+    return dicts
+
+
+def get_keams(data, standard='silhouette'):
+    global distance_lists
+    distance_list = []
+    dicts = {}
+    for k in range(2, 10):
+        dicts = get_kclusters(data, k, dicts, standard)
+    if standard == 'silhouette' or standard == 'calinski':
+        max_key = max(dicts.keys())
+        value = dicts[max_key]
+    elif standard == 'davies' or standard == 'inertia':
+        max_key = min(dicts.keys())
+        value = dicts[max_key]
+
+    for num, i in enumerate(value[0]):
+        distance_list.append(abs(cosine_similarity(value[1][i], data[num])))
+    distance_lists += distance_list
+    return distance_lists
+
+def move_file(labels, y_pred):
+    for label, y_label in zip(labels, y_pred):
+        print('label >>> {}'.format(label))
+        if not os.path.isdir(os.sep.join(['data', label.split('_')[0], str(y_label)])):
+            os.mkdir(os.sep.join(['data', label.split('_')[0], str(y_label)]))
+        try:
+            shutil.move(os.sep.join(['data', label.split('_')[0], label]),
+                        os.sep.join(['data', label.split('_')[0], str(y_label), label]))
+        except Exception as e:
+            print(e)
+
+def move_video(root_pth='data_test'):  # 数据准备
+    for Dirs in os.listdir(root_pth):
+        for root, dirs, files in os.walk(os.sep.join([root_pth, Dirs])):
+            if len(dirs) > 0:
+                for dir in dirs:
+                    filespth = os.sep.join([root_pth, Dirs, dir])
+                    for file in os.listdir(filespth):
+                        if file.endswith('.jpg'):
+                            oldpth = os.sep.join([filespth, file])
+                            newpth = os.sep.join([root_pth, Dirs, file])
+                            shutil.move(oldpth, newpth)
+
+def main():
+    global distance_lists
+    for standard in ['silhouette', 'calinski', 'davies', 'inertia']:
+        with open('data_test/data.pkl', 'rb') as f:
+            data = pickle.load(f)
+            for keys, values in data.items():
+                all_data, labels = [], []
+                temp = None
+                for value in values:
+                    for key1, value1 in value.items():
+                        # print('key1 >>>> {}'.format(key1))
+                        if temp is None:
+                            labels.append(key1)
+                            temp = value1
+                        else:
+                            labels.append(key1)
+                            temp = np.vstack((temp, value1))
+                try:
+                    # temp = get_pca_data(temp)
+                    get_keams(temp, standard)
+                    # move_file(labels, y_pred)
+                except Exception as e:
+                    print(e)
+            f.close()
+        all_distance_lists.append(distance_lists)
+        distance_lists = []
+    showHistogram(all_distance_lists)
+
+
+if __name__ == '__main__':
+    main()
+    # move_video()