import os import numpy as np from usearch.index import Index import json import struct def create_index(): index = Index( ndim=256, metric='cos', # dtype='f32', dtype='f16', connectivity=32, expansion_add=40, # 128, expansion_search=10, # 64, multi=True ) return index def compare_feature(features1, features2, model='1'): """ :param model 比对策略 '0':模拟一个轨迹的图像(所有的图像、或者挑选的若干图像)与标准库,先求每个图片与标准库的最大值,再求所有图片对应最大值的均值 '1':带对比的所有相似度的均值 '2':比对1:1的最大值 :param feature1: :param feature2: :return: """ similarity_group, similarity_groups = [], [] if model == '0': for feature1 in features1: for feature2 in features2[0]: similarity = np.dot(feature1, feature2) / (np.linalg.norm(feature1) * np.linalg.norm(feature2)) similarity_group.append(similarity) similarity_groups.append(max(similarity_group)) similarity_group = [] return sum(similarity_groups) / len(similarity_groups) elif model == '1': feature2 = features2[0] for feature1 in features1: for num in range(len(feature2)): similarity = np.dot(feature1, feature2[num]) / ( np.linalg.norm(feature1) * np.linalg.norm(feature2[num])) similarity_group.append(similarity) similarity_groups.append(sum(similarity_group) / len(similarity_group)) similarity_group = [] # return sum(similarity_groups)/len(similarity_groups), max(similarity_groups) if len(similarity_groups) == 0: return -1 return sum(similarity_groups) / len(similarity_groups) elif model == '2': feature2 = features2[0] for feature1 in features1: for num in range(len(feature2)): similarity = np.dot(feature1, feature2[num]) / ( np.linalg.norm(feature1) * np.linalg.norm(feature2[num])) similarity_group.append(similarity) return max(similarity_group) def get_barcode_feature(data): barcode = data['key'] features = data['value'] return [barcode] * len(features), features def analysis_file(file_path): """ :param file_path: :return: """ barcodes, features = [], [] with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) for dic in data['total']: barcode, feature = get_barcode_feature(dic) barcodes.append(barcode) features.append(feature) return barcodes, features def create_base_index(index_file_pth=None, barcodes=None, features=None, save_index_name=None): index = create_index() if index_file_pth is not None: # save_index_name = index_file_pth.split('json')[0] + 'usearch' save_index_name = index_file_pth.split('json')[0] + 'data' barcodes, features = analysis_file(index_file_pth) else: assert barcodes is not None and features is not None, 'barcodes and features must be not None' for barcode, feature in zip(barcodes, features): try: index.add(np.array(barcode), np.array(feature)) except Exception as e: print(e) continue index.save(save_index_name) def get_feature_index(index_file_pth=None, barcodes=None): assert index_file_pth is not None, 'index_file_pth must be not None' index = Index.restore(index_file_pth, view=True) feature_lists = index.get(np.array(barcodes)) print("memory {} size {}".format(index.memory_usage, index.size)) print("feature_lists {}".format(feature_lists)) return feature_lists def search_in_index(query=None, barcode=None, # barcode -> int or np.ndarray index_name=None, temp_index=False, # 是否为临时库 model='0', ): if temp_index: assert index_name is not None, 'index_name must be not None' index = Index.restore(index_name, view=True) if barcode is not None: # 1:1对比测试 feature_lists = index.get(np.array(barcode)) results = compare_feature(query, feature_lists) else: results = index.search(query, count=5) return results else: # 标准库 assert index_name is not None, 'index_name must be not None' index = Index.restore(index_name, view=True) if barcode is not None: # 1:1对比测试 feature_lists = index.get(np.array(barcode)) results = compare_feature(query, feature_lists, model) else: results = index.search(query, count=10) return results def delete_index(index_name=None, key=None, index=None): assert key is not None, 'key must be not None' if index is None: assert index_name is not None, 'index_name must be not None' index = Index.restore(index_name, view=True) index.remove(index_name) else: index.remove(key) from scipy.spatial.distance import cdist def compute_similarity_matrix(featurelists1, featurelists2): """计算图片之间的余弦相似度矩阵""" # 计算所有向量对之间的余弦相似度 cosine_similarities = 1 - cdist(featurelists1, featurelists2, metric='cosine') cosine_similarities = np.around(cosine_similarities, decimals=3) return cosine_similarities def check_usearch_json_diff(index_file_pth, json_file_pth): json_features = None feature_lists = get_feature_index(index_file_pth, ['6923644272159']) with open(json_file_pth, 'r') as json_file: json_data = json.load(json_file) for data in json_data['total']: if data['key'] == '6923644272159': json_features = data['value'] json_features = np.array(json_features) feature_lists = np.array(feature_lists[0]) compute_similarity_matrix(json_features, feature_lists) def write_binary_file(filename, datas): with open(filename, 'wb') as f: # 先写入数据中的key数量(为C++读取提供便利) key_count = len(datas) f.write(struct.pack('I', key_count)) # 'I'代表无符号整型(4字节) for data in datas: key = data['key'] feats = data['value'] key_bytes = key.encode('utf-8') key_len = len(key) length_byte = struct.pack('