Compare commits

...

2 Commits

Author SHA1 Message Date
lee
c978787ff8 多机并行计算 2025-08-18 10:14:05 +08:00
lee
99a204ee22 多机并行计算 2025-08-14 10:09:54 +08:00
19 changed files with 134 additions and 66 deletions

1
.gitignore vendored
View File

@ -8,4 +8,5 @@ loss/
checkpoints/
search_library/
quant_imgs/
electronic_imgs/
README.md

View File

@ -3,6 +3,24 @@
<component name="CopilotChatHistory">
<option name="conversations">
<list>
<Conversation>
<option name="createTime" value="1755228773977" />
<option name="id" value="0198abc99e597020bf8aa3ef78bc8bd3" />
<option name="title" value="新对话 2025年8月15日 11:32:53" />
<option name="updateTime" value="1755228773977" />
</Conversation>
<Conversation>
<option name="createTime" value="1755227620606" />
<option name="id" value="0198abb804fe7bf8ab3ac9ecfeae6d3f" />
<option name="title" value="新对话 2025年8月15日 11:13:40" />
<option name="updateTime" value="1755227620606" />
</Conversation>
<Conversation>
<option name="createTime" value="1755219481041" />
<option name="id" value="0198ab3bd1d17216b0dab33158ff294e" />
<option name="title" value="新对话 2025年8月15日 08:58:01" />
<option name="updateTime" value="1755219481041" />
</Conversation>
<Conversation>
<option name="createTime" value="1754286137102" />
<option name="id" value="0198739a1f0e75c38b0579ade7b34050" />

View File

@ -15,8 +15,8 @@ base:
# 模型配置
models:
backbone: 'resnet50'
channel_ratio: 1.0
backbone: 'resnet18'
channel_ratio: 0.75
# 训练参数
training:
@ -31,9 +31,9 @@ training:
weight_decay: 0.0005 # 权重衰减
scheduler: "step" # 学习率调度器可选cosine/cosine_warm/step/None
num_workers: 32 # 数据加载线程数
checkpoints: "./checkpoints/resnet50_electornic_20250807/" # 模型保存目录
checkpoints: "./checkpoints/resnet18_pdd_test/" # 模型保存目录
restore: false
restore_model: "./checkpoints/resnet18_20250717_scale=0.75_nosub/best.pth" # 模型恢复路径
restore_model: "./checkpoints/resnet50_electornic_20250807/best.pth" # 模型恢复路径
cosine_t_0: 10 # 初始周期长度
cosine_t_mult: 1 # 周期长度倍率
cosine_eta_min: 0.00001 # 最小学习率
@ -68,5 +68,7 @@ logging:
# 分布式训练(可选)
distributed:
enabled: false # 是否启用分布式训练
enabled: true # 是否启用分布式训练
backend: "nccl" # 分布式后端nccl/gloo
node_rank: 0 # 节点编号
node_num: 2 # 共计几个节点 一般几台机器就有几个节点

View File

@ -15,7 +15,8 @@ base:
models:
backbone: 'resnet18'
channel_ratio: 0.75
model_path: "../checkpoints/resnet18_1009/best.pth"
model_path: "../checkpoints/resnet18_20250715_scale=0.75_sub/best.pth"
# model_path: "../checkpoints/resnet18_1009/best.pth"
heatmap:
feature_layer: "layer4"
@ -26,10 +27,10 @@ data:
train_batch_size: 128 # 训练批次大小
val_batch_size: 8 # 验证批次大小
num_workers: 32 # 数据加载线程数
data_dir: "/home/lc/data_center/image_analysis/pic_pic_similar_maxtrix"
data_dir: "/home/lc/data_center/image_analysis/pic_pic_similar_maxtrix_new"
image_joint_pth: "/home/lc/data_center/image_analysis/error_compare_result"
total_pkl: "/home/lc/data_center/image_analysis/pic_pic_similar_maxtrix/total.pkl"
result_txt: "/home/lc/data_center/image_analysis/pic_pic_similar_maxtrix/result.txt"
total_pkl: "/home/lc/data_center/image_analysis/pic_pic_similar_maxtrix_new/total.pkl"
result_txt: "/home/lc/data_center/image_analysis/pic_pic_similar_maxtrix_new/result.txt"
transform:
img_size: 224 # 图像尺寸
@ -45,9 +46,9 @@ logging:
tensorboard: true # 是否启用TensorBoard
checkpoint_interval: 30 # 检查点保存间隔epoch
event:
oneToOne_max_th: 0.9
oneToSn_min_th: 0.6
event_save_dir: "/home/lc/works/realtime_yolov10s/online_yolov10s_resnetv11_20250702/yolos_tracking"
stdlib_image_path: "/testDataAndLogs/module_test_record/comparison/标准图测试数据/pic/stlib_base"
pickle_path: "event.pickle"
#event:
# oneToOne_max_th: 0.9
# oneToSn_min_th: 0.6
# event_save_dir: "/home/lc/works/realtime_yolov10s/online_yolov10s_resnetv11_20250702/yolos_tracking"
# stdlib_image_path: "/testDataAndLogs/module_test_record/comparison/标准图测试数据/pic/stlib_base"
# pickle_path: "event.pickle"

View File

@ -15,7 +15,8 @@ base:
models:
backbone: 'resnet18'
channel_ratio: 0.75
model_path: "../checkpoints/resnet18_1009/best.pth"
# model_path: "../checkpoints/resnet18_1009/best.pth"
model_path: "../checkpoints/resnet18_20250715_scale=0.75_sub/best.pth"
heatmap:
feature_layer: "layer4"

View File

@ -13,8 +13,8 @@ base:
# 模型配置
models:
backbone: 'resnet18'
channel_ratio: 0.75
model_path: "checkpoints/resnet18_1009/best.pth"
channel_ratio: 1.0
model_path: "checkpoints/resnet18_electornic_20250806/best.pth"
#resnet18_20250715_scale=0.75_sub
#resnet18_20250718_scale=0.75_nosub
half: false # 是否启用半精度测试fp16
@ -24,11 +24,11 @@ models:
data:
test_batch_size: 128 # 训练批次大小
num_workers: 32 # 数据加载线程数
test_dir: "../data_center/contrast_data/v1/extra" # 验证数据集根目录
test_dir: "../data_center/electornic/v1/val" # 验证数据集根目录
test_group_json: "../data_center/contrast_learning/model_test_data/test/inner_group_pairs.json"
test_list: "../data_center/contrast_data/v1/extra_cross_same.txt"
test_list: "../data_center/electornic/v1/cross_same.txt"
group_test: false
save_image_joint: true
save_image_joint: false
image_joint_pth: "./joint_images"
transform:

View File

@ -10,15 +10,16 @@ base:
embedding_size: 256 # 特征维度
pin_memory: true # 是否启用pin_memory
distributed: true # 是否启用分布式训练
dataset: "./dataset_electornic.txt" # 数据集名称
# 模型配置
models:
backbone: 'resnet18'
backbone: 'resnet101'
channel_ratio: 1.0
model_path: "../checkpoints/resnet18_1009/best.pth"
onnx_model: "../checkpoints/resnet18_3399_sancheng/best.onnx"
rknn_model: "../checkpoints/resnet18_3399_sancheng/best_rknn2.3.2_RK3566.rknn"
model_path: "../checkpoints/resnet101_electornic_20250807/best.pth"
onnx_model: "../checkpoints/resnet101_electornic_20250807/best.onnx"
rknn_model: "../checkpoints/resnet101_electornic_20250807/resnet101_electornic_3588.rknn"
rknn_batch_size: 1
# 日志与监控

View File

@ -1,4 +1,4 @@
from model import (resnet18, resnet34, resnet50, mobilevit_s, MobileNetV3_Small, MobileNetV3_Large, mobilenet_v1,
from model import (resnet18, resnet34, resnet50, resnet101, mobilevit_s, MobileNetV3_Small, MobileNetV3_Large, mobilenet_v1,
PPLCNET_x1_0, PPLCNET_x0_5, PPLCNET_x2_5)
from timm.models import vit_base_patch16_224 as vit_base_16
from model.metric import ArcFace, CosFace
@ -13,9 +13,10 @@ class trainer_tools:
def get_backbone(self):
backbone_mapping = {
'resnet18': lambda: resnet18(scale=self.conf['models']['channel_ratio']),
'resnet34': lambda: resnet34(scale=self.conf['models']['channel_ratio']),
'resnet50': lambda: resnet50(scale=self.conf['models']['channel_ratio']),
'resnet18': lambda: resnet18(scale=self.conf['models']['channel_ratio'], pretrained=True),
'resnet34': lambda: resnet34(scale=self.conf['models']['channel_ratio'], pretrained=True),
'resnet50': lambda: resnet50(scale=self.conf['models']['channel_ratio'], pretrained=True),
'resnet101': lambda: resnet101(scale=self.conf['models']['channel_ratio'], pretrained=True),
'mobilevit_s': lambda: mobilevit_s(),
'mobilenetv3_small': lambda: MobileNetV3_Small(),
'PPLCNET_x1_0': lambda: PPLCNET_x1_0(),

View File

@ -14,7 +14,7 @@ base:
models:
backbone: 'resnet18'
channel_ratio: 0.75
checkpoints: "../checkpoints/resnet18_20250715_scale=0.75_sub/best.pth"
checkpoints: "../checkpoints/resnet18_20250718_scale=0.75_nosub/best.pth"
# 数据配置
data:
@ -41,8 +41,8 @@ logging:
checkpoint_interval: 30 # 检查点保存间隔epoch
save:
json_bin: "../search_library/yunhedian_05-09.json" # 保存整个json文件
json_path: "/home/lc/data_center/baseStlib/feature_json/stlib_base_resnet18_sub" # 保存单个json文件路径
json_bin: "../search_library/resnet101_electronic.json" # 保存整个json文件
json_path: "/home/lc/data_center/baseStlib/feature_json/stlib_base_resnet18_sub_1k_合并" # 保存单个json文件路径
error_barcodes: "error_barcodes.txt"
barcodes_statistics: "../search_library/barcodes_statistics.txt"
create_single_json: true # 是否保存单个json文件

View File

@ -4,7 +4,7 @@ from .mobilevit import mobilevit_s
from .metric import ArcFace, CosFace
from .loss import FocalLoss
from .resbam import resnet
from .resnet_pre import resnet18, resnet34, resnet50, resnet14, CustomResNet18
from .resnet_pre import resnet18, resnet34, resnet50, resnet101, resnet152,resnet14, CustomResNet18
from .mobilenet_v2 import mobilenet_v2
from .mobilenet_v3 import MobileNetV3_Small, MobileNetV3_Large
# from .mobilenet_v1 import mobilenet_v1

View File

@ -368,7 +368,7 @@ def resnet18(pretrained=True, progress=True, **kwargs):
**kwargs)
def resnet34(pretrained=False, progress=True, **kwargs):
def resnet34(pretrained=True, progress=True, **kwargs):
r"""ResNet-34 model from
`"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
@ -380,7 +380,7 @@ def resnet34(pretrained=False, progress=True, **kwargs):
**kwargs)
def resnet50(pretrained=False, progress=True, **kwargs):
def resnet50(pretrained=True, progress=True, **kwargs):
r"""ResNet-50 model from
`"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
@ -392,7 +392,7 @@ def resnet50(pretrained=False, progress=True, **kwargs):
**kwargs)
def resnet101(pretrained=False, progress=True, **kwargs):
def resnet101(pretrained=True, progress=True, **kwargs):
r"""ResNet-101 model from
`"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_

View File

@ -17,7 +17,7 @@ from configs import trainer_tools
import yaml
from datetime import datetime
with open('./configs/test.yml', 'r') as f:
with open('../configs/test.yml', 'r') as f:
conf = yaml.load(f, Loader=yaml.FullLoader)
# Constants from config

View File

@ -0,0 +1,23 @@
../electronic_imgs/0.jpg
../electronic_imgs/1.jpg
../electronic_imgs/2.jpg
../electronic_imgs/3.jpg
../electronic_imgs/4.jpg
../electronic_imgs/5.jpg
../electronic_imgs/6.jpg
../electronic_imgs/7.jpg
../electronic_imgs/8.jpg
../electronic_imgs/9.jpg
../electronic_imgs/10.jpg
../electronic_imgs/11.jpg
../electronic_imgs/12.jpg
../electronic_imgs/13.jpg
../electronic_imgs/14.jpg
../electronic_imgs/15.jpg
../electronic_imgs/16.jpg
../electronic_imgs/17.jpg
../electronic_imgs/18.jpg
../electronic_imgs/19.jpg
../electronic_imgs/20.jpg
../electronic_imgs/21.jpg
../electronic_imgs/22.jpg

View File

@ -203,11 +203,11 @@ class PairGenerator:
if __name__ == "__main__":
original_path = '/home/lc/data_center/contrast_data/v1/extra'
original_path = '/home/lc/data_center/electornic/v1/val'
parent_dir = str(Path(original_path).parent)
generator = PairGenerator(original_path)
# Example usage:
pairs = generator.get_pairs(original_path,
output_txt=os.sep.join([parent_dir, 'extra_cross_same.txt'])) # Individual pairs
output_txt=os.sep.join([parent_dir, 'cross_same.txt'])) # Individual pairs
# groups = generator.get_group_pairs('val') # Group pairs

View File

@ -96,7 +96,7 @@ if __name__ == '__main__':
rknn.config(
mean_values=[[127.5, 127.5, 127.5]],
std_values=[[127.5, 127.5, 127.5]],
target_platform='rk3566',
target_platform='rk3588',
model_pruning=False,
compress_weight=False,
single_core_mode=True,
@ -122,8 +122,9 @@ if __name__ == '__main__':
# Build model
print('--> Building model')
ret = rknn.build(do_quantization=False, # True
dataset='./dataset.txt',
ret = rknn.build(do_quantization=True, # True
# dataset='./dataset.txt',
dataset=conf['base']['dataset'],
rknn_batch_size=conf['models']['rknn_batch_size'])
# ret = rknn.build(do_quantization=False, dataset='./dataset.txt')
if ret != 0:

View File

@ -237,6 +237,6 @@ def get_histogram(data, label=None):
if __name__ == '__main__':
# picTopic_matrix = picDirSimilarAnalysis()
# picTopic_matrix.get_group_similarity_matrix('/home/lc/data_center/image_analysis/pic_pic_similar_maxtrix')
read_result_txt()
picTopic_matrix = picDirSimilarAnalysis()
picTopic_matrix.get_group_similarity_matrix('/home/lc/data_center/image_analysis/pic_pic_similar_maxtrix_new')
# read_result_txt()

View File

@ -22,7 +22,7 @@ class SimilarAnalysis:
"""初始化模型和度量方法"""
tr_tools = trainer_tools(conf)
backbone_mapping = tr_tools.get_backbone()
print('model_path {}'.format(conf['models']['model_path']))
if conf['models']['backbone'] in backbone_mapping:
model = backbone_mapping[conf['models']['backbone']]()
else:

View File

@ -4,7 +4,7 @@ import logging
import numpy as np
from typing import Dict, List, Optional, Tuple
from tools.dataset import get_transform
from model import resnet18
from model import resnet18, resnet34, resnet50, resnet101
import torch
from PIL import Image
import pandas as pd
@ -50,7 +50,16 @@ class FeatureExtractor:
raise FileNotFoundError(f"Model weights file not found: {model_path}")
# Initialize model
if conf['models']['backbone'] == 'resnet18':
model = resnet18(scale=self.conf['models']['channel_ratio']).to(self.conf['base']['device'])
elif conf['models']['backbone'] == 'resnet34':
model = resnet34(scale=self.conf['models']['channel_ratio']).to(self.conf['base']['device'])
elif conf['models']['backbone'] == 'resnet50':
model = resnet50(scale=self.conf['models']['channel_ratio']).to(self.conf['base']['device'])
elif conf['models']['backbone'] == 'resnet101':
model = resnet101(scale=self.conf['models']['channel_ratio']).to(self.conf['base']['device'])
else:
print("不支持的模型: {}".format(conf['models']['backbone']))
# Handle multi-GPU case
if conf['base']['distributed']:
@ -168,7 +177,7 @@ class FeatureExtractor:
# Validate input directory
if not os.path.isdir(folder):
raise ValueError(f"Invalid directory: {folder}")
i = 0
# Process each barcode directory
for root, dirs, files in tqdm(os.walk(folder), desc="Scanning directories"):
if not dirs: # Leaf directory (contains images)
@ -180,14 +189,16 @@ class FeatureExtractor:
ori_barcode = basename
barcode = basename
# Apply filter if provided
i += 1
print(ori_barcode, i)
if filter and ori_barcode not in filter:
continue
elif len(ori_barcode) > 13 or len(ori_barcode) < 8:
logger.warning(f"Skipping invalid barcode {ori_barcode}")
with open(conf['save']['error_barcodes'], 'a') as f:
f.write(ori_barcode + '\n')
f.close()
continue
# elif len(ori_barcode) > 13 or len(ori_barcode) < 8: # barcode筛选长度
# logger.warning(f"Skipping invalid barcode {ori_barcode}")
# with open(conf['save']['error_barcodes'], 'a') as f:
# f.write(ori_barcode + '\n')
# f.close()
# continue
# Process image files
if files:
@ -299,7 +310,8 @@ class FeatureExtractor:
dicts['value'] = truncated_imgs_list
if create_single_json:
# json_path = os.path.join("./search_library/v8021_overseas/", str(barcode_list[i]) + '.json')
json_path = os.path.join(self.conf['save']['json_path'], str(barcode_list[i]) + '.json')
json_path = os.path.join(self.conf['save']['json_path'],
str(barcode_list[i]) + '.json')
with open(json_path, 'w') as json_file:
json.dump(dicts, json_file)
else:
@ -317,8 +329,10 @@ class FeatureExtractor:
with open(conf['save']['barcodes_statistics'], 'w', encoding='utf-8') as f:
for barcode in os.listdir(pth):
print("barcode length >> {}".format(len(barcode)))
if len(barcode) > 13 or len(barcode) < 8:
continue
# if len(barcode) > 13 or len(barcode) < 8: # barcode筛选长度
# continue
if filter is not None:
f.writelines(barcode + '\n')
if barcode in filter:

View File

@ -266,11 +266,15 @@ def main():
if distributed:
# 分布式训练使用mp.spawn启动多个进程
world_size = torch.cuda.device_count()
local_size = torch.cuda.device_count()
world_size = int(conf['distributed']['node_num'])*local_size
mp.spawn(
run_training,
args=(world_size, conf),
nprocs=world_size,
args=(conf['distributed']['node_rank'],
local_size,
world_size,
conf),
nprocs=local_size,
join=True
)
else:
@ -279,17 +283,18 @@ def main():
run_training_loop(components)
def run_training(rank, world_size, conf):
def run_training(local_rank, node_rank, local_size, world_size, conf):
"""实际执行训练的函数供mp.spawn调用"""
# 初始化分布式环境
rank = local_rank + node_rank * local_size
os.environ['RANK'] = str(rank)
os.environ['WORLD_SIZE'] = str(world_size)
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'
dist.init_process_group(backend='nccl', rank=rank, world_size=world_size)
torch.cuda.set_device(rank)
device = torch.device('cuda', rank)
torch.cuda.set_device(local_rank)
device = torch.device('cuda', local_rank)
# 获取数据集而不是DataLoader
train_dataset, class_num = load_data(training=True, cfg=conf, return_dataset=True)
@ -301,8 +306,8 @@ def run_training(rank, world_size, conf):
metric = metric.to(device)
# 包装为DistributedDataParallel模型
model = DDP(model, device_ids=[rank], output_device=rank)
metric = DDP(metric, device_ids=[rank], output_device=rank)
model = DDP(model, device_ids=[local_rank], output_device=local_rank)
metric = DDP(metric, device_ids=[local_rank], output_device=local_rank)
# 设置损失函数、优化器和调度器
criterion = setup_loss_function(conf)