多机并行计算

This commit is contained in:
lee
2025-08-18 10:14:05 +08:00
parent 99a204ee22
commit c978787ff8
8 changed files with 43 additions and 25 deletions

View File

@ -3,6 +3,24 @@
<component name="CopilotChatHistory">
<option name="conversations">
<list>
<Conversation>
<option name="createTime" value="1755228773977" />
<option name="id" value="0198abc99e597020bf8aa3ef78bc8bd3" />
<option name="title" value="新对话 2025年8月15日 11:32:53" />
<option name="updateTime" value="1755228773977" />
</Conversation>
<Conversation>
<option name="createTime" value="1755227620606" />
<option name="id" value="0198abb804fe7bf8ab3ac9ecfeae6d3f" />
<option name="title" value="新对话 2025年8月15日 11:13:40" />
<option name="updateTime" value="1755227620606" />
</Conversation>
<Conversation>
<option name="createTime" value="1755219481041" />
<option name="id" value="0198ab3bd1d17216b0dab33158ff294e" />
<option name="title" value="新对话 2025年8月15日 08:58:01" />
<option name="updateTime" value="1755219481041" />
</Conversation>
<Conversation>
<option name="createTime" value="1754286137102" />
<option name="id" value="0198739a1f0e75c38b0579ade7b34050" />

View File

@ -68,7 +68,7 @@ logging:
# 分布式训练(可选)
distributed:
enabled: false # 是否启用分布式训练
enabled: true # 是否启用分布式训练
backend: "nccl" # 分布式后端nccl/gloo
node_rank: 0 # 节点编号
node_num: 1 # 共计几个节点 一般几台机器就有几个节点
node_num: 2 # 共计几个节点 一般几台机器就有几个节点

View File

@ -27,10 +27,10 @@ data:
train_batch_size: 128 # 训练批次大小
val_batch_size: 8 # 验证批次大小
num_workers: 32 # 数据加载线程数
data_dir: "/home/lc/data_center/image_analysis/pic_pic_similar_maxtrix"
data_dir: "/home/lc/data_center/image_analysis/pic_pic_similar_maxtrix_new"
image_joint_pth: "/home/lc/data_center/image_analysis/error_compare_result"
total_pkl: "/home/lc/data_center/image_analysis/pic_pic_similar_maxtrix/total.pkl"
result_txt: "/home/lc/data_center/image_analysis/pic_pic_similar_maxtrix/result.txt"
total_pkl: "/home/lc/data_center/image_analysis/pic_pic_similar_maxtrix_new/total.pkl"
result_txt: "/home/lc/data_center/image_analysis/pic_pic_similar_maxtrix_new/result.txt"
transform:
img_size: 224 # 图像尺寸
@ -46,9 +46,9 @@ logging:
tensorboard: true # 是否启用TensorBoard
checkpoint_interval: 30 # 检查点保存间隔epoch
event:
oneToOne_max_th: 0.9
oneToSn_min_th: 0.6
event_save_dir: "/home/lc/works/realtime_yolov10s/online_yolov10s_resnetv11_20250702/yolos_tracking"
stdlib_image_path: "/testDataAndLogs/module_test_record/comparison/标准图测试数据/pic/stlib_base"
pickle_path: "event.pickle"
#event:
# oneToOne_max_th: 0.9
# oneToSn_min_th: 0.6
# event_save_dir: "/home/lc/works/realtime_yolov10s/online_yolov10s_resnetv11_20250702/yolos_tracking"
# stdlib_image_path: "/testDataAndLogs/module_test_record/comparison/标准图测试数据/pic/stlib_base"
# pickle_path: "event.pickle"

View File

@ -19,7 +19,7 @@ models:
channel_ratio: 1.0
model_path: "../checkpoints/resnet101_electornic_20250807/best.pth"
onnx_model: "../checkpoints/resnet101_electornic_20250807/best.onnx"
rknn_model: "../checkpoints/resnet101_electornic_20250807/resnet101_electornic.rknn"
rknn_model: "../checkpoints/resnet101_electornic_20250807/resnet101_electornic_3588.rknn"
rknn_batch_size: 1
# 日志与监控

View File

@ -12,9 +12,9 @@ base:
# 模型配置
models:
backbone: 'resnet101'
channel_ratio: 1.0
checkpoints: "../checkpoints/resnet101_electornic_20250807/best.pth"
backbone: 'resnet18'
channel_ratio: 0.75
checkpoints: "../checkpoints/resnet18_20250718_scale=0.75_nosub/best.pth"
# 数据配置
data:
@ -22,7 +22,7 @@ data:
test_batch_size: 128 # 验证批次大小
num_workers: 32 # 数据加载线程数
half: true # 是否启用半精度数据
img_dirs_path: "/shareData/completed_data/scatter_data/electronic_scale/base/total" # base标准库图片存储路径
img_dirs_path: "/home/lc/data_center/baseStlib/pic/stlib_base" # base标准库图片存储路径
# img_dirs_path: "/home/lc/contrast_nettest/data/feature_json"
xlsx_pth: false # 过滤商品, 默认None不进行过滤
@ -42,7 +42,7 @@ logging:
save:
json_bin: "../search_library/resnet101_electronic.json" # 保存整个json文件
json_path: "/home/lc/data_center/baseStlib/feature_json/stlib_base_resnet18_sub" # 保存单个json文件路径
json_path: "/home/lc/data_center/baseStlib/feature_json/stlib_base_resnet18_sub_1k_合并" # 保存单个json文件路径
error_barcodes: "error_barcodes.txt"
barcodes_statistics: "../search_library/barcodes_statistics.txt"
create_single_json: false # 是否保存单个json文件
create_single_json: true # 是否保存单个json文件

View File

@ -96,7 +96,7 @@ if __name__ == '__main__':
rknn.config(
mean_values=[[127.5, 127.5, 127.5]],
std_values=[[127.5, 127.5, 127.5]],
target_platform='rk3566',
target_platform='rk3588',
model_pruning=False,
compress_weight=False,
single_core_mode=True,

View File

@ -238,5 +238,5 @@ def get_histogram(data, label=None):
if __name__ == '__main__':
picTopic_matrix = picDirSimilarAnalysis()
picTopic_matrix.get_group_similarity_matrix('/home/lc/data_center/image_analysis/pic_pic_similar_maxtrix')
picTopic_matrix.get_group_similarity_matrix('/home/lc/data_center/image_analysis/pic_pic_similar_maxtrix_new')
# read_result_txt()

View File

@ -288,13 +288,13 @@ def run_training(local_rank, node_rank, local_size, world_size, conf):
# 初始化分布式环境
rank = local_rank + node_rank * local_size
os.environ['RANK'] = str(rank)
os.environ['WORLD_SIZE'] = str(local_size)
os.environ['WORLD_SIZE'] = str(world_size)
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'
dist.init_process_group(backend='nccl', rank=rank, world_size=world_size)
torch.cuda.set_device(rank)
device = torch.device('cuda', rank)
torch.cuda.set_device(local_rank)
device = torch.device('cuda', local_rank)
# 获取数据集而不是DataLoader
train_dataset, class_num = load_data(training=True, cfg=conf, return_dataset=True)
@ -306,8 +306,8 @@ def run_training(local_rank, node_rank, local_size, world_size, conf):
metric = metric.to(device)
# 包装为DistributedDataParallel模型
model = DDP(model, device_ids=[rank], output_device=rank)
metric = DDP(metric, device_ids=[rank], output_device=rank)
model = DDP(model, device_ids=[local_rank], output_device=local_rank)
metric = DDP(metric, device_ids=[local_rank], output_device=local_rank)
# 设置损失函数、优化器和调度器
criterion = setup_loss_function(conf)