多机并行计算
This commit is contained in:
18
.idea/CopilotChatHistory.xml
generated
18
.idea/CopilotChatHistory.xml
generated
@ -3,6 +3,24 @@
|
||||
<component name="CopilotChatHistory">
|
||||
<option name="conversations">
|
||||
<list>
|
||||
<Conversation>
|
||||
<option name="createTime" value="1755228773977" />
|
||||
<option name="id" value="0198abc99e597020bf8aa3ef78bc8bd3" />
|
||||
<option name="title" value="新对话 2025年8月15日 11:32:53" />
|
||||
<option name="updateTime" value="1755228773977" />
|
||||
</Conversation>
|
||||
<Conversation>
|
||||
<option name="createTime" value="1755227620606" />
|
||||
<option name="id" value="0198abb804fe7bf8ab3ac9ecfeae6d3f" />
|
||||
<option name="title" value="新对话 2025年8月15日 11:13:40" />
|
||||
<option name="updateTime" value="1755227620606" />
|
||||
</Conversation>
|
||||
<Conversation>
|
||||
<option name="createTime" value="1755219481041" />
|
||||
<option name="id" value="0198ab3bd1d17216b0dab33158ff294e" />
|
||||
<option name="title" value="新对话 2025年8月15日 08:58:01" />
|
||||
<option name="updateTime" value="1755219481041" />
|
||||
</Conversation>
|
||||
<Conversation>
|
||||
<option name="createTime" value="1754286137102" />
|
||||
<option name="id" value="0198739a1f0e75c38b0579ade7b34050" />
|
||||
|
@ -68,7 +68,7 @@ logging:
|
||||
|
||||
# 分布式训练(可选)
|
||||
distributed:
|
||||
enabled: false # 是否启用分布式训练
|
||||
enabled: true # 是否启用分布式训练
|
||||
backend: "nccl" # 分布式后端(nccl/gloo)
|
||||
node_rank: 0 # 节点编号
|
||||
node_num: 1 # 共计几个节点 一般几台机器就有几个节点
|
||||
node_num: 2 # 共计几个节点 一般几台机器就有几个节点
|
||||
|
@ -27,10 +27,10 @@ data:
|
||||
train_batch_size: 128 # 训练批次大小
|
||||
val_batch_size: 8 # 验证批次大小
|
||||
num_workers: 32 # 数据加载线程数
|
||||
data_dir: "/home/lc/data_center/image_analysis/pic_pic_similar_maxtrix"
|
||||
data_dir: "/home/lc/data_center/image_analysis/pic_pic_similar_maxtrix_new"
|
||||
image_joint_pth: "/home/lc/data_center/image_analysis/error_compare_result"
|
||||
total_pkl: "/home/lc/data_center/image_analysis/pic_pic_similar_maxtrix/total.pkl"
|
||||
result_txt: "/home/lc/data_center/image_analysis/pic_pic_similar_maxtrix/result.txt"
|
||||
total_pkl: "/home/lc/data_center/image_analysis/pic_pic_similar_maxtrix_new/total.pkl"
|
||||
result_txt: "/home/lc/data_center/image_analysis/pic_pic_similar_maxtrix_new/result.txt"
|
||||
|
||||
transform:
|
||||
img_size: 224 # 图像尺寸
|
||||
@ -46,9 +46,9 @@ logging:
|
||||
tensorboard: true # 是否启用TensorBoard
|
||||
checkpoint_interval: 30 # 检查点保存间隔(epoch)
|
||||
|
||||
event:
|
||||
oneToOne_max_th: 0.9
|
||||
oneToSn_min_th: 0.6
|
||||
event_save_dir: "/home/lc/works/realtime_yolov10s/online_yolov10s_resnetv11_20250702/yolos_tracking"
|
||||
stdlib_image_path: "/testDataAndLogs/module_test_record/comparison/标准图测试数据/pic/stlib_base"
|
||||
pickle_path: "event.pickle"
|
||||
#event:
|
||||
# oneToOne_max_th: 0.9
|
||||
# oneToSn_min_th: 0.6
|
||||
# event_save_dir: "/home/lc/works/realtime_yolov10s/online_yolov10s_resnetv11_20250702/yolos_tracking"
|
||||
# stdlib_image_path: "/testDataAndLogs/module_test_record/comparison/标准图测试数据/pic/stlib_base"
|
||||
# pickle_path: "event.pickle"
|
||||
|
@ -19,7 +19,7 @@ models:
|
||||
channel_ratio: 1.0
|
||||
model_path: "../checkpoints/resnet101_electornic_20250807/best.pth"
|
||||
onnx_model: "../checkpoints/resnet101_electornic_20250807/best.onnx"
|
||||
rknn_model: "../checkpoints/resnet101_electornic_20250807/resnet101_electornic.rknn"
|
||||
rknn_model: "../checkpoints/resnet101_electornic_20250807/resnet101_electornic_3588.rknn"
|
||||
rknn_batch_size: 1
|
||||
|
||||
# 日志与监控
|
||||
|
@ -12,9 +12,9 @@ base:
|
||||
|
||||
# 模型配置
|
||||
models:
|
||||
backbone: 'resnet101'
|
||||
channel_ratio: 1.0
|
||||
checkpoints: "../checkpoints/resnet101_electornic_20250807/best.pth"
|
||||
backbone: 'resnet18'
|
||||
channel_ratio: 0.75
|
||||
checkpoints: "../checkpoints/resnet18_20250718_scale=0.75_nosub/best.pth"
|
||||
|
||||
# 数据配置
|
||||
data:
|
||||
@ -22,7 +22,7 @@ data:
|
||||
test_batch_size: 128 # 验证批次大小
|
||||
num_workers: 32 # 数据加载线程数
|
||||
half: true # 是否启用半精度数据
|
||||
img_dirs_path: "/shareData/completed_data/scatter_data/electronic_scale/base/total" # base标准库图片存储路径
|
||||
img_dirs_path: "/home/lc/data_center/baseStlib/pic/stlib_base" # base标准库图片存储路径
|
||||
# img_dirs_path: "/home/lc/contrast_nettest/data/feature_json"
|
||||
xlsx_pth: false # 过滤商品, 默认None不进行过滤
|
||||
|
||||
@ -42,7 +42,7 @@ logging:
|
||||
|
||||
save:
|
||||
json_bin: "../search_library/resnet101_electronic.json" # 保存整个json文件
|
||||
json_path: "/home/lc/data_center/baseStlib/feature_json/stlib_base_resnet18_sub" # 保存单个json文件路径
|
||||
json_path: "/home/lc/data_center/baseStlib/feature_json/stlib_base_resnet18_sub_1k_合并" # 保存单个json文件路径
|
||||
error_barcodes: "error_barcodes.txt"
|
||||
barcodes_statistics: "../search_library/barcodes_statistics.txt"
|
||||
create_single_json: false # 是否保存单个json文件
|
||||
create_single_json: true # 是否保存单个json文件
|
@ -96,7 +96,7 @@ if __name__ == '__main__':
|
||||
rknn.config(
|
||||
mean_values=[[127.5, 127.5, 127.5]],
|
||||
std_values=[[127.5, 127.5, 127.5]],
|
||||
target_platform='rk3566',
|
||||
target_platform='rk3588',
|
||||
model_pruning=False,
|
||||
compress_weight=False,
|
||||
single_core_mode=True,
|
||||
|
@ -238,5 +238,5 @@ def get_histogram(data, label=None):
|
||||
|
||||
if __name__ == '__main__':
|
||||
picTopic_matrix = picDirSimilarAnalysis()
|
||||
picTopic_matrix.get_group_similarity_matrix('/home/lc/data_center/image_analysis/pic_pic_similar_maxtrix')
|
||||
picTopic_matrix.get_group_similarity_matrix('/home/lc/data_center/image_analysis/pic_pic_similar_maxtrix_new')
|
||||
# read_result_txt()
|
||||
|
@ -288,13 +288,13 @@ def run_training(local_rank, node_rank, local_size, world_size, conf):
|
||||
# 初始化分布式环境
|
||||
rank = local_rank + node_rank * local_size
|
||||
os.environ['RANK'] = str(rank)
|
||||
os.environ['WORLD_SIZE'] = str(local_size)
|
||||
os.environ['WORLD_SIZE'] = str(world_size)
|
||||
os.environ['MASTER_ADDR'] = 'localhost'
|
||||
os.environ['MASTER_PORT'] = '12355'
|
||||
|
||||
dist.init_process_group(backend='nccl', rank=rank, world_size=world_size)
|
||||
torch.cuda.set_device(rank)
|
||||
device = torch.device('cuda', rank)
|
||||
torch.cuda.set_device(local_rank)
|
||||
device = torch.device('cuda', local_rank)
|
||||
|
||||
# 获取数据集而不是DataLoader
|
||||
train_dataset, class_num = load_data(training=True, cfg=conf, return_dataset=True)
|
||||
@ -306,8 +306,8 @@ def run_training(local_rank, node_rank, local_size, world_size, conf):
|
||||
metric = metric.to(device)
|
||||
|
||||
# 包装为DistributedDataParallel模型
|
||||
model = DDP(model, device_ids=[rank], output_device=rank)
|
||||
metric = DDP(metric, device_ids=[rank], output_device=rank)
|
||||
model = DDP(model, device_ids=[local_rank], output_device=local_rank)
|
||||
metric = DDP(metric, device_ids=[local_rank], output_device=local_rank)
|
||||
|
||||
# 设置损失函数、优化器和调度器
|
||||
criterion = setup_loss_function(conf)
|
||||
|
Reference in New Issue
Block a user