多机并行计算
This commit is contained in:
18
.idea/CopilotChatHistory.xml
generated
18
.idea/CopilotChatHistory.xml
generated
@ -3,6 +3,24 @@
|
|||||||
<component name="CopilotChatHistory">
|
<component name="CopilotChatHistory">
|
||||||
<option name="conversations">
|
<option name="conversations">
|
||||||
<list>
|
<list>
|
||||||
|
<Conversation>
|
||||||
|
<option name="createTime" value="1755228773977" />
|
||||||
|
<option name="id" value="0198abc99e597020bf8aa3ef78bc8bd3" />
|
||||||
|
<option name="title" value="新对话 2025年8月15日 11:32:53" />
|
||||||
|
<option name="updateTime" value="1755228773977" />
|
||||||
|
</Conversation>
|
||||||
|
<Conversation>
|
||||||
|
<option name="createTime" value="1755227620606" />
|
||||||
|
<option name="id" value="0198abb804fe7bf8ab3ac9ecfeae6d3f" />
|
||||||
|
<option name="title" value="新对话 2025年8月15日 11:13:40" />
|
||||||
|
<option name="updateTime" value="1755227620606" />
|
||||||
|
</Conversation>
|
||||||
|
<Conversation>
|
||||||
|
<option name="createTime" value="1755219481041" />
|
||||||
|
<option name="id" value="0198ab3bd1d17216b0dab33158ff294e" />
|
||||||
|
<option name="title" value="新对话 2025年8月15日 08:58:01" />
|
||||||
|
<option name="updateTime" value="1755219481041" />
|
||||||
|
</Conversation>
|
||||||
<Conversation>
|
<Conversation>
|
||||||
<option name="createTime" value="1754286137102" />
|
<option name="createTime" value="1754286137102" />
|
||||||
<option name="id" value="0198739a1f0e75c38b0579ade7b34050" />
|
<option name="id" value="0198739a1f0e75c38b0579ade7b34050" />
|
||||||
|
@ -68,7 +68,7 @@ logging:
|
|||||||
|
|
||||||
# 分布式训练(可选)
|
# 分布式训练(可选)
|
||||||
distributed:
|
distributed:
|
||||||
enabled: false # 是否启用分布式训练
|
enabled: true # 是否启用分布式训练
|
||||||
backend: "nccl" # 分布式后端(nccl/gloo)
|
backend: "nccl" # 分布式后端(nccl/gloo)
|
||||||
node_rank: 0 # 节点编号
|
node_rank: 0 # 节点编号
|
||||||
node_num: 1 # 共计几个节点 一般几台机器就有几个节点
|
node_num: 2 # 共计几个节点 一般几台机器就有几个节点
|
||||||
|
@ -27,10 +27,10 @@ data:
|
|||||||
train_batch_size: 128 # 训练批次大小
|
train_batch_size: 128 # 训练批次大小
|
||||||
val_batch_size: 8 # 验证批次大小
|
val_batch_size: 8 # 验证批次大小
|
||||||
num_workers: 32 # 数据加载线程数
|
num_workers: 32 # 数据加载线程数
|
||||||
data_dir: "/home/lc/data_center/image_analysis/pic_pic_similar_maxtrix"
|
data_dir: "/home/lc/data_center/image_analysis/pic_pic_similar_maxtrix_new"
|
||||||
image_joint_pth: "/home/lc/data_center/image_analysis/error_compare_result"
|
image_joint_pth: "/home/lc/data_center/image_analysis/error_compare_result"
|
||||||
total_pkl: "/home/lc/data_center/image_analysis/pic_pic_similar_maxtrix/total.pkl"
|
total_pkl: "/home/lc/data_center/image_analysis/pic_pic_similar_maxtrix_new/total.pkl"
|
||||||
result_txt: "/home/lc/data_center/image_analysis/pic_pic_similar_maxtrix/result.txt"
|
result_txt: "/home/lc/data_center/image_analysis/pic_pic_similar_maxtrix_new/result.txt"
|
||||||
|
|
||||||
transform:
|
transform:
|
||||||
img_size: 224 # 图像尺寸
|
img_size: 224 # 图像尺寸
|
||||||
@ -46,9 +46,9 @@ logging:
|
|||||||
tensorboard: true # 是否启用TensorBoard
|
tensorboard: true # 是否启用TensorBoard
|
||||||
checkpoint_interval: 30 # 检查点保存间隔(epoch)
|
checkpoint_interval: 30 # 检查点保存间隔(epoch)
|
||||||
|
|
||||||
event:
|
#event:
|
||||||
oneToOne_max_th: 0.9
|
# oneToOne_max_th: 0.9
|
||||||
oneToSn_min_th: 0.6
|
# oneToSn_min_th: 0.6
|
||||||
event_save_dir: "/home/lc/works/realtime_yolov10s/online_yolov10s_resnetv11_20250702/yolos_tracking"
|
# event_save_dir: "/home/lc/works/realtime_yolov10s/online_yolov10s_resnetv11_20250702/yolos_tracking"
|
||||||
stdlib_image_path: "/testDataAndLogs/module_test_record/comparison/标准图测试数据/pic/stlib_base"
|
# stdlib_image_path: "/testDataAndLogs/module_test_record/comparison/标准图测试数据/pic/stlib_base"
|
||||||
pickle_path: "event.pickle"
|
# pickle_path: "event.pickle"
|
||||||
|
@ -19,7 +19,7 @@ models:
|
|||||||
channel_ratio: 1.0
|
channel_ratio: 1.0
|
||||||
model_path: "../checkpoints/resnet101_electornic_20250807/best.pth"
|
model_path: "../checkpoints/resnet101_electornic_20250807/best.pth"
|
||||||
onnx_model: "../checkpoints/resnet101_electornic_20250807/best.onnx"
|
onnx_model: "../checkpoints/resnet101_electornic_20250807/best.onnx"
|
||||||
rknn_model: "../checkpoints/resnet101_electornic_20250807/resnet101_electornic.rknn"
|
rknn_model: "../checkpoints/resnet101_electornic_20250807/resnet101_electornic_3588.rknn"
|
||||||
rknn_batch_size: 1
|
rknn_batch_size: 1
|
||||||
|
|
||||||
# 日志与监控
|
# 日志与监控
|
||||||
|
@ -12,9 +12,9 @@ base:
|
|||||||
|
|
||||||
# 模型配置
|
# 模型配置
|
||||||
models:
|
models:
|
||||||
backbone: 'resnet101'
|
backbone: 'resnet18'
|
||||||
channel_ratio: 1.0
|
channel_ratio: 0.75
|
||||||
checkpoints: "../checkpoints/resnet101_electornic_20250807/best.pth"
|
checkpoints: "../checkpoints/resnet18_20250718_scale=0.75_nosub/best.pth"
|
||||||
|
|
||||||
# 数据配置
|
# 数据配置
|
||||||
data:
|
data:
|
||||||
@ -22,7 +22,7 @@ data:
|
|||||||
test_batch_size: 128 # 验证批次大小
|
test_batch_size: 128 # 验证批次大小
|
||||||
num_workers: 32 # 数据加载线程数
|
num_workers: 32 # 数据加载线程数
|
||||||
half: true # 是否启用半精度数据
|
half: true # 是否启用半精度数据
|
||||||
img_dirs_path: "/shareData/completed_data/scatter_data/electronic_scale/base/total" # base标准库图片存储路径
|
img_dirs_path: "/home/lc/data_center/baseStlib/pic/stlib_base" # base标准库图片存储路径
|
||||||
# img_dirs_path: "/home/lc/contrast_nettest/data/feature_json"
|
# img_dirs_path: "/home/lc/contrast_nettest/data/feature_json"
|
||||||
xlsx_pth: false # 过滤商品, 默认None不进行过滤
|
xlsx_pth: false # 过滤商品, 默认None不进行过滤
|
||||||
|
|
||||||
@ -42,7 +42,7 @@ logging:
|
|||||||
|
|
||||||
save:
|
save:
|
||||||
json_bin: "../search_library/resnet101_electronic.json" # 保存整个json文件
|
json_bin: "../search_library/resnet101_electronic.json" # 保存整个json文件
|
||||||
json_path: "/home/lc/data_center/baseStlib/feature_json/stlib_base_resnet18_sub" # 保存单个json文件路径
|
json_path: "/home/lc/data_center/baseStlib/feature_json/stlib_base_resnet18_sub_1k_合并" # 保存单个json文件路径
|
||||||
error_barcodes: "error_barcodes.txt"
|
error_barcodes: "error_barcodes.txt"
|
||||||
barcodes_statistics: "../search_library/barcodes_statistics.txt"
|
barcodes_statistics: "../search_library/barcodes_statistics.txt"
|
||||||
create_single_json: false # 是否保存单个json文件
|
create_single_json: true # 是否保存单个json文件
|
@ -96,7 +96,7 @@ if __name__ == '__main__':
|
|||||||
rknn.config(
|
rknn.config(
|
||||||
mean_values=[[127.5, 127.5, 127.5]],
|
mean_values=[[127.5, 127.5, 127.5]],
|
||||||
std_values=[[127.5, 127.5, 127.5]],
|
std_values=[[127.5, 127.5, 127.5]],
|
||||||
target_platform='rk3566',
|
target_platform='rk3588',
|
||||||
model_pruning=False,
|
model_pruning=False,
|
||||||
compress_weight=False,
|
compress_weight=False,
|
||||||
single_core_mode=True,
|
single_core_mode=True,
|
||||||
|
@ -238,5 +238,5 @@ def get_histogram(data, label=None):
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
picTopic_matrix = picDirSimilarAnalysis()
|
picTopic_matrix = picDirSimilarAnalysis()
|
||||||
picTopic_matrix.get_group_similarity_matrix('/home/lc/data_center/image_analysis/pic_pic_similar_maxtrix')
|
picTopic_matrix.get_group_similarity_matrix('/home/lc/data_center/image_analysis/pic_pic_similar_maxtrix_new')
|
||||||
# read_result_txt()
|
# read_result_txt()
|
||||||
|
@ -288,13 +288,13 @@ def run_training(local_rank, node_rank, local_size, world_size, conf):
|
|||||||
# 初始化分布式环境
|
# 初始化分布式环境
|
||||||
rank = local_rank + node_rank * local_size
|
rank = local_rank + node_rank * local_size
|
||||||
os.environ['RANK'] = str(rank)
|
os.environ['RANK'] = str(rank)
|
||||||
os.environ['WORLD_SIZE'] = str(local_size)
|
os.environ['WORLD_SIZE'] = str(world_size)
|
||||||
os.environ['MASTER_ADDR'] = 'localhost'
|
os.environ['MASTER_ADDR'] = 'localhost'
|
||||||
os.environ['MASTER_PORT'] = '12355'
|
os.environ['MASTER_PORT'] = '12355'
|
||||||
|
|
||||||
dist.init_process_group(backend='nccl', rank=rank, world_size=world_size)
|
dist.init_process_group(backend='nccl', rank=rank, world_size=world_size)
|
||||||
torch.cuda.set_device(rank)
|
torch.cuda.set_device(local_rank)
|
||||||
device = torch.device('cuda', rank)
|
device = torch.device('cuda', local_rank)
|
||||||
|
|
||||||
# 获取数据集而不是DataLoader
|
# 获取数据集而不是DataLoader
|
||||||
train_dataset, class_num = load_data(training=True, cfg=conf, return_dataset=True)
|
train_dataset, class_num = load_data(training=True, cfg=conf, return_dataset=True)
|
||||||
@ -306,8 +306,8 @@ def run_training(local_rank, node_rank, local_size, world_size, conf):
|
|||||||
metric = metric.to(device)
|
metric = metric.to(device)
|
||||||
|
|
||||||
# 包装为DistributedDataParallel模型
|
# 包装为DistributedDataParallel模型
|
||||||
model = DDP(model, device_ids=[rank], output_device=rank)
|
model = DDP(model, device_ids=[local_rank], output_device=local_rank)
|
||||||
metric = DDP(metric, device_ids=[rank], output_device=rank)
|
metric = DDP(metric, device_ids=[local_rank], output_device=local_rank)
|
||||||
|
|
||||||
# 设置损失函数、优化器和调度器
|
# 设置损失函数、优化器和调度器
|
||||||
criterion = setup_loss_function(conf)
|
criterion = setup_loss_function(conf)
|
||||||
|
Reference in New Issue
Block a user