112 lines
3.9 KiB
Python
112 lines
3.9 KiB
Python
import os
|
|
import shutil
|
|
from pathlib import Path
|
|
import yaml
|
|
def count_files(directory):
|
|
"""统计目录中的文件数量"""
|
|
try:
|
|
return len([f for f in os.listdir(directory)
|
|
if os.path.isfile(os.path.join(directory, f))])
|
|
except Exception as e:
|
|
print(f"无法统计目录 {directory}: {e}")
|
|
return 0
|
|
|
|
def clear_empty_dirs(path):
|
|
"""
|
|
删除空目录
|
|
:param path: 目录路径
|
|
"""
|
|
for root, dirs, files in os.walk(path, topdown=False):
|
|
for dir_name in dirs:
|
|
dir_path = os.path.join(root, dir_name)
|
|
try:
|
|
if not os.listdir(dir_path):
|
|
os.rmdir(dir_path)
|
|
print(f"Deleted empty directory: {dir_path}")
|
|
except Exception as e:
|
|
print(f"Error: {e.strerror}")
|
|
|
|
def get_max_files(conf):
|
|
max_files_ratio = conf['data']['max_files_ratio']
|
|
files_number = []
|
|
for root, dirs, files in os.walk(conf['data']['source_dir']):
|
|
if len(dirs) == 0:
|
|
if len(files) == 0:
|
|
print(root, dirs,files)
|
|
files_number.append(len(files))
|
|
files_number = sorted(files_number, reverse=False)
|
|
max_files = files_number[int(max_files_ratio * len(files_number))]
|
|
print(f"max_files: {max_files}")
|
|
if max_files < conf['data']['min_files']:
|
|
max_files = conf['data']['min_files']
|
|
return max_files
|
|
def megre_subdirs(pth):
|
|
for roots, dir_names, files in os.walk(pth):
|
|
print(f"image {dir_names}")
|
|
for image in dir_names:
|
|
inner_dir_path = os.path.join(pth, image)
|
|
for inner_roots, inner_dirs, inner_files in os.walk(inner_dir_path):
|
|
for inner_dir in inner_dirs:
|
|
src_dir = os.path.join(inner_roots, inner_dir)
|
|
dest_dir = os.path.join(pth, inner_dir)
|
|
# shutil.copytree(src_dir, dest_dir)
|
|
shutil.move(src_dir, dest_dir)
|
|
print(f"Copied {inner_dir} to {pth}")
|
|
clear_empty_dirs(pth)
|
|
|
|
|
|
# def split_subdirs(source_dir, target_dir, max_files=10):
|
|
def split_subdirs(conf):
|
|
"""
|
|
复制文件数≤max_files的子目录到目标目录
|
|
:param source_dir: 源目录路径
|
|
:param target_dir: 目标目录路径
|
|
:param max_files: 最大文件数阈值
|
|
"""
|
|
source_dir = conf['data']['source_dir']
|
|
target_extra_dir = conf['data']['data_extra_dir']
|
|
train_dir = conf['data']['train_dir']
|
|
max_files = get_max_files(conf)
|
|
megre_subdirs(source_dir) # 合并子目录,删除上级目录
|
|
# 创建目标目录
|
|
Path(target_extra_dir).mkdir(parents=True, exist_ok=True)
|
|
|
|
print(f"开始处理目录: {source_dir}")
|
|
print(f"目标目录: {target_extra_dir}")
|
|
print(f"筛选条件: 文件数 ≤ {max_files}\n")
|
|
|
|
# 遍历源目录
|
|
for subdir in os.listdir(source_dir):
|
|
subdir_path = os.path.join(source_dir, subdir)
|
|
|
|
if not os.path.isdir(subdir_path):
|
|
continue
|
|
|
|
try:
|
|
file_count = count_files(subdir_path)
|
|
print(f"复制 {subdir} (包含 {file_count} 个文件)")
|
|
if file_count <= max_files:
|
|
dest_path = os.path.join(target_extra_dir, subdir)
|
|
else:
|
|
dest_path = os.path.join(train_dir, subdir)
|
|
# 如果目标目录已存在则跳过
|
|
if os.path.exists(dest_path):
|
|
print(f"目录已存在,跳过: {dest_path}")
|
|
continue
|
|
print(f"复制 {subdir} (包含 {file_count} 个文件) 至 {dest_path}")
|
|
shutil.copytree(subdir_path, dest_path)
|
|
# shutil.move(subdir_path, dest_path)
|
|
|
|
except Exception as e:
|
|
print(f"处理目录 {subdir} 时出错: {e}")
|
|
|
|
print("\n处理完成")
|
|
|
|
if __name__ == "__main__":
|
|
# 配置路径
|
|
with open('../configs/sub_data.yml', 'r') as f:
|
|
conf = yaml.load(f, Loader=yaml.FullLoader)
|
|
|
|
# 执行复制操作
|
|
split_subdirs(conf)
|