更新 detacttracking

This commit is contained in:
lee
2025-01-22 13:16:44 +08:00
parent 2320468c40
commit c9d79f8059
355 changed files with 61097 additions and 1 deletions

View File

@ -0,0 +1,12 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
__version__ = '8.0.173'
from ultralytics.models import RTDETR, SAM, YOLO
from ultralytics.models.fastsam import FastSAM
from ultralytics.models.nas import NAS
from ultralytics.utils import SETTINGS as settings
from ultralytics.utils.checks import check_yolo as checks
from ultralytics.utils.downloads import download
__all__ = '__version__', 'YOLO', 'NAS', 'SAM', 'FastSAM', 'RTDETR', 'checks', 'download', 'settings'

View File

@ -0,0 +1,460 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import contextlib
import re
import shutil
import sys
from pathlib import Path
from types import SimpleNamespace
from typing import Dict, List, Union
from ultralytics.utils import (ASSETS, DEFAULT_CFG, DEFAULT_CFG_DICT, DEFAULT_CFG_PATH, LOGGER, RANK, SETTINGS,
SETTINGS_YAML, IterableSimpleNamespace, __version__, checks, colorstr, deprecation_warn,
yaml_load, yaml_print)
# Define valid tasks and modes
MODES = 'train', 'val', 'predict', 'export', 'track', 'benchmark'
TASKS = 'detect', 'segment', 'classify', 'pose'
TASK2DATA = {'detect': 'coco8.yaml', 'segment': 'coco8-seg.yaml', 'classify': 'imagenet10', 'pose': 'coco8-pose.yaml'}
TASK2MODEL = {
'detect': 'yolov8n.pt',
'segment': 'yolov8n-seg.pt',
'classify': 'yolov8n-cls.pt',
'pose': 'yolov8n-pose.pt'}
TASK2METRIC = {
'detect': 'metrics/mAP50-95(B)',
'segment': 'metrics/mAP50-95(M)',
'classify': 'metrics/accuracy_top1',
'pose': 'metrics/mAP50-95(P)'}
CLI_HELP_MSG = \
f"""
Arguments received: {str(['yolo'] + sys.argv[1:])}. Ultralytics 'yolo' commands use the following syntax:
yolo TASK MODE ARGS
Where TASK (optional) is one of {TASKS}
MODE (required) is one of {MODES}
ARGS (optional) are any number of custom 'arg=value' pairs like 'imgsz=320' that override defaults.
See all ARGS at https://docs.ultralytics.com/usage/cfg or with 'yolo cfg'
1. Train a detection model for 10 epochs with an initial learning_rate of 0.01
yolo train data=coco128.yaml model=yolov8n.pt epochs=10 lr0=0.01
2. Predict a YouTube video using a pretrained segmentation model at image size 320:
yolo predict model=yolov8n-seg.pt source='https://youtu.be/Zgi9g1ksQHc' imgsz=320
3. Val a pretrained detection model at batch-size 1 and image size 640:
yolo val model=yolov8n.pt data=coco128.yaml batch=1 imgsz=640
4. Export a YOLOv8n classification model to ONNX format at image size 224 by 128 (no TASK required)
yolo export model=yolov8n-cls.pt format=onnx imgsz=224,128
5. Run special commands:
yolo help
yolo checks
yolo version
yolo settings
yolo copy-cfg
yolo cfg
Docs: https://docs.ultralytics.com
Community: https://community.ultralytics.com
GitHub: https://github.com/ultralytics/ultralytics
"""
# Define keys for arg type checks
CFG_FLOAT_KEYS = 'warmup_epochs', 'box', 'cls', 'dfl', 'degrees', 'shear'
CFG_FRACTION_KEYS = ('dropout', 'iou', 'lr0', 'lrf', 'momentum', 'weight_decay', 'warmup_momentum', 'warmup_bias_lr',
'label_smoothing', 'hsv_h', 'hsv_s', 'hsv_v', 'translate', 'scale', 'perspective', 'flipud',
'fliplr', 'mosaic', 'mixup', 'copy_paste', 'conf', 'iou', 'fraction') # fraction floats 0.0 - 1.0
CFG_INT_KEYS = ('epochs', 'patience', 'batch', 'workers', 'seed', 'close_mosaic', 'mask_ratio', 'max_det', 'vid_stride',
'line_width', 'workspace', 'nbs', 'save_period')
CFG_BOOL_KEYS = ('save', 'exist_ok', 'verbose', 'deterministic', 'single_cls', 'rect', 'cos_lr', 'overlap_mask', 'val',
'save_json', 'save_hybrid', 'half', 'dnn', 'plots', 'show', 'save_txt', 'save_conf', 'save_crop',
'show_labels', 'show_conf', 'visualize', 'augment', 'agnostic_nms', 'retina_masks', 'boxes', 'keras',
'optimize', 'int8', 'dynamic', 'simplify', 'nms', 'profile')
def cfg2dict(cfg):
"""
Convert a configuration object to a dictionary, whether it is a file path, a string, or a SimpleNamespace object.
Args:
cfg (str | Path | dict | SimpleNamespace): Configuration object to be converted to a dictionary.
Returns:
cfg (dict): Configuration object in dictionary format.
"""
if isinstance(cfg, (str, Path)):
cfg = yaml_load(cfg) # load dict
elif isinstance(cfg, SimpleNamespace):
cfg = vars(cfg) # convert to dict
return cfg
def get_cfg(cfg: Union[str, Path, Dict, SimpleNamespace] = DEFAULT_CFG_DICT, overrides: Dict = None):
"""
Load and merge configuration data from a file or dictionary.
Args:
cfg (str | Path | Dict | SimpleNamespace): Configuration data.
overrides (str | Dict | optional): Overrides in the form of a file name or a dictionary. Default is None.
Returns:
(SimpleNamespace): Training arguments namespace.
"""
cfg = cfg2dict(cfg)
# Merge overrides
if overrides:
overrides = cfg2dict(overrides)
if 'save_dir' not in cfg:
overrides.pop('save_dir', None) # special override keys to ignore
check_dict_alignment(cfg, overrides)
cfg = {**cfg, **overrides} # merge cfg and overrides dicts (prefer overrides)
# Special handling for numeric project/name
for k in 'project', 'name':
if k in cfg and isinstance(cfg[k], (int, float)):
cfg[k] = str(cfg[k])
if cfg.get('name') == 'model': # assign model to 'name' arg
cfg['name'] = cfg.get('model', '').split('.')[0]
LOGGER.warning(f"WARNING ⚠️ 'name=model' automatically updated to 'name={cfg['name']}'.")
# Type and Value checks
for k, v in cfg.items():
if v is not None: # None values may be from optional args
if k in CFG_FLOAT_KEYS and not isinstance(v, (int, float)):
raise TypeError(f"'{k}={v}' is of invalid type {type(v).__name__}. "
f"Valid '{k}' types are int (i.e. '{k}=0') or float (i.e. '{k}=0.5')")
elif k in CFG_FRACTION_KEYS:
if not isinstance(v, (int, float)):
raise TypeError(f"'{k}={v}' is of invalid type {type(v).__name__}. "
f"Valid '{k}' types are int (i.e. '{k}=0') or float (i.e. '{k}=0.5')")
if not (0.0 <= v <= 1.0):
raise ValueError(f"'{k}={v}' is an invalid value. "
f"Valid '{k}' values are between 0.0 and 1.0.")
elif k in CFG_INT_KEYS and not isinstance(v, int):
raise TypeError(f"'{k}={v}' is of invalid type {type(v).__name__}. "
f"'{k}' must be an int (i.e. '{k}=8')")
elif k in CFG_BOOL_KEYS and not isinstance(v, bool):
raise TypeError(f"'{k}={v}' is of invalid type {type(v).__name__}. "
f"'{k}' must be a bool (i.e. '{k}=True' or '{k}=False')")
# Return instance
return IterableSimpleNamespace(**cfg)
def get_save_dir(args, name=None):
"""Return save_dir as created from train/val/predict arguments."""
if getattr(args, 'save_dir', None):
save_dir = args.save_dir
else:
from ultralytics.utils.files import increment_path
project = args.project or Path(SETTINGS['runs_dir']) / args.task
name = name or args.name or f'{args.mode}'
save_dir = increment_path(Path(project) / name, exist_ok=args.exist_ok if RANK in (-1, 0) else True)
return Path(save_dir)
def _handle_deprecation(custom):
"""Hardcoded function to handle deprecated config keys."""
for key in custom.copy().keys():
if key == 'hide_labels':
deprecation_warn(key, 'show_labels')
custom['show_labels'] = custom.pop('hide_labels') == 'False'
if key == 'hide_conf':
deprecation_warn(key, 'show_conf')
custom['show_conf'] = custom.pop('hide_conf') == 'False'
if key == 'line_thickness':
deprecation_warn(key, 'line_width')
custom['line_width'] = custom.pop('line_thickness')
return custom
def check_dict_alignment(base: Dict, custom: Dict, e=None):
"""
This function checks for any mismatched keys between a custom configuration list and a base configuration list.
If any mismatched keys are found, the function prints out similar keys from the base list and exits the program.
Args:
custom (dict): a dictionary of custom configuration options
base (dict): a dictionary of base configuration options
e (Error, optional): An optional error that is passed by the calling function.
"""
custom = _handle_deprecation(custom)
base_keys, custom_keys = (set(x.keys()) for x in (base, custom))
mismatched = [k for k in custom_keys if k not in base_keys]
if mismatched:
from difflib import get_close_matches
string = ''
for x in mismatched:
matches = get_close_matches(x, base_keys) # key list
matches = [f'{k}={base[k]}' if base.get(k) is not None else k for k in matches]
match_str = f'Similar arguments are i.e. {matches}.' if matches else ''
string += f"'{colorstr('red', 'bold', x)}' is not a valid YOLO argument. {match_str}\n"
raise SyntaxError(string + CLI_HELP_MSG) from e
def merge_equals_args(args: List[str]) -> List[str]:
"""
Merges arguments around isolated '=' args in a list of strings.
The function considers cases where the first argument ends with '=' or the second starts with '=',
as well as when the middle one is an equals sign.
Args:
args (List[str]): A list of strings where each element is an argument.
Returns:
List[str]: A list of strings where the arguments around isolated '=' are merged.
"""
new_args = []
for i, arg in enumerate(args):
if arg == '=' and 0 < i < len(args) - 1: # merge ['arg', '=', 'val']
new_args[-1] += f'={args[i + 1]}'
del args[i + 1]
elif arg.endswith('=') and i < len(args) - 1 and '=' not in args[i + 1]: # merge ['arg=', 'val']
new_args.append(f'{arg}{args[i + 1]}')
del args[i + 1]
elif arg.startswith('=') and i > 0: # merge ['arg', '=val']
new_args[-1] += arg
else:
new_args.append(arg)
return new_args
def handle_yolo_hub(args: List[str]) -> None:
"""
Handle Ultralytics HUB command-line interface (CLI) commands.
This function processes Ultralytics HUB CLI commands such as login and logout.
It should be called when executing a script with arguments related to HUB authentication.
Args:
args (List[str]): A list of command line arguments
Example:
```bash
python my_script.py hub login your_api_key
```
"""
from ultralytics import hub
if args[0] == 'login':
key = args[1] if len(args) > 1 else ''
# Log in to Ultralytics HUB using the provided API key
hub.login(key)
elif args[0] == 'logout':
# Log out from Ultralytics HUB
hub.logout()
def handle_yolo_settings(args: List[str]) -> None:
"""
Handle YOLO settings command-line interface (CLI) commands.
This function processes YOLO settings CLI commands such as reset.
It should be called when executing a script with arguments related to YOLO settings management.
Args:
args (List[str]): A list of command line arguments for YOLO settings management.
Example:
```bash
python my_script.py yolo settings reset
```
"""
url = 'https://docs.ultralytics.com/quickstart/#ultralytics-settings' # help URL
try:
if any(args):
if args[0] == 'reset':
SETTINGS_YAML.unlink() # delete the settings file
SETTINGS.reset() # create new settings
LOGGER.info('Settings reset successfully') # inform the user that settings have been reset
else: # save a new setting
new = dict(parse_key_value_pair(a) for a in args)
check_dict_alignment(SETTINGS, new)
SETTINGS.update(new)
LOGGER.info(f'💡 Learn about settings at {url}')
yaml_print(SETTINGS_YAML) # print the current settings
except Exception as e:
LOGGER.warning(f"WARNING ⚠️ settings error: '{e}'. Please see {url} for help.")
def parse_key_value_pair(pair):
"""Parse one 'key=value' pair and return key and value."""
re.sub(r' *= *', '=', pair) # remove spaces around equals sign
k, v = pair.split('=', 1) # split on first '=' sign
assert v, f"missing '{k}' value"
return k, smart_value(v)
def smart_value(v):
"""Convert a string to an underlying type such as int, float, bool, etc."""
if v.lower() == 'none':
return None
elif v.lower() == 'true':
return True
elif v.lower() == 'false':
return False
else:
with contextlib.suppress(Exception):
return eval(v)
return v
def entrypoint(debug=''):
"""
This function is the ultralytics package entrypoint, it's responsible for parsing the command line arguments passed
to the package.
This function allows for:
- passing mandatory YOLO args as a list of strings
- specifying the task to be performed, either 'detect', 'segment' or 'classify'
- specifying the mode, either 'train', 'val', 'test', or 'predict'
- running special modes like 'checks'
- passing overrides to the package's configuration
It uses the package's default cfg and initializes it using the passed overrides.
Then it calls the CLI function with the composed cfg
"""
args = (debug.split(' ') if debug else sys.argv)[1:]
if not args: # no arguments passed
LOGGER.info(CLI_HELP_MSG)
return
special = {
'help': lambda: LOGGER.info(CLI_HELP_MSG),
'checks': checks.check_yolo,
'version': lambda: LOGGER.info(__version__),
'settings': lambda: handle_yolo_settings(args[1:]),
'cfg': lambda: yaml_print(DEFAULT_CFG_PATH),
'hub': lambda: handle_yolo_hub(args[1:]),
'login': lambda: handle_yolo_hub(args),
'copy-cfg': copy_default_cfg}
full_args_dict = {**DEFAULT_CFG_DICT, **{k: None for k in TASKS}, **{k: None for k in MODES}, **special}
# Define common mis-uses of special commands, i.e. -h, -help, --help
special.update({k[0]: v for k, v in special.items()}) # singular
special.update({k[:-1]: v for k, v in special.items() if len(k) > 1 and k.endswith('s')}) # singular
special = {**special, **{f'-{k}': v for k, v in special.items()}, **{f'--{k}': v for k, v in special.items()}}
overrides = {} # basic overrides, i.e. imgsz=320
for a in merge_equals_args(args): # merge spaces around '=' sign
if a.startswith('--'):
LOGGER.warning(f"WARNING ⚠️ '{a}' does not require leading dashes '--', updating to '{a[2:]}'.")
a = a[2:]
if a.endswith(','):
LOGGER.warning(f"WARNING ⚠️ '{a}' does not require trailing comma ',', updating to '{a[:-1]}'.")
a = a[:-1]
if '=' in a:
try:
k, v = parse_key_value_pair(a)
if k == 'cfg': # custom.yaml passed
LOGGER.info(f'Overriding {DEFAULT_CFG_PATH} with {v}')
overrides = {k: val for k, val in yaml_load(checks.check_yaml(v)).items() if k != 'cfg'}
else:
overrides[k] = v
except (NameError, SyntaxError, ValueError, AssertionError) as e:
check_dict_alignment(full_args_dict, {a: ''}, e)
elif a in TASKS:
overrides['task'] = a
elif a in MODES:
overrides['mode'] = a
elif a.lower() in special:
special[a.lower()]()
return
elif a in DEFAULT_CFG_DICT and isinstance(DEFAULT_CFG_DICT[a], bool):
overrides[a] = True # auto-True for default bool args, i.e. 'yolo show' sets show=True
elif a in DEFAULT_CFG_DICT:
raise SyntaxError(f"'{colorstr('red', 'bold', a)}' is a valid YOLO argument but is missing an '=' sign "
f"to set its value, i.e. try '{a}={DEFAULT_CFG_DICT[a]}'\n{CLI_HELP_MSG}")
else:
check_dict_alignment(full_args_dict, {a: ''})
# Check keys
check_dict_alignment(full_args_dict, overrides)
# Mode
mode = overrides.get('mode')
if mode is None:
mode = DEFAULT_CFG.mode or 'predict'
LOGGER.warning(f"WARNING ⚠️ 'mode' is missing. Valid modes are {MODES}. Using default 'mode={mode}'.")
elif mode not in MODES:
raise ValueError(f"Invalid 'mode={mode}'. Valid modes are {MODES}.\n{CLI_HELP_MSG}")
# Task
task = overrides.pop('task', None)
if task:
if task not in TASKS:
raise ValueError(f"Invalid 'task={task}'. Valid tasks are {TASKS}.\n{CLI_HELP_MSG}")
if 'model' not in overrides:
overrides['model'] = TASK2MODEL[task]
# Model
model = overrides.pop('model', DEFAULT_CFG.model)
if model is None:
model = 'yolov8n.pt'
LOGGER.warning(f"WARNING ⚠️ 'model' is missing. Using default 'model={model}'.")
overrides['model'] = model
if 'rtdetr' in model.lower(): # guess architecture
from ultralytics import RTDETR
model = RTDETR(model) # no task argument
elif 'fastsam' in model.lower():
from ultralytics import FastSAM
model = FastSAM(model)
elif 'sam' in model.lower():
from ultralytics import SAM
model = SAM(model)
else:
from ultralytics import YOLO
model = YOLO(model, task=task)
if isinstance(overrides.get('pretrained'), str):
model.load(overrides['pretrained'])
# Task Update
if task != model.task:
if task:
LOGGER.warning(f"WARNING ⚠️ conflicting 'task={task}' passed with 'task={model.task}' model. "
f"Ignoring 'task={task}' and updating to 'task={model.task}' to match model.")
task = model.task
# Mode
if mode in ('predict', 'track') and 'source' not in overrides:
overrides['source'] = DEFAULT_CFG.source or ASSETS
LOGGER.warning(f"WARNING ⚠️ 'source' is missing. Using default 'source={overrides['source']}'.")
elif mode in ('train', 'val'):
if 'data' not in overrides and 'resume' not in overrides:
overrides['data'] = TASK2DATA.get(task or DEFAULT_CFG.task, DEFAULT_CFG.data)
LOGGER.warning(f"WARNING ⚠️ 'data' is missing. Using default 'data={overrides['data']}'.")
elif mode == 'export':
if 'format' not in overrides:
overrides['format'] = DEFAULT_CFG.format or 'torchscript'
LOGGER.warning(f"WARNING ⚠️ 'format' is missing. Using default 'format={overrides['format']}'.")
# Run command in python
# getattr(model, mode)(**vars(get_cfg(overrides=overrides))) # default args using default.yaml
getattr(model, mode)(**overrides) # default args from model
# Special modes --------------------------------------------------------------------------------------------------------
def copy_default_cfg():
"""Copy and create a new default configuration file with '_copy' appended to its name."""
new_file = Path.cwd() / DEFAULT_CFG_PATH.name.replace('.yaml', '_copy.yaml')
shutil.copy2(DEFAULT_CFG_PATH, new_file)
LOGGER.info(f'{DEFAULT_CFG_PATH} copied to {new_file}\n'
f"Example YOLO command with this new custom cfg:\n yolo cfg='{new_file}' imgsz=320 batch=8")
if __name__ == '__main__':
# Example: entrypoint(debug='yolo predict model=yolov8n.pt')
entrypoint(debug='')

View File

@ -0,0 +1,73 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
# Argoverse-HD dataset (ring-front-center camera) http://www.cs.cmu.edu/~mengtial/proj/streaming/ by Argo AI
# Example usage: yolo train data=Argoverse.yaml
# parent
# ├── ultralytics
# └── datasets
# └── Argoverse ← downloads here (31.5 GB)
# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
path: ../datasets/Argoverse # dataset root dir
train: Argoverse-1.1/images/train/ # train images (relative to 'path') 39384 images
val: Argoverse-1.1/images/val/ # val images (relative to 'path') 15062 images
test: Argoverse-1.1/images/test/ # test images (optional) https://eval.ai/web/challenges/challenge-page/800/overview
# Classes
names:
0: person
1: bicycle
2: car
3: motorcycle
4: bus
5: truck
6: traffic_light
7: stop_sign
# Download script/URL (optional) ---------------------------------------------------------------------------------------
download: |
import json
from tqdm import tqdm
from ultralytics.utils.downloads import download
from pathlib import Path
def argoverse2yolo(set):
labels = {}
a = json.load(open(set, "rb"))
for annot in tqdm(a['annotations'], desc=f"Converting {set} to YOLOv5 format..."):
img_id = annot['image_id']
img_name = a['images'][img_id]['name']
img_label_name = f'{img_name[:-3]}txt'
cls = annot['category_id'] # instance class id
x_center, y_center, width, height = annot['bbox']
x_center = (x_center + width / 2) / 1920.0 # offset and scale
y_center = (y_center + height / 2) / 1200.0 # offset and scale
width /= 1920.0 # scale
height /= 1200.0 # scale
img_dir = set.parents[2] / 'Argoverse-1.1' / 'labels' / a['seq_dirs'][a['images'][annot['image_id']]['sid']]
if not img_dir.exists():
img_dir.mkdir(parents=True, exist_ok=True)
k = str(img_dir / img_label_name)
if k not in labels:
labels[k] = []
labels[k].append(f"{cls} {x_center} {y_center} {width} {height}\n")
for k in labels:
with open(k, "w") as f:
f.writelines(labels[k])
# Download 'https://argoverse-hd.s3.us-east-2.amazonaws.com/Argoverse-HD-Full.zip' (deprecated S3 link)
dir = Path(yaml['path']) # dataset root dir
urls = ['https://drive.google.com/file/d/1st9qW3BeIwQsnR0t8mRpvbsSWIo16ACi/view?usp=drive_link']
download(urls, dir=dir)
# Convert
annotations_dir = 'Argoverse-HD/annotations/'
(dir / 'Argoverse-1.1' / 'tracking').rename(dir / 'Argoverse-1.1' / 'images') # rename 'tracking' to 'images'
for d in "train.json", "val.json":
argoverse2yolo(dir / annotations_dir / d) # convert Argoverse annotations to YOLO labels

View File

@ -0,0 +1,37 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
# DOTA 2.0 dataset https://captain-whu.github.io/DOTA/index.html for object detection in aerial images by Wuhan University
# Example usage: yolo train model=yolov8n-obb.pt data=DOTAv2.yaml
# parent
# ├── ultralytics
# └── datasets
# └── dota2 ← downloads here (2GB)
# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
path: ../datasets/DOTAv2 # dataset root dir
train: images/train # train images (relative to 'path') 1411 images
val: images/val # val images (relative to 'path') 458 images
test: images/test # test images (optional) 937 images
# Classes for DOTA 2.0
names:
0: plane
1: ship
2: storage tank
3: baseball diamond
4: tennis court
5: basketball court
6: ground track field
7: harbor
8: bridge
9: large vehicle
10: small vehicle
11: helicopter
12: roundabout
13: soccer ball field
14: swimming pool
15: container crane
16: airport
17: helipad
# Download script/URL (optional)
download: https://github.com/ultralytics/yolov5/releases/download/v1.0/DOTAv2.zip

View File

@ -0,0 +1,54 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
# Global Wheat 2020 dataset http://www.global-wheat.com/ by University of Saskatchewan
# Example usage: yolo train data=GlobalWheat2020.yaml
# parent
# ├── ultralytics
# └── datasets
# └── GlobalWheat2020 ← downloads here (7.0 GB)
# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
path: ../datasets/GlobalWheat2020 # dataset root dir
train: # train images (relative to 'path') 3422 images
- images/arvalis_1
- images/arvalis_2
- images/arvalis_3
- images/ethz_1
- images/rres_1
- images/inrae_1
- images/usask_1
val: # val images (relative to 'path') 748 images (WARNING: train set contains ethz_1)
- images/ethz_1
test: # test images (optional) 1276 images
- images/utokyo_1
- images/utokyo_2
- images/nau_1
- images/uq_1
# Classes
names:
0: wheat_head
# Download script/URL (optional) ---------------------------------------------------------------------------------------
download: |
from ultralytics.utils.downloads import download
from pathlib import Path
# Download
dir = Path(yaml['path']) # dataset root dir
urls = ['https://zenodo.org/record/4298502/files/global-wheat-codalab-official.zip',
'https://github.com/ultralytics/yolov5/releases/download/v1.0/GlobalWheat2020_labels.zip']
download(urls, dir=dir)
# Make Directories
for p in 'annotations', 'images', 'labels':
(dir / p).mkdir(parents=True, exist_ok=True)
# Move
for p in 'arvalis_1', 'arvalis_2', 'arvalis_3', 'ethz_1', 'rres_1', 'inrae_1', 'usask_1', \
'utokyo_1', 'utokyo_2', 'nau_1', 'uq_1':
(dir / 'global-wheat-codalab-official' / p).rename(dir / 'images' / p) # move to /images
f = (dir / 'global-wheat-codalab-official' / p).with_suffix('.json') # json file
if f.exists():
f.rename((dir / 'annotations' / p).with_suffix('.json')) # move to /annotations

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,443 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
# Objects365 dataset https://www.objects365.org/ by Megvii
# Example usage: yolo train data=Objects365.yaml
# parent
# ├── ultralytics
# └── datasets
# └── Objects365 ← downloads here (712 GB = 367G data + 345G zips)
# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
path: ../datasets/Objects365 # dataset root dir
train: images/train # train images (relative to 'path') 1742289 images
val: images/val # val images (relative to 'path') 80000 images
test: # test images (optional)
# Classes
names:
0: Person
1: Sneakers
2: Chair
3: Other Shoes
4: Hat
5: Car
6: Lamp
7: Glasses
8: Bottle
9: Desk
10: Cup
11: Street Lights
12: Cabinet/shelf
13: Handbag/Satchel
14: Bracelet
15: Plate
16: Picture/Frame
17: Helmet
18: Book
19: Gloves
20: Storage box
21: Boat
22: Leather Shoes
23: Flower
24: Bench
25: Potted Plant
26: Bowl/Basin
27: Flag
28: Pillow
29: Boots
30: Vase
31: Microphone
32: Necklace
33: Ring
34: SUV
35: Wine Glass
36: Belt
37: Monitor/TV
38: Backpack
39: Umbrella
40: Traffic Light
41: Speaker
42: Watch
43: Tie
44: Trash bin Can
45: Slippers
46: Bicycle
47: Stool
48: Barrel/bucket
49: Van
50: Couch
51: Sandals
52: Basket
53: Drum
54: Pen/Pencil
55: Bus
56: Wild Bird
57: High Heels
58: Motorcycle
59: Guitar
60: Carpet
61: Cell Phone
62: Bread
63: Camera
64: Canned
65: Truck
66: Traffic cone
67: Cymbal
68: Lifesaver
69: Towel
70: Stuffed Toy
71: Candle
72: Sailboat
73: Laptop
74: Awning
75: Bed
76: Faucet
77: Tent
78: Horse
79: Mirror
80: Power outlet
81: Sink
82: Apple
83: Air Conditioner
84: Knife
85: Hockey Stick
86: Paddle
87: Pickup Truck
88: Fork
89: Traffic Sign
90: Balloon
91: Tripod
92: Dog
93: Spoon
94: Clock
95: Pot
96: Cow
97: Cake
98: Dinning Table
99: Sheep
100: Hanger
101: Blackboard/Whiteboard
102: Napkin
103: Other Fish
104: Orange/Tangerine
105: Toiletry
106: Keyboard
107: Tomato
108: Lantern
109: Machinery Vehicle
110: Fan
111: Green Vegetables
112: Banana
113: Baseball Glove
114: Airplane
115: Mouse
116: Train
117: Pumpkin
118: Soccer
119: Skiboard
120: Luggage
121: Nightstand
122: Tea pot
123: Telephone
124: Trolley
125: Head Phone
126: Sports Car
127: Stop Sign
128: Dessert
129: Scooter
130: Stroller
131: Crane
132: Remote
133: Refrigerator
134: Oven
135: Lemon
136: Duck
137: Baseball Bat
138: Surveillance Camera
139: Cat
140: Jug
141: Broccoli
142: Piano
143: Pizza
144: Elephant
145: Skateboard
146: Surfboard
147: Gun
148: Skating and Skiing shoes
149: Gas stove
150: Donut
151: Bow Tie
152: Carrot
153: Toilet
154: Kite
155: Strawberry
156: Other Balls
157: Shovel
158: Pepper
159: Computer Box
160: Toilet Paper
161: Cleaning Products
162: Chopsticks
163: Microwave
164: Pigeon
165: Baseball
166: Cutting/chopping Board
167: Coffee Table
168: Side Table
169: Scissors
170: Marker
171: Pie
172: Ladder
173: Snowboard
174: Cookies
175: Radiator
176: Fire Hydrant
177: Basketball
178: Zebra
179: Grape
180: Giraffe
181: Potato
182: Sausage
183: Tricycle
184: Violin
185: Egg
186: Fire Extinguisher
187: Candy
188: Fire Truck
189: Billiards
190: Converter
191: Bathtub
192: Wheelchair
193: Golf Club
194: Briefcase
195: Cucumber
196: Cigar/Cigarette
197: Paint Brush
198: Pear
199: Heavy Truck
200: Hamburger
201: Extractor
202: Extension Cord
203: Tong
204: Tennis Racket
205: Folder
206: American Football
207: earphone
208: Mask
209: Kettle
210: Tennis
211: Ship
212: Swing
213: Coffee Machine
214: Slide
215: Carriage
216: Onion
217: Green beans
218: Projector
219: Frisbee
220: Washing Machine/Drying Machine
221: Chicken
222: Printer
223: Watermelon
224: Saxophone
225: Tissue
226: Toothbrush
227: Ice cream
228: Hot-air balloon
229: Cello
230: French Fries
231: Scale
232: Trophy
233: Cabbage
234: Hot dog
235: Blender
236: Peach
237: Rice
238: Wallet/Purse
239: Volleyball
240: Deer
241: Goose
242: Tape
243: Tablet
244: Cosmetics
245: Trumpet
246: Pineapple
247: Golf Ball
248: Ambulance
249: Parking meter
250: Mango
251: Key
252: Hurdle
253: Fishing Rod
254: Medal
255: Flute
256: Brush
257: Penguin
258: Megaphone
259: Corn
260: Lettuce
261: Garlic
262: Swan
263: Helicopter
264: Green Onion
265: Sandwich
266: Nuts
267: Speed Limit Sign
268: Induction Cooker
269: Broom
270: Trombone
271: Plum
272: Rickshaw
273: Goldfish
274: Kiwi fruit
275: Router/modem
276: Poker Card
277: Toaster
278: Shrimp
279: Sushi
280: Cheese
281: Notepaper
282: Cherry
283: Pliers
284: CD
285: Pasta
286: Hammer
287: Cue
288: Avocado
289: Hamimelon
290: Flask
291: Mushroom
292: Screwdriver
293: Soap
294: Recorder
295: Bear
296: Eggplant
297: Board Eraser
298: Coconut
299: Tape Measure/Ruler
300: Pig
301: Showerhead
302: Globe
303: Chips
304: Steak
305: Crosswalk Sign
306: Stapler
307: Camel
308: Formula 1
309: Pomegranate
310: Dishwasher
311: Crab
312: Hoverboard
313: Meat ball
314: Rice Cooker
315: Tuba
316: Calculator
317: Papaya
318: Antelope
319: Parrot
320: Seal
321: Butterfly
322: Dumbbell
323: Donkey
324: Lion
325: Urinal
326: Dolphin
327: Electric Drill
328: Hair Dryer
329: Egg tart
330: Jellyfish
331: Treadmill
332: Lighter
333: Grapefruit
334: Game board
335: Mop
336: Radish
337: Baozi
338: Target
339: French
340: Spring Rolls
341: Monkey
342: Rabbit
343: Pencil Case
344: Yak
345: Red Cabbage
346: Binoculars
347: Asparagus
348: Barbell
349: Scallop
350: Noddles
351: Comb
352: Dumpling
353: Oyster
354: Table Tennis paddle
355: Cosmetics Brush/Eyeliner Pencil
356: Chainsaw
357: Eraser
358: Lobster
359: Durian
360: Okra
361: Lipstick
362: Cosmetics Mirror
363: Curling
364: Table Tennis
# Download script/URL (optional) ---------------------------------------------------------------------------------------
download: |
from tqdm import tqdm
from ultralytics.utils.checks import check_requirements
from ultralytics.utils.downloads import download
from ultralytics.utils.ops import xyxy2xywhn
import numpy as np
from pathlib import Path
check_requirements(('pycocotools>=2.0',))
from pycocotools.coco import COCO
# Make Directories
dir = Path(yaml['path']) # dataset root dir
for p in 'images', 'labels':
(dir / p).mkdir(parents=True, exist_ok=True)
for q in 'train', 'val':
(dir / p / q).mkdir(parents=True, exist_ok=True)
# Train, Val Splits
for split, patches in [('train', 50 + 1), ('val', 43 + 1)]:
print(f"Processing {split} in {patches} patches ...")
images, labels = dir / 'images' / split, dir / 'labels' / split
# Download
url = f"https://dorc.ks3-cn-beijing.ksyun.com/data-set/2020Objects365%E6%95%B0%E6%8D%AE%E9%9B%86/{split}/"
if split == 'train':
download([f'{url}zhiyuan_objv2_{split}.tar.gz'], dir=dir) # annotations json
download([f'{url}patch{i}.tar.gz' for i in range(patches)], dir=images, curl=True, threads=8)
elif split == 'val':
download([f'{url}zhiyuan_objv2_{split}.json'], dir=dir) # annotations json
download([f'{url}images/v1/patch{i}.tar.gz' for i in range(15 + 1)], dir=images, curl=True, threads=8)
download([f'{url}images/v2/patch{i}.tar.gz' for i in range(16, patches)], dir=images, curl=True, threads=8)
# Move
for f in tqdm(images.rglob('*.jpg'), desc=f'Moving {split} images'):
f.rename(images / f.name) # move to /images/{split}
# Labels
coco = COCO(dir / f'zhiyuan_objv2_{split}.json')
names = [x["name"] for x in coco.loadCats(coco.getCatIds())]
for cid, cat in enumerate(names):
catIds = coco.getCatIds(catNms=[cat])
imgIds = coco.getImgIds(catIds=catIds)
for im in tqdm(coco.loadImgs(imgIds), desc=f'Class {cid + 1}/{len(names)} {cat}'):
width, height = im["width"], im["height"]
path = Path(im["file_name"]) # image filename
try:
with open(labels / path.with_suffix('.txt').name, 'a') as file:
annIds = coco.getAnnIds(imgIds=im["id"], catIds=catIds, iscrowd=None)
for a in coco.loadAnns(annIds):
x, y, w, h = a['bbox'] # bounding box in xywh (xy top-left corner)
xyxy = np.array([x, y, x + w, y + h])[None] # pixels(1,4)
x, y, w, h = xyxy2xywhn(xyxy, w=width, h=height, clip=True)[0] # normalized and clipped
file.write(f"{cid} {x:.5f} {y:.5f} {w:.5f} {h:.5f}\n")
except Exception as e:
print(e)

View File

@ -0,0 +1,58 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
# SKU-110K retail items dataset https://github.com/eg4000/SKU110K_CVPR19 by Trax Retail
# Example usage: yolo train data=SKU-110K.yaml
# parent
# ├── ultralytics
# └── datasets
# └── SKU-110K ← downloads here (13.6 GB)
# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
path: ../datasets/SKU-110K # dataset root dir
train: train.txt # train images (relative to 'path') 8219 images
val: val.txt # val images (relative to 'path') 588 images
test: test.txt # test images (optional) 2936 images
# Classes
names:
0: object
# Download script/URL (optional) ---------------------------------------------------------------------------------------
download: |
import shutil
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm
from ultralytics.utils.downloads import download
from ultralytics.utils.ops import xyxy2xywh
# Download
dir = Path(yaml['path']) # dataset root dir
parent = Path(dir.parent) # download dir
urls = ['http://trax-geometry.s3.amazonaws.com/cvpr_challenge/SKU110K_fixed.tar.gz']
download(urls, dir=parent)
# Rename directories
if dir.exists():
shutil.rmtree(dir)
(parent / 'SKU110K_fixed').rename(dir) # rename dir
(dir / 'labels').mkdir(parents=True, exist_ok=True) # create labels dir
# Convert labels
names = 'image', 'x1', 'y1', 'x2', 'y2', 'class', 'image_width', 'image_height' # column names
for d in 'annotations_train.csv', 'annotations_val.csv', 'annotations_test.csv':
x = pd.read_csv(dir / 'annotations' / d, names=names).values # annotations
images, unique_images = x[:, 0], np.unique(x[:, 0])
with open((dir / d).with_suffix('.txt').__str__().replace('annotations_', ''), 'w') as f:
f.writelines(f'./images/{s}\n' for s in unique_images)
for im in tqdm(unique_images, desc=f'Converting {dir / d}'):
cls = 0 # single-class dataset
with open((dir / 'labels' / im).with_suffix('.txt'), 'a') as f:
for r in x[images == im]:
w, h = r[6], r[7] # image width, height
xywh = xyxy2xywh(np.array([[r[1] / w, r[2] / h, r[3] / w, r[4] / h]]))[0] # instance
f.write(f"{cls} {xywh[0]:.5f} {xywh[1]:.5f} {xywh[2]:.5f} {xywh[3]:.5f}\n") # write label

View File

@ -0,0 +1,100 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
# PASCAL VOC dataset http://host.robots.ox.ac.uk/pascal/VOC by University of Oxford
# Example usage: yolo train data=VOC.yaml
# parent
# ├── ultralytics
# └── datasets
# └── VOC ← downloads here (2.8 GB)
# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
path: ../datasets/VOC
train: # train images (relative to 'path') 16551 images
- images/train2012
- images/train2007
- images/val2012
- images/val2007
val: # val images (relative to 'path') 4952 images
- images/test2007
test: # test images (optional)
- images/test2007
# Classes
names:
0: aeroplane
1: bicycle
2: bird
3: boat
4: bottle
5: bus
6: car
7: cat
8: chair
9: cow
10: diningtable
11: dog
12: horse
13: motorbike
14: person
15: pottedplant
16: sheep
17: sofa
18: train
19: tvmonitor
# Download script/URL (optional) ---------------------------------------------------------------------------------------
download: |
import xml.etree.ElementTree as ET
from tqdm import tqdm
from ultralytics.utils.downloads import download
from pathlib import Path
def convert_label(path, lb_path, year, image_id):
def convert_box(size, box):
dw, dh = 1. / size[0], 1. / size[1]
x, y, w, h = (box[0] + box[1]) / 2.0 - 1, (box[2] + box[3]) / 2.0 - 1, box[1] - box[0], box[3] - box[2]
return x * dw, y * dh, w * dw, h * dh
in_file = open(path / f'VOC{year}/Annotations/{image_id}.xml')
out_file = open(lb_path, 'w')
tree = ET.parse(in_file)
root = tree.getroot()
size = root.find('size')
w = int(size.find('width').text)
h = int(size.find('height').text)
names = list(yaml['names'].values()) # names list
for obj in root.iter('object'):
cls = obj.find('name').text
if cls in names and int(obj.find('difficult').text) != 1:
xmlbox = obj.find('bndbox')
bb = convert_box((w, h), [float(xmlbox.find(x).text) for x in ('xmin', 'xmax', 'ymin', 'ymax')])
cls_id = names.index(cls) # class id
out_file.write(" ".join(str(a) for a in (cls_id, *bb)) + '\n')
# Download
dir = Path(yaml['path']) # dataset root dir
url = 'https://github.com/ultralytics/yolov5/releases/download/v1.0/'
urls = [f'{url}VOCtrainval_06-Nov-2007.zip', # 446MB, 5012 images
f'{url}VOCtest_06-Nov-2007.zip', # 438MB, 4953 images
f'{url}VOCtrainval_11-May-2012.zip'] # 1.95GB, 17126 images
download(urls, dir=dir / 'images', curl=True, threads=3)
# Convert
path = dir / 'images/VOCdevkit'
for year, image_set in ('2012', 'train'), ('2012', 'val'), ('2007', 'train'), ('2007', 'val'), ('2007', 'test'):
imgs_path = dir / 'images' / f'{image_set}{year}'
lbs_path = dir / 'labels' / f'{image_set}{year}'
imgs_path.mkdir(exist_ok=True, parents=True)
lbs_path.mkdir(exist_ok=True, parents=True)
with open(path / f'VOC{year}/ImageSets/Main/{image_set}.txt') as f:
image_ids = f.read().strip().split()
for id in tqdm(image_ids, desc=f'{image_set}{year}'):
f = path / f'VOC{year}/JPEGImages/{id}.jpg' # old img path
lb_path = (lbs_path / f.name).with_suffix('.txt') # new label path
f.rename(imgs_path / f.name) # move image
convert_label(path, lb_path, year, id) # convert labels to YOLO format

View File

@ -0,0 +1,73 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
# VisDrone2019-DET dataset https://github.com/VisDrone/VisDrone-Dataset by Tianjin University
# Example usage: yolo train data=VisDrone.yaml
# parent
# ├── ultralytics
# └── datasets
# └── VisDrone ← downloads here (2.3 GB)
# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
path: ../datasets/VisDrone # dataset root dir
train: VisDrone2019-DET-train/images # train images (relative to 'path') 6471 images
val: VisDrone2019-DET-val/images # val images (relative to 'path') 548 images
test: VisDrone2019-DET-test-dev/images # test images (optional) 1610 images
# Classes
names:
0: pedestrian
1: people
2: bicycle
3: car
4: van
5: truck
6: tricycle
7: awning-tricycle
8: bus
9: motor
# Download script/URL (optional) ---------------------------------------------------------------------------------------
download: |
import os
from pathlib import Path
from ultralytics.utils.downloads import download
def visdrone2yolo(dir):
from PIL import Image
from tqdm import tqdm
def convert_box(size, box):
# Convert VisDrone box to YOLO xywh box
dw = 1. / size[0]
dh = 1. / size[1]
return (box[0] + box[2] / 2) * dw, (box[1] + box[3] / 2) * dh, box[2] * dw, box[3] * dh
(dir / 'labels').mkdir(parents=True, exist_ok=True) # make labels directory
pbar = tqdm((dir / 'annotations').glob('*.txt'), desc=f'Converting {dir}')
for f in pbar:
img_size = Image.open((dir / 'images' / f.name).with_suffix('.jpg')).size
lines = []
with open(f, 'r') as file: # read annotation.txt
for row in [x.split(',') for x in file.read().strip().splitlines()]:
if row[4] == '0': # VisDrone 'ignored regions' class 0
continue
cls = int(row[5]) - 1
box = convert_box(img_size, tuple(map(int, row[:4])))
lines.append(f"{cls} {' '.join(f'{x:.6f}' for x in box)}\n")
with open(str(f).replace(f'{os.sep}annotations{os.sep}', f'{os.sep}labels{os.sep}'), 'w') as fl:
fl.writelines(lines) # write label.txt
# Download
dir = Path(yaml['path']) # dataset root dir
urls = ['https://github.com/ultralytics/yolov5/releases/download/v1.0/VisDrone2019-DET-train.zip',
'https://github.com/ultralytics/yolov5/releases/download/v1.0/VisDrone2019-DET-val.zip',
'https://github.com/ultralytics/yolov5/releases/download/v1.0/VisDrone2019-DET-test-dev.zip',
'https://github.com/ultralytics/yolov5/releases/download/v1.0/VisDrone2019-DET-test-challenge.zip']
download(urls, dir=dir, curl=True, threads=4)
# Convert
for d in 'VisDrone2019-DET-train', 'VisDrone2019-DET-val', 'VisDrone2019-DET-test-dev':
visdrone2yolo(dir / d) # convert VisDrone annotations to YOLO labels

View File

@ -0,0 +1,38 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
# COCO 2017 dataset http://cocodataset.org by Microsoft
# Example usage: yolo train data=coco-pose.yaml
# parent
# ├── ultralytics
# └── datasets
# └── coco-pose ← downloads here (20.1 GB)
# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
path: ../datasets/coco-pose # dataset root dir
train: train2017.txt # train images (relative to 'path') 118287 images
val: val2017.txt # val images (relative to 'path') 5000 images
test: test-dev2017.txt # 20288 of 40670 images, submit to https://competitions.codalab.org/competitions/20794
# Keypoints
kpt_shape: [17, 3] # number of keypoints, number of dims (2 for x,y or 3 for x,y,visible)
flip_idx: [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15]
# Classes
names:
0: person
# Download script/URL (optional)
download: |
from ultralytics.utils.downloads import download
from pathlib import Path
# Download labels
dir = Path(yaml['path']) # dataset root dir
url = 'https://github.com/ultralytics/yolov5/releases/download/v1.0/'
urls = [url + 'coco2017labels-pose.zip'] # labels
download(urls, dir=dir.parent)
# Download data
urls = ['http://images.cocodataset.org/zips/train2017.zip', # 19G, 118k images
'http://images.cocodataset.org/zips/val2017.zip', # 1G, 5k images
'http://images.cocodataset.org/zips/test2017.zip'] # 7G, 41k images (optional)
download(urls, dir=dir / 'images', threads=3)

View File

@ -0,0 +1,115 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
# COCO 2017 dataset http://cocodataset.org by Microsoft
# Example usage: yolo train data=coco.yaml
# parent
# ├── ultralytics
# └── datasets
# └── coco ← downloads here (20.1 GB)
# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
path: ../datasets/coco # dataset root dir
train: train2017.txt # train images (relative to 'path') 118287 images
val: val2017.txt # val images (relative to 'path') 5000 images
test: test-dev2017.txt # 20288 of 40670 images, submit to https://competitions.codalab.org/competitions/20794
# Classes
names:
0: person
1: bicycle
2: car
3: motorcycle
4: airplane
5: bus
6: train
7: truck
8: boat
9: traffic light
10: fire hydrant
11: stop sign
12: parking meter
13: bench
14: bird
15: cat
16: dog
17: horse
18: sheep
19: cow
20: elephant
21: bear
22: zebra
23: giraffe
24: backpack
25: umbrella
26: handbag
27: tie
28: suitcase
29: frisbee
30: skis
31: snowboard
32: sports ball
33: kite
34: baseball bat
35: baseball glove
36: skateboard
37: surfboard
38: tennis racket
39: bottle
40: wine glass
41: cup
42: fork
43: knife
44: spoon
45: bowl
46: banana
47: apple
48: sandwich
49: orange
50: broccoli
51: carrot
52: hot dog
53: pizza
54: donut
55: cake
56: chair
57: couch
58: potted plant
59: bed
60: dining table
61: toilet
62: tv
63: laptop
64: mouse
65: remote
66: keyboard
67: cell phone
68: microwave
69: oven
70: toaster
71: sink
72: refrigerator
73: book
74: clock
75: vase
76: scissors
77: teddy bear
78: hair drier
79: toothbrush
# Download script/URL (optional)
download: |
from ultralytics.utils.downloads import download
from pathlib import Path
# Download labels
segments = True # segment or box labels
dir = Path(yaml['path']) # dataset root dir
url = 'https://github.com/ultralytics/yolov5/releases/download/v1.0/'
urls = [url + ('coco2017labels-segments.zip' if segments else 'coco2017labels.zip')] # labels
download(urls, dir=dir.parent)
# Download data
urls = ['http://images.cocodataset.org/zips/train2017.zip', # 19G, 118k images
'http://images.cocodataset.org/zips/val2017.zip', # 1G, 5k images
'http://images.cocodataset.org/zips/test2017.zip'] # 7G, 41k images (optional)
download(urls, dir=dir / 'images', threads=3)

View File

@ -0,0 +1,101 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
# COCO128-seg dataset https://www.kaggle.com/ultralytics/coco128 (first 128 images from COCO train2017) by Ultralytics
# Example usage: yolo train data=coco128.yaml
# parent
# ├── ultralytics
# └── datasets
# └── coco128-seg ← downloads here (7 MB)
# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
path: ../datasets/coco128-seg # dataset root dir
train: images/train2017 # train images (relative to 'path') 128 images
val: images/train2017 # val images (relative to 'path') 128 images
test: # test images (optional)
# Classes
names:
0: person
1: bicycle
2: car
3: motorcycle
4: airplane
5: bus
6: train
7: truck
8: boat
9: traffic light
10: fire hydrant
11: stop sign
12: parking meter
13: bench
14: bird
15: cat
16: dog
17: horse
18: sheep
19: cow
20: elephant
21: bear
22: zebra
23: giraffe
24: backpack
25: umbrella
26: handbag
27: tie
28: suitcase
29: frisbee
30: skis
31: snowboard
32: sports ball
33: kite
34: baseball bat
35: baseball glove
36: skateboard
37: surfboard
38: tennis racket
39: bottle
40: wine glass
41: cup
42: fork
43: knife
44: spoon
45: bowl
46: banana
47: apple
48: sandwich
49: orange
50: broccoli
51: carrot
52: hot dog
53: pizza
54: donut
55: cake
56: chair
57: couch
58: potted plant
59: bed
60: dining table
61: toilet
62: tv
63: laptop
64: mouse
65: remote
66: keyboard
67: cell phone
68: microwave
69: oven
70: toaster
71: sink
72: refrigerator
73: book
74: clock
75: vase
76: scissors
77: teddy bear
78: hair drier
79: toothbrush
# Download script/URL (optional)
download: https://ultralytics.com/assets/coco128-seg.zip

View File

@ -0,0 +1,101 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
# COCO128 dataset https://www.kaggle.com/ultralytics/coco128 (first 128 images from COCO train2017) by Ultralytics
# Example usage: yolo train data=coco128.yaml
# parent
# ├── ultralytics
# └── datasets
# └── coco128 ← downloads here (7 MB)
# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
path: ../datasets/coco128 # dataset root dir
train: images/train2017 # train images (relative to 'path') 128 images
val: images/train2017 # val images (relative to 'path') 128 images
test: # test images (optional)
# Classes
names:
0: person
1: bicycle
2: car
3: motorcycle
4: airplane
5: bus
6: train
7: truck
8: boat
9: traffic light
10: fire hydrant
11: stop sign
12: parking meter
13: bench
14: bird
15: cat
16: dog
17: horse
18: sheep
19: cow
20: elephant
21: bear
22: zebra
23: giraffe
24: backpack
25: umbrella
26: handbag
27: tie
28: suitcase
29: frisbee
30: skis
31: snowboard
32: sports ball
33: kite
34: baseball bat
35: baseball glove
36: skateboard
37: surfboard
38: tennis racket
39: bottle
40: wine glass
41: cup
42: fork
43: knife
44: spoon
45: bowl
46: banana
47: apple
48: sandwich
49: orange
50: broccoli
51: carrot
52: hot dog
53: pizza
54: donut
55: cake
56: chair
57: couch
58: potted plant
59: bed
60: dining table
61: toilet
62: tv
63: laptop
64: mouse
65: remote
66: keyboard
67: cell phone
68: microwave
69: oven
70: toaster
71: sink
72: refrigerator
73: book
74: clock
75: vase
76: scissors
77: teddy bear
78: hair drier
79: toothbrush
# Download script/URL (optional)
download: https://ultralytics.com/assets/coco128.zip

View File

@ -0,0 +1,25 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
# COCO8-pose dataset (first 8 images from COCO train2017) by Ultralytics
# Example usage: yolo train data=coco8-pose.yaml
# parent
# ├── ultralytics
# └── datasets
# └── coco8-pose ← downloads here (1 MB)
# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
path: ../datasets/coco8-pose # dataset root dir
train: images/train # train images (relative to 'path') 4 images
val: images/val # val images (relative to 'path') 4 images
test: # test images (optional)
# Keypoints
kpt_shape: [17, 3] # number of keypoints, number of dims (2 for x,y or 3 for x,y,visible)
flip_idx: [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15]
# Classes
names:
0: person
# Download script/URL (optional)
download: https://ultralytics.com/assets/coco8-pose.zip

View File

@ -0,0 +1,101 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
# COCO8-seg dataset (first 8 images from COCO train2017) by Ultralytics
# Example usage: yolo train data=coco8-seg.yaml
# parent
# ├── ultralytics
# └── datasets
# └── coco8-seg ← downloads here (1 MB)
# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
path: ../datasets/coco8-seg # dataset root dir
train: images/train # train images (relative to 'path') 4 images
val: images/val # val images (relative to 'path') 4 images
test: # test images (optional)
# Classes
names:
0: person
1: bicycle
2: car
3: motorcycle
4: airplane
5: bus
6: train
7: truck
8: boat
9: traffic light
10: fire hydrant
11: stop sign
12: parking meter
13: bench
14: bird
15: cat
16: dog
17: horse
18: sheep
19: cow
20: elephant
21: bear
22: zebra
23: giraffe
24: backpack
25: umbrella
26: handbag
27: tie
28: suitcase
29: frisbee
30: skis
31: snowboard
32: sports ball
33: kite
34: baseball bat
35: baseball glove
36: skateboard
37: surfboard
38: tennis racket
39: bottle
40: wine glass
41: cup
42: fork
43: knife
44: spoon
45: bowl
46: banana
47: apple
48: sandwich
49: orange
50: broccoli
51: carrot
52: hot dog
53: pizza
54: donut
55: cake
56: chair
57: couch
58: potted plant
59: bed
60: dining table
61: toilet
62: tv
63: laptop
64: mouse
65: remote
66: keyboard
67: cell phone
68: microwave
69: oven
70: toaster
71: sink
72: refrigerator
73: book
74: clock
75: vase
76: scissors
77: teddy bear
78: hair drier
79: toothbrush
# Download script/URL (optional)
download: https://ultralytics.com/assets/coco8-seg.zip

View File

@ -0,0 +1,101 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
# COCO8 dataset (first 8 images from COCO train2017) by Ultralytics
# Example usage: yolo train data=coco8.yaml
# parent
# ├── ultralytics
# └── datasets
# └── coco8 ← downloads here (1 MB)
# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
path: ../datasets/coco8 # dataset root dir
train: images/train # train images (relative to 'path') 4 images
val: images/val # val images (relative to 'path') 4 images
test: # test images (optional)
# Classes
names:
0: person
1: bicycle
2: car
3: motorcycle
4: airplane
5: bus
6: train
7: truck
8: boat
9: traffic light
10: fire hydrant
11: stop sign
12: parking meter
13: bench
14: bird
15: cat
16: dog
17: horse
18: sheep
19: cow
20: elephant
21: bear
22: zebra
23: giraffe
24: backpack
25: umbrella
26: handbag
27: tie
28: suitcase
29: frisbee
30: skis
31: snowboard
32: sports ball
33: kite
34: baseball bat
35: baseball glove
36: skateboard
37: surfboard
38: tennis racket
39: bottle
40: wine glass
41: cup
42: fork
43: knife
44: spoon
45: bowl
46: banana
47: apple
48: sandwich
49: orange
50: broccoli
51: carrot
52: hot dog
53: pizza
54: donut
55: cake
56: chair
57: couch
58: potted plant
59: bed
60: dining table
61: toilet
62: tv
63: laptop
64: mouse
65: remote
66: keyboard
67: cell phone
68: microwave
69: oven
70: toaster
71: sink
72: refrigerator
73: book
74: clock
75: vase
76: scissors
77: teddy bear
78: hair drier
79: toothbrush
# Download script/URL (optional)
download: https://ultralytics.com/assets/coco8.zip

View File

@ -0,0 +1,661 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
# Open Images v7 dataset https://storage.googleapis.com/openimages/web/index.html by Google
# Example usage: yolo train data=open-images-v7.yaml
# parent
# ├── ultralytics
# └── datasets
# └── open-images-v7 ← downloads here (561 GB)
# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
path: ../datasets/open-images-v7 # dataset root dir
train: images/train # train images (relative to 'path') 1743042 images
val: images/val # val images (relative to 'path') 41620 images
test: # test images (optional)
# Classes
names:
0: Accordion
1: Adhesive tape
2: Aircraft
3: Airplane
4: Alarm clock
5: Alpaca
6: Ambulance
7: Animal
8: Ant
9: Antelope
10: Apple
11: Armadillo
12: Artichoke
13: Auto part
14: Axe
15: Backpack
16: Bagel
17: Baked goods
18: Balance beam
19: Ball
20: Balloon
21: Banana
22: Band-aid
23: Banjo
24: Barge
25: Barrel
26: Baseball bat
27: Baseball glove
28: Bat (Animal)
29: Bathroom accessory
30: Bathroom cabinet
31: Bathtub
32: Beaker
33: Bear
34: Bed
35: Bee
36: Beehive
37: Beer
38: Beetle
39: Bell pepper
40: Belt
41: Bench
42: Bicycle
43: Bicycle helmet
44: Bicycle wheel
45: Bidet
46: Billboard
47: Billiard table
48: Binoculars
49: Bird
50: Blender
51: Blue jay
52: Boat
53: Bomb
54: Book
55: Bookcase
56: Boot
57: Bottle
58: Bottle opener
59: Bow and arrow
60: Bowl
61: Bowling equipment
62: Box
63: Boy
64: Brassiere
65: Bread
66: Briefcase
67: Broccoli
68: Bronze sculpture
69: Brown bear
70: Building
71: Bull
72: Burrito
73: Bus
74: Bust
75: Butterfly
76: Cabbage
77: Cabinetry
78: Cake
79: Cake stand
80: Calculator
81: Camel
82: Camera
83: Can opener
84: Canary
85: Candle
86: Candy
87: Cannon
88: Canoe
89: Cantaloupe
90: Car
91: Carnivore
92: Carrot
93: Cart
94: Cassette deck
95: Castle
96: Cat
97: Cat furniture
98: Caterpillar
99: Cattle
100: Ceiling fan
101: Cello
102: Centipede
103: Chainsaw
104: Chair
105: Cheese
106: Cheetah
107: Chest of drawers
108: Chicken
109: Chime
110: Chisel
111: Chopsticks
112: Christmas tree
113: Clock
114: Closet
115: Clothing
116: Coat
117: Cocktail
118: Cocktail shaker
119: Coconut
120: Coffee
121: Coffee cup
122: Coffee table
123: Coffeemaker
124: Coin
125: Common fig
126: Common sunflower
127: Computer keyboard
128: Computer monitor
129: Computer mouse
130: Container
131: Convenience store
132: Cookie
133: Cooking spray
134: Corded phone
135: Cosmetics
136: Couch
137: Countertop
138: Cowboy hat
139: Crab
140: Cream
141: Cricket ball
142: Crocodile
143: Croissant
144: Crown
145: Crutch
146: Cucumber
147: Cupboard
148: Curtain
149: Cutting board
150: Dagger
151: Dairy Product
152: Deer
153: Desk
154: Dessert
155: Diaper
156: Dice
157: Digital clock
158: Dinosaur
159: Dishwasher
160: Dog
161: Dog bed
162: Doll
163: Dolphin
164: Door
165: Door handle
166: Doughnut
167: Dragonfly
168: Drawer
169: Dress
170: Drill (Tool)
171: Drink
172: Drinking straw
173: Drum
174: Duck
175: Dumbbell
176: Eagle
177: Earrings
178: Egg (Food)
179: Elephant
180: Envelope
181: Eraser
182: Face powder
183: Facial tissue holder
184: Falcon
185: Fashion accessory
186: Fast food
187: Fax
188: Fedora
189: Filing cabinet
190: Fire hydrant
191: Fireplace
192: Fish
193: Flag
194: Flashlight
195: Flower
196: Flowerpot
197: Flute
198: Flying disc
199: Food
200: Food processor
201: Football
202: Football helmet
203: Footwear
204: Fork
205: Fountain
206: Fox
207: French fries
208: French horn
209: Frog
210: Fruit
211: Frying pan
212: Furniture
213: Garden Asparagus
214: Gas stove
215: Giraffe
216: Girl
217: Glasses
218: Glove
219: Goat
220: Goggles
221: Goldfish
222: Golf ball
223: Golf cart
224: Gondola
225: Goose
226: Grape
227: Grapefruit
228: Grinder
229: Guacamole
230: Guitar
231: Hair dryer
232: Hair spray
233: Hamburger
234: Hammer
235: Hamster
236: Hand dryer
237: Handbag
238: Handgun
239: Harbor seal
240: Harmonica
241: Harp
242: Harpsichord
243: Hat
244: Headphones
245: Heater
246: Hedgehog
247: Helicopter
248: Helmet
249: High heels
250: Hiking equipment
251: Hippopotamus
252: Home appliance
253: Honeycomb
254: Horizontal bar
255: Horse
256: Hot dog
257: House
258: Houseplant
259: Human arm
260: Human beard
261: Human body
262: Human ear
263: Human eye
264: Human face
265: Human foot
266: Human hair
267: Human hand
268: Human head
269: Human leg
270: Human mouth
271: Human nose
272: Humidifier
273: Ice cream
274: Indoor rower
275: Infant bed
276: Insect
277: Invertebrate
278: Ipod
279: Isopod
280: Jacket
281: Jacuzzi
282: Jaguar (Animal)
283: Jeans
284: Jellyfish
285: Jet ski
286: Jug
287: Juice
288: Kangaroo
289: Kettle
290: Kitchen & dining room table
291: Kitchen appliance
292: Kitchen knife
293: Kitchen utensil
294: Kitchenware
295: Kite
296: Knife
297: Koala
298: Ladder
299: Ladle
300: Ladybug
301: Lamp
302: Land vehicle
303: Lantern
304: Laptop
305: Lavender (Plant)
306: Lemon
307: Leopard
308: Light bulb
309: Light switch
310: Lighthouse
311: Lily
312: Limousine
313: Lion
314: Lipstick
315: Lizard
316: Lobster
317: Loveseat
318: Luggage and bags
319: Lynx
320: Magpie
321: Mammal
322: Man
323: Mango
324: Maple
325: Maracas
326: Marine invertebrates
327: Marine mammal
328: Measuring cup
329: Mechanical fan
330: Medical equipment
331: Microphone
332: Microwave oven
333: Milk
334: Miniskirt
335: Mirror
336: Missile
337: Mixer
338: Mixing bowl
339: Mobile phone
340: Monkey
341: Moths and butterflies
342: Motorcycle
343: Mouse
344: Muffin
345: Mug
346: Mule
347: Mushroom
348: Musical instrument
349: Musical keyboard
350: Nail (Construction)
351: Necklace
352: Nightstand
353: Oboe
354: Office building
355: Office supplies
356: Orange
357: Organ (Musical Instrument)
358: Ostrich
359: Otter
360: Oven
361: Owl
362: Oyster
363: Paddle
364: Palm tree
365: Pancake
366: Panda
367: Paper cutter
368: Paper towel
369: Parachute
370: Parking meter
371: Parrot
372: Pasta
373: Pastry
374: Peach
375: Pear
376: Pen
377: Pencil case
378: Pencil sharpener
379: Penguin
380: Perfume
381: Person
382: Personal care
383: Personal flotation device
384: Piano
385: Picnic basket
386: Picture frame
387: Pig
388: Pillow
389: Pineapple
390: Pitcher (Container)
391: Pizza
392: Pizza cutter
393: Plant
394: Plastic bag
395: Plate
396: Platter
397: Plumbing fixture
398: Polar bear
399: Pomegranate
400: Popcorn
401: Porch
402: Porcupine
403: Poster
404: Potato
405: Power plugs and sockets
406: Pressure cooker
407: Pretzel
408: Printer
409: Pumpkin
410: Punching bag
411: Rabbit
412: Raccoon
413: Racket
414: Radish
415: Ratchet (Device)
416: Raven
417: Rays and skates
418: Red panda
419: Refrigerator
420: Remote control
421: Reptile
422: Rhinoceros
423: Rifle
424: Ring binder
425: Rocket
426: Roller skates
427: Rose
428: Rugby ball
429: Ruler
430: Salad
431: Salt and pepper shakers
432: Sandal
433: Sandwich
434: Saucer
435: Saxophone
436: Scale
437: Scarf
438: Scissors
439: Scoreboard
440: Scorpion
441: Screwdriver
442: Sculpture
443: Sea lion
444: Sea turtle
445: Seafood
446: Seahorse
447: Seat belt
448: Segway
449: Serving tray
450: Sewing machine
451: Shark
452: Sheep
453: Shelf
454: Shellfish
455: Shirt
456: Shorts
457: Shotgun
458: Shower
459: Shrimp
460: Sink
461: Skateboard
462: Ski
463: Skirt
464: Skull
465: Skunk
466: Skyscraper
467: Slow cooker
468: Snack
469: Snail
470: Snake
471: Snowboard
472: Snowman
473: Snowmobile
474: Snowplow
475: Soap dispenser
476: Sock
477: Sofa bed
478: Sombrero
479: Sparrow
480: Spatula
481: Spice rack
482: Spider
483: Spoon
484: Sports equipment
485: Sports uniform
486: Squash (Plant)
487: Squid
488: Squirrel
489: Stairs
490: Stapler
491: Starfish
492: Stationary bicycle
493: Stethoscope
494: Stool
495: Stop sign
496: Strawberry
497: Street light
498: Stretcher
499: Studio couch
500: Submarine
501: Submarine sandwich
502: Suit
503: Suitcase
504: Sun hat
505: Sunglasses
506: Surfboard
507: Sushi
508: Swan
509: Swim cap
510: Swimming pool
511: Swimwear
512: Sword
513: Syringe
514: Table
515: Table tennis racket
516: Tablet computer
517: Tableware
518: Taco
519: Tank
520: Tap
521: Tart
522: Taxi
523: Tea
524: Teapot
525: Teddy bear
526: Telephone
527: Television
528: Tennis ball
529: Tennis racket
530: Tent
531: Tiara
532: Tick
533: Tie
534: Tiger
535: Tin can
536: Tire
537: Toaster
538: Toilet
539: Toilet paper
540: Tomato
541: Tool
542: Toothbrush
543: Torch
544: Tortoise
545: Towel
546: Tower
547: Toy
548: Traffic light
549: Traffic sign
550: Train
551: Training bench
552: Treadmill
553: Tree
554: Tree house
555: Tripod
556: Trombone
557: Trousers
558: Truck
559: Trumpet
560: Turkey
561: Turtle
562: Umbrella
563: Unicycle
564: Van
565: Vase
566: Vegetable
567: Vehicle
568: Vehicle registration plate
569: Violin
570: Volleyball (Ball)
571: Waffle
572: Waffle iron
573: Wall clock
574: Wardrobe
575: Washing machine
576: Waste container
577: Watch
578: Watercraft
579: Watermelon
580: Weapon
581: Whale
582: Wheel
583: Wheelchair
584: Whisk
585: Whiteboard
586: Willow
587: Window
588: Window blind
589: Wine
590: Wine glass
591: Wine rack
592: Winter melon
593: Wok
594: Woman
595: Wood-burning stove
596: Woodpecker
597: Worm
598: Wrench
599: Zebra
600: Zucchini
# Download script/URL (optional) ---------------------------------------------------------------------------------------
download: |
from ultralytics.utils import LOGGER, SETTINGS, Path, is_ubuntu, get_ubuntu_version
from ultralytics.utils.checks import check_requirements, check_version
check_requirements('fiftyone')
if is_ubuntu() and check_version(get_ubuntu_version(), '>=22.04'):
# Ubuntu>=22.04 patch https://github.com/voxel51/fiftyone/issues/2961#issuecomment-1666519347
check_requirements('fiftyone-db-ubuntu2204')
import fiftyone as fo
import fiftyone.zoo as foz
import warnings
name = 'open-images-v7'
fraction = 1.0 # fraction of full dataset to use
LOGGER.warning('WARNING ⚠️ Open Images V7 dataset requires at least **561 GB of free space. Starting download...')
for split in 'train', 'validation': # 1743042 train, 41620 val images
train = split == 'train'
# Load Open Images dataset
dataset = foz.load_zoo_dataset(name,
split=split,
label_types=['detections'],
dataset_dir=Path(SETTINGS['datasets_dir']) / 'fiftyone' / name,
max_samples=round((1743042 if train else 41620) * fraction))
# Define classes
if train:
classes = dataset.default_classes # all classes
# classes = dataset.distinct('ground_truth.detections.label') # only observed classes
# Export to YOLO format
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=UserWarning, module="fiftyone.utils.yolo")
dataset.export(export_dir=str(Path(SETTINGS['datasets_dir']) / name),
dataset_type=fo.types.YOLOv5Dataset,
label_field='ground_truth',
split='val' if split == 'validation' else split,
classes=classes,
overwrite=train)

View File

@ -0,0 +1,153 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
# DIUx xView 2018 Challenge https://challenge.xviewdataset.org by U.S. National Geospatial-Intelligence Agency (NGA)
# -------- DOWNLOAD DATA MANUALLY and jar xf val_images.zip to 'datasets/xView' before running train command! --------
# Example usage: yolo train data=xView.yaml
# parent
# ├── ultralytics
# └── datasets
# └── xView ← downloads here (20.7 GB)
# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
path: ../datasets/xView # dataset root dir
train: images/autosplit_train.txt # train images (relative to 'path') 90% of 847 train images
val: images/autosplit_val.txt # train images (relative to 'path') 10% of 847 train images
# Classes
names:
0: Fixed-wing Aircraft
1: Small Aircraft
2: Cargo Plane
3: Helicopter
4: Passenger Vehicle
5: Small Car
6: Bus
7: Pickup Truck
8: Utility Truck
9: Truck
10: Cargo Truck
11: Truck w/Box
12: Truck Tractor
13: Trailer
14: Truck w/Flatbed
15: Truck w/Liquid
16: Crane Truck
17: Railway Vehicle
18: Passenger Car
19: Cargo Car
20: Flat Car
21: Tank car
22: Locomotive
23: Maritime Vessel
24: Motorboat
25: Sailboat
26: Tugboat
27: Barge
28: Fishing Vessel
29: Ferry
30: Yacht
31: Container Ship
32: Oil Tanker
33: Engineering Vehicle
34: Tower crane
35: Container Crane
36: Reach Stacker
37: Straddle Carrier
38: Mobile Crane
39: Dump Truck
40: Haul Truck
41: Scraper/Tractor
42: Front loader/Bulldozer
43: Excavator
44: Cement Mixer
45: Ground Grader
46: Hut/Tent
47: Shed
48: Building
49: Aircraft Hangar
50: Damaged Building
51: Facility
52: Construction Site
53: Vehicle Lot
54: Helipad
55: Storage Tank
56: Shipping container lot
57: Shipping Container
58: Pylon
59: Tower
# Download script/URL (optional) ---------------------------------------------------------------------------------------
download: |
import json
import os
from pathlib import Path
import numpy as np
from PIL import Image
from tqdm import tqdm
from ultralytics.data.utils import autosplit
from ultralytics.utils.ops import xyxy2xywhn
def convert_labels(fname=Path('xView/xView_train.geojson')):
# Convert xView geoJSON labels to YOLO format
path = fname.parent
with open(fname) as f:
print(f'Loading {fname}...')
data = json.load(f)
# Make dirs
labels = Path(path / 'labels' / 'train')
os.system(f'rm -rf {labels}')
labels.mkdir(parents=True, exist_ok=True)
# xView classes 11-94 to 0-59
xview_class2index = [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, -1, 3, -1, 4, 5, 6, 7, 8, -1, 9, 10, 11,
12, 13, 14, 15, -1, -1, 16, 17, 18, 19, 20, 21, 22, -1, 23, 24, 25, -1, 26, 27, -1, 28, -1,
29, 30, 31, 32, 33, 34, 35, 36, 37, -1, 38, 39, 40, 41, 42, 43, 44, 45, -1, -1, -1, -1, 46,
47, 48, 49, -1, 50, 51, -1, 52, -1, -1, -1, 53, 54, -1, 55, -1, -1, 56, -1, 57, -1, 58, 59]
shapes = {}
for feature in tqdm(data['features'], desc=f'Converting {fname}'):
p = feature['properties']
if p['bounds_imcoords']:
id = p['image_id']
file = path / 'train_images' / id
if file.exists(): # 1395.tif missing
try:
box = np.array([int(num) for num in p['bounds_imcoords'].split(",")])
assert box.shape[0] == 4, f'incorrect box shape {box.shape[0]}'
cls = p['type_id']
cls = xview_class2index[int(cls)] # xView class to 0-60
assert 59 >= cls >= 0, f'incorrect class index {cls}'
# Write YOLO label
if id not in shapes:
shapes[id] = Image.open(file).size
box = xyxy2xywhn(box[None].astype(np.float), w=shapes[id][0], h=shapes[id][1], clip=True)
with open((labels / id).with_suffix('.txt'), 'a') as f:
f.write(f"{cls} {' '.join(f'{x:.6f}' for x in box[0])}\n") # write label.txt
except Exception as e:
print(f'WARNING: skipping one label for {file}: {e}')
# Download manually from https://challenge.xviewdataset.org
dir = Path(yaml['path']) # dataset root dir
# urls = ['https://d307kc0mrhucc3.cloudfront.net/train_labels.zip', # train labels
# 'https://d307kc0mrhucc3.cloudfront.net/train_images.zip', # 15G, 847 train images
# 'https://d307kc0mrhucc3.cloudfront.net/val_images.zip'] # 5G, 282 val images (no labels)
# download(urls, dir=dir)
# Convert labels
convert_labels(dir / 'xView_train.geojson')
# Move images
images = Path(dir / 'images')
images.mkdir(parents=True, exist_ok=True)
Path(dir / 'train_images').rename(dir / 'images' / 'train')
Path(dir / 'val_images').rename(dir / 'images' / 'val')
# Split
autosplit(dir / 'images' / 'train')

View File

@ -0,0 +1,116 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
# Default training settings and hyperparameters for medium-augmentation COCO training
task: detect # (str) YOLO task, i.e. detect, segment, classify, pose
mode: train # (str) YOLO mode, i.e. train, val, predict, export, track, benchmark
# Train settings -------------------------------------------------------------------------------------------------------
model: # (str, optional) path to model file, i.e. yolov8n.pt, yolov8n.yaml
data: # (str, optional) path to data file, i.e. coco128.yaml
epochs: 100 # (int) number of epochs to train for
patience: 50 # (int) epochs to wait for no observable improvement for early stopping of training
batch: 16 # (int) number of images per batch (-1 for AutoBatch)
imgsz: 640 # (int | list) input images size as int for train and val modes, or list[w,h] for predict and export modes
save: True # (bool) save train checkpoints and predict results
save_period: -1 # (int) Save checkpoint every x epochs (disabled if < 1)
cache: False # (bool) True/ram, disk or False. Use cache for data loading
device: # (int | str | list, optional) device to run on, i.e. cuda device=0 or device=0,1,2,3 or device=cpu
workers: 8 # (int) number of worker threads for data loading (per RANK if DDP)
project: # (str, optional) project name
name: # (str, optional) experiment name, results saved to 'project/name' directory
exist_ok: False # (bool) whether to overwrite existing experiment
pretrained: True # (bool | str) whether to use a pretrained model (bool) or a model to load weights from (str)
optimizer: auto # (str) optimizer to use, choices=[SGD, Adam, Adamax, AdamW, NAdam, RAdam, RMSProp, auto]
verbose: True # (bool) whether to print verbose output
seed: 0 # (int) random seed for reproducibility
deterministic: True # (bool) whether to enable deterministic mode
single_cls: False # (bool) train multi-class data as single-class
rect: False # (bool) rectangular training if mode='train' or rectangular validation if mode='val'
cos_lr: False # (bool) use cosine learning rate scheduler
close_mosaic: 10 # (int) disable mosaic augmentation for final epochs (0 to disable)
resume: False # (bool) resume training from last checkpoint
amp: True # (bool) Automatic Mixed Precision (AMP) training, choices=[True, False], True runs AMP check
fraction: 1.0 # (float) dataset fraction to train on (default is 1.0, all images in train set)
profile: False # (bool) profile ONNX and TensorRT speeds during training for loggers
freeze: None # (int | list, optional) freeze first n layers, or freeze list of layer indices during training
# Segmentation
overlap_mask: True # (bool) masks should overlap during training (segment train only)
mask_ratio: 4 # (int) mask downsample ratio (segment train only)
# Classification
dropout: 0.0 # (float) use dropout regularization (classify train only)
# Val/Test settings ----------------------------------------------------------------------------------------------------
val: True # (bool) validate/test during training
split: val # (str) dataset split to use for validation, i.e. 'val', 'test' or 'train'
save_json: False # (bool) save results to JSON file
save_hybrid: False # (bool) save hybrid version of labels (labels + additional predictions)
conf: # (float, optional) object confidence threshold for detection (default 0.25 predict, 0.001 val)
iou: 0.7 # (float) intersection over union (IoU) threshold for NMS
max_det: 300 # (int) maximum number of detections per image
half: False # (bool) use half precision (FP16)
dnn: False # (bool) use OpenCV DNN for ONNX inference
plots: True # (bool) save plots during train/val
# Prediction settings --------------------------------------------------------------------------------------------------
source: # (str, optional) source directory for images or videos
show: False # (bool) show results if possible
save_txt: False # (bool) save results as .txt file
save_conf: False # (bool) save results with confidence scores
save_crop: False # (bool) save cropped images with results
show_labels: True # (bool) show object labels in plots
show_conf: True # (bool) show object confidence scores in plots
vid_stride: 1 # (int) video frame-rate stride
stream_buffer: False # (bool) buffer all streaming frames (True) or return the most recent frame (False)
line_width: # (int, optional) line width of the bounding boxes, auto if missing
visualize: False # (bool) visualize model features
augment: False # (bool) apply image augmentation to prediction sources
agnostic_nms: False # (bool) class-agnostic NMS
classes: # (int | list[int], optional) filter results by class, i.e. classes=0, or classes=[0,2,3]
retina_masks: False # (bool) use high-resolution segmentation masks
boxes: True # (bool) Show boxes in segmentation predictions
# Export settings ------------------------------------------------------------------------------------------------------
format: torchscript # (str) format to export to, choices at https://docs.ultralytics.com/modes/export/#export-formats
keras: False # (bool) use Kera=s
optimize: False # (bool) TorchScript: optimize for mobile
int8: False # (bool) CoreML/TF INT8 quantization
dynamic: False # (bool) ONNX/TF/TensorRT: dynamic axes
simplify: False # (bool) ONNX: simplify model
opset: # (int, optional) ONNX: opset version
workspace: 4 # (int) TensorRT: workspace size (GB)
nms: False # (bool) CoreML: add NMS
# Hyperparameters ------------------------------------------------------------------------------------------------------
lr0: 0.01 # (float) initial learning rate (i.e. SGD=1E-2, Adam=1E-3)
lrf: 0.01 # (float) final learning rate (lr0 * lrf)
momentum: 0.937 # (float) SGD momentum/Adam beta1
weight_decay: 0.0005 # (float) optimizer weight decay 5e-4
warmup_epochs: 3.0 # (float) warmup epochs (fractions ok)
warmup_momentum: 0.8 # (float) warmup initial momentum
warmup_bias_lr: 0.1 # (float) warmup initial bias lr
box: 7.5 # (float) box loss gain
cls: 0.5 # (float) cls loss gain (scale with pixels)
dfl: 1.5 # (float) dfl loss gain
pose: 12.0 # (float) pose loss gain
kobj: 1.0 # (float) keypoint obj loss gain
label_smoothing: 0.0 # (float) label smoothing (fraction)
nbs: 64 # (int) nominal batch size
hsv_h: 0.015 # (float) image HSV-Hue augmentation (fraction)
hsv_s: 0.7 # (float) image HSV-Saturation augmentation (fraction)
hsv_v: 0.4 # (float) image HSV-Value augmentation (fraction)
degrees: 0.0 # (float) image rotation (+/- deg)
translate: 0.1 # (float) image translation (+/- fraction)
scale: 0.5 # (float) image scale (+/- gain)
shear: 0.0 # (float) image shear (+/- deg)
perspective: 0.0 # (float) image perspective (+/- fraction), range 0-0.001
flipud: 0.0 # (float) image flip up-down (probability)
fliplr: 0.5 # (float) image flip left-right (probability)
mosaic: 1.0 # (float) image mosaic (probability)
mixup: 0.0 # (float) image mixup (probability)
copy_paste: 0.0 # (float) segment copy-paste (probability)
# Custom config.yaml ---------------------------------------------------------------------------------------------------
cfg: # (str, optional) for overriding defaults.yaml
# Tracker settings ------------------------------------------------------------------------------------------------------
tracker: botsort.yaml # (str) tracker type, choices=[botsort.yaml, bytetrack.yaml]

View File

@ -0,0 +1,41 @@
## Models
Welcome to the Ultralytics Models directory! Here you will find a wide variety of pre-configured model configuration files (`*.yaml`s) that can be used to create custom YOLO models. The models in this directory have been expertly crafted and fine-tuned by the Ultralytics team to provide the best performance for a wide range of object detection and image segmentation tasks.
These model configurations cover a wide range of scenarios, from simple object detection to more complex tasks like instance segmentation and object tracking. They are also designed to run efficiently on a variety of hardware platforms, from CPUs to GPUs. Whether you are a seasoned machine learning practitioner or just getting started with YOLO, this directory provides a great starting point for your custom model development needs.
To get started, simply browse through the models in this directory and find one that best suits your needs. Once you've selected a model, you can use the provided `*.yaml` file to train and deploy your custom YOLO model with ease. See full details at the Ultralytics [Docs](https://docs.ultralytics.com/models), and if you need help or have any questions, feel free to reach out to the Ultralytics team for support. So, don't wait, start creating your custom YOLO model now!
### Usage
Model `*.yaml` files may be used directly in the Command Line Interface (CLI) with a `yolo` command:
```bash
yolo task=detect mode=train model=yolov8n.yaml data=coco128.yaml epochs=100
```
They may also be used directly in a Python environment, and accepts the same
[arguments](https://docs.ultralytics.com/usage/cfg/) as in the CLI example above:
```python
from ultralytics import YOLO
model = YOLO("model.yaml") # build a YOLOv8n model from scratch
# YOLO("model.pt") use pre-trained model if available
model.info() # display model information
model.train(data="coco128.yaml", epochs=100) # train the model
```
## Pre-trained Model Architectures
Ultralytics supports many model architectures. Visit https://docs.ultralytics.com/models to view detailed information and usage. Any of these models can be used by loading their configs or pretrained checkpoints if available.
## Contribute New Models
Have you trained a new YOLO variant or achieved state-of-the-art performance with specific tuning? We'd love to showcase your work in our Models section! Contributions from the community in the form of new models, architectures, or optimizations are highly valued and can significantly enrich our repository.
By contributing to this section, you're helping us offer a wider array of model choices and configurations to the community. It's a fantastic way to share your knowledge and expertise while making the Ultralytics YOLO ecosystem even more versatile.
To get started, please consult our [Contributing Guide](https://docs.ultralytics.com/help/contributing) for step-by-step instructions on how to submit a Pull Request (PR) 🛠️. Your contributions are eagerly awaited!
Let's join hands to extend the range and capabilities of the Ultralytics YOLO models 🙏!

View File

@ -0,0 +1,50 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
# RT-DETR-l object detection model with P3-P5 outputs. For details see https://docs.ultralytics.com/models/rtdetr
# Parameters
nc: 80 # number of classes
scales: # model compound scaling constants, i.e. 'model=yolov8n-cls.yaml' will call yolov8-cls.yaml with scale 'n'
# [depth, width, max_channels]
l: [1.00, 1.00, 1024]
backbone:
# [from, repeats, module, args]
- [-1, 1, HGStem, [32, 48]] # 0-P2/4
- [-1, 6, HGBlock, [48, 128, 3]] # stage 1
- [-1, 1, DWConv, [128, 3, 2, 1, False]] # 2-P3/8
- [-1, 6, HGBlock, [96, 512, 3]] # stage 2
- [-1, 1, DWConv, [512, 3, 2, 1, False]] # 4-P3/16
- [-1, 6, HGBlock, [192, 1024, 5, True, False]] # cm, c2, k, light, shortcut
- [-1, 6, HGBlock, [192, 1024, 5, True, True]]
- [-1, 6, HGBlock, [192, 1024, 5, True, True]] # stage 3
- [-1, 1, DWConv, [1024, 3, 2, 1, False]] # 8-P4/32
- [-1, 6, HGBlock, [384, 2048, 5, True, False]] # stage 4
head:
- [-1, 1, Conv, [256, 1, 1, None, 1, 1, False]] # 10 input_proj.2
- [-1, 1, AIFI, [1024, 8]]
- [-1, 1, Conv, [256, 1, 1]] # 12, Y5, lateral_convs.0
- [-1, 1, nn.Upsample, [None, 2, 'nearest']]
- [7, 1, Conv, [256, 1, 1, None, 1, 1, False]] # 14 input_proj.1
- [[-2, -1], 1, Concat, [1]]
- [-1, 3, RepC3, [256]] # 16, fpn_blocks.0
- [-1, 1, Conv, [256, 1, 1]] # 17, Y4, lateral_convs.1
- [-1, 1, nn.Upsample, [None, 2, 'nearest']]
- [3, 1, Conv, [256, 1, 1, None, 1, 1, False]] # 19 input_proj.0
- [[-2, -1], 1, Concat, [1]] # cat backbone P4
- [-1, 3, RepC3, [256]] # X3 (21), fpn_blocks.1
- [-1, 1, Conv, [256, 3, 2]] # 22, downsample_convs.0
- [[-1, 17], 1, Concat, [1]] # cat Y4
- [-1, 3, RepC3, [256]] # F4 (24), pan_blocks.0
- [-1, 1, Conv, [256, 3, 2]] # 25, downsample_convs.1
- [[-1, 12], 1, Concat, [1]] # cat Y5
- [-1, 3, RepC3, [256]] # F5 (27), pan_blocks.1
- [[21, 24, 27], 1, RTDETRDecoder, [nc]] # Detect(P3, P4, P5)

View File

@ -0,0 +1,54 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
# RT-DETR-x object detection model with P3-P5 outputs. For details see https://docs.ultralytics.com/models/rtdetr
# Parameters
nc: 80 # number of classes
scales: # model compound scaling constants, i.e. 'model=yolov8n-cls.yaml' will call yolov8-cls.yaml with scale 'n'
# [depth, width, max_channels]
x: [1.00, 1.00, 2048]
backbone:
# [from, repeats, module, args]
- [-1, 1, HGStem, [32, 64]] # 0-P2/4
- [-1, 6, HGBlock, [64, 128, 3]] # stage 1
- [-1, 1, DWConv, [128, 3, 2, 1, False]] # 2-P3/8
- [-1, 6, HGBlock, [128, 512, 3]]
- [-1, 6, HGBlock, [128, 512, 3, False, True]] # 4-stage 2
- [-1, 1, DWConv, [512, 3, 2, 1, False]] # 5-P3/16
- [-1, 6, HGBlock, [256, 1024, 5, True, False]] # cm, c2, k, light, shortcut
- [-1, 6, HGBlock, [256, 1024, 5, True, True]]
- [-1, 6, HGBlock, [256, 1024, 5, True, True]]
- [-1, 6, HGBlock, [256, 1024, 5, True, True]]
- [-1, 6, HGBlock, [256, 1024, 5, True, True]] # 10-stage 3
- [-1, 1, DWConv, [1024, 3, 2, 1, False]] # 11-P4/32
- [-1, 6, HGBlock, [512, 2048, 5, True, False]]
- [-1, 6, HGBlock, [512, 2048, 5, True, True]] # 13-stage 4
head:
- [-1, 1, Conv, [384, 1, 1, None, 1, 1, False]] # 14 input_proj.2
- [-1, 1, AIFI, [2048, 8]]
- [-1, 1, Conv, [384, 1, 1]] # 16, Y5, lateral_convs.0
- [-1, 1, nn.Upsample, [None, 2, 'nearest']]
- [10, 1, Conv, [384, 1, 1, None, 1, 1, False]] # 18 input_proj.1
- [[-2, -1], 1, Concat, [1]]
- [-1, 3, RepC3, [384]] # 20, fpn_blocks.0
- [-1, 1, Conv, [384, 1, 1]] # 21, Y4, lateral_convs.1
- [-1, 1, nn.Upsample, [None, 2, 'nearest']]
- [4, 1, Conv, [384, 1, 1, None, 1, 1, False]] # 23 input_proj.0
- [[-2, -1], 1, Concat, [1]] # cat backbone P4
- [-1, 3, RepC3, [384]] # X3 (25), fpn_blocks.1
- [-1, 1, Conv, [384, 3, 2]] # 26, downsample_convs.0
- [[-1, 21], 1, Concat, [1]] # cat Y4
- [-1, 3, RepC3, [384]] # F4 (28), pan_blocks.0
- [-1, 1, Conv, [384, 3, 2]] # 29, downsample_convs.1
- [[-1, 16], 1, Concat, [1]] # cat Y5
- [-1, 3, RepC3, [384]] # F5 (31), pan_blocks.1
- [[25, 28, 31], 1, RTDETRDecoder, [nc]] # Detect(P3, P4, P5)

View File

@ -0,0 +1,61 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
# YOLOv5 object detection model with P3-P6 outputs. For details see https://docs.ultralytics.com/models/yolov5
# Parameters
nc: 80 # number of classes
scales: # model compound scaling constants, i.e. 'model=yolov5n-p6.yaml' will call yolov5-p6.yaml with scale 'n'
# [depth, width, max_channels]
n: [0.33, 0.25, 1024]
s: [0.33, 0.50, 1024]
m: [0.67, 0.75, 1024]
l: [1.00, 1.00, 1024]
x: [1.33, 1.25, 1024]
# YOLOv5 v6.0 backbone
backbone:
# [from, number, module, args]
[[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2
[-1, 1, Conv, [128, 3, 2]], # 1-P2/4
[-1, 3, C3, [128]],
[-1, 1, Conv, [256, 3, 2]], # 3-P3/8
[-1, 6, C3, [256]],
[-1, 1, Conv, [512, 3, 2]], # 5-P4/16
[-1, 9, C3, [512]],
[-1, 1, Conv, [768, 3, 2]], # 7-P5/32
[-1, 3, C3, [768]],
[-1, 1, Conv, [1024, 3, 2]], # 9-P6/64
[-1, 3, C3, [1024]],
[-1, 1, SPPF, [1024, 5]], # 11
]
# YOLOv5 v6.0 head
head:
[[-1, 1, Conv, [768, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 8], 1, Concat, [1]], # cat backbone P5
[-1, 3, C3, [768, False]], # 15
[-1, 1, Conv, [512, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 6], 1, Concat, [1]], # cat backbone P4
[-1, 3, C3, [512, False]], # 19
[-1, 1, Conv, [256, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 4], 1, Concat, [1]], # cat backbone P3
[-1, 3, C3, [256, False]], # 23 (P3/8-small)
[-1, 1, Conv, [256, 3, 2]],
[[-1, 20], 1, Concat, [1]], # cat head P4
[-1, 3, C3, [512, False]], # 26 (P4/16-medium)
[-1, 1, Conv, [512, 3, 2]],
[[-1, 16], 1, Concat, [1]], # cat head P5
[-1, 3, C3, [768, False]], # 29 (P5/32-large)
[-1, 1, Conv, [768, 3, 2]],
[[-1, 12], 1, Concat, [1]], # cat head P6
[-1, 3, C3, [1024, False]], # 32 (P6/64-xlarge)
[[23, 26, 29, 32], 1, Detect, [nc]], # Detect(P3, P4, P5, P6)
]

View File

@ -0,0 +1,50 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
# YOLOv5 object detection model with P3-P5 outputs. For details see https://docs.ultralytics.com/models/yolov5
# Parameters
nc: 80 # number of classes
scales: # model compound scaling constants, i.e. 'model=yolov5n.yaml' will call yolov5.yaml with scale 'n'
# [depth, width, max_channels]
n: [0.33, 0.25, 1024]
s: [0.33, 0.50, 1024]
m: [0.67, 0.75, 1024]
l: [1.00, 1.00, 1024]
x: [1.33, 1.25, 1024]
# YOLOv5 v6.0 backbone
backbone:
# [from, number, module, args]
[[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2
[-1, 1, Conv, [128, 3, 2]], # 1-P2/4
[-1, 3, C3, [128]],
[-1, 1, Conv, [256, 3, 2]], # 3-P3/8
[-1, 6, C3, [256]],
[-1, 1, Conv, [512, 3, 2]], # 5-P4/16
[-1, 9, C3, [512]],
[-1, 1, Conv, [1024, 3, 2]], # 7-P5/32
[-1, 3, C3, [1024]],
[-1, 1, SPPF, [1024, 5]], # 9
]
# YOLOv5 v6.0 head
head:
[[-1, 1, Conv, [512, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 6], 1, Concat, [1]], # cat backbone P4
[-1, 3, C3, [512, False]], # 13
[-1, 1, Conv, [256, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 4], 1, Concat, [1]], # cat backbone P3
[-1, 3, C3, [256, False]], # 17 (P3/8-small)
[-1, 1, Conv, [256, 3, 2]],
[[-1, 14], 1, Concat, [1]], # cat head P4
[-1, 3, C3, [512, False]], # 20 (P4/16-medium)
[-1, 1, Conv, [512, 3, 2]],
[[-1, 10], 1, Concat, [1]], # cat head P5
[-1, 3, C3, [1024, False]], # 23 (P5/32-large)
[[17, 20, 23], 1, Detect, [nc]], # Detect(P3, P4, P5)
]

View File

@ -0,0 +1,18 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
# Default YOLO tracker settings for BoT-SORT tracker https://github.com/NirAharon/BoT-SORT
tracker_type: botsort # tracker type, ['botsort', 'bytetrack']
track_high_thresh: 0.5 # threshold for the first association
track_low_thresh: 0.1 # threshold for the second association
new_track_thresh: 0.6 # threshold for init new track if the detection does not match any tracks
track_buffer: 30 # buffer to calculate the time when to remove tracks
match_thresh: 0.8 # threshold for matching tracks
# min_box_area: 10 # threshold for min box areas(for tracker evaluation, not used for now)
# mot20: False # for tracker evaluation(not used for now)
# BoT-SORT settings
gmc_method: sparseOptFlow # method of global motion compensation
# ReID model related thresh (not supported yet)
proximity_thresh: 0.5
appearance_thresh: 0.25
with_reid: False

View File

@ -0,0 +1,11 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
# Default YOLO tracker settings for ByteTrack tracker https://github.com/ifzhang/ByteTrack
tracker_type: bytetrack # tracker type, ['botsort', 'bytetrack']
track_high_thresh: 0.5 # threshold for the first association
track_low_thresh: 0.1 # threshold for the second association
new_track_thresh: 0.6 # threshold for init new track if the detection does not match any tracks
track_buffer: 30 # buffer to calculate the time when to remove tracks
match_thresh: 0.8 # threshold for matching tracks
# min_box_area: 10 # threshold for min box areas(for tracker evaluation, not used for now)
# mot20: False # for tracker evaluation(not used for now)

View File

@ -0,0 +1,8 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from .base import BaseDataset
from .build import build_dataloader, build_yolo_dataset, load_inference_source
from .dataset import ClassificationDataset, SemanticDataset, YOLODataset
__all__ = ('BaseDataset', 'ClassificationDataset', 'SemanticDataset', 'YOLODataset', 'build_yolo_dataset',
'build_dataloader', 'load_inference_source')

View File

@ -0,0 +1,50 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from pathlib import Path
from ultralytics import SAM, YOLO
def auto_annotate(data, det_model='yolov8x.pt', sam_model='sam_b.pt', device='', output_dir=None):
"""
Automatically annotates images using a YOLO object detection model and a SAM segmentation model.
Args:
data (str): Path to a folder containing images to be annotated.
det_model (str, optional): Pre-trained YOLO detection model. Defaults to 'yolov8x.pt'.
sam_model (str, optional): Pre-trained SAM segmentation model. Defaults to 'sam_b.pt'.
device (str, optional): Device to run the models on. Defaults to an empty string (CPU or GPU, if available).
output_dir (str | None | optional): Directory to save the annotated results.
Defaults to a 'labels' folder in the same directory as 'data'.
Example:
```python
from ultralytics.data.annotator import auto_annotate
auto_annotate(data='ultralytics/assets', det_model='yolov8n.pt', sam_model='mobile_sam.pt')
```
"""
det_model = YOLO(det_model)
sam_model = SAM(sam_model)
data = Path(data)
if not output_dir:
output_dir = data.parent / f'{data.stem}_auto_annotate_labels'
Path(output_dir).mkdir(exist_ok=True, parents=True)
det_results = det_model(data, stream=True, device=device)
for result in det_results:
class_ids = result.boxes.cls.int().tolist() # noqa
if len(class_ids):
boxes = result.boxes.xyxy # Boxes object for bbox outputs
sam_results = sam_model(result.orig_img, bboxes=boxes, verbose=False, save=False, device=device)
segments = sam_results[0].masks.xyn # noqa
with open(f'{str(Path(output_dir) / Path(result.path).stem)}.txt', 'w') as f:
for i in range(len(segments)):
s = segments[i]
if len(s) == 0:
continue
segment = map(str, segments[i].reshape(-1).tolist())
f.write(f'{class_ids[i]} ' + ' '.join(segment) + '\n')

View File

@ -0,0 +1,902 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import math
import random
from copy import deepcopy
import cv2
import numpy as np
import torch
import torchvision.transforms as T
from ultralytics.utils import LOGGER, colorstr
from ultralytics.utils.checks import check_version
from ultralytics.utils.instance import Instances
from ultralytics.utils.metrics import bbox_ioa
from ultralytics.utils.ops import segment2box
from .utils import polygons2masks, polygons2masks_overlap
# TODO: we might need a BaseTransform to make all these augments be compatible with both classification and semantic
class BaseTransform:
def __init__(self) -> None:
pass
def apply_image(self, labels):
"""Applies image transformation to labels."""
pass
def apply_instances(self, labels):
"""Applies transformations to input 'labels' and returns object instances."""
pass
def apply_semantic(self, labels):
"""Applies semantic segmentation to an image."""
pass
def __call__(self, labels):
"""Applies label transformations to an image, instances and semantic masks."""
self.apply_image(labels)
self.apply_instances(labels)
self.apply_semantic(labels)
class Compose:
def __init__(self, transforms):
"""Initializes the Compose object with a list of transforms."""
self.transforms = transforms
def __call__(self, data):
"""Applies a series of transformations to input data."""
for t in self.transforms:
data = t(data)
return data
def append(self, transform):
"""Appends a new transform to the existing list of transforms."""
self.transforms.append(transform)
def tolist(self):
"""Converts list of transforms to a standard Python list."""
return self.transforms
def __repr__(self):
"""Return string representation of object."""
return f"{self.__class__.__name__}({', '.join([f'{t}' for t in self.transforms])})"
class BaseMixTransform:
"""This implementation is from mmyolo."""
def __init__(self, dataset, pre_transform=None, p=0.0) -> None:
self.dataset = dataset
self.pre_transform = pre_transform
self.p = p
def __call__(self, labels):
"""Applies pre-processing transforms and mixup/mosaic transforms to labels data."""
if random.uniform(0, 1) > self.p:
return labels
# Get index of one or three other images
indexes = self.get_indexes()
if isinstance(indexes, int):
indexes = [indexes]
# Get images information will be used for Mosaic or MixUp
mix_labels = [self.dataset.get_image_and_label(i) for i in indexes]
if self.pre_transform is not None:
for i, data in enumerate(mix_labels):
mix_labels[i] = self.pre_transform(data)
labels['mix_labels'] = mix_labels
# Mosaic or MixUp
labels = self._mix_transform(labels)
labels.pop('mix_labels', None)
return labels
def _mix_transform(self, labels):
"""Applies MixUp or Mosaic augmentation to the label dictionary."""
raise NotImplementedError
def get_indexes(self):
"""Gets a list of shuffled indexes for mosaic augmentation."""
raise NotImplementedError
class Mosaic(BaseMixTransform):
"""
Mosaic augmentation.
This class performs mosaic augmentation by combining multiple (4 or 9) images into a single mosaic image.
The augmentation is applied to a dataset with a given probability.
Attributes:
dataset: The dataset on which the mosaic augmentation is applied.
imgsz (int, optional): Image size (height and width) after mosaic pipeline of a single image. Default to 640.
p (float, optional): Probability of applying the mosaic augmentation. Must be in the range 0-1. Default to 1.0.
n (int, optional): The grid size, either 4 (for 2x2) or 9 (for 3x3).
"""
def __init__(self, dataset, imgsz=640, p=1.0, n=4):
"""Initializes the object with a dataset, image size, probability, and border."""
assert 0 <= p <= 1.0, f'The probability should be in range [0, 1], but got {p}.'
assert n in (4, 9), 'grid must be equal to 4 or 9.'
super().__init__(dataset=dataset, p=p)
self.dataset = dataset
self.imgsz = imgsz
self.border = (-imgsz // 2, -imgsz // 2) # width, height
self.n = n
def get_indexes(self, buffer=True):
"""Return a list of random indexes from the dataset."""
if buffer: # select images from buffer
return random.choices(list(self.dataset.buffer), k=self.n - 1)
else: # select any images
return [random.randint(0, len(self.dataset) - 1) for _ in range(self.n - 1)]
def _mix_transform(self, labels):
"""Apply mixup transformation to the input image and labels."""
assert labels.get('rect_shape', None) is None, 'rect and mosaic are mutually exclusive.'
assert len(labels.get('mix_labels', [])), 'There are no other images for mosaic augment.'
return self._mosaic4(labels) if self.n == 4 else self._mosaic9(labels)
def _mosaic4(self, labels):
"""Create a 2x2 image mosaic."""
mosaic_labels = []
s = self.imgsz
yc, xc = (int(random.uniform(-x, 2 * s + x)) for x in self.border) # mosaic center x, y
for i in range(4):
labels_patch = labels if i == 0 else labels['mix_labels'][i - 1]
# Load image
img = labels_patch['img']
h, w = labels_patch.pop('resized_shape')
# Place img in img4
if i == 0: # top left
img4 = np.full((s * 2, s * 2, img.shape[2]), 114, dtype=np.uint8) # base image with 4 tiles
x1a, y1a, x2a, y2a = max(xc - w, 0), max(yc - h, 0), xc, yc # xmin, ymin, xmax, ymax (large image)
x1b, y1b, x2b, y2b = w - (x2a - x1a), h - (y2a - y1a), w, h # xmin, ymin, xmax, ymax (small image)
elif i == 1: # top right
x1a, y1a, x2a, y2a = xc, max(yc - h, 0), min(xc + w, s * 2), yc
x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h
elif i == 2: # bottom left
x1a, y1a, x2a, y2a = max(xc - w, 0), yc, xc, min(s * 2, yc + h)
x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, w, min(y2a - y1a, h)
elif i == 3: # bottom right
x1a, y1a, x2a, y2a = xc, yc, min(xc + w, s * 2), min(s * 2, yc + h)
x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h)
img4[y1a:y2a, x1a:x2a] = img[y1b:y2b, x1b:x2b] # img4[ymin:ymax, xmin:xmax]
padw = x1a - x1b
padh = y1a - y1b
labels_patch = self._update_labels(labels_patch, padw, padh)
mosaic_labels.append(labels_patch)
final_labels = self._cat_labels(mosaic_labels)
final_labels['img'] = img4
return final_labels
def _mosaic9(self, labels):
"""Create a 3x3 image mosaic."""
mosaic_labels = []
s = self.imgsz
hp, wp = -1, -1 # height, width previous
for i in range(9):
labels_patch = labels if i == 0 else labels['mix_labels'][i - 1]
# Load image
img = labels_patch['img']
h, w = labels_patch.pop('resized_shape')
# Place img in img9
if i == 0: # center
img9 = np.full((s * 3, s * 3, img.shape[2]), 114, dtype=np.uint8) # base image with 4 tiles
h0, w0 = h, w
c = s, s, s + w, s + h # xmin, ymin, xmax, ymax (base) coordinates
elif i == 1: # top
c = s, s - h, s + w, s
elif i == 2: # top right
c = s + wp, s - h, s + wp + w, s
elif i == 3: # right
c = s + w0, s, s + w0 + w, s + h
elif i == 4: # bottom right
c = s + w0, s + hp, s + w0 + w, s + hp + h
elif i == 5: # bottom
c = s + w0 - w, s + h0, s + w0, s + h0 + h
elif i == 6: # bottom left
c = s + w0 - wp - w, s + h0, s + w0 - wp, s + h0 + h
elif i == 7: # left
c = s - w, s + h0 - h, s, s + h0
elif i == 8: # top left
c = s - w, s + h0 - hp - h, s, s + h0 - hp
padw, padh = c[:2]
x1, y1, x2, y2 = (max(x, 0) for x in c) # allocate coords
# Image
img9[y1:y2, x1:x2] = img[y1 - padh:, x1 - padw:] # img9[ymin:ymax, xmin:xmax]
hp, wp = h, w # height, width previous for next iteration
# Labels assuming imgsz*2 mosaic size
labels_patch = self._update_labels(labels_patch, padw + self.border[0], padh + self.border[1])
mosaic_labels.append(labels_patch)
final_labels = self._cat_labels(mosaic_labels)
final_labels['img'] = img9[-self.border[0]:self.border[0], -self.border[1]:self.border[1]]
return final_labels
@staticmethod
def _update_labels(labels, padw, padh):
"""Update labels."""
nh, nw = labels['img'].shape[:2]
labels['instances'].convert_bbox(format='xyxy')
labels['instances'].denormalize(nw, nh)
labels['instances'].add_padding(padw, padh)
return labels
def _cat_labels(self, mosaic_labels):
"""Return labels with mosaic border instances clipped."""
if len(mosaic_labels) == 0:
return {}
cls = []
instances = []
imgsz = self.imgsz * 2 # mosaic imgsz
for labels in mosaic_labels:
cls.append(labels['cls'])
instances.append(labels['instances'])
final_labels = {
'im_file': mosaic_labels[0]['im_file'],
'ori_shape': mosaic_labels[0]['ori_shape'],
'resized_shape': (imgsz, imgsz),
'cls': np.concatenate(cls, 0),
'instances': Instances.concatenate(instances, axis=0),
'mosaic_border': self.border} # final_labels
final_labels['instances'].clip(imgsz, imgsz)
good = final_labels['instances'].remove_zero_area_boxes()
final_labels['cls'] = final_labels['cls'][good]
return final_labels
class MixUp(BaseMixTransform):
def __init__(self, dataset, pre_transform=None, p=0.0) -> None:
super().__init__(dataset=dataset, pre_transform=pre_transform, p=p)
def get_indexes(self):
"""Get a random index from the dataset."""
return random.randint(0, len(self.dataset) - 1)
def _mix_transform(self, labels):
"""Applies MixUp augmentation https://arxiv.org/pdf/1710.09412.pdf."""
r = np.random.beta(32.0, 32.0) # mixup ratio, alpha=beta=32.0
labels2 = labels['mix_labels'][0]
labels['img'] = (labels['img'] * r + labels2['img'] * (1 - r)).astype(np.uint8)
labels['instances'] = Instances.concatenate([labels['instances'], labels2['instances']], axis=0)
labels['cls'] = np.concatenate([labels['cls'], labels2['cls']], 0)
return labels
class RandomPerspective:
def __init__(self,
degrees=0.0,
translate=0.1,
scale=0.5,
shear=0.0,
perspective=0.0,
border=(0, 0),
pre_transform=None):
self.degrees = degrees
self.translate = translate
self.scale = scale
self.shear = shear
self.perspective = perspective
# Mosaic border
self.border = border
self.pre_transform = pre_transform
def affine_transform(self, img, border):
"""Center."""
C = np.eye(3, dtype=np.float32)
C[0, 2] = -img.shape[1] / 2 # x translation (pixels)
C[1, 2] = -img.shape[0] / 2 # y translation (pixels)
# Perspective
P = np.eye(3, dtype=np.float32)
P[2, 0] = random.uniform(-self.perspective, self.perspective) # x perspective (about y)
P[2, 1] = random.uniform(-self.perspective, self.perspective) # y perspective (about x)
# Rotation and Scale
R = np.eye(3, dtype=np.float32)
a = random.uniform(-self.degrees, self.degrees)
# a += random.choice([-180, -90, 0, 90]) # add 90deg rotations to small rotations
s = random.uniform(1 - self.scale, 1 + self.scale)
# s = 2 ** random.uniform(-scale, scale)
R[:2] = cv2.getRotationMatrix2D(angle=a, center=(0, 0), scale=s)
# Shear
S = np.eye(3, dtype=np.float32)
S[0, 1] = math.tan(random.uniform(-self.shear, self.shear) * math.pi / 180) # x shear (deg)
S[1, 0] = math.tan(random.uniform(-self.shear, self.shear) * math.pi / 180) # y shear (deg)
# Translation
T = np.eye(3, dtype=np.float32)
T[0, 2] = random.uniform(0.5 - self.translate, 0.5 + self.translate) * self.size[0] # x translation (pixels)
T[1, 2] = random.uniform(0.5 - self.translate, 0.5 + self.translate) * self.size[1] # y translation (pixels)
# Combined rotation matrix
M = T @ S @ R @ P @ C # order of operations (right to left) is IMPORTANT
# Affine image
if (border[0] != 0) or (border[1] != 0) or (M != np.eye(3)).any(): # image changed
if self.perspective:
img = cv2.warpPerspective(img, M, dsize=self.size, borderValue=(114, 114, 114))
else: # affine
img = cv2.warpAffine(img, M[:2], dsize=self.size, borderValue=(114, 114, 114))
return img, M, s
def apply_bboxes(self, bboxes, M):
"""
Apply affine to bboxes only.
Args:
bboxes (ndarray): list of bboxes, xyxy format, with shape (num_bboxes, 4).
M (ndarray): affine matrix.
Returns:
new_bboxes (ndarray): bboxes after affine, [num_bboxes, 4].
"""
n = len(bboxes)
if n == 0:
return bboxes
xy = np.ones((n * 4, 3), dtype=bboxes.dtype)
xy[:, :2] = bboxes[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(n * 4, 2) # x1y1, x2y2, x1y2, x2y1
xy = xy @ M.T # transform
xy = (xy[:, :2] / xy[:, 2:3] if self.perspective else xy[:, :2]).reshape(n, 8) # perspective rescale or affine
# Create new boxes
x = xy[:, [0, 2, 4, 6]]
y = xy[:, [1, 3, 5, 7]]
return np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1)), dtype=bboxes.dtype).reshape(4, n).T
def apply_segments(self, segments, M):
"""
Apply affine to segments and generate new bboxes from segments.
Args:
segments (ndarray): list of segments, [num_samples, 500, 2].
M (ndarray): affine matrix.
Returns:
new_segments (ndarray): list of segments after affine, [num_samples, 500, 2].
new_bboxes (ndarray): bboxes after affine, [N, 4].
"""
n, num = segments.shape[:2]
if n == 0:
return [], segments
xy = np.ones((n * num, 3), dtype=segments.dtype)
segments = segments.reshape(-1, 2)
xy[:, :2] = segments
xy = xy @ M.T # transform
xy = xy[:, :2] / xy[:, 2:3]
segments = xy.reshape(n, -1, 2)
bboxes = np.stack([segment2box(xy, self.size[0], self.size[1]) for xy in segments], 0)
return bboxes, segments
def apply_keypoints(self, keypoints, M):
"""
Apply affine to keypoints.
Args:
keypoints (ndarray): keypoints, [N, 17, 3].
M (ndarray): affine matrix.
Returns:
new_keypoints (ndarray): keypoints after affine, [N, 17, 3].
"""
n, nkpt = keypoints.shape[:2]
if n == 0:
return keypoints
xy = np.ones((n * nkpt, 3), dtype=keypoints.dtype)
visible = keypoints[..., 2].reshape(n * nkpt, 1)
xy[:, :2] = keypoints[..., :2].reshape(n * nkpt, 2)
xy = xy @ M.T # transform
xy = xy[:, :2] / xy[:, 2:3] # perspective rescale or affine
out_mask = (xy[:, 0] < 0) | (xy[:, 1] < 0) | (xy[:, 0] > self.size[0]) | (xy[:, 1] > self.size[1])
visible[out_mask] = 0
return np.concatenate([xy, visible], axis=-1).reshape(n, nkpt, 3)
def __call__(self, labels):
"""
Affine images and targets.
Args:
labels (dict): a dict of `bboxes`, `segments`, `keypoints`.
"""
if self.pre_transform and 'mosaic_border' not in labels:
labels = self.pre_transform(labels)
labels.pop('ratio_pad', None) # do not need ratio pad
img = labels['img']
cls = labels['cls']
instances = labels.pop('instances')
# Make sure the coord formats are right
instances.convert_bbox(format='xyxy')
instances.denormalize(*img.shape[:2][::-1])
border = labels.pop('mosaic_border', self.border)
self.size = img.shape[1] + border[1] * 2, img.shape[0] + border[0] * 2 # w, h
# M is affine matrix
# scale for func:`box_candidates`
img, M, scale = self.affine_transform(img, border)
bboxes = self.apply_bboxes(instances.bboxes, M)
segments = instances.segments
keypoints = instances.keypoints
# Update bboxes if there are segments.
if len(segments):
bboxes, segments = self.apply_segments(segments, M)
if keypoints is not None:
keypoints = self.apply_keypoints(keypoints, M)
new_instances = Instances(bboxes, segments, keypoints, bbox_format='xyxy', normalized=False)
# Clip
new_instances.clip(*self.size)
# Filter instances
instances.scale(scale_w=scale, scale_h=scale, bbox_only=True)
# Make the bboxes have the same scale with new_bboxes
i = self.box_candidates(box1=instances.bboxes.T,
box2=new_instances.bboxes.T,
area_thr=0.01 if len(segments) else 0.10)
labels['instances'] = new_instances[i]
labels['cls'] = cls[i]
labels['img'] = img
labels['resized_shape'] = img.shape[:2]
return labels
def box_candidates(self, box1, box2, wh_thr=2, ar_thr=100, area_thr=0.1, eps=1e-16): # box1(4,n), box2(4,n)
# Compute box candidates: box1 before augment, box2 after augment, wh_thr (pixels), aspect_ratio_thr, area_ratio
w1, h1 = box1[2] - box1[0], box1[3] - box1[1]
w2, h2 = box2[2] - box2[0], box2[3] - box2[1]
ar = np.maximum(w2 / (h2 + eps), h2 / (w2 + eps)) # aspect ratio
return (w2 > wh_thr) & (h2 > wh_thr) & (w2 * h2 / (w1 * h1 + eps) > area_thr) & (ar < ar_thr) # candidates
class RandomHSV:
def __init__(self, hgain=0.5, sgain=0.5, vgain=0.5) -> None:
self.hgain = hgain
self.sgain = sgain
self.vgain = vgain
def __call__(self, labels):
"""Applies image HSV augmentation"""
img = labels['img']
if self.hgain or self.sgain or self.vgain:
r = np.random.uniform(-1, 1, 3) * [self.hgain, self.sgain, self.vgain] + 1 # random gains
hue, sat, val = cv2.split(cv2.cvtColor(img, cv2.COLOR_BGR2HSV))
dtype = img.dtype # uint8
x = np.arange(0, 256, dtype=r.dtype)
lut_hue = ((x * r[0]) % 180).astype(dtype)
lut_sat = np.clip(x * r[1], 0, 255).astype(dtype)
lut_val = np.clip(x * r[2], 0, 255).astype(dtype)
im_hsv = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val)))
cv2.cvtColor(im_hsv, cv2.COLOR_HSV2BGR, dst=img) # no return needed
return labels
class RandomFlip:
"""Applies random horizontal or vertical flip to an image with a given probability."""
def __init__(self, p=0.5, direction='horizontal', flip_idx=None) -> None:
assert direction in ['horizontal', 'vertical'], f'Support direction `horizontal` or `vertical`, got {direction}'
assert 0 <= p <= 1.0
self.p = p
self.direction = direction
self.flip_idx = flip_idx
def __call__(self, labels):
"""Resize image and padding for detection, instance segmentation, pose."""
img = labels['img']
instances = labels.pop('instances')
instances.convert_bbox(format='xywh')
h, w = img.shape[:2]
h = 1 if instances.normalized else h
w = 1 if instances.normalized else w
# Flip up-down
if self.direction == 'vertical' and random.random() < self.p:
img = np.flipud(img)
instances.flipud(h)
if self.direction == 'horizontal' and random.random() < self.p:
img = np.fliplr(img)
instances.fliplr(w)
# For keypoints
if self.flip_idx is not None and instances.keypoints is not None:
instances.keypoints = np.ascontiguousarray(instances.keypoints[:, self.flip_idx, :])
labels['img'] = np.ascontiguousarray(img)
labels['instances'] = instances
return labels
class LetterBox:
"""Resize image and padding for detection, instance segmentation, pose."""
def __init__(self, new_shape=(640, 640), auto=False, scaleFill=False, scaleup=True, center=True, stride=32):
"""Initialize LetterBox object with specific parameters."""
self.new_shape = new_shape
self.auto = auto
self.scaleFill = scaleFill
self.scaleup = scaleup
self.stride = stride
self.center = center # Put the image in the middle or top-left
def __call__(self, labels=None, image=None):
"""Return updated labels and image with added border."""
if labels is None:
labels = {}
img = labels.get('img') if image is None else image
shape = img.shape[:2] # current shape [height, width]
new_shape = labels.pop('rect_shape', self.new_shape)
if isinstance(new_shape, int):
new_shape = (new_shape, new_shape)
# Scale ratio (new / old)
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
if not self.scaleup: # only scale down, do not scale up (for better val mAP)
r = min(r, 1.0)
# Compute padding
ratio = r, r # width, height ratios
new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding
if self.auto: # minimum rectangle
dw, dh = np.mod(dw, self.stride), np.mod(dh, self.stride) # wh padding
elif self.scaleFill: # stretch
dw, dh = 0.0, 0.0
new_unpad = (new_shape[1], new_shape[0])
ratio = new_shape[1] / shape[1], new_shape[0] / shape[0] # width, height ratios
if self.center:
dw /= 2 # divide padding into 2 sides
dh /= 2
if labels.get('ratio_pad'):
labels['ratio_pad'] = (labels['ratio_pad'], (dw, dh)) # for evaluation
if shape[::-1] != new_unpad: # resize
img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
top, bottom = int(round(dh - 0.1)) if self.center else 0, int(round(dh + 0.1))
left, right = int(round(dw - 0.1)) if self.center else 0, int(round(dw + 0.1))
img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT,
value=(114, 114, 114)) # add border
if len(labels):
labels = self._update_labels(labels, ratio, dw, dh)
labels['img'] = img
labels['resized_shape'] = new_shape
return labels
else:
return img
def _update_labels(self, labels, ratio, padw, padh):
"""Update labels."""
labels['instances'].convert_bbox(format='xyxy')
labels['instances'].denormalize(*labels['img'].shape[:2][::-1])
labels['instances'].scale(*ratio)
labels['instances'].add_padding(padw, padh)
return labels
class CopyPaste:
def __init__(self, p=0.5) -> None:
self.p = p
def __call__(self, labels):
"""Implement Copy-Paste augmentation https://arxiv.org/abs/2012.07177, labels as nx5 np.array(cls, xyxy)."""
im = labels['img']
cls = labels['cls']
h, w = im.shape[:2]
instances = labels.pop('instances')
instances.convert_bbox(format='xyxy')
instances.denormalize(w, h)
if self.p and len(instances.segments):
n = len(instances)
_, w, _ = im.shape # height, width, channels
im_new = np.zeros(im.shape, np.uint8)
# Calculate ioa first then select indexes randomly
ins_flip = deepcopy(instances)
ins_flip.fliplr(w)
ioa = bbox_ioa(ins_flip.bboxes, instances.bboxes) # intersection over area, (N, M)
indexes = np.nonzero((ioa < 0.30).all(1))[0] # (N, )
n = len(indexes)
for j in random.sample(list(indexes), k=round(self.p * n)):
cls = np.concatenate((cls, cls[[j]]), axis=0)
instances = Instances.concatenate((instances, ins_flip[[j]]), axis=0)
cv2.drawContours(im_new, instances.segments[[j]].astype(np.int32), -1, (1, 1, 1), cv2.FILLED)
result = cv2.flip(im, 1) # augment segments (flip left-right)
i = cv2.flip(im_new, 1).astype(bool)
im[i] = result[i]
labels['img'] = im
labels['cls'] = cls
labels['instances'] = instances
return labels
class Albumentations:
"""Albumentations transformations. Optional, uninstall package to disable.
Applies Blur, Median Blur, convert to grayscale, Contrast Limited Adaptive Histogram Equalization,
random change of brightness and contrast, RandomGamma and lowering of image quality by compression."""
def __init__(self, p=1.0):
"""Initialize the transform object for YOLO bbox formatted params."""
self.p = p
self.transform = None
prefix = colorstr('albumentations: ')
try:
import albumentations as A
check_version(A.__version__, '1.0.3', hard=True) # version requirement
T = [
A.Blur(p=0.01),
A.MedianBlur(p=0.01),
A.ToGray(p=0.01),
A.CLAHE(p=0.01),
A.RandomBrightnessContrast(p=0.0),
A.RandomGamma(p=0.0),
A.ImageCompression(quality_lower=75, p=0.0)] # transforms
self.transform = A.Compose(T, bbox_params=A.BboxParams(format='yolo', label_fields=['class_labels']))
LOGGER.info(prefix + ', '.join(f'{x}'.replace('always_apply=False, ', '') for x in T if x.p))
except ImportError: # package not installed, skip
pass
except Exception as e:
LOGGER.info(f'{prefix}{e}')
def __call__(self, labels):
"""Generates object detections and returns a dictionary with detection results."""
im = labels['img']
cls = labels['cls']
if len(cls):
labels['instances'].convert_bbox('xywh')
labels['instances'].normalize(*im.shape[:2][::-1])
bboxes = labels['instances'].bboxes
# TODO: add supports of segments and keypoints
if self.transform and random.random() < self.p:
new = self.transform(image=im, bboxes=bboxes, class_labels=cls) # transformed
if len(new['class_labels']) > 0: # skip update if no bbox in new im
labels['img'] = new['image']
labels['cls'] = np.array(new['class_labels'])
bboxes = np.array(new['bboxes'], dtype=np.float32)
labels['instances'].update(bboxes=bboxes)
return labels
# TODO: technically this is not an augmentation, maybe we should put this to another files
class Format:
def __init__(self,
bbox_format='xywh',
normalize=True,
return_mask=False,
return_keypoint=False,
mask_ratio=4,
mask_overlap=True,
batch_idx=True):
self.bbox_format = bbox_format
self.normalize = normalize
self.return_mask = return_mask # set False when training detection only
self.return_keypoint = return_keypoint
self.mask_ratio = mask_ratio
self.mask_overlap = mask_overlap
self.batch_idx = batch_idx # keep the batch indexes
def __call__(self, labels):
"""Return formatted image, classes, bounding boxes & keypoints to be used by 'collate_fn'."""
img = labels.pop('img')
h, w = img.shape[:2]
cls = labels.pop('cls')
instances = labels.pop('instances')
instances.convert_bbox(format=self.bbox_format)
instances.denormalize(w, h)
nl = len(instances)
if self.return_mask:
if nl:
masks, instances, cls = self._format_segments(instances, cls, w, h)
masks = torch.from_numpy(masks)
else:
masks = torch.zeros(1 if self.mask_overlap else nl, img.shape[0] // self.mask_ratio,
img.shape[1] // self.mask_ratio)
labels['masks'] = masks
if self.normalize:
instances.normalize(w, h)
labels['img'] = self._format_img(img)
labels['cls'] = torch.from_numpy(cls) if nl else torch.zeros(nl)
labels['bboxes'] = torch.from_numpy(instances.bboxes) if nl else torch.zeros((nl, 4))
if self.return_keypoint:
labels['keypoints'] = torch.from_numpy(instances.keypoints)
# Then we can use collate_fn
if self.batch_idx:
labels['batch_idx'] = torch.zeros(nl)
return labels
def _format_img(self, img):
"""Format the image for YOLOv5 from Numpy array to PyTorch tensor."""
if len(img.shape) < 3:
img = np.expand_dims(img, -1)
img = np.ascontiguousarray(img.transpose(2, 0, 1)[::-1])
img = torch.from_numpy(img)
return img
def _format_segments(self, instances, cls, w, h):
"""convert polygon points to bitmap."""
segments = instances.segments
if self.mask_overlap:
masks, sorted_idx = polygons2masks_overlap((h, w), segments, downsample_ratio=self.mask_ratio)
masks = masks[None] # (640, 640) -> (1, 640, 640)
instances = instances[sorted_idx]
cls = cls[sorted_idx]
else:
masks = polygons2masks((h, w), segments, color=1, downsample_ratio=self.mask_ratio)
return masks, instances, cls
def v8_transforms(dataset, imgsz, hyp, stretch=False):
"""Convert images to a size suitable for YOLOv8 training."""
pre_transform = Compose([
Mosaic(dataset, imgsz=imgsz, p=hyp.mosaic),
CopyPaste(p=hyp.copy_paste),
RandomPerspective(
degrees=hyp.degrees,
translate=hyp.translate,
scale=hyp.scale,
shear=hyp.shear,
perspective=hyp.perspective,
pre_transform=None if stretch else LetterBox(new_shape=(imgsz, imgsz)),
)])
flip_idx = dataset.data.get('flip_idx', []) # for keypoints augmentation
if dataset.use_keypoints:
kpt_shape = dataset.data.get('kpt_shape', None)
if len(flip_idx) == 0 and hyp.fliplr > 0.0:
hyp.fliplr = 0.0
LOGGER.warning("WARNING ⚠️ No 'flip_idx' array defined in data.yaml, setting augmentation 'fliplr=0.0'")
elif flip_idx and (len(flip_idx) != kpt_shape[0]):
raise ValueError(f'data.yaml flip_idx={flip_idx} length must be equal to kpt_shape[0]={kpt_shape[0]}')
return Compose([
pre_transform,
MixUp(dataset, pre_transform=pre_transform, p=hyp.mixup),
Albumentations(p=1.0),
RandomHSV(hgain=hyp.hsv_h, sgain=hyp.hsv_s, vgain=hyp.hsv_v),
RandomFlip(direction='vertical', p=hyp.flipud),
RandomFlip(direction='horizontal', p=hyp.fliplr, flip_idx=flip_idx)]) # transforms
# Classification augmentations -----------------------------------------------------------------------------------------
def classify_transforms(size=224, mean=(0.0, 0.0, 0.0), std=(1.0, 1.0, 1.0)): # IMAGENET_MEAN, IMAGENET_STD
# Transforms to apply if albumentations not installed
if not isinstance(size, int):
raise TypeError(f'classify_transforms() size {size} must be integer, not (list, tuple)')
if any(mean) or any(std):
return T.Compose([CenterCrop(size), ToTensor(), T.Normalize(mean, std, inplace=True)])
else:
return T.Compose([CenterCrop(size), ToTensor()])
def hsv2colorjitter(h, s, v):
"""Map HSV (hue, saturation, value) jitter into ColorJitter values (brightness, contrast, saturation, hue)"""
return v, v, s, h
def classify_albumentations(
augment=True,
size=224,
scale=(0.08, 1.0),
hflip=0.5,
vflip=0.0,
hsv_h=0.015, # image HSV-Hue augmentation (fraction)
hsv_s=0.7, # image HSV-Saturation augmentation (fraction)
hsv_v=0.4, # image HSV-Value augmentation (fraction)
mean=(0.0, 0.0, 0.0), # IMAGENET_MEAN
std=(1.0, 1.0, 1.0), # IMAGENET_STD
auto_aug=False,
):
"""YOLOv8 classification Albumentations (optional, only used if package is installed)."""
prefix = colorstr('albumentations: ')
try:
import albumentations as A
from albumentations.pytorch import ToTensorV2
check_version(A.__version__, '1.0.3', hard=True) # version requirement
if augment: # Resize and crop
T = [A.RandomResizedCrop(height=size, width=size, scale=scale)]
if auto_aug:
# TODO: implement AugMix, AutoAug & RandAug in albumentations
LOGGER.info(f'{prefix}auto augmentations are currently not supported')
else:
if hflip > 0:
T += [A.HorizontalFlip(p=hflip)]
if vflip > 0:
T += [A.VerticalFlip(p=vflip)]
if any((hsv_h, hsv_s, hsv_v)):
T += [A.ColorJitter(*hsv2colorjitter(hsv_h, hsv_s, hsv_v))] # brightness, contrast, saturation, hue
else: # Use fixed crop for eval set (reproducibility)
T = [A.SmallestMaxSize(max_size=size), A.CenterCrop(height=size, width=size)]
T += [A.Normalize(mean=mean, std=std), ToTensorV2()] # Normalize and convert to Tensor
LOGGER.info(prefix + ', '.join(f'{x}'.replace('always_apply=False, ', '') for x in T if x.p))
return A.Compose(T)
except ImportError: # package not installed, skip
pass
except Exception as e:
LOGGER.info(f'{prefix}{e}')
class ClassifyLetterBox:
"""YOLOv8 LetterBox class for image preprocessing, i.e. T.Compose([LetterBox(size), ToTensor()])"""
def __init__(self, size=(640, 640), auto=False, stride=32):
"""Resizes image and crops it to center with max dimensions 'h' and 'w'."""
super().__init__()
self.h, self.w = (size, size) if isinstance(size, int) else size
self.auto = auto # pass max size integer, automatically solve for short side using stride
self.stride = stride # used with auto
def __call__(self, im): # im = np.array HWC
imh, imw = im.shape[:2]
r = min(self.h / imh, self.w / imw) # ratio of new/old
h, w = round(imh * r), round(imw * r) # resized image
hs, ws = (math.ceil(x / self.stride) * self.stride for x in (h, w)) if self.auto else self.h, self.w
top, left = round((hs - h) / 2 - 0.1), round((ws - w) / 2 - 0.1)
im_out = np.full((self.h, self.w, 3), 114, dtype=im.dtype)
im_out[top:top + h, left:left + w] = cv2.resize(im, (w, h), interpolation=cv2.INTER_LINEAR)
return im_out
class CenterCrop:
"""YOLOv8 CenterCrop class for image preprocessing, i.e. T.Compose([CenterCrop(size), ToTensor()])"""
def __init__(self, size=640):
"""Converts an image from numpy array to PyTorch tensor."""
super().__init__()
self.h, self.w = (size, size) if isinstance(size, int) else size
def __call__(self, im): # im = np.array HWC
imh, imw = im.shape[:2]
m = min(imh, imw) # min dimension
top, left = (imh - m) // 2, (imw - m) // 2
return cv2.resize(im[top:top + m, left:left + m], (self.w, self.h), interpolation=cv2.INTER_LINEAR)
class ToTensor:
"""YOLOv8 ToTensor class for image preprocessing, i.e. T.Compose([LetterBox(size), ToTensor()])."""
def __init__(self, half=False):
"""Initialize YOLOv8 ToTensor object with optional half-precision support."""
super().__init__()
self.half = half
def __call__(self, im): # im = np.array HWC in BGR order
im = np.ascontiguousarray(im.transpose((2, 0, 1))[::-1]) # HWC to CHW -> BGR to RGB -> contiguous
im = torch.from_numpy(im) # to torch
im = im.half() if self.half else im.float() # uint8 to fp16/32
im /= 255.0 # 0-255 to 0.0-1.0
return im

View File

@ -0,0 +1,288 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import glob
import math
import os
import random
from copy import deepcopy
from multiprocessing.pool import ThreadPool
from pathlib import Path
from typing import Optional
import cv2
import numpy as np
import psutil
from torch.utils.data import Dataset
from ultralytics.utils import DEFAULT_CFG, LOCAL_RANK, LOGGER, NUM_THREADS, TQDM
from .utils import HELP_URL, IMG_FORMATS
class BaseDataset(Dataset):
"""
Base dataset class for loading and processing image data.
Args:
img_path (str): Path to the folder containing images.
imgsz (int, optional): Image size. Defaults to 640.
cache (bool, optional): Cache images to RAM or disk during training. Defaults to False.
augment (bool, optional): If True, data augmentation is applied. Defaults to True.
hyp (dict, optional): Hyperparameters to apply data augmentation. Defaults to None.
prefix (str, optional): Prefix to print in log messages. Defaults to ''.
rect (bool, optional): If True, rectangular training is used. Defaults to False.
batch_size (int, optional): Size of batches. Defaults to None.
stride (int, optional): Stride. Defaults to 32.
pad (float, optional): Padding. Defaults to 0.0.
single_cls (bool, optional): If True, single class training is used. Defaults to False.
classes (list): List of included classes. Default is None.
fraction (float): Fraction of dataset to utilize. Default is 1.0 (use all data).
Attributes:
im_files (list): List of image file paths.
labels (list): List of label data dictionaries.
ni (int): Number of images in the dataset.
ims (list): List of loaded images.
npy_files (list): List of numpy file paths.
transforms (callable): Image transformation function.
"""
def __init__(self,
img_path,
imgsz=640,
cache=False,
augment=True,
hyp=DEFAULT_CFG,
prefix='',
rect=False,
batch_size=16,
stride=32,
pad=0.5,
single_cls=False,
classes=None,
fraction=1.0):
super().__init__()
self.img_path = img_path
self.imgsz = imgsz
self.augment = augment
self.single_cls = single_cls
self.prefix = prefix
self.fraction = fraction
self.im_files = self.get_img_files(self.img_path)
self.labels = self.get_labels()
self.update_labels(include_class=classes) # single_cls and include_class
self.ni = len(self.labels) # number of images
self.rect = rect
self.batch_size = batch_size
self.stride = stride
self.pad = pad
if self.rect:
assert self.batch_size is not None
self.set_rectangle()
# Buffer thread for mosaic images
self.buffer = [] # buffer size = batch size
self.max_buffer_length = min((self.ni, self.batch_size * 8, 1000)) if self.augment else 0
# Cache stuff
if cache == 'ram' and not self.check_cache_ram():
cache = False
self.ims, self.im_hw0, self.im_hw = [None] * self.ni, [None] * self.ni, [None] * self.ni
self.npy_files = [Path(f).with_suffix('.npy') for f in self.im_files]
if cache:
self.cache_images(cache)
# Transforms
self.transforms = self.build_transforms(hyp=hyp)
def get_img_files(self, img_path):
"""Read image files."""
try:
f = [] # image files
for p in img_path if isinstance(img_path, list) else [img_path]:
p = Path(p) # os-agnostic
if p.is_dir(): # dir
f += glob.glob(str(p / '**' / '*.*'), recursive=True)
# F = list(p.rglob('*.*')) # pathlib
elif p.is_file(): # file
with open(p) as t:
t = t.read().strip().splitlines()
parent = str(p.parent) + os.sep
f += [x.replace('./', parent) if x.startswith('./') else x for x in t] # local to global path
# F += [p.parent / x.lstrip(os.sep) for x in t] # local to global path (pathlib)
else:
raise FileNotFoundError(f'{self.prefix}{p} does not exist')
im_files = sorted(x.replace('/', os.sep) for x in f if x.split('.')[-1].lower() in IMG_FORMATS)
# self.img_files = sorted([x for x in f if x.suffix[1:].lower() in IMG_FORMATS]) # pathlib
assert im_files, f'{self.prefix}No images found in {img_path}'
except Exception as e:
raise FileNotFoundError(f'{self.prefix}Error loading data from {img_path}\n{HELP_URL}') from e
if self.fraction < 1:
im_files = im_files[:round(len(im_files) * self.fraction)]
return im_files
def update_labels(self, include_class: Optional[list]):
"""include_class, filter labels to include only these classes (optional)."""
include_class_array = np.array(include_class).reshape(1, -1)
for i in range(len(self.labels)):
if include_class is not None:
cls = self.labels[i]['cls']
bboxes = self.labels[i]['bboxes']
segments = self.labels[i]['segments']
keypoints = self.labels[i]['keypoints']
j = (cls == include_class_array).any(1)
self.labels[i]['cls'] = cls[j]
self.labels[i]['bboxes'] = bboxes[j]
if segments:
self.labels[i]['segments'] = [segments[si] for si, idx in enumerate(j) if idx]
if keypoints is not None:
self.labels[i]['keypoints'] = keypoints[j]
if self.single_cls:
self.labels[i]['cls'][:, 0] = 0
def load_image(self, i, rect_mode=True):
"""Loads 1 image from dataset index 'i', returns (im, resized hw)."""
im, f, fn = self.ims[i], self.im_files[i], self.npy_files[i]
if im is None: # not cached in RAM
if fn.exists(): # load npy
im = np.load(fn)
else: # read image
im = cv2.imread(f) # BGR
if im is None:
raise FileNotFoundError(f'Image Not Found {f}')
h0, w0 = im.shape[:2] # orig hw
if rect_mode: # resize long side to imgsz while maintaining aspect ratio
r = self.imgsz / max(h0, w0) # ratio
if r != 1: # if sizes are not equal
w, h = (min(math.ceil(w0 * r), self.imgsz), min(math.ceil(h0 * r), self.imgsz))
im = cv2.resize(im, (w, h), interpolation=cv2.INTER_LINEAR)
elif not (h0 == w0 == self.imgsz): # resize by stretching image to square imgsz
im = cv2.resize(im, (self.imgsz, self.imgsz), interpolation=cv2.INTER_LINEAR)
# Add to buffer if training with augmentations
if self.augment:
self.ims[i], self.im_hw0[i], self.im_hw[i] = im, (h0, w0), im.shape[:2] # im, hw_original, hw_resized
self.buffer.append(i)
if len(self.buffer) >= self.max_buffer_length:
j = self.buffer.pop(0)
self.ims[j], self.im_hw0[j], self.im_hw[j] = None, None, None
return im, (h0, w0), im.shape[:2]
return self.ims[i], self.im_hw0[i], self.im_hw[i]
def cache_images(self, cache):
"""Cache images to memory or disk."""
b, gb = 0, 1 << 30 # bytes of cached images, bytes per gigabytes
fcn = self.cache_images_to_disk if cache == 'disk' else self.load_image
with ThreadPool(NUM_THREADS) as pool:
results = pool.imap(fcn, range(self.ni))
pbar = TQDM(enumerate(results), total=self.ni, disable=LOCAL_RANK > 0)
for i, x in pbar:
if cache == 'disk':
b += self.npy_files[i].stat().st_size
else: # 'ram'
self.ims[i], self.im_hw0[i], self.im_hw[i] = x # im, hw_orig, hw_resized = load_image(self, i)
b += self.ims[i].nbytes
pbar.desc = f'{self.prefix}Caching images ({b / gb:.1f}GB {cache})'
pbar.close()
def cache_images_to_disk(self, i):
"""Saves an image as an *.npy file for faster loading."""
f = self.npy_files[i]
if not f.exists():
np.save(f.as_posix(), cv2.imread(self.im_files[i]), allow_pickle=False)
def check_cache_ram(self, safety_margin=0.5):
"""Check image caching requirements vs available memory."""
b, gb = 0, 1 << 30 # bytes of cached images, bytes per gigabytes
n = min(self.ni, 30) # extrapolate from 30 random images
for _ in range(n):
im = cv2.imread(random.choice(self.im_files)) # sample image
ratio = self.imgsz / max(im.shape[0], im.shape[1]) # max(h, w) # ratio
b += im.nbytes * ratio ** 2
mem_required = b * self.ni / n * (1 + safety_margin) # GB required to cache dataset into RAM
mem = psutil.virtual_memory()
cache = mem_required < mem.available # to cache or not to cache, that is the question
if not cache:
LOGGER.info(f'{self.prefix}{mem_required / gb:.1f}GB RAM required to cache images '
f'with {int(safety_margin * 100)}% safety margin but only '
f'{mem.available / gb:.1f}/{mem.total / gb:.1f}GB available, '
f"{'caching images ✅' if cache else 'not caching images ⚠️'}")
return cache
def set_rectangle(self):
"""Sets the shape of bounding boxes for YOLO detections as rectangles."""
bi = np.floor(np.arange(self.ni) / self.batch_size).astype(int) # batch index
nb = bi[-1] + 1 # number of batches
s = np.array([x.pop('shape') for x in self.labels]) # hw
ar = s[:, 0] / s[:, 1] # aspect ratio
irect = ar.argsort()
self.im_files = [self.im_files[i] for i in irect]
self.labels = [self.labels[i] for i in irect]
ar = ar[irect]
# Set training image shapes
shapes = [[1, 1]] * nb
for i in range(nb):
ari = ar[bi == i]
mini, maxi = ari.min(), ari.max()
if maxi < 1:
shapes[i] = [maxi, 1]
elif mini > 1:
shapes[i] = [1, 1 / mini]
self.batch_shapes = np.ceil(np.array(shapes) * self.imgsz / self.stride + self.pad).astype(int) * self.stride
self.batch = bi # batch index of image
def __getitem__(self, index):
"""Returns transformed label information for given index."""
return self.transforms(self.get_image_and_label(index))
def get_image_and_label(self, index):
"""Get and return label information from the dataset."""
label = deepcopy(self.labels[index]) # requires deepcopy() https://github.com/ultralytics/ultralytics/pull/1948
label.pop('shape', None) # shape is for rect, remove it
label['img'], label['ori_shape'], label['resized_shape'] = self.load_image(index)
label['ratio_pad'] = (label['resized_shape'][0] / label['ori_shape'][0],
label['resized_shape'][1] / label['ori_shape'][1]) # for evaluation
if self.rect:
label['rect_shape'] = self.batch_shapes[self.batch[index]]
return self.update_labels_info(label)
def __len__(self):
"""Returns the length of the labels list for the dataset."""
return len(self.labels)
def update_labels_info(self, label):
"""custom your label format here."""
return label
def build_transforms(self, hyp=None):
"""Users can custom augmentations here
like:
if self.augment:
# Training transforms
return Compose([])
else:
# Val transforms
return Compose([])
"""
raise NotImplementedError
def get_labels(self):
"""Users can custom their own format here.
Make sure your output is a list with each element like below:
dict(
im_file=im_file,
shape=shape, # format: (height, width)
cls=cls,
bboxes=bboxes, # xywh
segments=segments, # xy
keypoints=keypoints, # xy
normalized=True, # or False
bbox_format="xyxy", # or xywh, ltwh
)
"""
raise NotImplementedError

View File

@ -0,0 +1,171 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import os
import random
from pathlib import Path
import numpy as np
import torch
from PIL import Image
from torch.utils.data import dataloader, distributed
from ultralytics.data.loaders import (LOADERS, LoadImages, LoadPilAndNumpy, LoadScreenshots, LoadStreams, LoadTensor,
SourceTypes, autocast_list)
from ultralytics.data.utils import IMG_FORMATS, VID_FORMATS
from ultralytics.utils import RANK, colorstr
from ultralytics.utils.checks import check_file
from .dataset import YOLODataset
from .utils import PIN_MEMORY
class InfiniteDataLoader(dataloader.DataLoader):
"""Dataloader that reuses workers. Uses same syntax as vanilla DataLoader."""
def __init__(self, *args, **kwargs):
"""Dataloader that infinitely recycles workers, inherits from DataLoader."""
super().__init__(*args, **kwargs)
object.__setattr__(self, 'batch_sampler', _RepeatSampler(self.batch_sampler))
self.iterator = super().__iter__()
def __len__(self):
"""Returns the length of the batch sampler's sampler."""
return len(self.batch_sampler.sampler)
def __iter__(self):
"""Creates a sampler that repeats indefinitely."""
for _ in range(len(self)):
yield next(self.iterator)
def reset(self):
"""Reset iterator.
This is useful when we want to modify settings of dataset while training.
"""
self.iterator = self._get_iterator()
class _RepeatSampler:
"""
Sampler that repeats forever.
Args:
sampler (Dataset.sampler): The sampler to repeat.
"""
def __init__(self, sampler):
"""Initializes an object that repeats a given sampler indefinitely."""
self.sampler = sampler
def __iter__(self):
"""Iterates over the 'sampler' and yields its contents."""
while True:
yield from iter(self.sampler)
def seed_worker(worker_id): # noqa
"""Set dataloader worker seed https://pytorch.org/docs/stable/notes/randomness.html#dataloader."""
worker_seed = torch.initial_seed() % 2 ** 32
np.random.seed(worker_seed)
random.seed(worker_seed)
def build_yolo_dataset(cfg, img_path, batch, data, mode='train', rect=False, stride=32):
"""Build YOLO Dataset"""
return YOLODataset(
img_path=img_path,
imgsz=cfg.imgsz,
batch_size=batch,
augment=mode == 'train', # augmentation
hyp=cfg, # TODO: probably add a get_hyps_from_cfg function
rect=cfg.rect or rect, # rectangular batches
cache=cfg.cache or None,
single_cls=cfg.single_cls or False,
stride=int(stride),
pad=0.0 if mode == 'train' else 0.5,
prefix=colorstr(f'{mode}: '),
use_segments=cfg.task == 'segment',
use_keypoints=cfg.task == 'pose',
classes=cfg.classes,
data=data,
fraction=cfg.fraction if mode == 'train' else 1.0)
def build_dataloader(dataset, batch, workers, shuffle=True, rank=-1):
"""Return an InfiniteDataLoader or DataLoader for training or validation set."""
batch = min(batch, len(dataset))
nd = torch.cuda.device_count() # number of CUDA devices
nw = min([os.cpu_count() // max(nd, 1), batch if batch > 1 else 0, workers]) # number of workers
sampler = None if rank == -1 else distributed.DistributedSampler(dataset, shuffle=shuffle)
generator = torch.Generator()
generator.manual_seed(6148914691236517205 + RANK)
return InfiniteDataLoader(dataset=dataset,
batch_size=batch,
shuffle=shuffle and sampler is None,
num_workers=nw,
sampler=sampler,
pin_memory=PIN_MEMORY,
collate_fn=getattr(dataset, 'collate_fn', None),
worker_init_fn=seed_worker,
generator=generator)
def check_source(source):
"""Check source type and return corresponding flag values."""
webcam, screenshot, from_img, in_memory, tensor = False, False, False, False, False
if isinstance(source, (str, int, Path)): # int for local usb camera
source = str(source)
is_file = Path(source).suffix[1:] in (IMG_FORMATS + VID_FORMATS)
is_url = source.lower().startswith(('https://', 'http://', 'rtsp://', 'rtmp://'))
webcam = source.isnumeric() or source.endswith('.streams') or (is_url and not is_file)
screenshot = source.lower() == 'screen'
if is_url and is_file:
source = check_file(source) # download
elif isinstance(source, LOADERS):
in_memory = True
elif isinstance(source, (list, tuple)):
source = autocast_list(source) # convert all list elements to PIL or np arrays
from_img = True
elif isinstance(source, (Image.Image, np.ndarray)):
from_img = True
elif isinstance(source, torch.Tensor):
tensor = True
else:
raise TypeError('Unsupported image type. For supported types see https://docs.ultralytics.com/modes/predict')
return source, webcam, screenshot, from_img, in_memory, tensor
def load_inference_source(source=None, imgsz=640, vid_stride=1, stream_buffer=False):
"""
Loads an inference source for object detection and applies necessary transformations.
Args:
source (str, Path, Tensor, PIL.Image, np.ndarray): The input source for inference.
imgsz (int, optional): The size of the image for inference. Default is 640.
vid_stride (int, optional): The frame interval for video sources. Default is 1.
stream_buffer (bool, optional): Determined whether stream frames will be buffered. Default is False.
Returns:
dataset (Dataset): A dataset object for the specified input source.
"""
source, webcam, screenshot, from_img, in_memory, tensor = check_source(source)
source_type = source.source_type if in_memory else SourceTypes(webcam, screenshot, from_img, tensor)
# Dataloader
if tensor:
dataset = LoadTensor(source)
elif in_memory:
dataset = source
elif webcam:
dataset = LoadStreams(source, imgsz=imgsz, vid_stride=vid_stride, stream_buffer=stream_buffer)
elif screenshot:
dataset = LoadScreenshots(source, imgsz=imgsz)
elif from_img:
dataset = LoadPilAndNumpy(source, imgsz=imgsz)
else:
dataset = LoadImages(source, imgsz=imgsz, vid_stride=vid_stride)
# Attach source types to the dataset
setattr(dataset, 'source_type', source_type)
return dataset

View File

@ -0,0 +1,298 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import json
import shutil
from collections import defaultdict
from pathlib import Path
import cv2
import numpy as np
from ultralytics.utils import TQDM
def coco91_to_coco80_class():
"""Converts 91-index COCO class IDs to 80-index COCO class IDs.
Returns:
(list): A list of 91 class IDs where the index represents the 80-index class ID and the value is the
corresponding 91-index class ID.
"""
return [
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, None, 24, 25, None,
None, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, None, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
51, 52, 53, 54, 55, 56, 57, 58, 59, None, 60, None, None, 61, None, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72,
None, 73, 74, 75, 76, 77, 78, 79, None]
def coco80_to_coco91_class(): #
"""
Converts 80-index (val2014) to 91-index (paper).
For details see https://tech.amikelive.com/node-718/what-object-categories-labels-are-in-coco-dataset/.
Example:
```python
import numpy as np
a = np.loadtxt('data/coco.names', dtype='str', delimiter='\n')
b = np.loadtxt('data/coco_paper.names', dtype='str', delimiter='\n')
x1 = [list(a[i] == b).index(True) + 1 for i in range(80)] # darknet to coco
x2 = [list(b[i] == a).index(True) if any(b[i] == a) else None for i in range(91)] # coco to darknet
```
"""
return [
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34,
35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90]
def convert_coco(labels_dir='../coco/annotations/', use_segments=False, use_keypoints=False, cls91to80=True):
"""Converts COCO dataset annotations to a format suitable for training YOLOv5 models.
Args:
labels_dir (str, optional): Path to directory containing COCO dataset annotation files.
use_segments (bool, optional): Whether to include segmentation masks in the output.
use_keypoints (bool, optional): Whether to include keypoint annotations in the output.
cls91to80 (bool, optional): Whether to map 91 COCO class IDs to the corresponding 80 COCO class IDs.
Example:
```python
from ultralytics.data.converter import convert_coco
convert_coco('../datasets/coco/annotations/', use_segments=True, use_keypoints=False, cls91to80=True)
```
Output:
Generates output files in the specified output directory.
"""
# Create dataset directory
save_dir = Path('yolo_labels')
if save_dir.exists():
shutil.rmtree(save_dir) # delete dir
for p in save_dir / 'labels', save_dir / 'images':
p.mkdir(parents=True, exist_ok=True) # make dir
# Convert classes
coco80 = coco91_to_coco80_class()
# Import json
for json_file in sorted(Path(labels_dir).resolve().glob('*.json')):
fn = Path(save_dir) / 'labels' / json_file.stem.replace('instances_', '') # folder name
fn.mkdir(parents=True, exist_ok=True)
with open(json_file) as f:
data = json.load(f)
# Create image dict
images = {f'{x["id"]:d}': x for x in data['images']}
# Create image-annotations dict
imgToAnns = defaultdict(list)
for ann in data['annotations']:
imgToAnns[ann['image_id']].append(ann)
# Write labels file
for img_id, anns in TQDM(imgToAnns.items(), desc=f'Annotations {json_file}'):
img = images[f'{img_id:d}']
h, w, f = img['height'], img['width'], img['file_name']
bboxes = []
segments = []
keypoints = []
for ann in anns:
if ann['iscrowd']:
continue
# The COCO box format is [top left x, top left y, width, height]
box = np.array(ann['bbox'], dtype=np.float64)
box[:2] += box[2:] / 2 # xy top-left corner to center
box[[0, 2]] /= w # normalize x
box[[1, 3]] /= h # normalize y
if box[2] <= 0 or box[3] <= 0: # if w <= 0 and h <= 0
continue
cls = coco80[ann['category_id'] - 1] if cls91to80 else ann['category_id'] - 1 # class
box = [cls] + box.tolist()
if box not in bboxes:
bboxes.append(box)
if use_segments and ann.get('segmentation') is not None:
if len(ann['segmentation']) == 0:
segments.append([])
continue
elif len(ann['segmentation']) > 1:
s = merge_multi_segment(ann['segmentation'])
s = (np.concatenate(s, axis=0) / np.array([w, h])).reshape(-1).tolist()
else:
s = [j for i in ann['segmentation'] for j in i] # all segments concatenated
s = (np.array(s).reshape(-1, 2) / np.array([w, h])).reshape(-1).tolist()
s = [cls] + s
if s not in segments:
segments.append(s)
if use_keypoints and ann.get('keypoints') is not None:
keypoints.append(box + (np.array(ann['keypoints']).reshape(-1, 3) /
np.array([w, h, 1])).reshape(-1).tolist())
# Write
with open((fn / f).with_suffix('.txt'), 'a') as file:
for i in range(len(bboxes)):
if use_keypoints:
line = *(keypoints[i]), # cls, box, keypoints
else:
line = *(segments[i]
if use_segments and len(segments[i]) > 0 else bboxes[i]), # cls, box or segments
file.write(('%g ' * len(line)).rstrip() % line + '\n')
def convert_dota_to_yolo_obb(dota_root_path: str):
"""
Converts DOTA dataset annotations to YOLO OBB (Oriented Bounding Box) format.
The function processes images in the 'train' and 'val' folders of the DOTA dataset. For each image, it reads the
associated label from the original labels directory and writes new labels in YOLO OBB format to a new directory.
Args:
dota_root_path (str): The root directory path of the DOTA dataset.
Example:
```python
from ultralytics.data.converter import convert_dota_to_yolo_obb
convert_dota_to_yolo_obb('path/to/DOTA')
```
Notes:
The directory structure assumed for the DOTA dataset:
- DOTA
- images
- train
- val
- labels
- train_original
- val_original
After the function execution, the new labels will be saved in:
- DOTA
- labels
- train
- val
"""
dota_root_path = Path(dota_root_path)
# Class names to indices mapping
class_mapping = {
'plane': 0,
'ship': 1,
'storage-tank': 2,
'baseball-diamond': 3,
'tennis-court': 4,
'basketball-court': 5,
'ground-track-field': 6,
'harbor': 7,
'bridge': 8,
'large-vehicle': 9,
'small-vehicle': 10,
'helicopter': 11,
'roundabout': 12,
'soccer ball-field': 13,
'swimming-pool': 14,
'container-crane': 15,
'airport': 16,
'helipad': 17}
def convert_label(image_name, image_width, image_height, orig_label_dir, save_dir):
orig_label_path = orig_label_dir / f'{image_name}.txt'
save_path = save_dir / f'{image_name}.txt'
with orig_label_path.open('r') as f, save_path.open('w') as g:
lines = f.readlines()
for line in lines:
parts = line.strip().split()
if len(parts) < 9:
continue
class_name = parts[8]
class_idx = class_mapping[class_name]
coords = [float(p) for p in parts[:8]]
normalized_coords = [
coords[i] / image_width if i % 2 == 0 else coords[i] / image_height for i in range(8)]
formatted_coords = ['{:.6g}'.format(coord) for coord in normalized_coords]
g.write(f"{class_idx} {' '.join(formatted_coords)}\n")
for phase in ['train', 'val']:
image_dir = dota_root_path / 'images' / phase
orig_label_dir = dota_root_path / 'labels' / f'{phase}_original'
save_dir = dota_root_path / 'labels' / phase
save_dir.mkdir(parents=True, exist_ok=True)
image_paths = list(image_dir.iterdir())
for image_path in TQDM(image_paths, desc=f'Processing {phase} images'):
if image_path.suffix != '.png':
continue
image_name_without_ext = image_path.stem
img = cv2.imread(str(image_path))
h, w = img.shape[:2]
convert_label(image_name_without_ext, w, h, orig_label_dir, save_dir)
def min_index(arr1, arr2):
"""
Find a pair of indexes with the shortest distance between two arrays of 2D points.
Args:
arr1 (np.array): A NumPy array of shape (N, 2) representing N 2D points.
arr2 (np.array): A NumPy array of shape (M, 2) representing M 2D points.
Returns:
(tuple): A tuple containing the indexes of the points with the shortest distance in arr1 and arr2 respectively.
"""
dis = ((arr1[:, None, :] - arr2[None, :, :]) ** 2).sum(-1)
return np.unravel_index(np.argmin(dis, axis=None), dis.shape)
def merge_multi_segment(segments):
"""
Merge multiple segments into one list by connecting the coordinates with the minimum distance between each segment.
This function connects these coordinates with a thin line to merge all segments into one.
Args:
segments (List[List]): Original segmentations in COCO's JSON file.
Each element is a list of coordinates, like [segmentation1, segmentation2,...].
Returns:
s (List[np.ndarray]): A list of connected segments represented as NumPy arrays.
"""
s = []
segments = [np.array(i).reshape(-1, 2) for i in segments]
idx_list = [[] for _ in range(len(segments))]
# record the indexes with min distance between each segment
for i in range(1, len(segments)):
idx1, idx2 = min_index(segments[i - 1], segments[i])
idx_list[i - 1].append(idx1)
idx_list[i].append(idx2)
# use two round to connect all the segments
for k in range(2):
# forward connection
if k == 0:
for i, idx in enumerate(idx_list):
# middle segments have two indexes
# reverse the index of middle segments
if len(idx) == 2 and idx[0] > idx[1]:
idx = idx[::-1]
segments[i] = segments[i][::-1, :]
segments[i] = np.roll(segments[i], -idx[0], axis=0)
segments[i] = np.concatenate([segments[i], segments[i][:1]])
# deal with the first segment and the last one
if i in [0, len(idx_list) - 1]:
s.append(segments[i])
else:
idx = [0, idx[1] - idx[0]]
s.append(segments[i][idx[0]:idx[1] + 1])
else:
for i in range(len(idx_list) - 1, -1, -1):
if i not in [0, len(idx_list) - 1]:
idx = idx_list[i]
nidx = abs(idx[1] - idx[0])
s.append(segments[i][nidx:])
return s

View File

@ -0,0 +1,326 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import contextlib
from itertools import repeat
from multiprocessing.pool import ThreadPool
from pathlib import Path
import cv2
import numpy as np
import torch
import torchvision
from ultralytics.utils import LOCAL_RANK, NUM_THREADS, TQDM, colorstr, is_dir_writeable
from .augment import Compose, Format, Instances, LetterBox, classify_albumentations, classify_transforms, v8_transforms
from .base import BaseDataset
from .utils import HELP_URL, LOGGER, get_hash, img2label_paths, verify_image, verify_image_label
# Ultralytics dataset *.cache version, >= 1.0.0 for YOLOv8
DATASET_CACHE_VERSION = '1.0.3'
class YOLODataset(BaseDataset):
"""
Dataset class for loading object detection and/or segmentation labels in YOLO format.
Args:
data (dict, optional): A dataset YAML dictionary. Defaults to None.
use_segments (bool, optional): If True, segmentation masks are used as labels. Defaults to False.
use_keypoints (bool, optional): If True, keypoints are used as labels. Defaults to False.
Returns:
(torch.utils.data.Dataset): A PyTorch dataset object that can be used for training an object detection model.
"""
def __init__(self, *args, data=None, use_segments=False, use_keypoints=False, **kwargs):
self.use_segments = use_segments
self.use_keypoints = use_keypoints
self.data = data
assert not (self.use_segments and self.use_keypoints), 'Can not use both segments and keypoints.'
super().__init__(*args, **kwargs)
def cache_labels(self, path=Path('./labels.cache')):
"""Cache dataset labels, check images and read shapes.
Args:
path (Path): path where to save the cache file (default: Path('./labels.cache')).
Returns:
(dict): labels.
"""
x = {'labels': []}
nm, nf, ne, nc, msgs = 0, 0, 0, 0, [] # number missing, found, empty, corrupt, messages
desc = f'{self.prefix}Scanning {path.parent / path.stem}...'
total = len(self.im_files)
nkpt, ndim = self.data.get('kpt_shape', (0, 0))
if self.use_keypoints and (nkpt <= 0 or ndim not in (2, 3)):
raise ValueError("'kpt_shape' in data.yaml missing or incorrect. Should be a list with [number of "
"keypoints, number of dims (2 for x,y or 3 for x,y,visible)], i.e. 'kpt_shape: [17, 3]'")
with ThreadPool(NUM_THREADS) as pool:
results = pool.imap(func=verify_image_label,
iterable=zip(self.im_files, self.label_files, repeat(self.prefix),
repeat(self.use_keypoints), repeat(len(self.data['names'])), repeat(nkpt),
repeat(ndim)))
pbar = TQDM(results, desc=desc, total=total)
for im_file, lb, shape, segments, keypoint, nm_f, nf_f, ne_f, nc_f, msg in pbar:
nm += nm_f
nf += nf_f
ne += ne_f
nc += nc_f
if im_file:
x['labels'].append(
dict(
im_file=im_file,
shape=shape,
cls=lb[:, 0:1], # n, 1
bboxes=lb[:, 1:], # n, 4
segments=segments,
keypoints=keypoint,
normalized=True,
bbox_format='xywh'))
if msg:
msgs.append(msg)
pbar.desc = f'{desc} {nf} images, {nm + ne} backgrounds, {nc} corrupt'
pbar.close()
if msgs:
LOGGER.info('\n'.join(msgs))
if nf == 0:
LOGGER.warning(f'{self.prefix}WARNING ⚠️ No labels found in {path}. {HELP_URL}')
x['hash'] = get_hash(self.label_files + self.im_files)
x['results'] = nf, nm, ne, nc, len(self.im_files)
x['msgs'] = msgs # warnings
save_dataset_cache_file(self.prefix, path, x)
return x
def get_labels(self):
"""Returns dictionary of labels for YOLO training."""
self.label_files = img2label_paths(self.im_files)
cache_path = Path(self.label_files[0]).parent.with_suffix('.cache')
try:
cache, exists = load_dataset_cache_file(cache_path), True # attempt to load a *.cache file
assert cache['version'] == DATASET_CACHE_VERSION # matches current version
assert cache['hash'] == get_hash(self.label_files + self.im_files) # identical hash
except (FileNotFoundError, AssertionError, AttributeError):
cache, exists = self.cache_labels(cache_path), False # run cache ops
# Display cache
nf, nm, ne, nc, n = cache.pop('results') # found, missing, empty, corrupt, total
if exists and LOCAL_RANK in (-1, 0):
d = f'Scanning {cache_path}... {nf} images, {nm + ne} backgrounds, {nc} corrupt'
TQDM(None, desc=self.prefix + d, total=n, initial=n) # display results
if cache['msgs']:
LOGGER.info('\n'.join(cache['msgs'])) # display warnings
# Read cache
[cache.pop(k) for k in ('hash', 'version', 'msgs')] # remove items
labels = cache['labels']
if not labels:
LOGGER.warning(f'WARNING ⚠️ No images found in {cache_path}, training may not work correctly. {HELP_URL}')
self.im_files = [lb['im_file'] for lb in labels] # update im_files
# Check if the dataset is all boxes or all segments
lengths = ((len(lb['cls']), len(lb['bboxes']), len(lb['segments'])) for lb in labels)
len_cls, len_boxes, len_segments = (sum(x) for x in zip(*lengths))
if len_segments and len_boxes != len_segments:
LOGGER.warning(
f'WARNING ⚠️ Box and segment counts should be equal, but got len(segments) = {len_segments}, '
f'len(boxes) = {len_boxes}. To resolve this only boxes will be used and all segments will be removed. '
'To avoid this please supply either a detect or segment dataset, not a detect-segment mixed dataset.')
for lb in labels:
lb['segments'] = []
if len_cls == 0:
LOGGER.warning(f'WARNING ⚠️ No labels found in {cache_path}, training may not work correctly. {HELP_URL}')
return labels
def build_transforms(self, hyp=None):
"""Builds and appends transforms to the list."""
if self.augment:
hyp.mosaic = hyp.mosaic if self.augment and not self.rect else 0.0
hyp.mixup = hyp.mixup if self.augment and not self.rect else 0.0
transforms = v8_transforms(self, self.imgsz, hyp)
else:
transforms = Compose([LetterBox(new_shape=(self.imgsz, self.imgsz), scaleup=False)])
transforms.append(
Format(bbox_format='xywh',
normalize=True,
return_mask=self.use_segments,
return_keypoint=self.use_keypoints,
batch_idx=True,
mask_ratio=hyp.mask_ratio,
mask_overlap=hyp.overlap_mask))
return transforms
def close_mosaic(self, hyp):
"""Sets mosaic, copy_paste and mixup options to 0.0 and builds transformations."""
hyp.mosaic = 0.0 # set mosaic ratio=0.0
hyp.copy_paste = 0.0 # keep the same behavior as previous v8 close-mosaic
hyp.mixup = 0.0 # keep the same behavior as previous v8 close-mosaic
self.transforms = self.build_transforms(hyp)
def update_labels_info(self, label):
"""custom your label format here."""
# NOTE: cls is not with bboxes now, classification and semantic segmentation need an independent cls label
# we can make it also support classification and semantic segmentation by add or remove some dict keys there.
bboxes = label.pop('bboxes')
segments = label.pop('segments')
keypoints = label.pop('keypoints', None)
bbox_format = label.pop('bbox_format')
normalized = label.pop('normalized')
label['instances'] = Instances(bboxes, segments, keypoints, bbox_format=bbox_format, normalized=normalized)
return label
@staticmethod
def collate_fn(batch):
"""Collates data samples into batches."""
new_batch = {}
keys = batch[0].keys()
values = list(zip(*[list(b.values()) for b in batch]))
for i, k in enumerate(keys):
value = values[i]
if k == 'img':
value = torch.stack(value, 0)
if k in ['masks', 'keypoints', 'bboxes', 'cls']:
value = torch.cat(value, 0)
new_batch[k] = value
new_batch['batch_idx'] = list(new_batch['batch_idx'])
for i in range(len(new_batch['batch_idx'])):
new_batch['batch_idx'][i] += i # add target image index for build_targets()
new_batch['batch_idx'] = torch.cat(new_batch['batch_idx'], 0)
return new_batch
# Classification dataloaders -------------------------------------------------------------------------------------------
class ClassificationDataset(torchvision.datasets.ImageFolder):
"""
YOLO Classification Dataset.
Args:
root (str): Dataset path.
Attributes:
cache_ram (bool): True if images should be cached in RAM, False otherwise.
cache_disk (bool): True if images should be cached on disk, False otherwise.
samples (list): List of samples containing file, index, npy, and im.
torch_transforms (callable): torchvision transforms applied to the dataset.
album_transforms (callable, optional): Albumentations transforms applied to the dataset if augment is True.
"""
def __init__(self, root, args, augment=False, cache=False, prefix=''):
"""
Initialize YOLO object with root, image size, augmentations, and cache settings.
Args:
root (str): Dataset path.
args (Namespace): Argument parser containing dataset related settings.
augment (bool, optional): True if dataset should be augmented, False otherwise. Defaults to False.
cache (bool | str | optional): Cache setting, can be True, False, 'ram' or 'disk'. Defaults to False.
"""
super().__init__(root=root)
if augment and args.fraction < 1.0: # reduce training fraction
self.samples = self.samples[:round(len(self.samples) * args.fraction)]
self.prefix = colorstr(f'{prefix}: ') if prefix else ''
self.cache_ram = cache is True or cache == 'ram'
self.cache_disk = cache == 'disk'
self.samples = self.verify_images() # filter out bad images
self.samples = [list(x) + [Path(x[0]).with_suffix('.npy'), None] for x in self.samples] # file, index, npy, im
self.torch_transforms = classify_transforms(args.imgsz)
self.album_transforms = classify_albumentations(
augment=augment,
size=args.imgsz,
scale=(1.0 - args.scale, 1.0), # (0.08, 1.0)
hflip=args.fliplr,
vflip=args.flipud,
hsv_h=args.hsv_h, # HSV-Hue augmentation (fraction)
hsv_s=args.hsv_s, # HSV-Saturation augmentation (fraction)
hsv_v=args.hsv_v, # HSV-Value augmentation (fraction)
mean=(0.0, 0.0, 0.0), # IMAGENET_MEAN
std=(1.0, 1.0, 1.0), # IMAGENET_STD
auto_aug=False) if augment else None
def __getitem__(self, i):
"""Returns subset of data and targets corresponding to given indices."""
f, j, fn, im = self.samples[i] # filename, index, filename.with_suffix('.npy'), image
if self.cache_ram and im is None:
im = self.samples[i][3] = cv2.imread(f)
elif self.cache_disk:
if not fn.exists(): # load npy
np.save(fn.as_posix(), cv2.imread(f), allow_pickle=False)
im = np.load(fn)
else: # read image
im = cv2.imread(f) # BGR
if self.album_transforms:
sample = self.album_transforms(image=cv2.cvtColor(im, cv2.COLOR_BGR2RGB))['image']
else:
sample = self.torch_transforms(im)
return {'img': sample, 'cls': j}
def __len__(self) -> int:
return len(self.samples)
def verify_images(self):
"""Verify all images in dataset."""
desc = f'{self.prefix}Scanning {self.root}...'
path = Path(self.root).with_suffix('.cache') # *.cache file path
with contextlib.suppress(FileNotFoundError, AssertionError, AttributeError):
cache = load_dataset_cache_file(path) # attempt to load a *.cache file
assert cache['version'] == DATASET_CACHE_VERSION # matches current version
assert cache['hash'] == get_hash([x[0] for x in self.samples]) # identical hash
nf, nc, n, samples = cache.pop('results') # found, missing, empty, corrupt, total
if LOCAL_RANK in (-1, 0):
d = f'{desc} {nf} images, {nc} corrupt'
TQDM(None, desc=d, total=n, initial=n)
if cache['msgs']:
LOGGER.info('\n'.join(cache['msgs'])) # display warnings
return samples
# Run scan if *.cache retrieval failed
nf, nc, msgs, samples, x = 0, 0, [], [], {}
with ThreadPool(NUM_THREADS) as pool:
results = pool.imap(func=verify_image, iterable=zip(self.samples, repeat(self.prefix)))
pbar = TQDM(results, desc=desc, total=len(self.samples))
for sample, nf_f, nc_f, msg in pbar:
if nf_f:
samples.append(sample)
if msg:
msgs.append(msg)
nf += nf_f
nc += nc_f
pbar.desc = f'{desc} {nf} images, {nc} corrupt'
pbar.close()
if msgs:
LOGGER.info('\n'.join(msgs))
x['hash'] = get_hash([x[0] for x in self.samples])
x['results'] = nf, nc, len(samples), samples
x['msgs'] = msgs # warnings
save_dataset_cache_file(self.prefix, path, x)
return samples
def load_dataset_cache_file(path):
"""Load an Ultralytics *.cache dictionary from path."""
import gc
gc.disable() # reduce pickle load time https://github.com/ultralytics/ultralytics/pull/1585
cache = np.load(str(path), allow_pickle=True).item() # load dict
gc.enable()
return cache
def save_dataset_cache_file(prefix, path, x):
"""Save an Ultralytics dataset *.cache dictionary x to path."""
x['version'] = DATASET_CACHE_VERSION # add cache version
if is_dir_writeable(path.parent):
if path.exists():
path.unlink() # remove *.cache file if exists
np.save(str(path), x) # save cache for next time
path.with_suffix('.cache.npy').rename(path) # remove .npy suffix
LOGGER.info(f'{prefix}New cache created: {path}')
else:
LOGGER.warning(f'{prefix}WARNING ⚠️ Cache directory {path.parent} is not writeable, cache not saved.')
# TODO: support semantic segmentation
class SemanticDataset(BaseDataset):
def __init__(self):
"""Initialize a SemanticDataset object."""
super().__init__()

View File

@ -0,0 +1,412 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import glob
import math
import os
import time
from dataclasses import dataclass
from pathlib import Path
from threading import Thread
from urllib.parse import urlparse
import cv2
import numpy as np
import requests
import torch
from PIL import Image
from ultralytics.data.utils import IMG_FORMATS, VID_FORMATS
from ultralytics.utils import LOGGER, is_colab, is_kaggle, ops
from ultralytics.utils.checks import check_requirements
@dataclass
class SourceTypes:
webcam: bool = False
screenshot: bool = False
from_img: bool = False
tensor: bool = False
class LoadStreams:
"""YOLOv8 streamloader, i.e. `yolo predict source='rtsp://example.com/media.mp4' # RTSP, RTMP, HTTP streams`."""
def __init__(self, sources='file.streams', imgsz=640, vid_stride=1, stream_buffer=False):
"""Initialize instance variables and check for consistent input stream shapes."""
torch.backends.cudnn.benchmark = True # faster for fixed-size inference
self.stream_buffer = stream_buffer # buffer input streams
self.running = True # running flag for Thread
self.mode = 'stream'
self.imgsz = imgsz
self.vid_stride = vid_stride # video frame-rate stride
sources = Path(sources).read_text().rsplit() if os.path.isfile(sources) else [sources]
n = len(sources)
self.sources = [ops.clean_str(x) for x in sources] # clean source names for later
self.imgs, self.fps, self.frames, self.threads, self.shape = [[]] * n, [0] * n, [0] * n, [None] * n, [None] * n
self.caps = [None] * n # video capture objects
for i, s in enumerate(sources): # index, source
# Start thread to read frames from video stream
st = f'{i + 1}/{n}: {s}... '
if urlparse(s).hostname in ('www.youtube.com', 'youtube.com', 'youtu.be'): # if source is YouTube video
# YouTube format i.e. 'https://www.youtube.com/watch?v=Zgi9g1ksQHc' or 'https://youtu.be/Zgi9g1ksQHc'
s = get_best_youtube_url(s)
s = eval(s) if s.isnumeric() else s # i.e. s = '0' local webcam
if s == 0 and (is_colab() or is_kaggle()):
raise NotImplementedError("'source=0' webcam not supported in Colab and Kaggle notebooks. "
"Try running 'source=0' in a local environment.")
self.caps[i] = cv2.VideoCapture(s) # store video capture object
if not self.caps[i].isOpened():
raise ConnectionError(f'{st}Failed to open {s}')
w = int(self.caps[i].get(cv2.CAP_PROP_FRAME_WIDTH))
h = int(self.caps[i].get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = self.caps[i].get(cv2.CAP_PROP_FPS) # warning: may return 0 or nan
self.frames[i] = max(int(self.caps[i].get(cv2.CAP_PROP_FRAME_COUNT)), 0) or float(
'inf') # infinite stream fallback
self.fps[i] = max((fps if math.isfinite(fps) else 0) % 100, 0) or 30 # 30 FPS fallback
success, im = self.caps[i].read() # guarantee first frame
if not success or im is None:
raise ConnectionError(f'{st}Failed to read images from {s}')
self.imgs[i].append(im)
self.shape[i] = im.shape
self.threads[i] = Thread(target=self.update, args=([i, self.caps[i], s]), daemon=True)
LOGGER.info(f'{st}Success ✅ ({self.frames[i]} frames of shape {w}x{h} at {self.fps[i]:.2f} FPS)')
self.threads[i].start()
LOGGER.info('') # newline
# Check for common shapes
self.bs = self.__len__()
def update(self, i, cap, stream):
"""Read stream `i` frames in daemon thread."""
n, f = 0, self.frames[i] # frame number, frame array
while self.running and cap.isOpened() and n < (f - 1):
# Only read a new frame if the buffer is empty
if not self.imgs[i] or not self.stream_buffer:
n += 1
cap.grab() # .read() = .grab() followed by .retrieve()
if n % self.vid_stride == 0:
success, im = cap.retrieve()
if not success:
im = np.zeros(self.shape[i], dtype=np.uint8)
LOGGER.warning('WARNING ⚠️ Video stream unresponsive, please check your IP camera connection.')
cap.open(stream) # re-open stream if signal was lost
self.imgs[i].append(im) # add image to buffer
else:
time.sleep(0.01) # wait until the buffer is empty
def close(self):
"""Close stream loader and release resources."""
self.running = False # stop flag for Thread
for thread in self.threads:
if thread.is_alive():
thread.join(timeout=5) # Add timeout
for cap in self.caps: # Iterate through the stored VideoCapture objects
try:
cap.release() # release video capture
except Exception as e:
LOGGER.warning(f'WARNING ⚠️ Could not release VideoCapture object: {e}')
cv2.destroyAllWindows()
def __iter__(self):
"""Iterates through YOLO image feed and re-opens unresponsive streams."""
self.count = -1
return self
def __next__(self):
"""Returns source paths, transformed and original images for processing."""
self.count += 1
# Wait until a frame is available in each buffer
while not all(self.imgs):
if not all(x.is_alive() for x in self.threads) or cv2.waitKey(1) == ord('q'): # q to quit
self.close()
raise StopIteration
time.sleep(1 / min(self.fps))
# Get and remove the next frame from imgs buffer
if self.stream_buffer:
images = [x.pop(0) for x in self.imgs]
else:
# Get the latest frame, and clear the rest from the imgs buffer
images = []
for x in self.imgs:
images.append(x.pop(-1) if x else None)
x.clear()
return self.sources, images, None, ''
def __len__(self):
"""Return the length of the sources object."""
return len(self.sources) # 1E12 frames = 32 streams at 30 FPS for 30 years
class LoadScreenshots:
"""YOLOv8 screenshot dataloader, i.e. `yolo predict source=screen`."""
def __init__(self, source, imgsz=640):
"""source = [screen_number left top width height] (pixels)."""
check_requirements('mss')
import mss # noqa
source, *params = source.split()
self.screen, left, top, width, height = 0, None, None, None, None # default to full screen 0
if len(params) == 1:
self.screen = int(params[0])
elif len(params) == 4:
left, top, width, height = (int(x) for x in params)
elif len(params) == 5:
self.screen, left, top, width, height = (int(x) for x in params)
self.imgsz = imgsz
self.mode = 'stream'
self.frame = 0
self.sct = mss.mss()
self.bs = 1
# Parse monitor shape
monitor = self.sct.monitors[self.screen]
self.top = monitor['top'] if top is None else (monitor['top'] + top)
self.left = monitor['left'] if left is None else (monitor['left'] + left)
self.width = width or monitor['width']
self.height = height or monitor['height']
self.monitor = {'left': self.left, 'top': self.top, 'width': self.width, 'height': self.height}
def __iter__(self):
"""Returns an iterator of the object."""
return self
def __next__(self):
"""mss screen capture: get raw pixels from the screen as np array."""
im0 = np.asarray(self.sct.grab(self.monitor))[:, :, :3] # BGRA to BGR
s = f'screen {self.screen} (LTWH): {self.left},{self.top},{self.width},{self.height}: '
self.frame += 1
return [str(self.screen)], [im0], None, s # screen, img, vid_cap, string
class LoadImages:
"""YOLOv8 image/video dataloader, i.e. `yolo predict source=image.jpg/vid.mp4`."""
def __init__(self, path, imgsz=640, vid_stride=1):
"""Initialize the Dataloader and raise FileNotFoundError if file not found."""
parent = None
if isinstance(path, str) and Path(path).suffix == '.txt': # *.txt file with img/vid/dir on each line
parent = Path(path).parent
path = Path(path).read_text().splitlines() # list of sources
files = []
for p in sorted(path) if isinstance(path, (list, tuple)) else [path]:
a = str(Path(p).absolute()) # do not use .resolve() https://github.com/ultralytics/ultralytics/issues/2912
if '*' in a:
files.extend(sorted(glob.glob(a, recursive=True))) # glob
elif os.path.isdir(a):
files.extend(sorted(glob.glob(os.path.join(a, '*.*')))) # dir
elif os.path.isfile(a):
files.append(a) # files (absolute or relative to CWD)
elif parent and (parent / p).is_file():
files.append(str((parent / p).absolute())) # files (relative to *.txt file parent)
else:
raise FileNotFoundError(f'{p} does not exist')
images = [x for x in files if x.split('.')[-1].lower() in IMG_FORMATS]
videos = [x for x in files if x.split('.')[-1].lower() in VID_FORMATS]
ni, nv = len(images), len(videos)
self.imgsz = imgsz
self.files = images + videos
self.nf = ni + nv # number of files
self.video_flag = [False] * ni + [True] * nv
self.mode = 'image'
self.vid_stride = vid_stride # video frame-rate stride
self.bs = 1
if any(videos):
self._new_video(videos[0]) # new video
else:
self.cap = None
if self.nf == 0:
raise FileNotFoundError(f'No images or videos found in {p}. '
f'Supported formats are:\nimages: {IMG_FORMATS}\nvideos: {VID_FORMATS}')
def __iter__(self):
"""Returns an iterator object for VideoStream or ImageFolder."""
self.count = 0
return self
def __next__(self):
"""Return next image, path and metadata from dataset."""
if self.count == self.nf:
raise StopIteration
path = self.files[self.count]
if self.video_flag[self.count]:
# Read video
self.mode = 'video'
for _ in range(self.vid_stride):
self.cap.grab()
success, im0 = self.cap.retrieve()
while not success:
self.count += 1
self.cap.release()
if self.count == self.nf: # last video
raise StopIteration
path = self.files[self.count]
self._new_video(path)
success, im0 = self.cap.read()
self.frame += 1
# im0 = self._cv2_rotate(im0) # for use if cv2 autorotation is False
s = f'video {self.count + 1}/{self.nf} ({self.frame}/{self.frames}) {path}: '
else:
# Read image
self.count += 1
im0 = cv2.imread(path) # BGR
if im0 is None:
raise FileNotFoundError(f'Image Not Found {path}')
s = f'image {self.count}/{self.nf} {path}: '
return [path], [im0], self.cap, s
def _new_video(self, path):
"""Create a new video capture object."""
self.frame = 0
self.cap = cv2.VideoCapture(path)
self.frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT) / self.vid_stride)
def __len__(self):
"""Returns the number of files in the object."""
return self.nf # number of files
class LoadPilAndNumpy:
def __init__(self, im0, imgsz=640):
"""Initialize PIL and Numpy Dataloader."""
if not isinstance(im0, list):
im0 = [im0]
self.paths = [getattr(im, 'filename', f'image{i}.jpg') for i, im in enumerate(im0)]
self.im0 = [self._single_check(im) for im in im0]
self.imgsz = imgsz
self.mode = 'image'
# Generate fake paths
self.bs = len(self.im0)
@staticmethod
def _single_check(im):
"""Validate and format an image to numpy array."""
assert isinstance(im, (Image.Image, np.ndarray)), f'Expected PIL/np.ndarray image type, but got {type(im)}'
if isinstance(im, Image.Image):
if im.mode != 'RGB':
im = im.convert('RGB')
im = np.asarray(im)[:, :, ::-1]
im = np.ascontiguousarray(im) # contiguous
return im
def __len__(self):
"""Returns the length of the 'im0' attribute."""
return len(self.im0)
def __next__(self):
"""Returns batch paths, images, processed images, None, ''."""
if self.count == 1: # loop only once as it's batch inference
raise StopIteration
self.count += 1
return self.paths, self.im0, None, ''
def __iter__(self):
"""Enables iteration for class LoadPilAndNumpy."""
self.count = 0
return self
class LoadTensor:
def __init__(self, im0) -> None:
self.im0 = self._single_check(im0)
self.bs = self.im0.shape[0]
self.mode = 'image'
self.paths = [getattr(im, 'filename', f'image{i}.jpg') for i, im in enumerate(im0)]
@staticmethod
def _single_check(im, stride=32):
"""Validate and format an image to torch.Tensor."""
s = f'WARNING ⚠️ torch.Tensor inputs should be BCHW i.e. shape(1, 3, 640, 640) ' \
f'divisible by stride {stride}. Input shape{tuple(im.shape)} is incompatible.'
if len(im.shape) != 4:
if len(im.shape) != 3:
raise ValueError(s)
LOGGER.warning(s)
im = im.unsqueeze(0)
if im.shape[2] % stride or im.shape[3] % stride:
raise ValueError(s)
if im.max() > 1.0:
LOGGER.warning(f'WARNING ⚠️ torch.Tensor inputs should be normalized 0.0-1.0 but max value is {im.max()}. '
f'Dividing input by 255.')
im = im.float() / 255.0
return im
def __iter__(self):
"""Returns an iterator object."""
self.count = 0
return self
def __next__(self):
"""Return next item in the iterator."""
if self.count == 1:
raise StopIteration
self.count += 1
return self.paths, self.im0, None, ''
def __len__(self):
"""Returns the batch size."""
return self.bs
def autocast_list(source):
"""
Merges a list of source of different types into a list of numpy arrays or PIL images
"""
files = []
for im in source:
if isinstance(im, (str, Path)): # filename or uri
files.append(Image.open(requests.get(im, stream=True).raw if str(im).startswith('http') else im))
elif isinstance(im, (Image.Image, np.ndarray)): # PIL or np Image
files.append(im)
else:
raise TypeError(f'type {type(im).__name__} is not a supported Ultralytics prediction source type. \n'
f'See https://docs.ultralytics.com/modes/predict for supported source types.')
return files
LOADERS = LoadStreams, LoadPilAndNumpy, LoadImages, LoadScreenshots # tuple
def get_best_youtube_url(url, use_pafy=False):
"""
Retrieves the URL of the best quality MP4 video stream from a given YouTube video.
This function uses the pafy or yt_dlp library to extract the video info from YouTube. It then finds the highest
quality MP4 format that has video codec but no audio codec, and returns the URL of this video stream.
Args:
url (str): The URL of the YouTube video.
use_pafy (bool): Use the pafy package, default=True, otherwise use yt_dlp package.
Returns:
(str): The URL of the best quality MP4 video stream, or None if no suitable stream is found.
"""
if use_pafy:
check_requirements(('pafy', 'youtube_dl==2020.12.2'))
import pafy # noqa
return pafy.new(url).getbestvideo(preftype='mp4').url
else:
check_requirements('yt-dlp')
import yt_dlp
with yt_dlp.YoutubeDL({'quiet': True}) as ydl:
info_dict = ydl.extract_info(url, download=False) # extract info
for f in reversed(info_dict.get('formats', [])): # reversed because best is usually last
# Find a format with video codec, no audio, *.mp4 extension at least 1920x1080 size
good_size = (f.get('width') or 0) >= 1920 or (f.get('height') or 0) >= 1080
if good_size and f['vcodec'] != 'none' and f['acodec'] == 'none' and f['ext'] == 'mp4':
return f.get('url')

View File

@ -0,0 +1,18 @@
#!/bin/bash
# Ultralytics YOLO 🚀, AGPL-3.0 license
# Download latest models from https://github.com/ultralytics/assets/releases
# Example usage: bash ultralytics/data/scripts/download_weights.sh
# parent
# └── weights
# ├── yolov8n.pt ← downloads here
# ├── yolov8s.pt
# └── ...
python - <<EOF
from ultralytics.utils.downloads import attempt_download_asset
assets = [f'yolov8{size}{suffix}.pt' for size in 'nsmlx' for suffix in ('', '-cls', '-seg', '-pose')]
for x in assets:
attempt_download_asset(f'weights/{x}')
EOF

View File

@ -0,0 +1,60 @@
#!/bin/bash
# Ultralytics YOLO 🚀, AGPL-3.0 license
# Download COCO 2017 dataset http://cocodataset.org
# Example usage: bash data/scripts/get_coco.sh
# parent
# ├── ultralytics
# └── datasets
# └── coco ← downloads here
# Arguments (optional) Usage: bash data/scripts/get_coco.sh --train --val --test --segments
if [ "$#" -gt 0 ]; then
for opt in "$@"; do
case "${opt}" in
--train) train=true ;;
--val) val=true ;;
--test) test=true ;;
--segments) segments=true ;;
--sama) sama=true ;;
esac
done
else
train=true
val=true
test=false
segments=false
sama=false
fi
# Download/unzip labels
d='../datasets' # unzip directory
url=https://github.com/ultralytics/yolov5/releases/download/v1.0/
if [ "$segments" == "true" ]; then
f='coco2017labels-segments.zip' # 169 MB
elif [ "$sama" == "true" ]; then
f='coco2017labels-segments-sama.zip' # 199 MB https://www.sama.com/sama-coco-dataset/
else
f='coco2017labels.zip' # 46 MB
fi
echo 'Downloading' $url$f ' ...'
curl -L $url$f -o $f -# && unzip -q $f -d $d && rm $f &
# Download/unzip images
d='../datasets/coco/images' # unzip directory
url=http://images.cocodataset.org/zips/
if [ "$train" == "true" ]; then
f='train2017.zip' # 19G, 118k images
echo 'Downloading' $url$f '...'
curl -L $url$f -o $f -# && unzip -q $f -d $d && rm $f &
fi
if [ "$val" == "true" ]; then
f='val2017.zip' # 1G, 5k images
echo 'Downloading' $url$f '...'
curl -L $url$f -o $f -# && unzip -q $f -d $d && rm $f &
fi
if [ "$test" == "true" ]; then
f='test2017.zip' # 7G, 41k images (optional)
echo 'Downloading' $url$f '...'
curl -L $url$f -o $f -# && unzip -q $f -d $d && rm $f &
fi
wait # finish background tasks

View File

@ -0,0 +1,17 @@
#!/bin/bash
# Ultralytics YOLO 🚀, AGPL-3.0 license
# Download COCO128 dataset https://www.kaggle.com/ultralytics/coco128 (first 128 images from COCO train2017)
# Example usage: bash data/scripts/get_coco128.sh
# parent
# ├── ultralytics
# └── datasets
# └── coco128 ← downloads here
# Download/unzip images and labels
d='../datasets' # unzip directory
url=https://github.com/ultralytics/yolov5/releases/download/v1.0/
f='coco128.zip' # or 'coco128-segments.zip', 68 MB
echo 'Downloading' $url$f ' ...'
curl -L $url$f -o $f -# && unzip -q $f -d $d && rm $f &
wait # finish background tasks

View File

@ -0,0 +1,51 @@
#!/bin/bash
# Ultralytics YOLO 🚀, AGPL-3.0 license
# Download ILSVRC2012 ImageNet dataset https://image-net.org
# Example usage: bash data/scripts/get_imagenet.sh
# parent
# ├── ultralytics
# └── datasets
# └── imagenet ← downloads here
# Arguments (optional) Usage: bash data/scripts/get_imagenet.sh --train --val
if [ "$#" -gt 0 ]; then
for opt in "$@"; do
case "${opt}" in
--train) train=true ;;
--val) val=true ;;
esac
done
else
train=true
val=true
fi
# Make dir
d='../datasets/imagenet' # unzip directory
mkdir -p $d && cd $d
# Download/unzip train
if [ "$train" == "true" ]; then
wget https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_train.tar # download 138G, 1281167 images
mkdir train && mv ILSVRC2012_img_train.tar train/ && cd train
tar -xf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar
find . -name "*.tar" | while read NAME; do
mkdir -p "${NAME%.tar}"
tar -xf "${NAME}" -C "${NAME%.tar}"
rm -f "${NAME}"
done
cd ..
fi
# Download/unzip val
if [ "$val" == "true" ]; then
wget https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_val.tar # download 6.3G, 50000 images
mkdir val && mv ILSVRC2012_img_val.tar val/ && cd val && tar -xf ILSVRC2012_img_val.tar
wget -qO- https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh | bash # move into subdirs
fi
# Delete corrupted image (optional: PNG under JPEG name that may cause dataloaders to fail)
# rm train/n04266014/n04266014_10835.JPEG
# TFRecords (optional)
# wget https://raw.githubusercontent.com/tensorflow/models/master/research/slim/datasets/imagenet_lsvrc_2015_synsets.txt

View File

@ -0,0 +1,619 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import contextlib
import hashlib
import json
import os
import random
import subprocess
import time
import zipfile
from multiprocessing.pool import ThreadPool
from pathlib import Path
from tarfile import is_tarfile
import cv2
import numpy as np
from PIL import Image, ImageOps
from ultralytics.nn.autobackend import check_class_names
from ultralytics.utils import (DATASETS_DIR, LOGGER, NUM_THREADS, ROOT, SETTINGS_YAML, TQDM, clean_url, colorstr,
emojis, yaml_load)
from ultralytics.utils.checks import check_file, check_font, is_ascii
from ultralytics.utils.downloads import download, safe_download, unzip_file
from ultralytics.utils.ops import segments2boxes
HELP_URL = 'See https://docs.ultralytics.com/datasets/detect for dataset formatting guidance.'
IMG_FORMATS = 'bmp', 'dng', 'jpeg', 'jpg', 'mpo', 'png', 'tif', 'tiff', 'webp', 'pfm' # image suffixes
VID_FORMATS = 'asf', 'avi', 'gif', 'm4v', 'mkv', 'mov', 'mp4', 'mpeg', 'mpg', 'ts', 'wmv', 'webm' # video suffixes
PIN_MEMORY = str(os.getenv('PIN_MEMORY', True)).lower() == 'true' # global pin_memory for dataloaders
def img2label_paths(img_paths):
"""Define label paths as a function of image paths."""
sa, sb = f'{os.sep}images{os.sep}', f'{os.sep}labels{os.sep}' # /images/, /labels/ substrings
return [sb.join(x.rsplit(sa, 1)).rsplit('.', 1)[0] + '.txt' for x in img_paths]
def get_hash(paths):
"""Returns a single hash value of a list of paths (files or dirs)."""
size = sum(os.path.getsize(p) for p in paths if os.path.exists(p)) # sizes
h = hashlib.sha256(str(size).encode()) # hash sizes
h.update(''.join(paths).encode()) # hash paths
return h.hexdigest() # return hash
def exif_size(img: Image.Image):
"""Returns exif-corrected PIL size."""
s = img.size # (width, height)
if img.format == 'JPEG': # only support JPEG images
with contextlib.suppress(Exception):
exif = img.getexif()
if exif:
rotation = exif.get(274, None) # the EXIF key for the orientation tag is 274
if rotation in [6, 8]: # rotation 270 or 90
s = s[1], s[0]
return s
def verify_image(args):
"""Verify one image."""
(im_file, cls), prefix = args
# Number (found, corrupt), message
nf, nc, msg = 0, 0, ''
try:
im = Image.open(im_file)
im.verify() # PIL verify
shape = exif_size(im) # image size
shape = (shape[1], shape[0]) # hw
assert (shape[0] > 9) & (shape[1] > 9), f'image size {shape} <10 pixels'
assert im.format.lower() in IMG_FORMATS, f'invalid image format {im.format}'
if im.format.lower() in ('jpg', 'jpeg'):
with open(im_file, 'rb') as f:
f.seek(-2, 2)
if f.read() != b'\xff\xd9': # corrupt JPEG
ImageOps.exif_transpose(Image.open(im_file)).save(im_file, 'JPEG', subsampling=0, quality=100)
msg = f'{prefix}WARNING ⚠️ {im_file}: corrupt JPEG restored and saved'
nf = 1
except Exception as e:
nc = 1
msg = f'{prefix}WARNING ⚠️ {im_file}: ignoring corrupt image/label: {e}'
return (im_file, cls), nf, nc, msg
def verify_image_label(args):
"""Verify one image-label pair."""
im_file, lb_file, prefix, keypoint, num_cls, nkpt, ndim = args
# Number (missing, found, empty, corrupt), message, segments, keypoints
nm, nf, ne, nc, msg, segments, keypoints = 0, 0, 0, 0, '', [], None
try:
# Verify images
im = Image.open(im_file)
im.verify() # PIL verify
shape = exif_size(im) # image size
shape = (shape[1], shape[0]) # hw
assert (shape[0] > 9) & (shape[1] > 9), f'image size {shape} <10 pixels'
assert im.format.lower() in IMG_FORMATS, f'invalid image format {im.format}'
if im.format.lower() in ('jpg', 'jpeg'):
with open(im_file, 'rb') as f:
f.seek(-2, 2)
if f.read() != b'\xff\xd9': # corrupt JPEG
ImageOps.exif_transpose(Image.open(im_file)).save(im_file, 'JPEG', subsampling=0, quality=100)
msg = f'{prefix}WARNING ⚠️ {im_file}: corrupt JPEG restored and saved'
# Verify labels
if os.path.isfile(lb_file):
nf = 1 # label found
with open(lb_file) as f:
lb = [x.split() for x in f.read().strip().splitlines() if len(x)]
if any(len(x) > 6 for x in lb) and (not keypoint): # is segment
classes = np.array([x[0] for x in lb], dtype=np.float32)
segments = [np.array(x[1:], dtype=np.float32).reshape(-1, 2) for x in lb] # (cls, xy1...)
lb = np.concatenate((classes.reshape(-1, 1), segments2boxes(segments)), 1) # (cls, xywh)
lb = np.array(lb, dtype=np.float32)
nl = len(lb)
if nl:
if keypoint:
assert lb.shape[1] == (5 + nkpt * ndim), f'labels require {(5 + nkpt * ndim)} columns each'
assert (lb[:, 5::ndim] <= 1).all(), 'non-normalized or out of bounds coordinate labels'
assert (lb[:, 6::ndim] <= 1).all(), 'non-normalized or out of bounds coordinate labels'
else:
assert lb.shape[1] == 5, f'labels require 5 columns, {lb.shape[1]} columns detected'
assert (lb[:, 1:] <= 1).all(), \
f'non-normalized or out of bounds coordinates {lb[:, 1:][lb[:, 1:] > 1]}'
assert (lb >= 0).all(), f'negative label values {lb[lb < 0]}'
# All labels
max_cls = int(lb[:, 0].max()) # max label count
assert max_cls <= num_cls, \
f'Label class {max_cls} exceeds dataset class count {num_cls}. ' \
f'Possible class labels are 0-{num_cls - 1}'
_, i = np.unique(lb, axis=0, return_index=True)
if len(i) < nl: # duplicate row check
lb = lb[i] # remove duplicates
if segments:
segments = [segments[x] for x in i]
msg = f'{prefix}WARNING ⚠️ {im_file}: {nl - len(i)} duplicate labels removed'
else:
ne = 1 # label empty
lb = np.zeros((0, (5 + nkpt * ndim)), dtype=np.float32) if keypoint else np.zeros(
(0, 5), dtype=np.float32)
else:
nm = 1 # label missing
lb = np.zeros((0, (5 + nkpt * ndim)), dtype=np.float32) if keypoint else np.zeros((0, 5), dtype=np.float32)
if keypoint:
keypoints = lb[:, 5:].reshape(-1, nkpt, ndim)
if ndim == 2:
kpt_mask = np.where((keypoints[..., 0] < 0) | (keypoints[..., 1] < 0), 0.0, 1.0).astype(np.float32)
keypoints = np.concatenate([keypoints, kpt_mask[..., None]], axis=-1) # (nl, nkpt, 3)
lb = lb[:, :5]
return im_file, lb, shape, segments, keypoints, nm, nf, ne, nc, msg
except Exception as e:
nc = 1
msg = f'{prefix}WARNING ⚠️ {im_file}: ignoring corrupt image/label: {e}'
return [None, None, None, None, None, nm, nf, ne, nc, msg]
def polygon2mask(imgsz, polygons, color=1, downsample_ratio=1):
"""
Args:
imgsz (tuple): The image size.
polygons (list[np.ndarray]): [N, M], N is the number of polygons, M is the number of points(Be divided by 2).
color (int): color
downsample_ratio (int): downsample ratio
"""
mask = np.zeros(imgsz, dtype=np.uint8)
polygons = np.asarray(polygons, dtype=np.int32)
polygons = polygons.reshape((polygons.shape[0], -1, 2))
cv2.fillPoly(mask, polygons, color=color)
nh, nw = (imgsz[0] // downsample_ratio, imgsz[1] // downsample_ratio)
# NOTE: fillPoly first then resize is trying to keep the same way of loss calculation when mask-ratio=1.
return cv2.resize(mask, (nw, nh))
def polygons2masks(imgsz, polygons, color, downsample_ratio=1):
"""
Args:
imgsz (tuple): The image size.
polygons (list[np.ndarray]): each polygon is [N, M], N is number of polygons, M is number of points (M % 2 = 0)
color (int): color
downsample_ratio (int): downsample ratio
"""
return np.array([polygon2mask(imgsz, [x.reshape(-1)], color, downsample_ratio) for x in polygons])
def polygons2masks_overlap(imgsz, segments, downsample_ratio=1):
"""Return a (640, 640) overlap mask."""
masks = np.zeros((imgsz[0] // downsample_ratio, imgsz[1] // downsample_ratio),
dtype=np.int32 if len(segments) > 255 else np.uint8)
areas = []
ms = []
for si in range(len(segments)):
mask = polygon2mask(imgsz, [segments[si].reshape(-1)], downsample_ratio=downsample_ratio, color=1)
ms.append(mask)
areas.append(mask.sum())
areas = np.asarray(areas)
index = np.argsort(-areas)
ms = np.array(ms)[index]
for i in range(len(segments)):
mask = ms[i] * (i + 1)
masks = masks + mask
masks = np.clip(masks, a_min=0, a_max=i + 1)
return masks, index
def find_dataset_yaml(path: Path) -> Path:
"""
Find and return the YAML file associated with a Detect, Segment or Pose dataset.
This function searches for a YAML file at the root level of the provided directory first, and if not found, it
performs a recursive search. It prefers YAML files that have the samestem as the provided path. An AssertionError
is raised if no YAML file is found or if multiple YAML files are found.
Args:
path (Path): The directory path to search for the YAML file.
Returns:
(Path): The path of the found YAML file.
"""
files = list(path.glob('*.yaml')) or list(path.rglob('*.yaml')) # try root level first and then recursive
assert files, f"No YAML file found in '{path.resolve()}'"
if len(files) > 1:
files = [f for f in files if f.stem == path.stem] # prefer *.yaml files that match
assert len(files) == 1, f"Expected 1 YAML file in '{path.resolve()}', but found {len(files)}.\n{files}"
return files[0]
def check_det_dataset(dataset, autodownload=True):
"""
Download, verify, and/or unzip a dataset if not found locally.
This function checks the availability of a specified dataset, and if not found, it has the option to download and
unzip the dataset. It then reads and parses the accompanying YAML data, ensuring key requirements are met and also
resolves paths related to the dataset.
Args:
dataset (str): Path to the dataset or dataset descriptor (like a YAML file).
autodownload (bool, optional): Whether to automatically download the dataset if not found. Defaults to True.
Returns:
(dict): Parsed dataset information and paths.
"""
data = check_file(dataset)
# Download (optional)
extract_dir = ''
if isinstance(data, (str, Path)) and (zipfile.is_zipfile(data) or is_tarfile(data)):
new_dir = safe_download(data, dir=DATASETS_DIR, unzip=True, delete=False)
data = find_dataset_yaml(DATASETS_DIR / new_dir)
extract_dir, autodownload = data.parent, False
# Read YAML (optional)
if isinstance(data, (str, Path)):
data = yaml_load(data, append_filename=True) # dictionary
# Checks
for k in 'train', 'val':
if k not in data:
if k == 'val' and 'validation' in data:
LOGGER.info("WARNING ⚠️ renaming data YAML 'validation' key to 'val' to match YOLO format.")
data['val'] = data.pop('validation') # replace 'validation' key with 'val' key
else:
raise SyntaxError(
emojis(f"{dataset} '{k}:' key missing ❌.\n'train' and 'val' are required in all data YAMLs."))
if 'names' not in data and 'nc' not in data:
raise SyntaxError(emojis(f"{dataset} key missing ❌.\n either 'names' or 'nc' are required in all data YAMLs."))
if 'names' in data and 'nc' in data and len(data['names']) != data['nc']:
raise SyntaxError(emojis(f"{dataset} 'names' length {len(data['names'])} and 'nc: {data['nc']}' must match."))
if 'names' not in data:
data['names'] = [f'class_{i}' for i in range(data['nc'])]
else:
data['nc'] = len(data['names'])
data['names'] = check_class_names(data['names'])
# Resolve paths
path = Path(extract_dir or data.get('path') or Path(data.get('yaml_file', '')).parent) # dataset root
if not path.is_absolute():
path = (DATASETS_DIR / path).resolve()
data['path'] = path # download scripts
for k in 'train', 'val', 'test':
if data.get(k): # prepend path
if isinstance(data[k], str):
x = (path / data[k]).resolve()
if not x.exists() and data[k].startswith('../'):
x = (path / data[k][3:]).resolve()
data[k] = str(x)
else:
data[k] = [str((path / x).resolve()) for x in data[k]]
# Parse YAML
train, val, test, s = (data.get(x) for x in ('train', 'val', 'test', 'download'))
if val:
val = [Path(x).resolve() for x in (val if isinstance(val, list) else [val])] # val path
if not all(x.exists() for x in val):
name = clean_url(dataset) # dataset name with URL auth stripped
m = f"\nDataset '{name}' images not found ⚠️, missing path '{[x for x in val if not x.exists()][0]}'"
if s and autodownload:
LOGGER.warning(m)
else:
m += f"\nNote dataset download directory is '{DATASETS_DIR}'. You can update this in '{SETTINGS_YAML}'"
raise FileNotFoundError(m)
t = time.time()
r = None # success
if s.startswith('http') and s.endswith('.zip'): # URL
safe_download(url=s, dir=DATASETS_DIR, delete=True)
elif s.startswith('bash '): # bash script
LOGGER.info(f'Running {s} ...')
r = os.system(s)
else: # python script
exec(s, {'yaml': data})
dt = f'({round(time.time() - t, 1)}s)'
s = f"success ✅ {dt}, saved to {colorstr('bold', DATASETS_DIR)}" if r in (0, None) else f'failure {dt}'
LOGGER.info(f'Dataset download {s}\n')
check_font('Arial.ttf' if is_ascii(data['names']) else 'Arial.Unicode.ttf') # download fonts
return data # dictionary
def check_cls_dataset(dataset, split=''):
"""
Checks a classification dataset such as Imagenet.
This function accepts a `dataset` name and attempts to retrieve the corresponding dataset information.
If the dataset is not found locally, it attempts to download the dataset from the internet and save it locally.
Args:
dataset (str | Path): The name of the dataset.
split (str, optional): The split of the dataset. Either 'val', 'test', or ''. Defaults to ''.
Returns:
(dict): A dictionary containing the following keys:
- 'train' (Path): The directory path containing the training set of the dataset.
- 'val' (Path): The directory path containing the validation set of the dataset.
- 'test' (Path): The directory path containing the test set of the dataset.
- 'nc' (int): The number of classes in the dataset.
- 'names' (dict): A dictionary of class names in the dataset.
"""
# Download (optional if dataset=https://file.zip is passed directly)
if str(dataset).startswith(('http:/', 'https:/')):
dataset = safe_download(dataset, dir=DATASETS_DIR, unzip=True, delete=False)
dataset = Path(dataset)
data_dir = (dataset if dataset.is_dir() else (DATASETS_DIR / dataset)).resolve()
if not data_dir.is_dir():
LOGGER.warning(f'\nDataset not found ⚠️, missing path {data_dir}, attempting download...')
t = time.time()
if str(dataset) == 'imagenet':
subprocess.run(f"bash {ROOT / 'data/scripts/get_imagenet.sh'}", shell=True, check=True)
else:
url = f'https://github.com/ultralytics/yolov5/releases/download/v1.0/{dataset}.zip'
download(url, dir=data_dir.parent)
s = f"Dataset download success ✅ ({time.time() - t:.1f}s), saved to {colorstr('bold', data_dir)}\n"
LOGGER.info(s)
train_set = data_dir / 'train'
val_set = data_dir / 'val' if (data_dir / 'val').exists() else data_dir / 'validation' if \
(data_dir / 'validation').exists() else None # data/test or data/val
test_set = data_dir / 'test' if (data_dir / 'test').exists() else None # data/val or data/test
if split == 'val' and not val_set:
LOGGER.warning("WARNING ⚠️ Dataset 'split=val' not found, using 'split=test' instead.")
elif split == 'test' and not test_set:
LOGGER.warning("WARNING ⚠️ Dataset 'split=test' not found, using 'split=val' instead.")
nc = len([x for x in (data_dir / 'train').glob('*') if x.is_dir()]) # number of classes
names = [x.name for x in (data_dir / 'train').iterdir() if x.is_dir()] # class names list
names = dict(enumerate(sorted(names)))
# Print to console
for k, v in {'train': train_set, 'val': val_set, 'test': test_set}.items():
prefix = f'{colorstr(f"{k}:")} {v}...'
if v is None:
LOGGER.info(prefix)
else:
files = [path for path in v.rglob('*.*') if path.suffix[1:].lower() in IMG_FORMATS]
nf = len(files) # number of files
nd = len({file.parent for file in files}) # number of directories
if nf == 0:
if k == 'train':
raise FileNotFoundError(emojis(f"{dataset} '{k}:' no training images found ❌ "))
else:
LOGGER.warning(f'{prefix} found {nf} images in {nd} classes: WARNING ⚠️ no images found')
elif nd != nc:
LOGGER.warning(f'{prefix} found {nf} images in {nd} classes: ERROR ❌️ requires {nc} classes, not {nd}')
else:
LOGGER.info(f'{prefix} found {nf} images in {nd} classes ✅ ')
return {'train': train_set, 'val': val_set, 'test': test_set, 'nc': nc, 'names': names}
class HUBDatasetStats:
"""
A class for generating HUB dataset JSON and `-hub` dataset directory.
Args:
path (str): Path to data.yaml or data.zip (with data.yaml inside data.zip). Default is 'coco128.yaml'.
task (str): Dataset task. Options are 'detect', 'segment', 'pose', 'classify'. Default is 'detect'.
autodownload (bool): Attempt to download dataset if not found locally. Default is False.
Example:
Download *.zip files from https://github.com/ultralytics/hub/tree/main/example_datasets
i.e. https://github.com/ultralytics/hub/raw/main/example_datasets/coco8.zip for coco8.zip.
```python
from ultralytics.data.utils import HUBDatasetStats
stats = HUBDatasetStats('path/to/coco8.zip', task='detect') # detect dataset
stats = HUBDatasetStats('path/to/coco8-seg.zip', task='segment') # segment dataset
stats = HUBDatasetStats('path/to/coco8-pose.zip', task='pose') # pose dataset
stats = HUBDatasetStats('path/to/imagenet10.zip', task='classify') # classification dataset
stats.get_json(save=True)
stats.process_images()
```
"""
def __init__(self, path='coco128.yaml', task='detect', autodownload=False):
"""Initialize class."""
path = Path(path).resolve()
LOGGER.info(f'Starting HUB dataset checks for {path}....')
self.task = task # detect, segment, pose, classify
if self.task == 'classify':
unzip_dir = unzip_file(path)
data = check_cls_dataset(unzip_dir)
data['path'] = unzip_dir
else: # detect, segment, pose
zipped, data_dir, yaml_path = self._unzip(Path(path))
try:
# data = yaml_load(check_yaml(yaml_path)) # data dict
data = check_det_dataset(yaml_path, autodownload) # data dict
if zipped:
data['path'] = data_dir
except Exception as e:
raise Exception('error/HUB/dataset_stats/init') from e
self.hub_dir = Path(f'{data["path"]}-hub')
self.im_dir = self.hub_dir / 'images'
self.im_dir.mkdir(parents=True, exist_ok=True) # makes /images
self.stats = {'nc': len(data['names']), 'names': list(data['names'].values())} # statistics dictionary
self.data = data
def _unzip(self, path):
"""Unzip data.zip."""
if not str(path).endswith('.zip'): # path is data.yaml
return False, None, path
unzip_dir = unzip_file(path, path=path.parent)
assert unzip_dir.is_dir(), f'Error unzipping {path}, {unzip_dir} not found. ' \
f'path/to/abc.zip MUST unzip to path/to/abc/'
return True, str(unzip_dir), find_dataset_yaml(unzip_dir) # zipped, data_dir, yaml_path
def _hub_ops(self, f):
"""Saves a compressed image for HUB previews."""
compress_one_image(f, self.im_dir / Path(f).name) # save to dataset-hub
def get_json(self, save=False, verbose=False):
"""Return dataset JSON for Ultralytics HUB."""
def _round(labels):
"""Update labels to integer class and 4 decimal place floats."""
if self.task == 'detect':
coordinates = labels['bboxes']
elif self.task == 'segment':
coordinates = [x.flatten() for x in labels['segments']]
elif self.task == 'pose':
n = labels['keypoints'].shape[0]
coordinates = np.concatenate((labels['bboxes'], labels['keypoints'].reshape(n, -1)), 1)
else:
raise ValueError('Undefined dataset task.')
zipped = zip(labels['cls'], coordinates)
return [[int(c[0]), *(round(float(x), 4) for x in points)] for c, points in zipped]
for split in 'train', 'val', 'test':
self.stats[split] = None # predefine
path = self.data.get(split)
# Check split
if path is None: # no split
continue
files = [f for f in Path(path).rglob('*.*') if f.suffix[1:].lower() in IMG_FORMATS] # image files in split
if not files: # no images
continue
# Get dataset statistics
if self.task == 'classify':
from torchvision.datasets import ImageFolder
dataset = ImageFolder(self.data[split])
x = np.zeros(len(dataset.classes)).astype(int)
for im in dataset.imgs:
x[im[1]] += 1
self.stats[split] = {
'instance_stats': {
'total': len(dataset),
'per_class': x.tolist()},
'image_stats': {
'total': len(dataset),
'unlabelled': 0,
'per_class': x.tolist()},
'labels': [{
Path(k).name: v} for k, v in dataset.imgs]}
else:
from ultralytics.data import YOLODataset
dataset = YOLODataset(img_path=self.data[split],
data=self.data,
use_segments=self.task == 'segment',
use_keypoints=self.task == 'pose')
x = np.array([
np.bincount(label['cls'].astype(int).flatten(), minlength=self.data['nc'])
for label in TQDM(dataset.labels, total=len(dataset), desc='Statistics')]) # shape(128x80)
self.stats[split] = {
'instance_stats': {
'total': int(x.sum()),
'per_class': x.sum(0).tolist()},
'image_stats': {
'total': len(dataset),
'unlabelled': int(np.all(x == 0, 1).sum()),
'per_class': (x > 0).sum(0).tolist()},
'labels': [{
Path(k).name: _round(v)} for k, v in zip(dataset.im_files, dataset.labels)]}
# Save, print and return
if save:
stats_path = self.hub_dir / 'stats.json'
LOGGER.info(f'Saving {stats_path.resolve()}...')
with open(stats_path, 'w') as f:
json.dump(self.stats, f) # save stats.json
if verbose:
LOGGER.info(json.dumps(self.stats, indent=2, sort_keys=False))
return self.stats
def process_images(self):
"""Compress images for Ultralytics HUB."""
from ultralytics.data import YOLODataset # ClassificationDataset
for split in 'train', 'val', 'test':
if self.data.get(split) is None:
continue
dataset = YOLODataset(img_path=self.data[split], data=self.data)
with ThreadPool(NUM_THREADS) as pool:
for _ in TQDM(pool.imap(self._hub_ops, dataset.im_files), total=len(dataset), desc=f'{split} images'):
pass
LOGGER.info(f'Done. All images saved to {self.im_dir}')
return self.im_dir
def compress_one_image(f, f_new=None, max_dim=1920, quality=50):
"""
Compresses a single image file to reduced size while preserving its aspect ratio and quality using either the
Python Imaging Library (PIL) or OpenCV library. If the input image is smaller than the maximum dimension, it will
not be resized.
Args:
f (str): The path to the input image file.
f_new (str, optional): The path to the output image file. If not specified, the input file will be overwritten.
max_dim (int, optional): The maximum dimension (width or height) of the output image. Default is 1920 pixels.
quality (int, optional): The image compression quality as a percentage. Default is 50%.
Example:
```python
from pathlib import Path
from ultralytics.data.utils import compress_one_image
for f in Path('path/to/dataset').rglob('*.jpg'):
compress_one_image(f)
```
"""
try: # use PIL
im = Image.open(f)
r = max_dim / max(im.height, im.width) # ratio
if r < 1.0: # image too large
im = im.resize((int(im.width * r), int(im.height * r)))
im.save(f_new or f, 'JPEG', quality=quality, optimize=True) # save
except Exception as e: # use OpenCV
LOGGER.info(f'WARNING ⚠️ HUB ops PIL failure {f}: {e}')
im = cv2.imread(f)
im_height, im_width = im.shape[:2]
r = max_dim / max(im_height, im_width) # ratio
if r < 1.0: # image too large
im = cv2.resize(im, (int(im_width * r), int(im_height * r)), interpolation=cv2.INTER_AREA)
cv2.imwrite(str(f_new or f), im)
def autosplit(path=DATASETS_DIR / 'coco8/images', weights=(0.9, 0.1, 0.0), annotated_only=False):
"""
Automatically split a dataset into train/val/test splits and save the resulting splits into autosplit_*.txt files.
Args:
path (Path, optional): Path to images directory. Defaults to DATASETS_DIR / 'coco8/images'.
weights (list | tuple, optional): Train, validation, and test split fractions. Defaults to (0.9, 0.1, 0.0).
annotated_only (bool, optional): If True, only images with an associated txt file are used. Defaults to False.
Example:
```python
from ultralytics.data.utils import autosplit
autosplit()
```
"""
path = Path(path) # images dir
files = sorted(x for x in path.rglob('*.*') if x.suffix[1:].lower() in IMG_FORMATS) # image files only
n = len(files) # number of files
random.seed(0) # for reproducibility
indices = random.choices([0, 1, 2], weights=weights, k=n) # assign each image to a split
txt = ['autosplit_train.txt', 'autosplit_val.txt', 'autosplit_test.txt'] # 3 txt files
for x in txt:
if (path.parent / x).exists():
(path.parent / x).unlink() # remove existing
LOGGER.info(f'Autosplitting images from {path}' + ', using *.txt labeled images only' * annotated_only)
for i, img in TQDM(zip(indices, files), total=n):
if not annotated_only or Path(img2label_paths([str(img)])[0]).exists(): # check label
with open(path.parent / txt[i], 'a') as f:
f.write(f'./{img.relative_to(path.parent).as_posix()}' + '\n') # add image to txt file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,436 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import inspect
import sys
from pathlib import Path
from typing import Union
from ultralytics.cfg import TASK2DATA, get_cfg, get_save_dir
from ultralytics.hub.utils import HUB_WEB_ROOT
from ultralytics.nn.tasks import attempt_load_one_weight, guess_model_task, nn, yaml_model_load
from ultralytics.utils import ASSETS, DEFAULT_CFG_DICT, DEFAULT_CFG_KEYS, LOGGER, RANK, callbacks, emojis, yaml_load
from ultralytics.utils.checks import check_file, check_imgsz, check_pip_update_available, check_yaml
from ultralytics.utils.downloads import GITHUB_ASSETS_STEMS
from ultralytics.utils.torch_utils import smart_inference_mode
class Model:
"""
A base model class to unify apis for all the models.
Args:
model (str, Path): Path to the model file to load or create.
task (Any, optional): Task type for the YOLO model. Defaults to None.
Attributes:
predictor (Any): The predictor object.
model (Any): The model object.
trainer (Any): The trainer object.
task (str): The type of model task.
ckpt (Any): The checkpoint object if the model loaded from *.pt file.
cfg (str): The model configuration if loaded from *.yaml file.
ckpt_path (str): The checkpoint file path.
overrides (dict): Overrides for the trainer object.
metrics (Any): The data for metrics.
Methods:
__call__(source=None, stream=False, **kwargs):
Alias for the predict method.
_new(cfg:str, verbose:bool=True) -> None:
Initializes a new model and infers the task type from the model definitions.
_load(weights:str, task:str='') -> None:
Initializes a new model and infers the task type from the model head.
_check_is_pytorch_model() -> None:
Raises TypeError if the model is not a PyTorch model.
reset() -> None:
Resets the model modules.
info(verbose:bool=False) -> None:
Logs the model info.
fuse() -> None:
Fuses the model for faster inference.
predict(source=None, stream=False, **kwargs) -> List[ultralytics.engine.results.Results]:
Performs prediction using the YOLO model.
Returns:
list(ultralytics.engine.results.Results): The prediction results.
"""
def __init__(self, model: Union[str, Path] = 'yolov8n.pt', task=None) -> None:
"""
Initializes the YOLO model.
Args:
model (Union[str, Path], optional): Path or name of the model to load or create. Defaults to 'yolov8n.pt'.
task (Any, optional): Task type for the YOLO model. Defaults to None.
"""
self.callbacks = callbacks.get_default_callbacks()
self.predictor = None # reuse predictor
self.model = None # model object
self.trainer = None # trainer object
self.ckpt = None # if loaded from *.pt
self.cfg = None # if loaded from *.yaml
self.ckpt_path = None
self.overrides = {} # overrides for trainer object
self.metrics = None # validation/training metrics
self.session = None # HUB session
self.task = task # task type
model = str(model).strip() # strip spaces
# Check if Ultralytics HUB model from https://hub.ultralytics.com
if self.is_hub_model(model):
from ultralytics.hub.session import HUBTrainingSession
self.session = HUBTrainingSession(model)
model = self.session.model_file
# Load or create new YOLO model
suffix = Path(model).suffix
if not suffix and Path(model).stem in GITHUB_ASSETS_STEMS:
model, suffix = Path(model).with_suffix('.pt'), '.pt' # add suffix, i.e. yolov8n -> yolov8n.pt
if suffix in ('.yaml', '.yml'):
self._new(model, task)
else:
self._load(model, task)
def __call__(self, source=None, stream=False, **kwargs):
"""Calls the 'predict' function with given arguments to perform object detection."""
return self.predict(source, stream, **kwargs)
@staticmethod
def is_hub_model(model):
"""Check if the provided model is a HUB model."""
return any((
model.startswith(f'{HUB_WEB_ROOT}/models/'), # i.e. https://hub.ultralytics.com/models/MODEL_ID
[len(x) for x in model.split('_')] == [42, 20], # APIKEY_MODELID
len(model) == 20 and not Path(model).exists() and all(x not in model for x in './\\'))) # MODELID
def _new(self, cfg: str, task=None, model=None, verbose=True):
"""
Initializes a new model and infers the task type from the model definitions.
Args:
cfg (str): model configuration file
task (str | None): model task
model (BaseModel): Customized model.
verbose (bool): display model info on load
"""
cfg_dict = yaml_model_load(cfg)
self.cfg = cfg
self.task = task or guess_model_task(cfg_dict)
self.model = (model or self.smart_load('model'))(cfg_dict, verbose=verbose and RANK == -1) # build model
self.overrides['model'] = self.cfg
self.overrides['task'] = self.task
# Below added to allow export from YAMLs
args = {**DEFAULT_CFG_DICT, **self.overrides} # combine model and default args, preferring model args
self.model.args = {k: v for k, v in args.items() if k in DEFAULT_CFG_KEYS} # attach args to model
self.model.task = self.task
def _load(self, weights: str, task=None):
"""
Initializes a new model and infers the task type from the model head.
Args:
weights (str): model checkpoint to be loaded
task (str | None): model task
"""
suffix = Path(weights).suffix
if suffix == '.pt':
self.model, self.ckpt = attempt_load_one_weight(weights)
self.task = self.model.args['task']
self.overrides = self.model.args = self._reset_ckpt_args(self.model.args)
self.ckpt_path = self.model.pt_path
else:
weights = check_file(weights)
self.model, self.ckpt = weights, None
self.task = task or guess_model_task(weights)
self.ckpt_path = weights
self.overrides['model'] = weights
self.overrides['task'] = self.task
def _check_is_pytorch_model(self):
"""
Raises TypeError is model is not a PyTorch model
"""
pt_str = isinstance(self.model, (str, Path)) and Path(self.model).suffix == '.pt'
pt_module = isinstance(self.model, nn.Module)
if not (pt_module or pt_str):
raise TypeError(f"model='{self.model}' must be a *.pt PyTorch model, but is a different type. "
f'PyTorch models can be used to train, val, predict and export, i.e. '
f"'yolo export model=yolov8n.pt', but exported formats like ONNX, TensorRT etc. only "
f"support 'predict' and 'val' modes, i.e. 'yolo predict model=yolov8n.onnx'.")
@smart_inference_mode()
def reset_weights(self):
"""
Resets the model modules parameters to randomly initialized values, losing all training information.
"""
self._check_is_pytorch_model()
for m in self.model.modules():
if hasattr(m, 'reset_parameters'):
m.reset_parameters()
for p in self.model.parameters():
p.requires_grad = True
return self
@smart_inference_mode()
def load(self, weights='yolov8n.pt'):
"""
Transfers parameters with matching names and shapes from 'weights' to model.
"""
self._check_is_pytorch_model()
if isinstance(weights, (str, Path)):
weights, self.ckpt = attempt_load_one_weight(weights)
self.model.load(weights)
return self
def info(self, detailed=False, verbose=True):
"""
Logs model info.
Args:
detailed (bool): Show detailed information about model.
verbose (bool): Controls verbosity.
"""
self._check_is_pytorch_model()
return self.model.info(detailed=detailed, verbose=verbose)
def fuse(self):
"""Fuse PyTorch Conv2d and BatchNorm2d layers."""
self._check_is_pytorch_model()
self.model.fuse()
@smart_inference_mode()
def predict(self, source=None, stream=False, predictor=None, **kwargs):
"""
Perform prediction using the YOLO model.
Args:
source (str | int | PIL | np.ndarray): The source of the image to make predictions on.
Accepts all source types accepted by the YOLO model.
stream (bool): Whether to stream the predictions or not. Defaults to False.
predictor (BasePredictor): Customized predictor.
**kwargs : Additional keyword arguments passed to the predictor.
Check the 'configuration' section in the documentation for all available options.
Returns:
(List[ultralytics.engine.results.Results]): The prediction results.
"""
if source is None:
source = ASSETS
LOGGER.warning(f"WARNING ⚠️ 'source' is missing. Using 'source={source}'.")
is_cli = (sys.argv[0].endswith('yolo') or sys.argv[0].endswith('ultralytics')) and any(
x in sys.argv for x in ('predict', 'track', 'mode=predict', 'mode=track'))
custom = {'conf': 0.25, 'save': is_cli} # method defaults
args = {**self.overrides, **custom, **kwargs, 'mode': 'predict'} # highest priority args on the right
prompts = args.pop('prompts', None) # for SAM-type models
if not self.predictor:
self.predictor = (predictor or self.smart_load('predictor'))(overrides=args, _callbacks=self.callbacks)
self.predictor.setup_model(model=self.model, verbose=is_cli)
else: # only update args if predictor is already setup
self.predictor.args = get_cfg(self.predictor.args, args)
if 'project' in args or 'name' in args:
self.predictor.save_dir = get_save_dir(self.predictor.args)
if prompts and hasattr(self.predictor, 'set_prompts'): # for SAM-type models
self.predictor.set_prompts(prompts)
return self.predictor.predict_cli(source=source) if is_cli else self.predictor(source=source, stream=stream)
def track(self, source=None, stream=False, persist=False, **kwargs):
"""
Perform object tracking on the input source using the registered trackers.
Args:
source (str, optional): The input source for object tracking. Can be a file path or a video stream.
stream (bool, optional): Whether the input source is a video stream. Defaults to False.
persist (bool, optional): Whether to persist the trackers if they already exist. Defaults to False.
**kwargs (optional): Additional keyword arguments for the tracking process.
Returns:
(List[ultralytics.engine.results.Results]): The tracking results.
"""
if not hasattr(self.predictor, 'trackers'):
from ultralytics.trackers import register_tracker
register_tracker(self, persist)
# ByteTrack-based method needs low confidence predictions as input
kwargs['conf'] = kwargs.get('conf') or 0.1
kwargs['mode'] = 'track'
return self.predict(source=source, stream=stream, **kwargs)
@smart_inference_mode()
def val(self, validator=None, **kwargs):
"""
Validate a model on a given dataset.
Args:
validator (BaseValidator): Customized validator.
**kwargs : Any other args accepted by the validators. To see all args check 'configuration' section in docs
"""
custom = {'rect': True} # method defaults
args = {**self.overrides, **custom, **kwargs, 'mode': 'val'} # highest priority args on the right
args['imgsz'] = check_imgsz(args['imgsz'], max_dim=1)
validator = (validator or self.smart_load('validator'))(args=args, _callbacks=self.callbacks)
validator(model=self.model)
self.metrics = validator.metrics
return validator.metrics
@smart_inference_mode()
def benchmark(self, **kwargs):
"""
Benchmark a model on all export formats.
Args:
**kwargs : Any other args accepted by the validators. To see all args check 'configuration' section in docs
"""
self._check_is_pytorch_model()
from ultralytics.utils.benchmarks import benchmark
custom = {'verbose': False} # method defaults
args = {**DEFAULT_CFG_DICT, **self.model.args, **custom, **kwargs, 'mode': 'benchmark'}
return benchmark(
model=self,
data=kwargs.get('data'), # if no 'data' argument passed set data=None for default datasets
imgsz=args['imgsz'],
half=args['half'],
int8=args['int8'],
device=args['device'],
verbose=kwargs.get('verbose'))
def export(self, **kwargs):
"""
Export model.
Args:
**kwargs : Any other args accepted by the Exporter. To see all args check 'configuration' section in docs.
"""
self._check_is_pytorch_model()
from .exporter import Exporter
custom = {'imgsz': self.model.args['imgsz'], 'batch': 1, 'data': None, 'verbose': False} # method defaults
args = {**self.overrides, **custom, **kwargs, 'mode': 'export'} # highest priority args on the right
return Exporter(overrides=args, _callbacks=self.callbacks)(model=self.model)
def train(self, trainer=None, **kwargs):
"""
Trains the model on a given dataset.
Args:
trainer (BaseTrainer, optional): Customized trainer.
**kwargs (Any): Any number of arguments representing the training configuration.
"""
self._check_is_pytorch_model()
if self.session: # Ultralytics HUB session
if any(kwargs):
LOGGER.warning('WARNING ⚠️ using HUB training arguments, ignoring local training arguments.')
kwargs = self.session.train_args
check_pip_update_available()
overrides = yaml_load(check_yaml(kwargs['cfg'])) if kwargs.get('cfg') else self.overrides
custom = {'data': TASK2DATA[self.task]} # method defaults
args = {**overrides, **custom, **kwargs, 'mode': 'train'} # highest priority args on the right
if args.get('resume'):
args['resume'] = self.ckpt_path
self.trainer = (trainer or self.smart_load('trainer'))(overrides=args, _callbacks=self.callbacks)
if not args.get('resume'): # manually set model only if not resuming
self.trainer.model = self.trainer.get_model(weights=self.model if self.ckpt else None, cfg=self.model.yaml)
self.model = self.trainer.model
self.trainer.hub_session = self.session # attach optional HUB session
self.trainer.train()
# Update model and cfg after training
if RANK in (-1, 0):
ckpt = self.trainer.best if self.trainer.best.exists() else self.trainer.last
self.model, _ = attempt_load_one_weight(ckpt)
self.overrides = self.model.args
self.metrics = getattr(self.trainer.validator, 'metrics', None) # TODO: no metrics returned by DDP
return self.metrics
def tune(self, use_ray=False, iterations=10, *args, **kwargs):
"""
Runs hyperparameter tuning, optionally using Ray Tune. See ultralytics.utils.tuner.run_ray_tune for Args.
Returns:
(dict): A dictionary containing the results of the hyperparameter search.
"""
self._check_is_pytorch_model()
if use_ray:
from ultralytics.utils.tuner import run_ray_tune
return run_ray_tune(self, max_samples=iterations, *args, **kwargs)
else:
from .tuner import Tuner
custom = {'plots': False, 'save': False} # method defaults
args = {**self.overrides, **custom, **kwargs, 'mode': 'train'} # highest priority args on the right
return Tuner(args=args, _callbacks=self.callbacks)(model=self, iterations=iterations)
def to(self, device):
"""
Sends the model to the given device.
Args:
device (str): device
"""
self._check_is_pytorch_model()
self.model.to(device)
return self
@property
def names(self):
"""Returns class names of the loaded model."""
return self.model.names if hasattr(self.model, 'names') else None
@property
def device(self):
"""Returns device if PyTorch model."""
return next(self.model.parameters()).device if isinstance(self.model, nn.Module) else None
@property
def transforms(self):
"""Returns transform of the loaded model."""
return self.model.transforms if hasattr(self.model, 'transforms') else None
def add_callback(self, event: str, func):
"""Add a callback."""
self.callbacks[event].append(func)
def clear_callback(self, event: str):
"""Clear all event callbacks."""
self.callbacks[event] = []
@staticmethod
def _reset_ckpt_args(args):
"""Reset arguments when loading a PyTorch model."""
include = {'imgsz', 'data', 'task', 'single_cls'} # only remember these arguments when loading a PyTorch model
return {k: v for k, v in args.items() if k in include}
def _reset_callbacks(self):
"""Reset all registered callbacks."""
for event in callbacks.default_callbacks.keys():
self.callbacks[event] = [callbacks.default_callbacks[event][0]]
def __getattr__(self, attr):
"""Raises error if object has no requested attribute."""
name = self.__class__.__name__
raise AttributeError(f"'{name}' object has no attribute '{attr}'. See valid attributes below.\n{self.__doc__}")
def smart_load(self, key):
"""Load model/trainer/validator/predictor."""
try:
return self.task_map[self.task][key]
except Exception as e:
name = self.__class__.__name__
mode = inspect.stack()[1][3] # get the function name.
raise NotImplementedError(
emojis(f"WARNING ⚠️ '{name}' model does not support '{mode}' mode for '{self.task}' task yet.")) from e
@property
def task_map(self):
"""
Map head to model, trainer, validator, and predictor classes.
Returns:
task_map (dict): The map of model task to mode classes.
"""
raise NotImplementedError('Please provide task map for your model!')

View File

@ -0,0 +1,358 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
"""
Run prediction on images, videos, directories, globs, YouTube, webcam, streams, etc.
Usage - sources:
$ yolo mode=predict model=yolov8n.pt source=0 # webcam
img.jpg # image
vid.mp4 # video
screen # screenshot
path/ # directory
list.txt # list of images
list.streams # list of streams
'path/*.jpg' # glob
'https://youtu.be/Zgi9g1ksQHc' # YouTube
'rtsp://example.com/media.mp4' # RTSP, RTMP, HTTP stream
Usage - formats:
$ yolo mode=predict model=yolov8n.pt # PyTorch
yolov8n.torchscript # TorchScript
yolov8n.onnx # ONNX Runtime or OpenCV DNN with dnn=True
yolov8n_openvino_model # OpenVINO
yolov8n.engine # TensorRT
yolov8n.mlpackage # CoreML (macOS-only)
yolov8n_saved_model # TensorFlow SavedModel
yolov8n.pb # TensorFlow GraphDef
yolov8n.tflite # TensorFlow Lite
yolov8n_edgetpu.tflite # TensorFlow Edge TPU
yolov8n_paddle_model # PaddlePaddle
"""
import platform
from pathlib import Path
import cv2
import numpy as np
import torch
from ultralytics.cfg import get_cfg, get_save_dir
from ultralytics.data import load_inference_source
from ultralytics.data.augment import LetterBox, classify_transforms
from ultralytics.nn.autobackend import AutoBackend
from ultralytics.utils import DEFAULT_CFG, LOGGER, MACOS, WINDOWS, callbacks, colorstr, ops
from ultralytics.utils.checks import check_imgsz, check_imshow
from ultralytics.utils.files import increment_path
from ultralytics.utils.torch_utils import select_device, smart_inference_mode
STREAM_WARNING = """
WARNING ⚠️ inference results will accumulate in RAM unless `stream=True` is passed, causing potential out-of-memory
errors for large sources or long-running streams and videos. See https://docs.ultralytics.com/modes/predict/ for help.
Example:
results = model(source=..., stream=True) # generator of Results objects
for r in results:
boxes = r.boxes # Boxes object for bbox outputs
masks = r.masks # Masks object for segment masks outputs
probs = r.probs # Class probabilities for classification outputs
"""
class BasePredictor:
"""
BasePredictor
A base class for creating predictors.
Attributes:
args (SimpleNamespace): Configuration for the predictor.
save_dir (Path): Directory to save results.
done_warmup (bool): Whether the predictor has finished setup.
model (nn.Module): Model used for prediction.
data (dict): Data configuration.
device (torch.device): Device used for prediction.
dataset (Dataset): Dataset used for prediction.
vid_path (str): Path to video file.
vid_writer (cv2.VideoWriter): Video writer for saving video output.
data_path (str): Path to data.
"""
def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
"""
Initializes the BasePredictor class.
Args:
cfg (str, optional): Path to a configuration file. Defaults to DEFAULT_CFG.
overrides (dict, optional): Configuration overrides. Defaults to None.
"""
self.args = get_cfg(cfg, overrides)
self.save_dir = get_save_dir(self.args)
if self.args.conf is None:
self.args.conf = 0.25 # default conf=0.25
self.done_warmup = False
if self.args.show:
self.args.show = check_imshow(warn=True)
# Usable if setup is done
self.model = None
self.data = self.args.data # data_dict
self.imgsz = None
self.device = None
self.dataset = None
self.vid_path, self.vid_writer = None, None
self.plotted_img = None
self.data_path = None
self.source_type = None
self.batch = None
self.results = None
self.transforms = None
self.callbacks = _callbacks or callbacks.get_default_callbacks()
self.txt_path = None
callbacks.add_integration_callbacks(self)
def preprocess(self, im):
"""Prepares input image before inference.
Args:
im (torch.Tensor | List(np.ndarray)): BCHW for tensor, [(HWC) x B] for list.
"""
not_tensor = not isinstance(im, torch.Tensor)
if not_tensor:
im = np.stack(self.pre_transform(im))
im = im[..., ::-1].transpose((0, 3, 1, 2)) # BGR to RGB, BHWC to BCHW, (n, 3, h, w)
im = np.ascontiguousarray(im) # contiguous
im = torch.from_numpy(im)
im = im.to(self.device)
im = im.half() if self.model.fp16 else im.float() # uint8 to fp16/32
if not_tensor:
im /= 255 # 0 - 255 to 0.0 - 1.0
return im
def inference(self, im, *args, **kwargs):
visualize = increment_path(self.save_dir / Path(self.batch[0][0]).stem,
mkdir=True) if self.args.visualize and (not self.source_type.tensor) else False
return self.model(im, augment=self.args.augment, visualize=visualize)
def pre_transform(self, im):
"""
Pre-transform input image before inference.
Args:
im (List(np.ndarray)): (N, 3, h, w) for tensor, [(h, w, 3) x N] for list.
Returns:
(list): A list of transformed images.
"""
same_shapes = all(x.shape == im[0].shape for x in im)
letterbox = LetterBox(self.imgsz, auto=same_shapes and self.model.pt, stride=self.model.stride)
return [letterbox(image=x) for x in im]
def write_results(self, idx, results, batch):
"""Write inference results to a file or directory."""
p, im, _ = batch
log_string = ''
if len(im.shape) == 3:
im = im[None] # expand for batch dim
if self.source_type.webcam or self.source_type.from_img or self.source_type.tensor: # batch_size >= 1
log_string += f'{idx}: '
frame = self.dataset.count
else:
frame = getattr(self.dataset, 'frame', 0)
self.data_path = p
self.txt_path = str(self.save_dir / 'labels' / p.stem) + ('' if self.dataset.mode == 'image' else f'_{frame}')
log_string += '%gx%g ' % im.shape[2:] # print string
result = results[idx]
log_string += result.verbose()
if self.args.save or self.args.show: # Add bbox to image
plot_args = {
'line_width': self.args.line_width,
'boxes': self.args.boxes,
'conf': self.args.show_conf,
'labels': self.args.show_labels}
if not self.args.retina_masks:
plot_args['im_gpu'] = im[idx]
self.plotted_img = result.plot(**plot_args)
# Write
if self.args.save_txt:
result.save_txt(f'{self.txt_path}.txt', save_conf=self.args.save_conf)
if self.args.save_crop:
result.save_crop(save_dir=self.save_dir / 'crops',
file_name=self.data_path.stem + ('' if self.dataset.mode == 'image' else f'_{frame}'))
return log_string
def postprocess(self, preds, img, orig_imgs):
"""Post-processes predictions for an image and returns them."""
return preds
def __call__(self, source=None, model=None, stream=False, *args, **kwargs):
"""Performs inference on an image or stream."""
self.stream = stream
if stream:
return self.stream_inference(source, model, *args, **kwargs)
else:
return list(self.stream_inference(source, model, *args, **kwargs)) # merge list of Result into one
def predict_cli(self, source=None, model=None):
"""Method used for CLI prediction. It uses always generator as outputs as not required by CLI mode."""
gen = self.stream_inference(source, model)
for _ in gen: # running CLI inference without accumulating any outputs (do not modify)
pass
def setup_source(self, source):
"""Sets up source and inference mode."""
self.imgsz = check_imgsz(self.args.imgsz, stride=self.model.stride, min_dim=2) # check image size
self.transforms = getattr(self.model.model, 'transforms', classify_transforms(
self.imgsz[0])) if self.args.task == 'classify' else None
self.dataset = load_inference_source(source=source,
imgsz=self.imgsz,
vid_stride=self.args.vid_stride,
stream_buffer=self.args.stream_buffer)
self.source_type = self.dataset.source_type
if not getattr(self, 'stream', True) and (self.dataset.mode == 'stream' or # streams
len(self.dataset) > 1000 or # images
any(getattr(self.dataset, 'video_flag', [False]))): # videos
LOGGER.warning(STREAM_WARNING)
self.vid_path, self.vid_writer = [None] * self.dataset.bs, [None] * self.dataset.bs
@smart_inference_mode()
def stream_inference(self, source=None, model=None, *args, **kwargs):
"""Streams real-time inference on camera feed and saves results to file."""
if self.args.verbose:
LOGGER.info('')
# Setup model
if not self.model:
self.setup_model(model)
# Setup source every time predict is called
self.setup_source(source if source is not None else self.args.source)
# Check if save_dir/ label file exists
if self.args.save or self.args.save_txt:
(self.save_dir / 'labels' if self.args.save_txt else self.save_dir).mkdir(parents=True, exist_ok=True)
# Warmup model
if not self.done_warmup:
self.model.warmup(imgsz=(1 if self.model.pt or self.model.triton else self.dataset.bs, 3, *self.imgsz))
self.done_warmup = True
self.seen, self.windows, self.batch, profilers = 0, [], None, (ops.Profile(), ops.Profile(), ops.Profile())
self.run_callbacks('on_predict_start')
for batch in self.dataset:
self.run_callbacks('on_predict_batch_start')
self.batch = batch
path, im0s, vid_cap, s = batch
# Preprocess
with profilers[0]:
im = self.preprocess(im0s)
# Inference
with profilers[1]:
preds = self.inference(im, *args, **kwargs)
# Postprocess
with profilers[2]:
self.results = self.postprocess(preds, im, im0s)
self.run_callbacks('on_predict_postprocess_end')
# Visualize, save, write results
n = len(im0s)
for i in range(n):
self.seen += 1
self.results[i].speed = {
'preprocess': profilers[0].dt * 1E3 / n,
'inference': profilers[1].dt * 1E3 / n,
'postprocess': profilers[2].dt * 1E3 / n}
p, im0 = path[i], None if self.source_type.tensor else im0s[i].copy()
p = Path(p)
if self.args.verbose or self.args.save or self.args.save_txt or self.args.show:
s += self.write_results(i, self.results, (p, im, im0))
if self.args.save or self.args.save_txt:
self.results[i].save_dir = self.save_dir.__str__()
if self.args.show and self.plotted_img is not None:
self.show(p)
if self.args.save and self.plotted_img is not None:
self.save_preds(vid_cap, i, str(self.save_dir / p.name))
self.run_callbacks('on_predict_batch_end')
yield from self.results
# Print time (inference-only)
if self.args.verbose:
LOGGER.info(f'{s}{profilers[1].dt * 1E3:.1f}ms')
# Release assets
if isinstance(self.vid_writer[-1], cv2.VideoWriter):
self.vid_writer[-1].release() # release final video writer
# Print results
if self.args.verbose and self.seen:
t = tuple(x.t / self.seen * 1E3 for x in profilers) # speeds per image
LOGGER.info(f'Speed: %.1fms preprocess, %.1fms inference, %.1fms postprocess per image at shape '
f'{(1, 3, *im.shape[2:])}' % t)
if self.args.save or self.args.save_txt or self.args.save_crop:
nl = len(list(self.save_dir.glob('labels/*.txt'))) # number of labels
s = f"\n{nl} label{'s' * (nl > 1)} saved to {self.save_dir / 'labels'}" if self.args.save_txt else ''
LOGGER.info(f"Results saved to {colorstr('bold', self.save_dir)}{s}")
self.run_callbacks('on_predict_end')
def setup_model(self, model, verbose=True):
"""Initialize YOLO model with given parameters and set it to evaluation mode."""
self.model = AutoBackend(model or self.args.model,
device=select_device(self.args.device, verbose=verbose),
dnn=self.args.dnn,
data=self.args.data,
fp16=self.args.half,
fuse=True,
verbose=verbose)
self.device = self.model.device # update device
self.args.half = self.model.fp16 # update half
self.model.eval()
def show(self, p):
"""Display an image in a window using OpenCV imshow()."""
im0 = self.plotted_img
if platform.system() == 'Linux' and p not in self.windows:
self.windows.append(p)
cv2.namedWindow(str(p), cv2.WINDOW_NORMAL | cv2.WINDOW_KEEPRATIO) # allow window resize (Linux)
cv2.resizeWindow(str(p), im0.shape[1], im0.shape[0])
cv2.imshow(str(p), im0)
cv2.waitKey(500 if self.batch[3].startswith('image') else 1) # 1 millisecond
def save_preds(self, vid_cap, idx, save_path):
"""Save video predictions as mp4 at specified path."""
im0 = self.plotted_img
# Save imgs
if self.dataset.mode == 'image':
cv2.imwrite(save_path, im0)
else: # 'video' or 'stream'
if self.vid_path[idx] != save_path: # new video
self.vid_path[idx] = save_path
if isinstance(self.vid_writer[idx], cv2.VideoWriter):
self.vid_writer[idx].release() # release previous video writer
if vid_cap: # video
fps = int(vid_cap.get(cv2.CAP_PROP_FPS)) # integer required, floats produce error in MP4 codec
w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
else: # stream
fps, w, h = 30, im0.shape[1], im0.shape[0]
suffix, fourcc = ('.mp4', 'avc1') if MACOS else ('.avi', 'WMV2') if WINDOWS else ('.avi', 'MJPG')
save_path = str(Path(save_path).with_suffix(suffix))
self.vid_writer[idx] = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*fourcc), fps, (w, h))
self.vid_writer[idx].write(im0)
def run_callbacks(self, event: str):
"""Runs all registered callbacks for a specific event."""
for callback in self.callbacks.get(event, []):
callback(self)
def add_callback(self, event: str, func):
"""
Add callback
"""
self.callbacks[event].append(func)

View File

@ -0,0 +1,593 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
"""
Ultralytics Results, Boxes and Masks classes for handling inference results
Usage: See https://docs.ultralytics.com/modes/predict/
"""
from copy import deepcopy
from functools import lru_cache
from pathlib import Path
import numpy as np
import torch
from ultralytics.data.augment import LetterBox
from ultralytics.utils import LOGGER, SimpleClass, deprecation_warn, ops
from ultralytics.utils.plotting import Annotator, colors, save_one_box
class BaseTensor(SimpleClass):
"""
Base tensor class with additional methods for easy manipulation and device handling.
"""
def __init__(self, data, orig_shape) -> None:
"""Initialize BaseTensor with data and original shape.
Args:
data (torch.Tensor | np.ndarray): Predictions, such as bboxes, masks and keypoints.
orig_shape (tuple): Original shape of image.
"""
assert isinstance(data, (torch.Tensor, np.ndarray))
self.data = data
self.orig_shape = orig_shape
@property
def shape(self):
"""Return the shape of the data tensor."""
return self.data.shape
def cpu(self):
"""Return a copy of the tensor on CPU memory."""
return self if isinstance(self.data, np.ndarray) else self.__class__(self.data.cpu(), self.orig_shape)
def numpy(self):
"""Return a copy of the tensor as a numpy array."""
return self if isinstance(self.data, np.ndarray) else self.__class__(self.data.numpy(), self.orig_shape)
def cuda(self):
"""Return a copy of the tensor on GPU memory."""
return self.__class__(torch.as_tensor(self.data).cuda(), self.orig_shape)
def to(self, *args, **kwargs):
"""Return a copy of the tensor with the specified device and dtype."""
return self.__class__(torch.as_tensor(self.data).to(*args, **kwargs), self.orig_shape)
def __len__(self): # override len(results)
"""Return the length of the data tensor."""
return len(self.data)
def __getitem__(self, idx):
"""Return a BaseTensor with the specified index of the data tensor."""
return self.__class__(self.data[idx], self.orig_shape)
class Results(SimpleClass):
"""
A class for storing and manipulating inference results.
Args:
orig_img (numpy.ndarray): The original image as a numpy array.
path (str): The path to the image file.
names (dict): A dictionary of class names.
boxes (torch.tensor, optional): A 2D tensor of bounding box coordinates for each detection.
masks (torch.tensor, optional): A 3D tensor of detection masks, where each mask is a binary image.
probs (torch.tensor, optional): A 1D tensor of probabilities of each class for classification task.
keypoints (List[List[float]], optional): A list of detected keypoints for each object.
Attributes:
orig_img (numpy.ndarray): The original image as a numpy array.
orig_shape (tuple): The original image shape in (height, width) format.
boxes (Boxes, optional): A Boxes object containing the detection bounding boxes.
masks (Masks, optional): A Masks object containing the detection masks.
probs (Probs, optional): A Probs object containing probabilities of each class for classification task.
keypoints (Keypoints, optional): A Keypoints object containing detected keypoints for each object.
speed (dict): A dictionary of preprocess, inference, and postprocess speeds in milliseconds per image.
names (dict): A dictionary of class names.
path (str): The path to the image file.
_keys (tuple): A tuple of attribute names for non-empty attributes.
"""
def __init__(self, orig_img, path, names, boxes=None, masks=None, probs=None, keypoints=None) -> None:
"""Initialize the Results class."""
self.orig_img = orig_img
self.orig_shape = orig_img.shape[:2]
self.boxes = Boxes(boxes, self.orig_shape) if boxes is not None else None # native size boxes
self.masks = Masks(masks, self.orig_shape) if masks is not None else None # native size or imgsz masks
self.probs = Probs(probs) if probs is not None else None
self.keypoints = Keypoints(keypoints, self.orig_shape) if keypoints is not None else None
self.speed = {'preprocess': None, 'inference': None, 'postprocess': None} # milliseconds per image
self.names = names
self.path = path
self.save_dir = None
self._keys = 'boxes', 'masks', 'probs', 'keypoints'
def __getitem__(self, idx):
"""Return a Results object for the specified index."""
return self._apply('__getitem__', idx)
def __len__(self):
"""Return the number of detections in the Results object."""
for k in self._keys:
v = getattr(self, k)
if v is not None:
return len(v)
def update(self, boxes=None, masks=None, probs=None):
"""Update the boxes, masks, and probs attributes of the Results object."""
if boxes is not None:
ops.clip_boxes(boxes, self.orig_shape) # clip boxes
self.boxes = Boxes(boxes, self.orig_shape)
if masks is not None:
self.masks = Masks(masks, self.orig_shape)
if probs is not None:
self.probs = probs
def _apply(self, fn, *args, **kwargs):
r = self.new()
for k in self._keys:
v = getattr(self, k)
if v is not None:
setattr(r, k, getattr(v, fn)(*args, **kwargs))
return r
def cpu(self):
"""Return a copy of the Results object with all tensors on CPU memory."""
return self._apply('cpu')
def numpy(self):
"""Return a copy of the Results object with all tensors as numpy arrays."""
return self._apply('numpy')
def cuda(self):
"""Return a copy of the Results object with all tensors on GPU memory."""
return self._apply('cuda')
def to(self, *args, **kwargs):
"""Return a copy of the Results object with tensors on the specified device and dtype."""
return self._apply('to', *args, **kwargs)
def new(self):
"""Return a new Results object with the same image, path, and names."""
return Results(orig_img=self.orig_img, path=self.path, names=self.names)
def plot(
self,
conf=True,
line_width=None,
font_size=None,
font='Arial.ttf',
pil=False,
img=None,
im_gpu=None,
kpt_radius=5,
kpt_line=True,
labels=True,
boxes=True,
masks=True,
probs=True,
**kwargs # deprecated args TODO: remove support in 8.2
):
"""
Plots the detection results on an input RGB image. Accepts a numpy array (cv2) or a PIL Image.
Args:
conf (bool): Whether to plot the detection confidence score.
line_width (float, optional): The line width of the bounding boxes. If None, it is scaled to the image size.
font_size (float, optional): The font size of the text. If None, it is scaled to the image size.
font (str): The font to use for the text.
pil (bool): Whether to return the image as a PIL Image.
img (numpy.ndarray): Plot to another image. if not, plot to original image.
im_gpu (torch.Tensor): Normalized image in gpu with shape (1, 3, 640, 640), for faster mask plotting.
kpt_radius (int, optional): Radius of the drawn keypoints. Default is 5.
kpt_line (bool): Whether to draw lines connecting keypoints.
labels (bool): Whether to plot the label of bounding boxes.
boxes (bool): Whether to plot the bounding boxes.
masks (bool): Whether to plot the masks.
probs (bool): Whether to plot classification probability
Returns:
(numpy.ndarray): A numpy array of the annotated image.
Example:
```python
from PIL import Image
from ultralytics import YOLO
model = YOLO('yolov8n.pt')
results = model('bus.jpg') # results list
for r in results:
im_array = r.plot() # plot a BGR numpy array of predictions
im = Image.fromarray(im_array[..., ::-1]) # RGB PIL image
im.show() # show image
im.save('results.jpg') # save image
```
"""
if img is None and isinstance(self.orig_img, torch.Tensor):
img = (self.orig_img[0].detach().permute(1, 2, 0).contiguous() * 255).to(torch.uint8).cpu().numpy()
# Deprecation warn TODO: remove in 8.2
if 'show_conf' in kwargs:
deprecation_warn('show_conf', 'conf')
conf = kwargs['show_conf']
assert isinstance(conf, bool), '`show_conf` should be of boolean type, i.e, show_conf=True/False'
if 'line_thickness' in kwargs:
deprecation_warn('line_thickness', 'line_width')
line_width = kwargs['line_thickness']
assert isinstance(line_width, int), '`line_width` should be of int type, i.e, line_width=3'
names = self.names
pred_boxes, show_boxes = self.boxes, boxes
pred_masks, show_masks = self.masks, masks
pred_probs, show_probs = self.probs, probs
annotator = Annotator(
deepcopy(self.orig_img if img is None else img),
line_width,
font_size,
font,
pil or (pred_probs is not None and show_probs), # Classify tasks default to pil=True
example=names)
# Plot Segment results
if pred_masks and show_masks:
if im_gpu is None:
img = LetterBox(pred_masks.shape[1:])(image=annotator.result())
im_gpu = torch.as_tensor(img, dtype=torch.float16, device=pred_masks.data.device).permute(
2, 0, 1).flip(0).contiguous() / 255
idx = pred_boxes.cls if pred_boxes else range(len(pred_masks))
annotator.masks(pred_masks.data, colors=[colors(x, True) for x in idx], im_gpu=im_gpu)
# Plot Detect results
if pred_boxes and show_boxes:
for d in reversed(pred_boxes):
c, conf, id = int(d.cls), float(d.conf) if conf else None, None if d.id is None else int(d.id.item())
name = ('' if id is None else f'id:{id} ') + names[c]
label = (f'{name} {conf:.2f}' if conf else name) if labels else None
annotator.box_label(d.xyxy.squeeze(), label, color=colors(c, True))
# Plot Classify results
if pred_probs is not None and show_probs:
text = ',\n'.join(f'{names[j] if names else j} {pred_probs.data[j]:.2f}' for j in pred_probs.top5)
x = round(self.orig_shape[0] * 0.03)
annotator.text([x, x], text, txt_color=(255, 255, 255)) # TODO: allow setting colors
# Plot Pose results
if self.keypoints is not None:
for k in reversed(self.keypoints.data):
annotator.kpts(k, self.orig_shape, radius=kpt_radius, kpt_line=kpt_line)
return annotator.result()
def verbose(self):
"""
Return log string for each task.
"""
log_string = ''
probs = self.probs
boxes = self.boxes
if len(self) == 0:
return log_string if probs is not None else f'{log_string}(no detections), '
if probs is not None:
log_string += f"{', '.join(f'{self.names[j]} {probs.data[j]:.2f}' for j in probs.top5)}, "
if boxes:
for c in boxes.cls.unique():
n = (boxes.cls == c).sum() # detections per class
log_string += f"{n} {self.names[int(c)]}{'s' * (n > 1)}, "
return log_string
def save_txt(self, txt_file, save_conf=False):
"""
Save predictions into txt file.
Args:
txt_file (str): txt file path.
save_conf (bool): save confidence score or not.
"""
boxes = self.boxes
masks = self.masks
probs = self.probs
kpts = self.keypoints
texts = []
if probs is not None:
# Classify
[texts.append(f'{probs.data[j]:.2f} {self.names[j]}') for j in probs.top5]
elif boxes:
# Detect/segment/pose
for j, d in enumerate(boxes):
c, conf, id = int(d.cls), float(d.conf), None if d.id is None else int(d.id.item())
line = (c, *d.xywhn.view(-1))
if masks:
seg = masks[j].xyn[0].copy().reshape(-1) # reversed mask.xyn, (n,2) to (n*2)
line = (c, *seg)
if kpts is not None:
kpt = torch.cat((kpts[j].xyn, kpts[j].conf[..., None]), 2) if kpts[j].has_visible else kpts[j].xyn
line += (*kpt.reshape(-1).tolist(), )
line += (conf, ) * save_conf + (() if id is None else (id, ))
texts.append(('%g ' * len(line)).rstrip() % line)
if texts:
Path(txt_file).parent.mkdir(parents=True, exist_ok=True) # make directory
with open(txt_file, 'a') as f:
f.writelines(text + '\n' for text in texts)
def save_crop(self, save_dir, file_name=Path('im.jpg')):
"""
Save cropped predictions to `save_dir/cls/file_name.jpg`.
Args:
save_dir (str | pathlib.Path): Save path.
file_name (str | pathlib.Path): File name.
"""
if self.probs is not None:
LOGGER.warning('WARNING ⚠️ Classify task do not support `save_crop`.')
return
for d in self.boxes:
save_one_box(d.xyxy,
self.orig_img.copy(),
file=Path(save_dir) / self.names[int(d.cls)] / f'{Path(file_name).stem}.jpg',
BGR=True)
def tojson(self, normalize=False):
"""Convert the object to JSON format."""
if self.probs is not None:
LOGGER.warning('Warning: Classify task do not support `tojson` yet.')
return
import json
# Create list of detection dictionaries
results = []
data = self.boxes.data.cpu().tolist()
h, w = self.orig_shape if normalize else (1, 1)
for i, row in enumerate(data): # xyxy, track_id if tracking, conf, class_id
box = {'x1': row[0] / w, 'y1': row[1] / h, 'x2': row[2] / w, 'y2': row[3] / h}
conf = row[-2]
class_id = int(row[-1])
name = self.names[class_id]
result = {'name': name, 'class': class_id, 'confidence': conf, 'box': box}
if self.boxes.is_track:
result['track_id'] = int(row[-3]) # track ID
if self.masks:
x, y = self.masks.xy[i][:, 0], self.masks.xy[i][:, 1] # numpy array
result['segments'] = {'x': (x / w).tolist(), 'y': (y / h).tolist()}
if self.keypoints is not None:
x, y, visible = self.keypoints[i].data[0].cpu().unbind(dim=1) # torch Tensor
result['keypoints'] = {'x': (x / w).tolist(), 'y': (y / h).tolist(), 'visible': visible.tolist()}
results.append(result)
# Convert detections to JSON
return json.dumps(results, indent=2)
class Boxes(BaseTensor):
"""
A class for storing and manipulating detection boxes.
Args:
boxes (torch.Tensor | numpy.ndarray): A tensor or numpy array containing the detection boxes,
with shape (num_boxes, 6) or (num_boxes, 7). The last two columns contain confidence and class values.
If present, the third last column contains track IDs.
orig_shape (tuple): Original image size, in the format (height, width).
Attributes:
xyxy (torch.Tensor | numpy.ndarray): The boxes in xyxy format.
conf (torch.Tensor | numpy.ndarray): The confidence values of the boxes.
cls (torch.Tensor | numpy.ndarray): The class values of the boxes.
id (torch.Tensor | numpy.ndarray): The track IDs of the boxes (if available).
xywh (torch.Tensor | numpy.ndarray): The boxes in xywh format.
xyxyn (torch.Tensor | numpy.ndarray): The boxes in xyxy format normalized by original image size.
xywhn (torch.Tensor | numpy.ndarray): The boxes in xywh format normalized by original image size.
data (torch.Tensor): The raw bboxes tensor (alias for `boxes`).
Methods:
cpu(): Move the object to CPU memory.
numpy(): Convert the object to a numpy array.
cuda(): Move the object to CUDA memory.
to(*args, **kwargs): Move the object to the specified device.
"""
def __init__(self, boxes, orig_shape) -> None:
"""Initialize the Boxes class."""
if boxes.ndim == 1:
boxes = boxes[None, :]
n = boxes.shape[-1]
assert n in (6, 7), f'expected `n` in [6, 7], but got {n}' # xyxy, track_id, conf, cls
super().__init__(boxes, orig_shape)
self.is_track = n == 7
self.orig_shape = orig_shape
@property
def xyxy(self):
"""Return the boxes in xyxy format."""
return self.data[:, :4]
@property
def conf(self):
"""Return the confidence values of the boxes."""
return self.data[:, -2]
@property
def cls(self):
"""Return the class values of the boxes."""
return self.data[:, -1]
@property
def id(self):
"""Return the track IDs of the boxes (if available)."""
return self.data[:, -3] if self.is_track else None
@property
@lru_cache(maxsize=2) # maxsize 1 should suffice
def xywh(self):
"""Return the boxes in xywh format."""
return ops.xyxy2xywh(self.xyxy)
@property
@lru_cache(maxsize=2)
def xyxyn(self):
"""Return the boxes in xyxy format normalized by original image size."""
xyxy = self.xyxy.clone() if isinstance(self.xyxy, torch.Tensor) else np.copy(self.xyxy)
xyxy[..., [0, 2]] /= self.orig_shape[1]
xyxy[..., [1, 3]] /= self.orig_shape[0]
return xyxy
@property
@lru_cache(maxsize=2)
def xywhn(self):
"""Return the boxes in xywh format normalized by original image size."""
xywh = ops.xyxy2xywh(self.xyxy)
xywh[..., [0, 2]] /= self.orig_shape[1]
xywh[..., [1, 3]] /= self.orig_shape[0]
return xywh
@property
def boxes(self):
"""Return the raw bboxes tensor (deprecated)."""
LOGGER.warning("WARNING ⚠️ 'Boxes.boxes' is deprecated. Use 'Boxes.data' instead.")
return self.data
class Masks(BaseTensor):
"""
A class for storing and manipulating detection masks.
Attributes:
segments (list): Deprecated property for segments (normalized).
xy (list): A list of segments in pixel coordinates.
xyn (list): A list of normalized segments.
Methods:
cpu(): Returns the masks tensor on CPU memory.
numpy(): Returns the masks tensor as a numpy array.
cuda(): Returns the masks tensor on GPU memory.
to(device, dtype): Returns the masks tensor with the specified device and dtype.
"""
def __init__(self, masks, orig_shape) -> None:
"""Initialize the Masks class with the given masks tensor and original image shape."""
if masks.ndim == 2:
masks = masks[None, :]
super().__init__(masks, orig_shape)
@property
@lru_cache(maxsize=1)
def segments(self):
"""Return segments (normalized). Deprecated; use xyn property instead."""
LOGGER.warning(
"WARNING ⚠️ 'Masks.segments' is deprecated. Use 'Masks.xyn' for segments (normalized) and 'Masks.xy' for segments (pixels) instead."
)
return self.xyn
@property
@lru_cache(maxsize=1)
def xyn(self):
"""Return normalized segments."""
return [
ops.scale_coords(self.data.shape[1:], x, self.orig_shape, normalize=True)
for x in ops.masks2segments(self.data)]
@property
@lru_cache(maxsize=1)
def xy(self):
"""Return segments in pixel coordinates."""
return [
ops.scale_coords(self.data.shape[1:], x, self.orig_shape, normalize=False)
for x in ops.masks2segments(self.data)]
@property
def masks(self):
"""Return the raw masks tensor. Deprecated; use data attribute instead."""
LOGGER.warning("WARNING ⚠️ 'Masks.masks' is deprecated. Use 'Masks.data' instead.")
return self.data
class Keypoints(BaseTensor):
"""
A class for storing and manipulating detection keypoints.
Attributes:
xy (torch.Tensor): A collection of keypoints containing x, y coordinates for each detection.
xyn (torch.Tensor): A normalized version of xy with coordinates in the range [0, 1].
conf (torch.Tensor): Confidence values associated with keypoints if available, otherwise None.
Methods:
cpu(): Returns a copy of the keypoints tensor on CPU memory.
numpy(): Returns a copy of the keypoints tensor as a numpy array.
cuda(): Returns a copy of the keypoints tensor on GPU memory.
to(device, dtype): Returns a copy of the keypoints tensor with the specified device and dtype.
"""
def __init__(self, keypoints, orig_shape) -> None:
"""Initializes the Keypoints object with detection keypoints and original image size."""
if keypoints.ndim == 2:
keypoints = keypoints[None, :]
super().__init__(keypoints, orig_shape)
self.has_visible = self.data.shape[-1] == 3
@property
@lru_cache(maxsize=1)
def xy(self):
"""Returns x, y coordinates of keypoints."""
return self.data[..., :2]
@property
@lru_cache(maxsize=1)
def xyn(self):
"""Returns normalized x, y coordinates of keypoints."""
xy = self.xy.clone() if isinstance(self.xy, torch.Tensor) else np.copy(self.xy)
xy[..., 0] /= self.orig_shape[1]
xy[..., 1] /= self.orig_shape[0]
return xy
@property
@lru_cache(maxsize=1)
def conf(self):
"""Returns confidence values of keypoints if available, else None."""
return self.data[..., 2] if self.has_visible else None
class Probs(BaseTensor):
"""
A class for storing and manipulating classification predictions.
Attributes:
top1 (int): Index of the top 1 class.
top5 (list[int]): Indices of the top 5 classes.
top1conf (torch.Tensor): Confidence of the top 1 class.
top5conf (torch.Tensor): Confidences of the top 5 classes.
Methods:
cpu(): Returns a copy of the probs tensor on CPU memory.
numpy(): Returns a copy of the probs tensor as a numpy array.
cuda(): Returns a copy of the probs tensor on GPU memory.
to(): Returns a copy of the probs tensor with the specified device and dtype.
"""
def __init__(self, probs, orig_shape=None) -> None:
super().__init__(probs, orig_shape)
@property
@lru_cache(maxsize=1)
def top1(self):
"""Return the index of top 1."""
return int(self.data.argmax())
@property
@lru_cache(maxsize=1)
def top5(self):
"""Return the indices of top 5."""
return (-self.data).argsort(0)[:5].tolist() # this way works with both torch and numpy.
@property
@lru_cache(maxsize=1)
def top1conf(self):
"""Return the confidence of top 1."""
return self.data[self.top1]
@property
@lru_cache(maxsize=1)
def top5conf(self):
"""Return the confidences of top 5."""
return self.data[self.top5]

View File

@ -0,0 +1,689 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
"""
Train a model on a dataset
Usage:
$ yolo mode=train model=yolov8n.pt data=coco128.yaml imgsz=640 epochs=100 batch=16
"""
import math
import os
import subprocess
import time
import warnings
from copy import deepcopy
from datetime import datetime, timedelta
from pathlib import Path
import numpy as np
import torch
from torch import distributed as dist
from torch import nn, optim
from torch.cuda import amp
from torch.nn.parallel import DistributedDataParallel as DDP
from ultralytics.cfg import get_cfg, get_save_dir
from ultralytics.data.utils import check_cls_dataset, check_det_dataset
from ultralytics.nn.tasks import attempt_load_one_weight, attempt_load_weights
from ultralytics.utils import (DEFAULT_CFG, LOGGER, RANK, TQDM, __version__, callbacks, clean_url, colorstr, emojis,
yaml_save)
from ultralytics.utils.autobatch import check_train_batch_size
from ultralytics.utils.checks import check_amp, check_file, check_imgsz, print_args
from ultralytics.utils.dist import ddp_cleanup, generate_ddp_command
from ultralytics.utils.files import get_latest_run
from ultralytics.utils.torch_utils import (EarlyStopping, ModelEMA, de_parallel, init_seeds, one_cycle, select_device,
strip_optimizer)
class BaseTrainer:
"""
BaseTrainer
A base class for creating trainers.
Attributes:
args (SimpleNamespace): Configuration for the trainer.
check_resume (method): Method to check if training should be resumed from a saved checkpoint.
validator (BaseValidator): Validator instance.
model (nn.Module): Model instance.
callbacks (defaultdict): Dictionary of callbacks.
save_dir (Path): Directory to save results.
wdir (Path): Directory to save weights.
last (Path): Path to the last checkpoint.
best (Path): Path to the best checkpoint.
save_period (int): Save checkpoint every x epochs (disabled if < 1).
batch_size (int): Batch size for training.
epochs (int): Number of epochs to train for.
start_epoch (int): Starting epoch for training.
device (torch.device): Device to use for training.
amp (bool): Flag to enable AMP (Automatic Mixed Precision).
scaler (amp.GradScaler): Gradient scaler for AMP.
data (str): Path to data.
trainset (torch.utils.data.Dataset): Training dataset.
testset (torch.utils.data.Dataset): Testing dataset.
ema (nn.Module): EMA (Exponential Moving Average) of the model.
lf (nn.Module): Loss function.
scheduler (torch.optim.lr_scheduler._LRScheduler): Learning rate scheduler.
best_fitness (float): The best fitness value achieved.
fitness (float): Current fitness value.
loss (float): Current loss value.
tloss (float): Total loss value.
loss_names (list): List of loss names.
csv (Path): Path to results CSV file.
"""
def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
"""
Initializes the BaseTrainer class.
Args:
cfg (str, optional): Path to a configuration file. Defaults to DEFAULT_CFG.
overrides (dict, optional): Configuration overrides. Defaults to None.
"""
self.args = get_cfg(cfg, overrides)
self.check_resume(overrides)
self.device = select_device(self.args.device, self.args.batch)
self.validator = None
self.model = None
self.metrics = None
self.plots = {}
init_seeds(self.args.seed + 1 + RANK, deterministic=self.args.deterministic)
# Dirs
self.save_dir = get_save_dir(self.args)
self.wdir = self.save_dir / 'weights' # weights dir
if RANK in (-1, 0):
self.wdir.mkdir(parents=True, exist_ok=True) # make dir
self.args.save_dir = str(self.save_dir)
yaml_save(self.save_dir / 'args.yaml', vars(self.args)) # save run args
self.last, self.best = self.wdir / 'last.pt', self.wdir / 'best.pt' # checkpoint paths
self.save_period = self.args.save_period
self.batch_size = self.args.batch
self.epochs = self.args.epochs
self.start_epoch = 0
if RANK == -1:
print_args(vars(self.args))
# Device
if self.device.type in ('cpu', 'mps'):
self.args.workers = 0 # faster CPU training as time dominated by inference, not dataloading
# Model and Dataset
self.model = self.args.model
try:
if self.args.task == 'classify':
self.data = check_cls_dataset(self.args.data)
elif self.args.data.split('.')[-1] in ('yaml', 'yml') or self.args.task in ('detect', 'segment', 'pose'):
self.data = check_det_dataset(self.args.data)
if 'yaml_file' in self.data:
self.args.data = self.data['yaml_file'] # for validating 'yolo train data=url.zip' usage
except Exception as e:
raise RuntimeError(emojis(f"Dataset '{clean_url(self.args.data)}' error ❌ {e}")) from e
self.trainset, self.testset = self.get_dataset(self.data)
self.ema = None
# Optimization utils init
self.lf = None
self.scheduler = None
# Epoch level metrics
self.best_fitness = None
self.fitness = None
self.loss = None
self.tloss = None
self.loss_names = ['Loss']
self.csv = self.save_dir / 'results.csv'
self.plot_idx = [0, 1, 2]
# Callbacks
self.callbacks = _callbacks or callbacks.get_default_callbacks()
if RANK in (-1, 0):
callbacks.add_integration_callbacks(self)
def add_callback(self, event: str, callback):
"""
Appends the given callback.
"""
self.callbacks[event].append(callback)
def set_callback(self, event: str, callback):
"""
Overrides the existing callbacks with the given callback.
"""
self.callbacks[event] = [callback]
def run_callbacks(self, event: str):
"""Run all existing callbacks associated with a particular event."""
for callback in self.callbacks.get(event, []):
callback(self)
def train(self):
"""Allow device='', device=None on Multi-GPU systems to default to device=0."""
if isinstance(self.args.device, str) and len(self.args.device): # i.e. device='0' or device='0,1,2,3'
world_size = len(self.args.device.split(','))
elif isinstance(self.args.device, (tuple, list)): # i.e. device=[0, 1, 2, 3] (multi-GPU from CLI is list)
world_size = len(self.args.device)
elif torch.cuda.is_available(): # i.e. device=None or device='' or device=number
world_size = 1 # default to device 0
else: # i.e. device='cpu' or 'mps'
world_size = 0
# Run subprocess if DDP training, else train normally
if world_size > 1 and 'LOCAL_RANK' not in os.environ:
# Argument checks
if self.args.rect:
LOGGER.warning("WARNING ⚠️ 'rect=True' is incompatible with Multi-GPU training, setting 'rect=False'")
self.args.rect = False
if self.args.batch == -1:
LOGGER.warning("WARNING ⚠️ 'batch=-1' for AutoBatch is incompatible with Multi-GPU training, setting "
"default 'batch=16'")
self.args.batch = 16
# Command
cmd, file = generate_ddp_command(world_size, self)
try:
LOGGER.info(f'{colorstr("DDP:")} debug command {" ".join(cmd)}')
subprocess.run(cmd, check=True)
except Exception as e:
raise e
finally:
ddp_cleanup(self, str(file))
else:
self._do_train(world_size)
def _setup_ddp(self, world_size):
"""Initializes and sets the DistributedDataParallel parameters for training."""
torch.cuda.set_device(RANK)
self.device = torch.device('cuda', RANK)
# LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}')
os.environ['NCCL_BLOCKING_WAIT'] = '1' # set to enforce timeout
dist.init_process_group(
'nccl' if dist.is_nccl_available() else 'gloo',
timeout=timedelta(seconds=10800), # 3 hours
rank=RANK,
world_size=world_size)
def _setup_train(self, world_size):
"""
Builds dataloaders and optimizer on correct rank process.
"""
# Model
self.run_callbacks('on_pretrain_routine_start')
ckpt = self.setup_model()
self.model = self.model.to(self.device)
self.set_model_attributes()
# Freeze layers
freeze_list = self.args.freeze if isinstance(
self.args.freeze, list) else range(self.args.freeze) if isinstance(self.args.freeze, int) else []
always_freeze_names = ['.dfl'] # always freeze these layers
freeze_layer_names = [f'model.{x}.' for x in freeze_list] + always_freeze_names
for k, v in self.model.named_parameters():
# v.register_hook(lambda x: torch.nan_to_num(x)) # NaN to 0 (commented for erratic training results)
if any(x in k for x in freeze_layer_names):
LOGGER.info(f"Freezing layer '{k}'")
v.requires_grad = False
elif not v.requires_grad:
LOGGER.info(f"WARNING ⚠️ setting 'requires_grad=True' for frozen layer '{k}'. "
'See ultralytics.engine.trainer for customization of frozen layers.')
v.requires_grad = True
# Check AMP
self.amp = torch.tensor(self.args.amp).to(self.device) # True or False
if self.amp and RANK in (-1, 0): # Single-GPU and DDP
callbacks_backup = callbacks.default_callbacks.copy() # backup callbacks as check_amp() resets them
self.amp = torch.tensor(check_amp(self.model), device=self.device)
callbacks.default_callbacks = callbacks_backup # restore callbacks
if RANK > -1 and world_size > 1: # DDP
dist.broadcast(self.amp, src=0) # broadcast the tensor from rank 0 to all other ranks (returns None)
self.amp = bool(self.amp) # as boolean
self.scaler = amp.GradScaler(enabled=self.amp)
if world_size > 1:
self.model = DDP(self.model, device_ids=[RANK])
# Check imgsz
gs = max(int(self.model.stride.max() if hasattr(self.model, 'stride') else 32), 32) # grid size (max stride)
self.args.imgsz = check_imgsz(self.args.imgsz, stride=gs, floor=gs, max_dim=1)
# Batch size
if self.batch_size == -1 and RANK == -1: # single-GPU only, estimate best batch size
self.args.batch = self.batch_size = check_train_batch_size(self.model, self.args.imgsz, self.amp)
# Dataloaders
batch_size = self.batch_size // max(world_size, 1)
self.train_loader = self.get_dataloader(self.trainset, batch_size=batch_size, rank=RANK, mode='train')
if RANK in (-1, 0):
self.test_loader = self.get_dataloader(self.testset, batch_size=batch_size * 2, rank=-1, mode='val')
self.validator = self.get_validator()
metric_keys = self.validator.metrics.keys + self.label_loss_items(prefix='val')
self.metrics = dict(zip(metric_keys, [0] * len(metric_keys)))
self.ema = ModelEMA(self.model)
if self.args.plots:
self.plot_training_labels()
# Optimizer
self.accumulate = max(round(self.args.nbs / self.batch_size), 1) # accumulate loss before optimizing
weight_decay = self.args.weight_decay * self.batch_size * self.accumulate / self.args.nbs # scale weight_decay
iterations = math.ceil(len(self.train_loader.dataset) / max(self.batch_size, self.args.nbs)) * self.epochs
self.optimizer = self.build_optimizer(model=self.model,
name=self.args.optimizer,
lr=self.args.lr0,
momentum=self.args.momentum,
decay=weight_decay,
iterations=iterations)
# Scheduler
if self.args.cos_lr:
self.lf = one_cycle(1, self.args.lrf, self.epochs) # cosine 1->hyp['lrf']
else:
self.lf = lambda x: (1 - x / self.epochs) * (1.0 - self.args.lrf) + self.args.lrf # linear
self.scheduler = optim.lr_scheduler.LambdaLR(self.optimizer, lr_lambda=self.lf)
self.stopper, self.stop = EarlyStopping(patience=self.args.patience), False
self.resume_training(ckpt)
self.scheduler.last_epoch = self.start_epoch - 1 # do not move
self.run_callbacks('on_pretrain_routine_end')
def _do_train(self, world_size=1):
"""Train completed, evaluate and plot if specified by arguments."""
if world_size > 1:
self._setup_ddp(world_size)
self._setup_train(world_size)
self.epoch_time = None
self.epoch_time_start = time.time()
self.train_time_start = time.time()
nb = len(self.train_loader) # number of batches
nw = max(round(self.args.warmup_epochs * nb), 100) if self.args.warmup_epochs > 0 else -1 # warmup iterations
last_opt_step = -1
self.run_callbacks('on_train_start')
LOGGER.info(f'Image sizes {self.args.imgsz} train, {self.args.imgsz} val\n'
f'Using {self.train_loader.num_workers * (world_size or 1)} dataloader workers\n'
f"Logging results to {colorstr('bold', self.save_dir)}\n"
f'Starting training for {self.epochs} epochs...')
if self.args.close_mosaic:
base_idx = (self.epochs - self.args.close_mosaic) * nb
self.plot_idx.extend([base_idx, base_idx + 1, base_idx + 2])
epoch = self.epochs # predefine for resume fully trained model edge cases
for epoch in range(self.start_epoch, self.epochs):
self.epoch = epoch
self.run_callbacks('on_train_epoch_start')
self.model.train()
if RANK != -1:
self.train_loader.sampler.set_epoch(epoch)
pbar = enumerate(self.train_loader)
# Update dataloader attributes (optional)
if epoch == (self.epochs - self.args.close_mosaic):
LOGGER.info('Closing dataloader mosaic')
if hasattr(self.train_loader.dataset, 'mosaic'):
self.train_loader.dataset.mosaic = False
if hasattr(self.train_loader.dataset, 'close_mosaic'):
self.train_loader.dataset.close_mosaic(hyp=self.args)
self.train_loader.reset()
if RANK in (-1, 0):
LOGGER.info(self.progress_string())
pbar = TQDM(enumerate(self.train_loader), total=nb)
self.tloss = None
self.optimizer.zero_grad()
for i, batch in pbar:
self.run_callbacks('on_train_batch_start')
# Warmup
ni = i + nb * epoch
if ni <= nw:
xi = [0, nw] # x interp
self.accumulate = max(1, np.interp(ni, xi, [1, self.args.nbs / self.batch_size]).round())
for j, x in enumerate(self.optimizer.param_groups):
# Bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
x['lr'] = np.interp(
ni, xi, [self.args.warmup_bias_lr if j == 0 else 0.0, x['initial_lr'] * self.lf(epoch)])
if 'momentum' in x:
x['momentum'] = np.interp(ni, xi, [self.args.warmup_momentum, self.args.momentum])
# Forward
with torch.cuda.amp.autocast(self.amp):
batch = self.preprocess_batch(batch)
self.loss, self.loss_items = self.model(batch)
if RANK != -1:
self.loss *= world_size
self.tloss = (self.tloss * i + self.loss_items) / (i + 1) if self.tloss is not None \
else self.loss_items
# Backward
self.scaler.scale(self.loss).backward()
# Optimize - https://pytorch.org/docs/master/notes/amp_examples.html
if ni - last_opt_step >= self.accumulate:
self.optimizer_step()
last_opt_step = ni
# Log
mem = f'{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G' # (GB)
loss_len = self.tloss.shape[0] if len(self.tloss.size()) else 1
losses = self.tloss if loss_len > 1 else torch.unsqueeze(self.tloss, 0)
if RANK in (-1, 0):
pbar.set_description(
('%11s' * 2 + '%11.4g' * (2 + loss_len)) %
(f'{epoch + 1}/{self.epochs}', mem, *losses, batch['cls'].shape[0], batch['img'].shape[-1]))
self.run_callbacks('on_batch_end')
if self.args.plots and ni in self.plot_idx:
self.plot_training_samples(batch, ni)
self.run_callbacks('on_train_batch_end')
self.lr = {f'lr/pg{ir}': x['lr'] for ir, x in enumerate(self.optimizer.param_groups)} # for loggers
with warnings.catch_warnings():
warnings.simplefilter('ignore') # suppress 'Detected lr_scheduler.step() before optimizer.step()'
self.scheduler.step()
self.run_callbacks('on_train_epoch_end')
if RANK in (-1, 0):
# Validation
self.ema.update_attr(self.model, include=['yaml', 'nc', 'args', 'names', 'stride', 'class_weights'])
final_epoch = (epoch + 1 == self.epochs) or self.stopper.possible_stop
if self.args.val or final_epoch:
self.metrics, self.fitness = self.validate()
self.save_metrics(metrics={**self.label_loss_items(self.tloss), **self.metrics, **self.lr})
self.stop = self.stopper(epoch + 1, self.fitness)
# Save model
if self.args.save or (epoch + 1 == self.epochs):
self.save_model()
self.run_callbacks('on_model_save')
tnow = time.time()
self.epoch_time = tnow - self.epoch_time_start
self.epoch_time_start = tnow
self.run_callbacks('on_fit_epoch_end')
torch.cuda.empty_cache() # clears GPU vRAM at end of epoch, can help with out of memory errors
# Early Stopping
if RANK != -1: # if DDP training
broadcast_list = [self.stop if RANK == 0 else None]
dist.broadcast_object_list(broadcast_list, 0) # broadcast 'stop' to all ranks
if RANK != 0:
self.stop = broadcast_list[0]
if self.stop:
break # must break all DDP ranks
if RANK in (-1, 0):
# Do final val with best.pt
LOGGER.info(f'\n{epoch - self.start_epoch + 1} epochs completed in '
f'{(time.time() - self.train_time_start) / 3600:.3f} hours.')
self.final_eval()
if self.args.plots:
self.plot_metrics()
self.run_callbacks('on_train_end')
torch.cuda.empty_cache()
self.run_callbacks('teardown')
def save_model(self):
"""Save model checkpoints based on various conditions."""
ckpt = {
'epoch': self.epoch,
'best_fitness': self.best_fitness,
'model': deepcopy(de_parallel(self.model)).half(),
'ema': deepcopy(self.ema.ema).half(),
'updates': self.ema.updates,
'optimizer': self.optimizer.state_dict(),
'train_args': vars(self.args), # save as dict
'date': datetime.now().isoformat(),
'version': __version__}
# Use dill (if exists) to serialize the lambda functions where pickle does not do this
try:
import dill as pickle
except ImportError:
import pickle
# Save last, best and delete
torch.save(ckpt, self.last, pickle_module=pickle)
if self.best_fitness == self.fitness:
torch.save(ckpt, self.best, pickle_module=pickle)
if (self.epoch > 0) and (self.save_period > 0) and (self.epoch % self.save_period == 0):
torch.save(ckpt, self.wdir / f'epoch{self.epoch}.pt', pickle_module=pickle)
del ckpt
@staticmethod
def get_dataset(data):
"""
Get train, val path from data dict if it exists. Returns None if data format is not recognized.
"""
return data['train'], data.get('val') or data.get('test')
def setup_model(self):
"""
load/create/download model for any task.
"""
if isinstance(self.model, torch.nn.Module): # if model is loaded beforehand. No setup needed
return
model, weights = self.model, None
ckpt = None
if str(model).endswith('.pt'):
weights, ckpt = attempt_load_one_weight(model)
cfg = ckpt['model'].yaml
else:
cfg = model
self.model = self.get_model(cfg=cfg, weights=weights, verbose=RANK == -1) # calls Model(cfg, weights)
return ckpt
def optimizer_step(self):
"""Perform a single step of the training optimizer with gradient clipping and EMA update."""
self.scaler.unscale_(self.optimizer) # unscale gradients
torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=10.0) # clip gradients
self.scaler.step(self.optimizer)
self.scaler.update()
self.optimizer.zero_grad()
if self.ema:
self.ema.update(self.model)
def preprocess_batch(self, batch):
"""
Allows custom preprocessing model inputs and ground truths depending on task type.
"""
return batch
def validate(self):
"""
Runs validation on test set using self.validator. The returned dict is expected to contain "fitness" key.
"""
metrics = self.validator(self)
fitness = metrics.pop('fitness', -self.loss.detach().cpu().numpy()) # use loss as fitness measure if not found
if not self.best_fitness or self.best_fitness < fitness:
self.best_fitness = fitness
return metrics, fitness
def get_model(self, cfg=None, weights=None, verbose=True):
"""Get model and raise NotImplementedError for loading cfg files."""
raise NotImplementedError("This task trainer doesn't support loading cfg files")
def get_validator(self):
"""Returns a NotImplementedError when the get_validator function is called."""
raise NotImplementedError('get_validator function not implemented in trainer')
def get_dataloader(self, dataset_path, batch_size=16, rank=0, mode='train'):
"""
Returns dataloader derived from torch.data.Dataloader.
"""
raise NotImplementedError('get_dataloader function not implemented in trainer')
def build_dataset(self, img_path, mode='train', batch=None):
"""Build dataset"""
raise NotImplementedError('build_dataset function not implemented in trainer')
def label_loss_items(self, loss_items=None, prefix='train'):
"""
Returns a loss dict with labelled training loss items tensor
"""
# Not needed for classification but necessary for segmentation & detection
return {'loss': loss_items} if loss_items is not None else ['loss']
def set_model_attributes(self):
"""
To set or update model parameters before training.
"""
self.model.names = self.data['names']
def build_targets(self, preds, targets):
"""Builds target tensors for training YOLO model."""
pass
def progress_string(self):
"""Returns a string describing training progress."""
return ''
# TODO: may need to put these following functions into callback
def plot_training_samples(self, batch, ni):
"""Plots training samples during YOLOv5 training."""
pass
def plot_training_labels(self):
"""Plots training labels for YOLO model."""
pass
def save_metrics(self, metrics):
"""Saves training metrics to a CSV file."""
keys, vals = list(metrics.keys()), list(metrics.values())
n = len(metrics) + 1 # number of cols
s = '' if self.csv.exists() else (('%23s,' * n % tuple(['epoch'] + keys)).rstrip(',') + '\n') # header
with open(self.csv, 'a') as f:
f.write(s + ('%23.5g,' * n % tuple([self.epoch + 1] + vals)).rstrip(',') + '\n')
def plot_metrics(self):
"""Plot and display metrics visually."""
pass
def on_plot(self, name, data=None):
"""Registers plots (e.g. to be consumed in callbacks)"""
path = Path(name)
self.plots[path] = {'data': data, 'timestamp': time.time()}
def final_eval(self):
"""Performs final evaluation and validation for object detection YOLO model."""
for f in self.last, self.best:
if f.exists():
strip_optimizer(f) # strip optimizers
if f is self.best:
LOGGER.info(f'\nValidating {f}...')
self.validator.args.plots = self.args.plots
self.metrics = self.validator(model=f)
self.metrics.pop('fitness', None)
self.run_callbacks('on_fit_epoch_end')
def check_resume(self, overrides):
"""Check if resume checkpoint exists and update arguments accordingly."""
resume = self.args.resume
if resume:
try:
exists = isinstance(resume, (str, Path)) and Path(resume).exists()
last = Path(check_file(resume) if exists else get_latest_run())
# Check that resume data YAML exists, otherwise strip to force re-download of dataset
ckpt_args = attempt_load_weights(last).args
if not Path(ckpt_args['data']).exists():
ckpt_args['data'] = self.args.data
resume = True
self.args = get_cfg(ckpt_args)
self.args.model = str(last) # reinstate model
for k in 'imgsz', 'batch': # allow arg updates to reduce memory on resume if crashed due to CUDA OOM
if k in overrides:
setattr(self.args, k, overrides[k])
except Exception as e:
raise FileNotFoundError('Resume checkpoint not found. Please pass a valid checkpoint to resume from, '
"i.e. 'yolo train resume model=path/to/last.pt'") from e
self.resume = resume
def resume_training(self, ckpt):
"""Resume YOLO training from given epoch and best fitness."""
if ckpt is None:
return
best_fitness = 0.0
start_epoch = ckpt['epoch'] + 1
if ckpt['optimizer'] is not None:
self.optimizer.load_state_dict(ckpt['optimizer']) # optimizer
best_fitness = ckpt['best_fitness']
if self.ema and ckpt.get('ema'):
self.ema.ema.load_state_dict(ckpt['ema'].float().state_dict()) # EMA
self.ema.updates = ckpt['updates']
if self.resume:
assert start_epoch > 0, \
f'{self.args.model} training to {self.epochs} epochs is finished, nothing to resume.\n' \
f"Start a new training without resuming, i.e. 'yolo train model={self.args.model}'"
LOGGER.info(
f'Resuming training from {self.args.model} from epoch {start_epoch + 1} to {self.epochs} total epochs')
if self.epochs < start_epoch:
LOGGER.info(
f"{self.model} has been trained for {ckpt['epoch']} epochs. Fine-tuning for {self.epochs} more epochs.")
self.epochs += ckpt['epoch'] # finetune additional epochs
self.best_fitness = best_fitness
self.start_epoch = start_epoch
if start_epoch > (self.epochs - self.args.close_mosaic):
LOGGER.info('Closing dataloader mosaic')
if hasattr(self.train_loader.dataset, 'mosaic'):
self.train_loader.dataset.mosaic = False
if hasattr(self.train_loader.dataset, 'close_mosaic'):
self.train_loader.dataset.close_mosaic(hyp=self.args)
def build_optimizer(self, model, name='auto', lr=0.001, momentum=0.9, decay=1e-5, iterations=1e5):
"""
Constructs an optimizer for the given model, based on the specified optimizer name, learning rate,
momentum, weight decay, and number of iterations.
Args:
model (torch.nn.Module): The model for which to build an optimizer.
name (str, optional): The name of the optimizer to use. If 'auto', the optimizer is selected
based on the number of iterations. Default: 'auto'.
lr (float, optional): The learning rate for the optimizer. Default: 0.001.
momentum (float, optional): The momentum factor for the optimizer. Default: 0.9.
decay (float, optional): The weight decay for the optimizer. Default: 1e-5.
iterations (float, optional): The number of iterations, which determines the optimizer if
name is 'auto'. Default: 1e5.
Returns:
(torch.optim.Optimizer): The constructed optimizer.
"""
g = [], [], [] # optimizer parameter groups
bn = tuple(v for k, v in nn.__dict__.items() if 'Norm' in k) # normalization layers, i.e. BatchNorm2d()
if name == 'auto':
nc = getattr(model, 'nc', 10) # number of classes
lr_fit = round(0.002 * 5 / (4 + nc), 6) # lr0 fit equation to 6 decimal places
name, lr, momentum = ('SGD', 0.01, 0.9) if iterations > 10000 else ('AdamW', lr_fit, 0.9)
self.args.warmup_bias_lr = 0.0 # no higher than 0.01 for Adam
for module_name, module in model.named_modules():
for param_name, param in module.named_parameters(recurse=False):
fullname = f'{module_name}.{param_name}' if module_name else param_name
if 'bias' in fullname: # bias (no decay)
g[2].append(param)
elif isinstance(module, bn): # weight (no decay)
g[1].append(param)
else: # weight (with decay)
g[0].append(param)
if name in ('Adam', 'Adamax', 'AdamW', 'NAdam', 'RAdam'):
optimizer = getattr(optim, name, optim.Adam)(g[2], lr=lr, betas=(momentum, 0.999), weight_decay=0.0)
elif name == 'RMSProp':
optimizer = optim.RMSprop(g[2], lr=lr, momentum=momentum)
elif name == 'SGD':
optimizer = optim.SGD(g[2], lr=lr, momentum=momentum, nesterov=True)
else:
raise NotImplementedError(
f"Optimizer '{name}' not found in list of available optimizers "
f'[Adam, AdamW, NAdam, RAdam, RMSProp, SGD, auto].'
'To request support for addition optimizers please visit https://github.com/ultralytics/ultralytics.')
optimizer.add_param_group({'params': g[0], 'weight_decay': decay}) # add g0 with weight_decay
optimizer.add_param_group({'params': g[1], 'weight_decay': 0.0}) # add g1 (BatchNorm2d weights)
LOGGER.info(
f"{colorstr('optimizer:')} {type(optimizer).__name__}(lr={lr}, momentum={momentum}) with parameter groups "
f'{len(g[1])} weight(decay=0.0), {len(g[0])} weight(decay={decay}), {len(g[2])} bias(decay=0.0)')
return optimizer

View File

@ -0,0 +1,205 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
"""
This module provides functionalities for hyperparameter tuning of the Ultralytics YOLO models for object detection,
instance segmentation, image classification, pose estimation, and multi-object tracking.
Hyperparameter tuning is the process of systematically searching for the optimal set of hyperparameters
that yield the best model performance. This is particularly crucial in deep learning models like YOLO,
where small changes in hyperparameters can lead to significant differences in model accuracy and efficiency.
Example:
Tune hyperparameters for YOLOv8n on COCO8 at imgsz=640 and epochs=30 for 300 tuning iterations.
```python
from ultralytics import YOLO
model = YOLO('yolov8n.pt')
model.tune(data='coco8.yaml', imgsz=640, epochs=100, iterations=10)
```
"""
import random
import time
from copy import deepcopy
import numpy as np
from ultralytics import YOLO
from ultralytics.cfg import get_cfg, get_save_dir
from ultralytics.utils import DEFAULT_CFG, LOGGER, callbacks, colorstr, yaml_print, yaml_save
class Tuner:
"""
Class responsible for hyperparameter tuning of YOLO models.
The class evolves YOLO model hyperparameters over a given number of iterations
by mutating them according to the search space and retraining the model to evaluate their performance.
Attributes:
space (dict): Hyperparameter search space containing bounds and scaling factors for mutation.
tune_dir (Path): Directory where evolution logs and results will be saved.
evolve_csv (Path): Path to the CSV file where evolution logs are saved.
Methods:
_mutate(hyp: dict) -> dict:
Mutates the given hyperparameters within the bounds specified in `self.space`.
__call__():
Executes the hyperparameter evolution across multiple iterations.
Example:
Tune hyperparameters for YOLOv8n on COCO8 at imgsz=640 and epochs=30 for 300 tuning iterations.
```python
from ultralytics import YOLO
model = YOLO('yolov8n.pt')
model.tune(data='coco8.yaml', imgsz=640, epochs=100, iterations=10, val=False, cache=True)
```
"""
def __init__(self, args=DEFAULT_CFG, _callbacks=None):
"""
Initialize the Tuner with configurations.
Args:
args (dict, optional): Configuration for hyperparameter evolution.
"""
self.args = get_cfg(overrides=args)
self.space = { # key: (min, max, gain(optionaL))
# 'optimizer': tune.choice(['SGD', 'Adam', 'AdamW', 'NAdam', 'RAdam', 'RMSProp']),
'lr0': (1e-5, 1e-1),
'lrf': (0.01, 1.0), # final OneCycleLR learning rate (lr0 * lrf)
'momentum': (0.6, 0.98, 0.3), # SGD momentum/Adam beta1
'weight_decay': (0.0, 0.001), # optimizer weight decay 5e-4
'warmup_epochs': (0.0, 5.0), # warmup epochs (fractions ok)
'warmup_momentum': (0.0, 0.95), # warmup initial momentum
'box': (0.02, 0.2), # box loss gain
'cls': (0.2, 4.0), # cls loss gain (scale with pixels)
'hsv_h': (0.0, 0.1), # image HSV-Hue augmentation (fraction)
'hsv_s': (0.0, 0.9), # image HSV-Saturation augmentation (fraction)
'hsv_v': (0.0, 0.9), # image HSV-Value augmentation (fraction)
'degrees': (0.0, 45.0), # image rotation (+/- deg)
'translate': (0.0, 0.9), # image translation (+/- fraction)
'scale': (0.0, 0.9), # image scale (+/- gain)
'shear': (0.0, 10.0), # image shear (+/- deg)
'perspective': (0.0, 0.001), # image perspective (+/- fraction), range 0-0.001
'flipud': (0.0, 1.0), # image flip up-down (probability)
'fliplr': (0.0, 1.0), # image flip left-right (probability)
'mosaic': (0.0, 1.0), # image mixup (probability)
'mixup': (0.0, 1.0), # image mixup (probability)
'copy_paste': (0.0, 1.0)} # segment copy-paste (probability)
self.tune_dir = get_save_dir(self.args, name='_tune')
self.evolve_csv = self.tune_dir / 'evolve.csv'
self.callbacks = _callbacks or callbacks.get_default_callbacks()
callbacks.add_integration_callbacks(self)
LOGGER.info(f"Initialized Tuner instance with 'tune_dir={self.tune_dir}'.")
def _mutate(self, parent='single', n=5, mutation=0.8, sigma=0.2):
"""
Mutates the hyperparameters based on bounds and scaling factors specified in `self.space`.
Args:
parent (str): Parent selection method: 'single' or 'weighted'.
n (int): Number of parents to consider.
mutation (float): Probability of a parameter mutation in any given iteration.
sigma (float): Standard deviation for Gaussian random number generator.
Returns:
(dict): A dictionary containing mutated hyperparameters.
"""
if self.evolve_csv.exists(): # if evolve.csv exists: select best hyps and mutate
# Select parent(s)
x = np.loadtxt(self.evolve_csv, ndmin=2, delimiter=',', skiprows=1)
fitness = x[:, 0] # first column
n = min(n, len(x)) # number of previous results to consider
x = x[np.argsort(-fitness)][:n] # top n mutations
w = x[:, 0] - x[:, 0].min() + 1E-6 # weights (sum > 0)
if parent == 'single' or len(x) == 1:
# x = x[random.randint(0, n - 1)] # random selection
x = x[random.choices(range(n), weights=w)[0]] # weighted selection
elif parent == 'weighted':
x = (x * w.reshape(n, 1)).sum(0) / w.sum() # weighted combination
# Mutate
r = np.random # method
r.seed(int(time.time()))
g = np.array([v[2] if len(v) == 3 else 1.0 for k, v in self.space.items()]) # gains 0-1
ng = len(self.space)
v = np.ones(ng)
while all(v == 1): # mutate until a change occurs (prevent duplicates)
v = (g * (r.random(ng) < mutation) * r.randn(ng) * r.random() * sigma + 1).clip(0.3, 3.0)
hyp = {k: float(x[i + 1] * v[i]) for i, k in enumerate(self.space.keys())}
else:
hyp = {k: getattr(self.args, k) for k in self.space.keys()}
# Constrain to limits
for k, v in self.space.items():
hyp[k] = max(hyp[k], v[0]) # lower limit
hyp[k] = min(hyp[k], v[1]) # upper limit
hyp[k] = round(hyp[k], 5) # significant digits
return hyp
def __call__(self, model=None, iterations=10, prefix=colorstr('Tuner:')):
"""
Executes the hyperparameter evolution process when the Tuner instance is called.
This method iterates through the number of iterations, performing the following steps in each iteration:
1. Load the existing hyperparameters or initialize new ones.
2. Mutate the hyperparameters using the `mutate` method.
3. Train a YOLO model with the mutated hyperparameters.
4. Log the fitness score and mutated hyperparameters to a CSV file.
Args:
model (Model): A pre-initialized YOLO model to be used for training.
iterations (int): The number of generations to run the evolution for.
Note:
The method utilizes the `self.evolve_csv` Path object to read and log hyperparameters and fitness scores.
Ensure this path is set correctly in the Tuner instance.
"""
t0 = time.time()
best_save_dir, best_metrics = None, None
self.tune_dir.mkdir(parents=True, exist_ok=True)
for i in range(iterations):
# Mutate hyperparameters
mutated_hyp = self._mutate()
LOGGER.info(f'{prefix} Starting iteration {i + 1}/{iterations} with hyperparameters: {mutated_hyp}')
try:
# Train YOLO model with mutated hyperparameters
train_args = {**vars(self.args), **mutated_hyp}
results = (deepcopy(model) or YOLO(self.args.model)).train(**train_args)
fitness = results.fitness
except Exception as e:
LOGGER.warning(f'WARNING ❌️ training failure for hyperparameter tuning iteration {i}\n{e}')
fitness = 0.0
# Save results and mutated_hyp to evolve_csv
log_row = [round(fitness, 5)] + [mutated_hyp[k] for k in self.space.keys()]
headers = '' if self.evolve_csv.exists() else (','.join(['fitness_score'] + list(self.space.keys())) + '\n')
with open(self.evolve_csv, 'a') as f:
f.write(headers + ','.join(map(str, log_row)) + '\n')
# Print tuning results
x = np.loadtxt(self.evolve_csv, ndmin=2, delimiter=',', skiprows=1)
fitness = x[:, 0] # first column
best_idx = fitness.argmax()
best_is_current = best_idx == i
if best_is_current:
best_save_dir = results.save_dir
best_metrics = {k: round(v, 5) for k, v in results.results_dict.items()}
header = (f'{prefix} {i + 1} iterations complete ✅ ({time.time() - t0:.2f}s)\n'
f'{prefix} Results saved to {colorstr("bold", self.tune_dir)}\n'
f'{prefix} Best fitness={fitness[best_idx]} observed at iteration {best_idx + 1}\n'
f'{prefix} Best fitness metrics are {best_metrics}\n'
f'{prefix} Best fitness model is {best_save_dir}\n'
f'{prefix} Best fitness hyperparameters are printed below.\n')
LOGGER.info('\n' + header)
# Save turning results
data = {k: float(x[0, i + 1]) for i, k in enumerate(self.space.keys())}
header = header.replace(prefix, '#').replace('/', '').replace('', '') + '\n'
yaml_save(self.tune_dir / 'best.yaml', data=data, header=header)
yaml_print(self.tune_dir / 'best.yaml')

View File

@ -0,0 +1,325 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
"""
Check a model's accuracy on a test or val split of a dataset.
Usage:
$ yolo mode=val model=yolov8n.pt data=coco128.yaml imgsz=640
Usage - formats:
$ yolo mode=val model=yolov8n.pt # PyTorch
yolov8n.torchscript # TorchScript
yolov8n.onnx # ONNX Runtime or OpenCV DNN with dnn=True
yolov8n_openvino_model # OpenVINO
yolov8n.engine # TensorRT
yolov8n.mlpackage # CoreML (macOS-only)
yolov8n_saved_model # TensorFlow SavedModel
yolov8n.pb # TensorFlow GraphDef
yolov8n.tflite # TensorFlow Lite
yolov8n_edgetpu.tflite # TensorFlow Edge TPU
yolov8n_paddle_model # PaddlePaddle
"""
import json
import time
from pathlib import Path
import numpy as np
import torch
from ultralytics.cfg import get_cfg, get_save_dir
from ultralytics.data.utils import check_cls_dataset, check_det_dataset
from ultralytics.nn.autobackend import AutoBackend
from ultralytics.utils import LOGGER, TQDM, callbacks, colorstr, emojis
from ultralytics.utils.checks import check_imgsz
from ultralytics.utils.ops import Profile
from ultralytics.utils.torch_utils import de_parallel, select_device, smart_inference_mode
class BaseValidator:
"""
BaseValidator
A base class for creating validators.
Attributes:
args (SimpleNamespace): Configuration for the validator.
dataloader (DataLoader): Dataloader to use for validation.
pbar (tqdm): Progress bar to update during validation.
model (nn.Module): Model to validate.
data (dict): Data dictionary.
device (torch.device): Device to use for validation.
batch_i (int): Current batch index.
training (bool): Whether the model is in training mode.
names (dict): Class names.
seen: Records the number of images seen so far during validation.
stats: Placeholder for statistics during validation.
confusion_matrix: Placeholder for a confusion matrix.
nc: Number of classes.
iouv: (torch.Tensor): IoU thresholds from 0.50 to 0.95 in spaces of 0.05.
jdict (dict): Dictionary to store JSON validation results.
speed (dict): Dictionary with keys 'preprocess', 'inference', 'loss', 'postprocess' and their respective
batch processing times in milliseconds.
save_dir (Path): Directory to save results.
plots (dict): Dictionary to store plots for visualization.
callbacks (dict): Dictionary to store various callback functions.
"""
def __init__(self, dataloader=None, save_dir=None, pbar=None, args=None, _callbacks=None):
"""
Initializes a BaseValidator instance.
Args:
dataloader (torch.utils.data.DataLoader): Dataloader to be used for validation.
save_dir (Path, optional): Directory to save results.
pbar (tqdm.tqdm): Progress bar for displaying progress.
args (SimpleNamespace): Configuration for the validator.
_callbacks (dict): Dictionary to store various callback functions.
"""
self.args = get_cfg(overrides=args)
self.dataloader = dataloader
self.pbar = pbar
self.model = None
self.data = None
self.device = None
self.batch_i = None
self.training = True
self.names = None
self.seen = None
self.stats = None
self.confusion_matrix = None
self.nc = None
self.iouv = None
self.jdict = None
self.speed = {'preprocess': 0.0, 'inference': 0.0, 'loss': 0.0, 'postprocess': 0.0}
self.save_dir = save_dir or get_save_dir(self.args)
(self.save_dir / 'labels' if self.args.save_txt else self.save_dir).mkdir(parents=True, exist_ok=True)
if self.args.conf is None:
self.args.conf = 0.001 # default conf=0.001
self.plots = {}
self.callbacks = _callbacks or callbacks.get_default_callbacks()
@smart_inference_mode()
def __call__(self, trainer=None, model=None):
"""
Supports validation of a pre-trained model if passed or a model being trained if trainer is passed (trainer
gets priority).
"""
self.training = trainer is not None
augment = self.args.augment and (not self.training)
if self.training:
self.device = trainer.device
self.data = trainer.data
self.args.half = self.device.type != 'cpu' # force FP16 val during training
model = trainer.ema.ema or trainer.model
model = model.half() if self.args.half else model.float()
# self.model = model
self.loss = torch.zeros_like(trainer.loss_items, device=trainer.device)
self.args.plots &= trainer.stopper.possible_stop or (trainer.epoch == trainer.epochs - 1)
model.eval()
else:
callbacks.add_integration_callbacks(self)
self.run_callbacks('on_val_start')
model = AutoBackend(model or self.args.model,
device=select_device(self.args.device, self.args.batch),
dnn=self.args.dnn,
data=self.args.data,
fp16=self.args.half)
# self.model = model
self.device = model.device # update device
self.args.half = model.fp16 # update half
stride, pt, jit, engine = model.stride, model.pt, model.jit, model.engine
imgsz = check_imgsz(self.args.imgsz, stride=stride)
if engine:
self.args.batch = model.batch_size
elif not pt and not jit:
self.args.batch = 1 # export.py models default to batch-size 1
LOGGER.info(f'Forcing batch=1 square inference (1,3,{imgsz},{imgsz}) for non-PyTorch models')
if isinstance(self.args.data, str) and self.args.data.split('.')[-1] in ('yaml', 'yml'):
self.data = check_det_dataset(self.args.data)
elif self.args.task == 'classify':
self.data = check_cls_dataset(self.args.data, split=self.args.split)
else:
raise FileNotFoundError(emojis(f"Dataset '{self.args.data}' for task={self.args.task} not found ❌"))
if self.device.type in ('cpu', 'mps'):
self.args.workers = 0 # faster CPU val as time dominated by inference, not dataloading
if not pt:
self.args.rect = False
self.dataloader = self.dataloader or self.get_dataloader(self.data.get(self.args.split), self.args.batch)
model.eval()
model.warmup(imgsz=(1 if pt else self.args.batch, 3, imgsz, imgsz)) # warmup
dt = Profile(), Profile(), Profile(), Profile()
bar = TQDM(self.dataloader, desc=self.get_desc(), total=len(self.dataloader))
self.init_metrics(de_parallel(model))
self.jdict = [] # empty before each val
for batch_i, batch in enumerate(bar):
self.run_callbacks('on_val_batch_start')
self.batch_i = batch_i
# Preprocess
with dt[0]:
batch = self.preprocess(batch)
# Inference
with dt[1]:
preds = model(batch['img'], augment=augment)
# Loss
with dt[2]:
if self.training:
self.loss += model.loss(batch, preds)[1]
# Postprocess
with dt[3]:
preds = self.postprocess(preds)
self.update_metrics(preds, batch)
if self.args.plots and batch_i < 3:
self.plot_val_samples(batch, batch_i)
self.plot_predictions(batch, preds, batch_i)
self.run_callbacks('on_val_batch_end')
stats = self.get_stats()
self.check_stats(stats)
self.speed = dict(zip(self.speed.keys(), (x.t / len(self.dataloader.dataset) * 1E3 for x in dt)))
self.finalize_metrics()
self.print_results()
self.run_callbacks('on_val_end')
if self.training:
model.float()
results = {**stats, **trainer.label_loss_items(self.loss.cpu() / len(self.dataloader), prefix='val')}
return {k: round(float(v), 5) for k, v in results.items()} # return results as 5 decimal place floats
else:
LOGGER.info('Speed: %.1fms preprocess, %.1fms inference, %.1fms loss, %.1fms postprocess per image' %
tuple(self.speed.values()))
if self.args.save_json and self.jdict:
with open(str(self.save_dir / 'predictions.json'), 'w') as f:
LOGGER.info(f'Saving {f.name}...')
json.dump(self.jdict, f) # flatten and save
stats = self.eval_json(stats) # update stats
if self.args.plots or self.args.save_json:
LOGGER.info(f"Results saved to {colorstr('bold', self.save_dir)}")
return stats
def match_predictions(self, pred_classes, true_classes, iou, use_scipy=False):
"""
Matches predictions to ground truth objects (pred_classes, true_classes) using IoU.
Args:
pred_classes (torch.Tensor): Predicted class indices of shape(N,).
true_classes (torch.Tensor): Target class indices of shape(M,).
iou (torch.Tensor): An NxM tensor containing the pairwise IoU values for predictions and ground of truth
use_scipy (bool): Whether to use scipy for matching (more precise).
Returns:
(torch.Tensor): Correct tensor of shape(N,10) for 10 IoU thresholds.
"""
# Dx10 matrix, where D - detections, 10 - IoU thresholds
correct = np.zeros((pred_classes.shape[0], self.iouv.shape[0])).astype(bool)
# LxD matrix where L - labels (rows), D - detections (columns)
correct_class = true_classes[:, None] == pred_classes
iou = iou * correct_class # zero out the wrong classes
iou = iou.cpu().numpy()
for i, threshold in enumerate(self.iouv.cpu().tolist()):
if use_scipy:
# WARNING: known issue that reduces mAP in https://github.com/ultralytics/ultralytics/pull/4708
import scipy # scope import to avoid importing for all commands
cost_matrix = iou * (iou >= threshold)
if cost_matrix.any():
labels_idx, detections_idx = scipy.optimize.linear_sum_assignment(cost_matrix, maximize=True)
valid = cost_matrix[labels_idx, detections_idx] > 0
if valid.any():
correct[detections_idx[valid], i] = True
else:
matches = np.nonzero(iou >= threshold) # IoU > threshold and classes match
matches = np.array(matches).T
if matches.shape[0]:
if matches.shape[0] > 1:
matches = matches[iou[matches[:, 0], matches[:, 1]].argsort()[::-1]]
matches = matches[np.unique(matches[:, 1], return_index=True)[1]]
# matches = matches[matches[:, 2].argsort()[::-1]]
matches = matches[np.unique(matches[:, 0], return_index=True)[1]]
correct[matches[:, 1].astype(int), i] = True
return torch.tensor(correct, dtype=torch.bool, device=pred_classes.device)
def add_callback(self, event: str, callback):
"""Appends the given callback."""
self.callbacks[event].append(callback)
def run_callbacks(self, event: str):
"""Runs all callbacks associated with a specified event."""
for callback in self.callbacks.get(event, []):
callback(self)
def get_dataloader(self, dataset_path, batch_size):
"""Get data loader from dataset path and batch size."""
raise NotImplementedError('get_dataloader function not implemented for this validator')
def build_dataset(self, img_path):
"""Build dataset"""
raise NotImplementedError('build_dataset function not implemented in validator')
def preprocess(self, batch):
"""Preprocesses an input batch."""
return batch
def postprocess(self, preds):
"""Describes and summarizes the purpose of 'postprocess()' but no details mentioned."""
return preds
def init_metrics(self, model):
"""Initialize performance metrics for the YOLO model."""
pass
def update_metrics(self, preds, batch):
"""Updates metrics based on predictions and batch."""
pass
def finalize_metrics(self, *args, **kwargs):
"""Finalizes and returns all metrics."""
pass
def get_stats(self):
"""Returns statistics about the model's performance."""
return {}
def check_stats(self, stats):
"""Checks statistics."""
pass
def print_results(self):
"""Prints the results of the model's predictions."""
pass
def get_desc(self):
"""Get description of the YOLO model."""
pass
@property
def metric_keys(self):
"""Returns the metric keys used in YOLO training/validation."""
return []
def on_plot(self, name, data=None):
"""Registers plots (e.g. to be consumed in callbacks)"""
self.plots[Path(name)] = {'data': data, 'timestamp': time.time()}
# TODO: may need to put these following functions into callback
def plot_val_samples(self, batch, ni):
"""Plots validation samples during training."""
pass
def plot_predictions(self, batch, preds, ni):
"""Plots YOLO model predictions on batch images."""
pass
def pred_to_json(self, preds, batch):
"""Convert predictions to JSON format."""
pass
def eval_json(self, stats):
"""Evaluate and return JSON format of prediction statistics."""
pass

View File

@ -0,0 +1,100 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import requests
from ultralytics.data.utils import HUBDatasetStats
from ultralytics.hub.auth import Auth
from ultralytics.hub.utils import HUB_API_ROOT, HUB_WEB_ROOT, PREFIX
from ultralytics.utils import LOGGER, SETTINGS
def login(api_key=''):
"""
Log in to the Ultralytics HUB API using the provided API key.
Args:
api_key (str, optional): May be an API key or a combination API key and model ID, i.e. key_id
Example:
```python
from ultralytics import hub
hub.login('API_KEY')
```
"""
Auth(api_key, verbose=True)
def logout():
"""
Log out of Ultralytics HUB by removing the API key from the settings file. To log in again, use 'yolo hub login'.
Example:
```python
from ultralytics import hub
hub.logout()
```
"""
SETTINGS['api_key'] = ''
SETTINGS.save()
LOGGER.info(f"{PREFIX}logged out ✅. To log in again, use 'yolo hub login'.")
def reset_model(model_id=''):
"""Reset a trained model to an untrained state."""
r = requests.post(f'{HUB_API_ROOT}/model-reset', json={'apiKey': Auth().api_key, 'modelId': model_id})
if r.status_code == 200:
LOGGER.info(f'{PREFIX}Model reset successfully')
return
LOGGER.warning(f'{PREFIX}Model reset failure {r.status_code} {r.reason}')
def export_fmts_hub():
"""Returns a list of HUB-supported export formats."""
from ultralytics.engine.exporter import export_formats
return list(export_formats()['Argument'][1:]) + ['ultralytics_tflite', 'ultralytics_coreml']
def export_model(model_id='', format='torchscript'):
"""Export a model to all formats."""
assert format in export_fmts_hub(), f"Unsupported export format '{format}', valid formats are {export_fmts_hub()}"
r = requests.post(f'{HUB_API_ROOT}/v1/models/{model_id}/export',
json={'format': format},
headers={'x-api-key': Auth().api_key})
assert r.status_code == 200, f'{PREFIX}{format} export failure {r.status_code} {r.reason}'
LOGGER.info(f'{PREFIX}{format} export started ✅')
def get_export(model_id='', format='torchscript'):
"""Get an exported model dictionary with download URL."""
assert format in export_fmts_hub(), f"Unsupported export format '{format}', valid formats are {export_fmts_hub()}"
r = requests.post(f'{HUB_API_ROOT}/get-export',
json={
'apiKey': Auth().api_key,
'modelId': model_id,
'format': format})
assert r.status_code == 200, f'{PREFIX}{format} get_export failure {r.status_code} {r.reason}'
return r.json()
def check_dataset(path='', task='detect'):
"""
Function for error-checking HUB dataset Zip file before upload. It checks a dataset for errors before it is
uploaded to the HUB. Usage examples are given below.
Args:
path (str, optional): Path to data.zip (with data.yaml inside data.zip). Defaults to ''.
task (str, optional): Dataset task. Options are 'detect', 'segment', 'pose', 'classify'. Defaults to 'detect'.
Example:
```python
from ultralytics.hub import check_dataset
check_dataset('path/to/coco8.zip', task='detect') # detect dataset
check_dataset('path/to/coco8-seg.zip', task='segment') # segment dataset
check_dataset('path/to/coco8-pose.zip', task='pose') # pose dataset
```
"""
HUBDatasetStats(path=path, task=task).get_json()
LOGGER.info(f'Checks completed correctly ✅. Upload this dataset to {HUB_WEB_ROOT}/datasets/.')

View File

@ -0,0 +1,119 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import requests
from ultralytics.hub.utils import HUB_API_ROOT, HUB_WEB_ROOT, PREFIX, request_with_credentials
from ultralytics.utils import LOGGER, SETTINGS, emojis, is_colab
API_KEY_URL = f'{HUB_WEB_ROOT}/settings?tab=api+keys'
class Auth:
id_token = api_key = model_key = False
def __init__(self, api_key='', verbose=False):
"""
Initialize the Auth class with an optional API key.
Args:
api_key (str, optional): May be an API key or a combination API key and model ID, i.e. key_id
"""
# Split the input API key in case it contains a combined key_model and keep only the API key part
api_key = api_key.split('_')[0]
# Set API key attribute as value passed or SETTINGS API key if none passed
self.api_key = api_key or SETTINGS.get('api_key', '')
# If an API key is provided
if self.api_key:
# If the provided API key matches the API key in the SETTINGS
if self.api_key == SETTINGS.get('api_key'):
# Log that the user is already logged in
if verbose:
LOGGER.info(f'{PREFIX}Authenticated ✅')
return
else:
# Attempt to authenticate with the provided API key
success = self.authenticate()
# If the API key is not provided and the environment is a Google Colab notebook
elif is_colab():
# Attempt to authenticate using browser cookies
success = self.auth_with_cookies()
else:
# Request an API key
success = self.request_api_key()
# Update SETTINGS with the new API key after successful authentication
if success:
SETTINGS.update({'api_key': self.api_key})
# Log that the new login was successful
if verbose:
LOGGER.info(f'{PREFIX}New authentication successful ✅')
elif verbose:
LOGGER.info(f'{PREFIX}Retrieve API key from {API_KEY_URL}')
def request_api_key(self, max_attempts=3):
"""
Prompt the user to input their API key. Returns the model ID.
"""
import getpass
for attempts in range(max_attempts):
LOGGER.info(f'{PREFIX}Login. Attempt {attempts + 1} of {max_attempts}')
input_key = getpass.getpass(f'Enter API key from {API_KEY_URL} ')
self.api_key = input_key.split('_')[0] # remove model id if present
if self.authenticate():
return True
raise ConnectionError(emojis(f'{PREFIX}Failed to authenticate ❌'))
def authenticate(self) -> bool:
"""
Attempt to authenticate with the server using either id_token or API key.
Returns:
bool: True if authentication is successful, False otherwise.
"""
try:
if header := self.get_auth_header():
r = requests.post(f'{HUB_API_ROOT}/v1/auth', headers=header)
if not r.json().get('success', False):
raise ConnectionError('Unable to authenticate.')
return True
raise ConnectionError('User has not authenticated locally.')
except ConnectionError:
self.id_token = self.api_key = False # reset invalid
LOGGER.warning(f'{PREFIX}Invalid API key ⚠️')
return False
def auth_with_cookies(self) -> bool:
"""
Attempt to fetch authentication via cookies and set id_token.
User must be logged in to HUB and running in a supported browser.
Returns:
bool: True if authentication is successful, False otherwise.
"""
if not is_colab():
return False # Currently only works with Colab
try:
authn = request_with_credentials(f'{HUB_API_ROOT}/v1/auth/auto')
if authn.get('success', False):
self.id_token = authn.get('data', {}).get('idToken', None)
self.authenticate()
return True
raise ConnectionError('Unable to fetch browser authentication details.')
except ConnectionError:
self.id_token = False # reset invalid
return False
def get_auth_header(self):
"""
Get the authentication header for making API requests.
Returns:
(dict): The authentication header if id_token or API key is set, None otherwise.
"""
if self.id_token:
return {'authorization': f'Bearer {self.id_token}'}
elif self.api_key:
return {'x-api-key': self.api_key}
# else returns None

View File

@ -0,0 +1,190 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import signal
import sys
from pathlib import Path
from time import sleep
import requests
from ultralytics.hub.utils import HUB_API_ROOT, HUB_WEB_ROOT, PREFIX, smart_request
from ultralytics.utils import LOGGER, __version__, checks, emojis, is_colab, threaded
from ultralytics.utils.errors import HUBModelError
AGENT_NAME = f'python-{__version__}-colab' if is_colab() else f'python-{__version__}-local'
class HUBTrainingSession:
"""
HUB training session for Ultralytics HUB YOLO models. Handles model initialization, heartbeats, and checkpointing.
Args:
url (str): Model identifier used to initialize the HUB training session.
Attributes:
agent_id (str): Identifier for the instance communicating with the server.
model_id (str): Identifier for the YOLOv5 model being trained.
model_url (str): URL for the model in Ultralytics HUB.
api_url (str): API URL for the model in Ultralytics HUB.
auth_header (dict): Authentication header for the Ultralytics HUB API requests.
rate_limits (dict): Rate limits for different API calls (in seconds).
timers (dict): Timers for rate limiting.
metrics_queue (dict): Queue for the model's metrics.
model (dict): Model data fetched from Ultralytics HUB.
alive (bool): Indicates if the heartbeat loop is active.
"""
def __init__(self, url):
"""
Initialize the HUBTrainingSession with the provided model identifier.
Args:
url (str): Model identifier used to initialize the HUB training session.
It can be a URL string or a model key with specific format.
Raises:
ValueError: If the provided model identifier is invalid.
ConnectionError: If connecting with global API key is not supported.
"""
from ultralytics.hub.auth import Auth
# Parse input
if url.startswith(f'{HUB_WEB_ROOT}/models/'):
url = url.split(f'{HUB_WEB_ROOT}/models/')[-1]
if [len(x) for x in url.split('_')] == [42, 20]:
key, model_id = url.split('_')
elif len(url) == 20:
key, model_id = '', url
else:
raise HUBModelError(f"model='{url}' not found. Check format is correct, i.e. "
f"model='{HUB_WEB_ROOT}/models/MODEL_ID' and try again.")
# Authorize
auth = Auth(key)
self.agent_id = None # identifies which instance is communicating with server
self.model_id = model_id
self.model_url = f'{HUB_WEB_ROOT}/models/{model_id}'
self.api_url = f'{HUB_API_ROOT}/v1/models/{model_id}'
self.auth_header = auth.get_auth_header()
self.rate_limits = {'metrics': 3.0, 'ckpt': 900.0, 'heartbeat': 300.0} # rate limits (seconds)
self.timers = {} # rate limit timers (seconds)
self.metrics_queue = {} # metrics queue
self.model = self._get_model()
self.alive = True
self._start_heartbeat() # start heartbeats
self._register_signal_handlers()
LOGGER.info(f'{PREFIX}View model at {self.model_url} 🚀')
def _register_signal_handlers(self):
"""Register signal handlers for SIGTERM and SIGINT signals to gracefully handle termination."""
signal.signal(signal.SIGTERM, self._handle_signal)
signal.signal(signal.SIGINT, self._handle_signal)
def _handle_signal(self, signum, frame):
"""
Handle kill signals and prevent heartbeats from being sent on Colab after termination.
This method does not use frame, it is included as it is passed by signal.
"""
if self.alive is True:
LOGGER.info(f'{PREFIX}Kill signal received! ❌')
self._stop_heartbeat()
sys.exit(signum)
def _stop_heartbeat(self):
"""Terminate the heartbeat loop."""
self.alive = False
def upload_metrics(self):
"""Upload model metrics to Ultralytics HUB."""
payload = {'metrics': self.metrics_queue.copy(), 'type': 'metrics'}
smart_request('post', self.api_url, json=payload, headers=self.auth_header, code=2)
def _get_model(self):
"""Fetch and return model data from Ultralytics HUB."""
api_url = f'{HUB_API_ROOT}/v1/models/{self.model_id}'
try:
response = smart_request('get', api_url, headers=self.auth_header, thread=False, code=0)
data = response.json().get('data', None)
if data.get('status', None) == 'trained':
raise ValueError(emojis(f'Model is already trained and uploaded to {self.model_url} 🚀'))
if not data.get('data', None):
raise ValueError('Dataset may still be processing. Please wait a minute and try again.') # RF fix
self.model_id = data['id']
if data['status'] == 'new': # new model to start training
self.train_args = {
# TODO: deprecate 'batch_size' key for 'batch' in 3Q23
'batch': data['batch' if ('batch' in data) else 'batch_size'],
'epochs': data['epochs'],
'imgsz': data['imgsz'],
'patience': data['patience'],
'device': data['device'],
'cache': data['cache'],
'data': data['data']}
self.model_file = data.get('cfg') or data.get('weights') # cfg for pretrained=False
self.model_file = checks.check_yolov5u_filename(self.model_file, verbose=False) # YOLOv5->YOLOv5u
elif data['status'] == 'training': # existing model to resume training
self.train_args = {'data': data['data'], 'resume': True}
self.model_file = data['resume']
return data
except requests.exceptions.ConnectionError as e:
raise ConnectionRefusedError('ERROR: The HUB server is not online. Please try again later.') from e
except Exception:
raise
def upload_model(self, epoch, weights, is_best=False, map=0.0, final=False):
"""
Upload a model checkpoint to Ultralytics HUB.
Args:
epoch (int): The current training epoch.
weights (str): Path to the model weights file.
is_best (bool): Indicates if the current model is the best one so far.
map (float): Mean average precision of the model.
final (bool): Indicates if the model is the final model after training.
"""
if Path(weights).is_file():
with open(weights, 'rb') as f:
file = f.read()
else:
LOGGER.warning(f'{PREFIX}WARNING ⚠️ Model upload issue. Missing model {weights}.')
file = None
url = f'{self.api_url}/upload'
# url = 'http://httpbin.org/post' # for debug
data = {'epoch': epoch}
if final:
data.update({'type': 'final', 'map': map})
smart_request('post',
url,
data=data,
files={'best.pt': file},
headers=self.auth_header,
retry=10,
timeout=3600,
thread=False,
progress=True,
code=4)
else:
data.update({'type': 'epoch', 'isBest': bool(is_best)})
smart_request('post', url, data=data, files={'last.pt': file}, headers=self.auth_header, code=3)
@threaded
def _start_heartbeat(self):
"""Begin a threaded heartbeat loop to report the agent's status to Ultralytics HUB."""
while self.alive:
r = smart_request('post',
f'{HUB_API_ROOT}/v1/agent/heartbeat/models/{self.model_id}',
json={
'agent': AGENT_NAME,
'agentId': self.agent_id},
headers=self.auth_header,
retry=0,
code=5,
thread=False) # already in a thread
self.agent_id = r.json().get('data', {}).get('agentId', None)
sleep(self.rate_limits['heartbeat'])

View File

@ -0,0 +1,222 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import os
import platform
import random
import sys
import threading
import time
from pathlib import Path
import requests
from ultralytics.utils import (ENVIRONMENT, LOGGER, ONLINE, RANK, SETTINGS, TESTS_RUNNING, TQDM, TryExcept, __version__,
colorstr, get_git_origin_url, is_colab, is_git_dir, is_pip_package)
from ultralytics.utils.downloads import GITHUB_ASSETS_NAMES
PREFIX = colorstr('Ultralytics HUB: ')
HELP_MSG = 'If this issue persists please visit https://github.com/ultralytics/hub/issues for assistance.'
HUB_API_ROOT = os.environ.get('ULTRALYTICS_HUB_API', 'https://api.ultralytics.com')
HUB_WEB_ROOT = os.environ.get('ULTRALYTICS_HUB_WEB', 'https://hub.ultralytics.com')
def request_with_credentials(url: str) -> any:
"""
Make an AJAX request with cookies attached in a Google Colab environment.
Args:
url (str): The URL to make the request to.
Returns:
(any): The response data from the AJAX request.
Raises:
OSError: If the function is not run in a Google Colab environment.
"""
if not is_colab():
raise OSError('request_with_credentials() must run in a Colab environment')
from google.colab import output # noqa
from IPython import display # noqa
display.display(
display.Javascript("""
window._hub_tmp = new Promise((resolve, reject) => {
const timeout = setTimeout(() => reject("Failed authenticating existing browser session"), 5000)
fetch("%s", {
method: 'POST',
credentials: 'include'
})
.then((response) => resolve(response.json()))
.then((json) => {
clearTimeout(timeout);
}).catch((err) => {
clearTimeout(timeout);
reject(err);
});
});
""" % url))
return output.eval_js('_hub_tmp')
def requests_with_progress(method, url, **kwargs):
"""
Make an HTTP request using the specified method and URL, with an optional progress bar.
Args:
method (str): The HTTP method to use (e.g. 'GET', 'POST').
url (str): The URL to send the request to.
**kwargs (dict): Additional keyword arguments to pass to the underlying `requests.request` function.
Returns:
(requests.Response): The response object from the HTTP request.
Note:
If 'progress' is set to True, the progress bar will display the download progress
for responses with a known content length.
"""
progress = kwargs.pop('progress', False)
if not progress:
return requests.request(method, url, **kwargs)
response = requests.request(method, url, stream=True, **kwargs)
total = int(response.headers.get('content-length', 0)) # total size
try:
pbar = TQDM(total=total, unit='B', unit_scale=True, unit_divisor=1024)
for data in response.iter_content(chunk_size=1024):
pbar.update(len(data))
pbar.close()
except requests.exceptions.ChunkedEncodingError: # avoid 'Connection broken: IncompleteRead' warnings
response.close()
return response
def smart_request(method, url, retry=3, timeout=30, thread=True, code=-1, verbose=True, progress=False, **kwargs):
"""
Makes an HTTP request using the 'requests' library, with exponential backoff retries up to a specified timeout.
Args:
method (str): The HTTP method to use for the request. Choices are 'post' and 'get'.
url (str): The URL to make the request to.
retry (int, optional): Number of retries to attempt before giving up. Default is 3.
timeout (int, optional): Timeout in seconds after which the function will give up retrying. Default is 30.
thread (bool, optional): Whether to execute the request in a separate daemon thread. Default is True.
code (int, optional): An identifier for the request, used for logging purposes. Default is -1.
verbose (bool, optional): A flag to determine whether to print out to console or not. Default is True.
progress (bool, optional): Whether to show a progress bar during the request. Default is False.
**kwargs (dict): Keyword arguments to be passed to the requests function specified in method.
Returns:
(requests.Response): The HTTP response object. If the request is executed in a separate thread, returns None.
"""
retry_codes = (408, 500) # retry only these codes
@TryExcept(verbose=verbose)
def func(func_method, func_url, **func_kwargs):
"""Make HTTP requests with retries and timeouts, with optional progress tracking."""
r = None # response
t0 = time.time() # initial time for timer
for i in range(retry + 1):
if (time.time() - t0) > timeout:
break
r = requests_with_progress(func_method, func_url, **func_kwargs) # i.e. get(url, data, json, files)
if r.status_code < 300: # return codes in the 2xx range are generally considered "good" or "successful"
break
try:
m = r.json().get('message', 'No JSON message.')
except AttributeError:
m = 'Unable to read JSON.'
if i == 0:
if r.status_code in retry_codes:
m += f' Retrying {retry}x for {timeout}s.' if retry else ''
elif r.status_code == 429: # rate limit
h = r.headers # response headers
m = f"Rate limit reached ({h['X-RateLimit-Remaining']}/{h['X-RateLimit-Limit']}). " \
f"Please retry after {h['Retry-After']}s."
if verbose:
LOGGER.warning(f'{PREFIX}{m} {HELP_MSG} ({r.status_code} #{code})')
if r.status_code not in retry_codes:
return r
time.sleep(2 ** i) # exponential standoff
return r
args = method, url
kwargs['progress'] = progress
if thread:
threading.Thread(target=func, args=args, kwargs=kwargs, daemon=True).start()
else:
return func(*args, **kwargs)
class Events:
"""
A class for collecting anonymous event analytics. Event analytics are enabled when sync=True in settings and
disabled when sync=False. Run 'yolo settings' to see and update settings YAML file.
Attributes:
url (str): The URL to send anonymous events.
rate_limit (float): The rate limit in seconds for sending events.
metadata (dict): A dictionary containing metadata about the environment.
enabled (bool): A flag to enable or disable Events based on certain conditions.
"""
url = 'https://www.google-analytics.com/mp/collect?measurement_id=G-X8NCJYTQXM&api_secret=QLQrATrNSwGRFRLE-cbHJw'
def __init__(self):
"""
Initializes the Events object with default values for events, rate_limit, and metadata.
"""
self.events = [] # events list
self.rate_limit = 60.0 # rate limit (seconds)
self.t = 0.0 # rate limit timer (seconds)
self.metadata = {
'cli': Path(sys.argv[0]).name == 'yolo',
'install': 'git' if is_git_dir() else 'pip' if is_pip_package() else 'other',
'python': '.'.join(platform.python_version_tuple()[:2]), # i.e. 3.10
'version': __version__,
'env': ENVIRONMENT,
'session_id': round(random.random() * 1E15),
'engagement_time_msec': 1000}
self.enabled = \
SETTINGS['sync'] and \
RANK in (-1, 0) and \
not TESTS_RUNNING and \
ONLINE and \
(is_pip_package() or get_git_origin_url() == 'https://github.com/ultralytics/ultralytics.git')
def __call__(self, cfg):
"""
Attempts to add a new event to the events list and send events if the rate limit is reached.
Args:
cfg (IterableSimpleNamespace): The configuration object containing mode and task information.
"""
if not self.enabled:
# Events disabled, do nothing
return
# Attempt to add to events
if len(self.events) < 25: # Events list limited to 25 events (drop any events past this)
params = {
**self.metadata, 'task': cfg.task,
'model': cfg.model if cfg.model in GITHUB_ASSETS_NAMES else 'custom'}
if cfg.mode == 'export':
params['format'] = cfg.format
self.events.append({'name': cfg.mode, 'params': params})
# Check rate limit
t = time.time()
if (t - self.t) < self.rate_limit:
# Time is under rate limiter, wait to send
return
# Time is over rate limiter, send now
data = {'client_id': SETTINGS['uuid'], 'events': self.events} # SHA-256 anonymized UUID hash and events list
# POST equivalent to requests.post(self.url, json=data)
smart_request('post', self.url, json=data, retry=0, verbose=False)
# Reset events and rate limit timer
self.events = []
self.t = t
# Run below code on hub/utils init -------------------------------------------------------------------------------------
events = Events()

View File

@ -0,0 +1,7 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from .rtdetr import RTDETR
from .sam import SAM
from .yolo import YOLO
__all__ = 'YOLO', 'RTDETR', 'SAM' # allow simpler import

View File

@ -0,0 +1,8 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from .model import FastSAM
from .predict import FastSAMPredictor
from .prompt import FastSAMPrompt
from .val import FastSAMValidator
__all__ = 'FastSAMPredictor', 'FastSAM', 'FastSAMPrompt', 'FastSAMValidator'

View File

@ -0,0 +1,33 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from pathlib import Path
from ultralytics.engine.model import Model
from .predict import FastSAMPredictor
from .val import FastSAMValidator
class FastSAM(Model):
"""
FastSAM model interface.
Example:
```python
from ultralytics import FastSAM
model = FastSAM('last.pt')
results = model.predict('ultralytics/assets/bus.jpg')
```
"""
def __init__(self, model='FastSAM-x.pt'):
"""Call the __init__ method of the parent class (YOLO) with the updated default model"""
if str(model) == 'FastSAM.pt':
model = 'FastSAM-x.pt'
assert Path(model).suffix not in ('.yaml', '.yml'), 'FastSAM models only support pre-trained models.'
super().__init__(model=model, task='segment')
@property
def task_map(self):
return {'segment': {'predictor': FastSAMPredictor, 'validator': FastSAMValidator}}

View File

@ -0,0 +1,51 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import torch
from ultralytics.engine.results import Results
from ultralytics.models.fastsam.utils import bbox_iou
from ultralytics.models.yolo.detect.predict import DetectionPredictor
from ultralytics.utils import DEFAULT_CFG, ops
class FastSAMPredictor(DetectionPredictor):
def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
super().__init__(cfg, overrides, _callbacks)
self.args.task = 'segment'
def postprocess(self, preds, img, orig_imgs):
p = ops.non_max_suppression(preds[0],
self.args.conf,
self.args.iou,
agnostic=self.args.agnostic_nms,
max_det=self.args.max_det,
nc=len(self.model.names),
classes=self.args.classes)
full_box = torch.zeros(p[0].shape[1], device=p[0].device)
full_box[2], full_box[3], full_box[4], full_box[6:] = img.shape[3], img.shape[2], 1.0, 1.0
full_box = full_box.view(1, -1)
critical_iou_index = bbox_iou(full_box[0][:4], p[0][:, :4], iou_thres=0.9, image_shape=img.shape[2:])
if critical_iou_index.numel() != 0:
full_box[0][4] = p[0][critical_iou_index][:, 4]
full_box[0][6:] = p[0][critical_iou_index][:, 6:]
p[0][critical_iou_index] = full_box
if not isinstance(orig_imgs, list): # input images are a torch.Tensor, not a list
orig_imgs = ops.convert_torch2numpy_batch(orig_imgs)
results = []
proto = preds[1][-1] if len(preds[1]) == 3 else preds[1] # second output is len 3 if pt, but only 1 if exported
for i, pred in enumerate(p):
orig_img = orig_imgs[i]
img_path = self.batch[0][i]
if not len(pred): # save empty boxes
masks = None
elif self.args.retina_masks:
pred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
masks = ops.process_mask_native(proto[i], pred[:, 6:], pred[:, :4], orig_img.shape[:2]) # HWC
else:
masks = ops.process_mask(proto[i], pred[:, 6:], pred[:, :4], img.shape[2:], upsample=True) # HWC
pred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
results.append(Results(orig_img, path=img_path, names=self.model.names, boxes=pred[:, :6], masks=masks))
return results

View File

@ -0,0 +1,307 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import os
from pathlib import Path
import cv2
import matplotlib.pyplot as plt
import numpy as np
import torch
from PIL import Image
from ultralytics.utils import TQDM
class FastSAMPrompt:
def __init__(self, source, results, device='cuda') -> None:
self.device = device
self.results = results
self.source = source
# Import and assign clip
try:
import clip # for linear_assignment
except ImportError:
from ultralytics.utils.checks import check_requirements
check_requirements('git+https://github.com/openai/CLIP.git')
import clip
self.clip = clip
@staticmethod
def _segment_image(image, bbox):
image_array = np.array(image)
segmented_image_array = np.zeros_like(image_array)
x1, y1, x2, y2 = bbox
segmented_image_array[y1:y2, x1:x2] = image_array[y1:y2, x1:x2]
segmented_image = Image.fromarray(segmented_image_array)
black_image = Image.new('RGB', image.size, (255, 255, 255))
# transparency_mask = np.zeros_like((), dtype=np.uint8)
transparency_mask = np.zeros((image_array.shape[0], image_array.shape[1]), dtype=np.uint8)
transparency_mask[y1:y2, x1:x2] = 255
transparency_mask_image = Image.fromarray(transparency_mask, mode='L')
black_image.paste(segmented_image, mask=transparency_mask_image)
return black_image
@staticmethod
def _format_results(result, filter=0):
annotations = []
n = len(result.masks.data) if result.masks is not None else 0
for i in range(n):
mask = result.masks.data[i] == 1.0
if torch.sum(mask) >= filter:
annotation = {
'id': i,
'segmentation': mask.cpu().numpy(),
'bbox': result.boxes.data[i],
'score': result.boxes.conf[i]}
annotation['area'] = annotation['segmentation'].sum()
annotations.append(annotation)
return annotations
@staticmethod
def _get_bbox_from_mask(mask):
mask = mask.astype(np.uint8)
contours, hierarchy = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
x1, y1, w, h = cv2.boundingRect(contours[0])
x2, y2 = x1 + w, y1 + h
if len(contours) > 1:
for b in contours:
x_t, y_t, w_t, h_t = cv2.boundingRect(b)
x1 = min(x1, x_t)
y1 = min(y1, y_t)
x2 = max(x2, x_t + w_t)
y2 = max(y2, y_t + h_t)
return [x1, y1, x2, y2]
def plot(self,
annotations,
output,
bbox=None,
points=None,
point_label=None,
mask_random_color=True,
better_quality=True,
retina=False,
with_contours=True):
pbar = TQDM(annotations, total=len(annotations))
for ann in pbar:
result_name = os.path.basename(ann.path)
image = ann.orig_img
original_h, original_w = ann.orig_shape
# for macOS only
# plt.switch_backend('TkAgg')
plt.figure(figsize=(original_w / 100, original_h / 100))
# Add subplot with no margin.
plt.subplots_adjust(top=1, bottom=0, right=1, left=0, hspace=0, wspace=0)
plt.margins(0, 0)
plt.gca().xaxis.set_major_locator(plt.NullLocator())
plt.gca().yaxis.set_major_locator(plt.NullLocator())
plt.imshow(image)
if ann.masks is not None:
masks = ann.masks.data
if better_quality:
if isinstance(masks[0], torch.Tensor):
masks = np.array(masks.cpu())
for i, mask in enumerate(masks):
mask = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_CLOSE, np.ones((3, 3), np.uint8))
masks[i] = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_OPEN, np.ones((8, 8), np.uint8))
self.fast_show_mask(
masks,
plt.gca(),
random_color=mask_random_color,
bbox=bbox,
points=points,
pointlabel=point_label,
retinamask=retina,
target_height=original_h,
target_width=original_w,
)
if with_contours:
contour_all = []
temp = np.zeros((original_h, original_w, 1))
for i, mask in enumerate(masks):
mask = mask.astype(np.uint8)
if not retina:
mask = cv2.resize(mask, (original_w, original_h), interpolation=cv2.INTER_NEAREST)
contours, _ = cv2.findContours(mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
contour_all.extend(iter(contours))
cv2.drawContours(temp, contour_all, -1, (255, 255, 255), 2)
color = np.array([0 / 255, 0 / 255, 1.0, 0.8])
contour_mask = temp / 255 * color.reshape(1, 1, -1)
plt.imshow(contour_mask)
plt.axis('off')
fig = plt.gcf()
# Check if the canvas has been drawn
if fig.canvas.get_renderer() is None: # macOS requires this or tests fail
fig.canvas.draw()
save_path = Path(output) / result_name
save_path.parent.mkdir(exist_ok=True, parents=True)
image = Image.frombytes('RGB', fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
image.save(save_path)
plt.close()
pbar.set_description(f'Saving {result_name} to {save_path}')
@staticmethod
def fast_show_mask(
annotation,
ax,
random_color=False,
bbox=None,
points=None,
pointlabel=None,
retinamask=True,
target_height=960,
target_width=960,
):
n, h, w = annotation.shape # batch, height, width
areas = np.sum(annotation, axis=(1, 2))
annotation = annotation[np.argsort(areas)]
index = (annotation != 0).argmax(axis=0)
if random_color:
color = np.random.random((n, 1, 1, 3))
else:
color = np.ones((n, 1, 1, 3)) * np.array([30 / 255, 144 / 255, 1.0])
transparency = np.ones((n, 1, 1, 1)) * 0.6
visual = np.concatenate([color, transparency], axis=-1)
mask_image = np.expand_dims(annotation, -1) * visual
show = np.zeros((h, w, 4))
h_indices, w_indices = np.meshgrid(np.arange(h), np.arange(w), indexing='ij')
indices = (index[h_indices, w_indices], h_indices, w_indices, slice(None))
show[h_indices, w_indices, :] = mask_image[indices]
if bbox is not None:
x1, y1, x2, y2 = bbox
ax.add_patch(plt.Rectangle((x1, y1), x2 - x1, y2 - y1, fill=False, edgecolor='b', linewidth=1))
# Draw point
if points is not None:
plt.scatter(
[point[0] for i, point in enumerate(points) if pointlabel[i] == 1],
[point[1] for i, point in enumerate(points) if pointlabel[i] == 1],
s=20,
c='y',
)
plt.scatter(
[point[0] for i, point in enumerate(points) if pointlabel[i] == 0],
[point[1] for i, point in enumerate(points) if pointlabel[i] == 0],
s=20,
c='m',
)
if not retinamask:
show = cv2.resize(show, (target_width, target_height), interpolation=cv2.INTER_NEAREST)
ax.imshow(show)
@torch.no_grad()
def retrieve(self, model, preprocess, elements, search_text: str, device) -> int:
preprocessed_images = [preprocess(image).to(device) for image in elements]
tokenized_text = self.clip.tokenize([search_text]).to(device)
stacked_images = torch.stack(preprocessed_images)
image_features = model.encode_image(stacked_images)
text_features = model.encode_text(tokenized_text)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
probs = 100.0 * image_features @ text_features.T
return probs[:, 0].softmax(dim=0)
def _crop_image(self, format_results):
if os.path.isdir(self.source):
raise ValueError(f"'{self.source}' is a directory, not a valid source for this function.")
image = Image.fromarray(cv2.cvtColor(self.results[0].orig_img, cv2.COLOR_BGR2RGB))
ori_w, ori_h = image.size
annotations = format_results
mask_h, mask_w = annotations[0]['segmentation'].shape
if ori_w != mask_w or ori_h != mask_h:
image = image.resize((mask_w, mask_h))
cropped_boxes = []
cropped_images = []
not_crop = []
filter_id = []
for _, mask in enumerate(annotations):
if np.sum(mask['segmentation']) <= 100:
filter_id.append(_)
continue
bbox = self._get_bbox_from_mask(mask['segmentation']) # mask 的 bbox
cropped_boxes.append(self._segment_image(image, bbox)) # 保存裁剪的图片
cropped_images.append(bbox) # 保存裁剪的图片的bbox
return cropped_boxes, cropped_images, not_crop, filter_id, annotations
def box_prompt(self, bbox):
if self.results[0].masks is not None:
assert (bbox[2] != 0 and bbox[3] != 0)
if os.path.isdir(self.source):
raise ValueError(f"'{self.source}' is a directory, not a valid source for this function.")
masks = self.results[0].masks.data
target_height, target_width = self.results[0].orig_shape
h = masks.shape[1]
w = masks.shape[2]
if h != target_height or w != target_width:
bbox = [
int(bbox[0] * w / target_width),
int(bbox[1] * h / target_height),
int(bbox[2] * w / target_width),
int(bbox[3] * h / target_height), ]
bbox[0] = max(round(bbox[0]), 0)
bbox[1] = max(round(bbox[1]), 0)
bbox[2] = min(round(bbox[2]), w)
bbox[3] = min(round(bbox[3]), h)
# IoUs = torch.zeros(len(masks), dtype=torch.float32)
bbox_area = (bbox[3] - bbox[1]) * (bbox[2] - bbox[0])
masks_area = torch.sum(masks[:, bbox[1]:bbox[3], bbox[0]:bbox[2]], dim=(1, 2))
orig_masks_area = torch.sum(masks, dim=(1, 2))
union = bbox_area + orig_masks_area - masks_area
IoUs = masks_area / union
max_iou_index = torch.argmax(IoUs)
self.results[0].masks.data = torch.tensor(np.array([masks[max_iou_index].cpu().numpy()]))
return self.results
def point_prompt(self, points, pointlabel): # numpy 处理
if self.results[0].masks is not None:
if os.path.isdir(self.source):
raise ValueError(f"'{self.source}' is a directory, not a valid source for this function.")
masks = self._format_results(self.results[0], 0)
target_height, target_width = self.results[0].orig_shape
h = masks[0]['segmentation'].shape[0]
w = masks[0]['segmentation'].shape[1]
if h != target_height or w != target_width:
points = [[int(point[0] * w / target_width), int(point[1] * h / target_height)] for point in points]
onemask = np.zeros((h, w))
for annotation in masks:
mask = annotation['segmentation'] if isinstance(annotation, dict) else annotation
for i, point in enumerate(points):
if mask[point[1], point[0]] == 1 and pointlabel[i] == 1:
onemask += mask
if mask[point[1], point[0]] == 1 and pointlabel[i] == 0:
onemask -= mask
onemask = onemask >= 1
self.results[0].masks.data = torch.tensor(np.array([onemask]))
return self.results
def text_prompt(self, text):
if self.results[0].masks is not None:
format_results = self._format_results(self.results[0], 0)
cropped_boxes, cropped_images, not_crop, filter_id, annotations = self._crop_image(format_results)
clip_model, preprocess = self.clip.load('ViT-B/32', device=self.device)
scores = self.retrieve(clip_model, preprocess, cropped_boxes, text, device=self.device)
max_idx = scores.argsort()
max_idx = max_idx[-1]
max_idx += sum(np.array(filter_id) <= int(max_idx))
self.results[0].masks.data = torch.tensor(np.array([ann['segmentation'] for ann in annotations]))
return self.results
def everything_prompt(self):
return self.results

View File

@ -0,0 +1,67 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import torch
def adjust_bboxes_to_image_border(boxes, image_shape, threshold=20):
"""
Adjust bounding boxes to stick to image border if they are within a certain threshold.
Args:
boxes (torch.Tensor): (n, 4)
image_shape (tuple): (height, width)
threshold (int): pixel threshold
Returns:
adjusted_boxes (torch.Tensor): adjusted bounding boxes
"""
# Image dimensions
h, w = image_shape
# Adjust boxes
boxes[boxes[:, 0] < threshold, 0] = 0 # x1
boxes[boxes[:, 1] < threshold, 1] = 0 # y1
boxes[boxes[:, 2] > w - threshold, 2] = w # x2
boxes[boxes[:, 3] > h - threshold, 3] = h # y2
return boxes
def bbox_iou(box1, boxes, iou_thres=0.9, image_shape=(640, 640), raw_output=False):
"""
Compute the Intersection-Over-Union of a bounding box with respect to an array of other bounding boxes.
Args:
box1 (torch.Tensor): (4, )
boxes (torch.Tensor): (n, 4)
iou_thres (float): IoU threshold
image_shape (tuple): (height, width)
raw_output (bool): If True, return the raw IoU values instead of the indices
Returns:
high_iou_indices (torch.Tensor): Indices of boxes with IoU > thres
"""
boxes = adjust_bboxes_to_image_border(boxes, image_shape)
# obtain coordinates for intersections
x1 = torch.max(box1[0], boxes[:, 0])
y1 = torch.max(box1[1], boxes[:, 1])
x2 = torch.min(box1[2], boxes[:, 2])
y2 = torch.min(box1[3], boxes[:, 3])
# compute the area of intersection
intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)
# compute the area of both individual boxes
box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
box2_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
# compute the area of union
union = box1_area + box2_area - intersection
# compute the IoU
iou = intersection / union # Should be shape (n, )
if raw_output:
return 0 if iou.numel() == 0 else iou
# return indices of boxes with IoU > thres
return torch.nonzero(iou > iou_thres).flatten()

View File

@ -0,0 +1,14 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from ultralytics.models.yolo.segment import SegmentationValidator
from ultralytics.utils.metrics import SegmentMetrics
class FastSAMValidator(SegmentationValidator):
def __init__(self, dataloader=None, save_dir=None, pbar=None, args=None, _callbacks=None):
"""Initialize SegmentationValidator and set task to 'segment', metrics to SegmentMetrics."""
super().__init__(dataloader, save_dir, pbar, args, _callbacks)
self.args.task = 'segment'
self.args.plots = False # disable ConfusionMatrix and other plots to avoid errors
self.metrics = SegmentMetrics(save_dir=self.save_dir, on_plot=self.on_plot)

View File

@ -0,0 +1,7 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from .model import NAS
from .predict import NASPredictor
from .val import NASValidator
__all__ = 'NASPredictor', 'NASValidator', 'NAS'

View File

@ -0,0 +1,61 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
"""
YOLO-NAS model interface.
Example:
```python
from ultralytics import NAS
model = NAS('yolo_nas_s')
results = model.predict('ultralytics/assets/bus.jpg')
```
"""
from pathlib import Path
import torch
from ultralytics.engine.model import Model
from ultralytics.utils.torch_utils import model_info, smart_inference_mode
from .predict import NASPredictor
from .val import NASValidator
class NAS(Model):
def __init__(self, model='yolo_nas_s.pt') -> None:
assert Path(model).suffix not in ('.yaml', '.yml'), 'YOLO-NAS models only support pre-trained models.'
super().__init__(model, task='detect')
@smart_inference_mode()
def _load(self, weights: str, task: str):
# Load or create new NAS model
import super_gradients
suffix = Path(weights).suffix
if suffix == '.pt':
self.model = torch.load(weights)
elif suffix == '':
self.model = super_gradients.training.models.get(weights, pretrained_weights='coco')
# Standardize model
self.model.fuse = lambda verbose=True: self.model
self.model.stride = torch.tensor([32])
self.model.names = dict(enumerate(self.model._class_names))
self.model.is_fused = lambda: False # for info()
self.model.yaml = {} # for info()
self.model.pt_path = weights # for export()
self.model.task = 'detect' # for export()
def info(self, detailed=False, verbose=True):
"""
Logs model info.
Args:
detailed (bool): Show detailed information about model.
verbose (bool): Controls verbosity.
"""
return model_info(self.model, detailed=detailed, verbose=verbose, imgsz=640)
@property
def task_map(self):
return {'detect': {'predictor': NASPredictor, 'validator': NASValidator}}

View File

@ -0,0 +1,35 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import torch
from ultralytics.engine.predictor import BasePredictor
from ultralytics.engine.results import Results
from ultralytics.utils import ops
class NASPredictor(BasePredictor):
def postprocess(self, preds_in, img, orig_imgs):
"""Postprocess predictions and returns a list of Results objects."""
# Cat boxes and class scores
boxes = ops.xyxy2xywh(preds_in[0][0])
preds = torch.cat((boxes, preds_in[0][1]), -1).permute(0, 2, 1)
preds = ops.non_max_suppression(preds,
self.args.conf,
self.args.iou,
agnostic=self.args.agnostic_nms,
max_det=self.args.max_det,
classes=self.args.classes)
if not isinstance(orig_imgs, list): # input images are a torch.Tensor, not a list
orig_imgs = ops.convert_torch2numpy_batch(orig_imgs)
results = []
for i, pred in enumerate(preds):
orig_img = orig_imgs[i]
pred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
img_path = self.batch[0][i]
results.append(Results(orig_img, path=img_path, names=self.model.names, boxes=pred))
return results

View File

@ -0,0 +1,24 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import torch
from ultralytics.models.yolo.detect import DetectionValidator
from ultralytics.utils import ops
__all__ = ['NASValidator']
class NASValidator(DetectionValidator):
def postprocess(self, preds_in):
"""Apply Non-maximum suppression to prediction outputs."""
boxes = ops.xyxy2xywh(preds_in[0][0])
preds = torch.cat((boxes, preds_in[0][1]), -1).permute(0, 2, 1)
return ops.non_max_suppression(preds,
self.args.conf,
self.args.iou,
labels=self.lb,
multi_label=False,
agnostic=self.args.single_cls,
max_det=self.args.max_det,
max_time_img=0.5)

View File

@ -0,0 +1,7 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from .model import RTDETR
from .predict import RTDETRPredictor
from .val import RTDETRValidator
__all__ = 'RTDETRPredictor', 'RTDETRValidator', 'RTDETR'

View File

@ -0,0 +1,30 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
"""
RT-DETR model interface
"""
from ultralytics.engine.model import Model
from ultralytics.nn.tasks import RTDETRDetectionModel
from .predict import RTDETRPredictor
from .train import RTDETRTrainer
from .val import RTDETRValidator
class RTDETR(Model):
"""
RTDETR model interface.
"""
def __init__(self, model='rtdetr-l.pt') -> None:
if model and model.split('.')[-1] not in ('pt', 'yaml', 'yml'):
raise NotImplementedError('RT-DETR only supports creating from *.pt file or *.yaml file.')
super().__init__(model=model, task='detect')
@property
def task_map(self):
return {
'detect': {
'predictor': RTDETRPredictor,
'validator': RTDETRValidator,
'trainer': RTDETRTrainer,
'model': RTDETRDetectionModel}}

View File

@ -0,0 +1,62 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import torch
from ultralytics.data.augment import LetterBox
from ultralytics.engine.predictor import BasePredictor
from ultralytics.engine.results import Results
from ultralytics.utils import ops
class RTDETRPredictor(BasePredictor):
"""
A class extending the BasePredictor class for prediction based on an RT-DETR detection model.
Example:
```python
from ultralytics.utils import ASSETS
from ultralytics.models.rtdetr import RTDETRPredictor
args = dict(model='rtdetr-l.pt', source=ASSETS)
predictor = RTDETRPredictor(overrides=args)
predictor.predict_cli()
```
"""
def postprocess(self, preds, img, orig_imgs):
"""Postprocess predictions and returns a list of Results objects."""
nd = preds[0].shape[-1]
bboxes, scores = preds[0].split((4, nd - 4), dim=-1)
if not isinstance(orig_imgs, list): # input images are a torch.Tensor, not a list
orig_imgs = ops.convert_torch2numpy_batch(orig_imgs)
results = []
for i, bbox in enumerate(bboxes): # (300, 4)
bbox = ops.xywh2xyxy(bbox)
score, cls = scores[i].max(-1, keepdim=True) # (300, 1)
idx = score.squeeze(-1) > self.args.conf # (300, )
if self.args.classes is not None:
idx = (cls == torch.tensor(self.args.classes, device=cls.device)).any(1) & idx
pred = torch.cat([bbox, score, cls], dim=-1)[idx] # filter
orig_img = orig_imgs[i]
oh, ow = orig_img.shape[:2]
pred[..., [0, 2]] *= ow
pred[..., [1, 3]] *= oh
img_path = self.batch[0][i]
results.append(Results(orig_img, path=img_path, names=self.model.names, boxes=pred))
return results
def pre_transform(self, im):
"""Pre-transform input image before inference.
Args:
im (List(np.ndarray)): (N, 3, h, w) for tensor, [(h, w, 3) x N] for list.
Notes: The size must be square(640) and scaleFilled.
Returns:
(list): A list of transformed imgs.
"""
letterbox = LetterBox(self.imgsz, auto=False, scaleFill=True)
return [letterbox(image=x) for x in im]

View File

@ -0,0 +1,72 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from copy import copy
import torch
from ultralytics.models.yolo.detect import DetectionTrainer
from ultralytics.nn.tasks import RTDETRDetectionModel
from ultralytics.utils import RANK, colorstr
from .val import RTDETRDataset, RTDETRValidator
class RTDETRTrainer(DetectionTrainer):
"""
A class extending the DetectionTrainer class for training based on an RT-DETR detection model.
Notes:
- F.grid_sample used in rt-detr does not support the `deterministic=True` argument.
- AMP training can lead to NaN outputs and may produce errors during bipartite graph matching.
Example:
```python
from ultralytics.models.rtdetr.train import RTDETRTrainer
args = dict(model='rtdetr-l.yaml', data='coco8.yaml', imgsz=640, epochs=3)
trainer = RTDETRTrainer(overrides=args)
trainer.train()
```
"""
def get_model(self, cfg=None, weights=None, verbose=True):
"""Return a YOLO detection model."""
model = RTDETRDetectionModel(cfg, nc=self.data['nc'], verbose=verbose and RANK == -1)
if weights:
model.load(weights)
return model
def build_dataset(self, img_path, mode='val', batch=None):
"""Build RTDETR Dataset
Args:
img_path (str): Path to the folder containing images.
mode (str): `train` mode or `val` mode, users are able to customize different augmentations for each mode.
batch (int, optional): Size of batches, this is for `rect`. Defaults to None.
"""
return RTDETRDataset(
img_path=img_path,
imgsz=self.args.imgsz,
batch_size=batch,
augment=mode == 'train', # no augmentation
hyp=self.args,
rect=False, # no rect
cache=self.args.cache or None,
prefix=colorstr(f'{mode}: '),
data=self.data)
def get_validator(self):
"""Returns a DetectionValidator for RTDETR model validation."""
self.loss_names = 'giou_loss', 'cls_loss', 'l1_loss'
return RTDETRValidator(self.test_loader, save_dir=self.save_dir, args=copy(self.args))
def preprocess_batch(self, batch):
"""Preprocesses a batch of images by scaling and converting to float."""
batch = super().preprocess_batch(batch)
bs = len(batch['img'])
batch_idx = batch['batch_idx']
gt_bbox, gt_class = [], []
for i in range(bs):
gt_bbox.append(batch['bboxes'][batch_idx == i].to(batch_idx.device))
gt_class.append(batch['cls'][batch_idx == i].to(device=batch_idx.device, dtype=torch.long))
return batch

View File

@ -0,0 +1,141 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from pathlib import Path
import torch
from ultralytics.data import YOLODataset
from ultralytics.data.augment import Compose, Format, v8_transforms
from ultralytics.models.yolo.detect import DetectionValidator
from ultralytics.utils import colorstr, ops
__all__ = 'RTDETRValidator', # tuple or list
# TODO: Temporarily RT-DETR does not need padding.
class RTDETRDataset(YOLODataset):
def __init__(self, *args, data=None, **kwargs):
super().__init__(*args, data=data, use_segments=False, use_keypoints=False, **kwargs)
# NOTE: add stretch version load_image for rtdetr mosaic
def load_image(self, i, rect_mode=False):
"""Loads 1 image from dataset index 'i', returns (im, resized hw)."""
return super().load_image(i=i, rect_mode=rect_mode)
def build_transforms(self, hyp=None):
"""Temporary, only for evaluation."""
if self.augment:
hyp.mosaic = hyp.mosaic if self.augment and not self.rect else 0.0
hyp.mixup = hyp.mixup if self.augment and not self.rect else 0.0
transforms = v8_transforms(self, self.imgsz, hyp, stretch=True)
else:
# transforms = Compose([LetterBox(new_shape=(self.imgsz, self.imgsz), auto=False, scaleFill=True)])
transforms = Compose([])
transforms.append(
Format(bbox_format='xywh',
normalize=True,
return_mask=self.use_segments,
return_keypoint=self.use_keypoints,
batch_idx=True,
mask_ratio=hyp.mask_ratio,
mask_overlap=hyp.overlap_mask))
return transforms
class RTDETRValidator(DetectionValidator):
"""
A class extending the DetectionValidator class for validation based on an RT-DETR detection model.
Example:
```python
from ultralytics.models.rtdetr import RTDETRValidator
args = dict(model='rtdetr-l.pt', data='coco8.yaml')
validator = RTDETRValidator(args=args)
validator()
```
"""
def build_dataset(self, img_path, mode='val', batch=None):
"""
Build an RTDETR Dataset.
Args:
img_path (str): Path to the folder containing images.
mode (str): `train` mode or `val` mode, users are able to customize different augmentations for each mode.
batch (int, optional): Size of batches, this is for `rect`. Defaults to None.
"""
return RTDETRDataset(
img_path=img_path,
imgsz=self.args.imgsz,
batch_size=batch,
augment=False, # no augmentation
hyp=self.args,
rect=False, # no rect
cache=self.args.cache or None,
prefix=colorstr(f'{mode}: '),
data=self.data)
def postprocess(self, preds):
"""Apply Non-maximum suppression to prediction outputs."""
bs, _, nd = preds[0].shape
bboxes, scores = preds[0].split((4, nd - 4), dim=-1)
bboxes *= self.args.imgsz
outputs = [torch.zeros((0, 6), device=bboxes.device)] * bs
for i, bbox in enumerate(bboxes): # (300, 4)
bbox = ops.xywh2xyxy(bbox)
score, cls = scores[i].max(-1) # (300, )
# Do not need threshold for evaluation as only got 300 boxes here.
# idx = score > self.args.conf
pred = torch.cat([bbox, score[..., None], cls[..., None]], dim=-1) # filter
# sort by confidence to correctly get internal metrics.
pred = pred[score.argsort(descending=True)]
outputs[i] = pred # [idx]
return outputs
def update_metrics(self, preds, batch):
"""Metrics."""
for si, pred in enumerate(preds):
idx = batch['batch_idx'] == si
cls = batch['cls'][idx]
bbox = batch['bboxes'][idx]
nl, npr = cls.shape[0], pred.shape[0] # number of labels, predictions
shape = batch['ori_shape'][si]
correct_bboxes = torch.zeros(npr, self.niou, dtype=torch.bool, device=self.device) # init
self.seen += 1
if npr == 0:
if nl:
self.stats.append((correct_bboxes, *torch.zeros((2, 0), device=self.device), cls.squeeze(-1)))
if self.args.plots:
self.confusion_matrix.process_batch(detections=None, labels=cls.squeeze(-1))
continue
# Predictions
if self.args.single_cls:
pred[:, 5] = 0
predn = pred.clone()
predn[..., [0, 2]] *= shape[1] / self.args.imgsz # native-space pred
predn[..., [1, 3]] *= shape[0] / self.args.imgsz # native-space pred
# Evaluate
if nl:
tbox = ops.xywh2xyxy(bbox) # target boxes
tbox[..., [0, 2]] *= shape[1] # native-space pred
tbox[..., [1, 3]] *= shape[0] # native-space pred
labelsn = torch.cat((cls, tbox), 1) # native-space labels
# NOTE: To get correct metrics, the inputs of `_process_batch` should always be float32 type.
correct_bboxes = self._process_batch(predn.float(), labelsn)
# TODO: maybe remove these `self.` arguments as they already are member variable
if self.args.plots:
self.confusion_matrix.process_batch(predn, labelsn)
self.stats.append((correct_bboxes, pred[:, 4], pred[:, 5], cls.squeeze(-1))) # (conf, pcls, tcls)
# Save
if self.args.save_json:
self.pred_to_json(predn, batch['im_file'][si])
if self.args.save_txt:
file = self.save_dir / 'labels' / f'{Path(batch["im_file"][si]).stem}.txt'
self.save_one_txt(predn, self.args.save_conf, shape, file)

View File

@ -0,0 +1,8 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from .model import SAM
from .predict import Predictor
# from .build import build_sam
__all__ = 'SAM', 'Predictor' # tuple or list

View File

@ -0,0 +1,180 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import math
from itertools import product
from typing import Any, Generator, List, Tuple
import numpy as np
import torch
def is_box_near_crop_edge(boxes: torch.Tensor,
crop_box: List[int],
orig_box: List[int],
atol: float = 20.0) -> torch.Tensor:
"""Return a boolean tensor indicating if boxes are near the crop edge."""
crop_box_torch = torch.as_tensor(crop_box, dtype=torch.float, device=boxes.device)
orig_box_torch = torch.as_tensor(orig_box, dtype=torch.float, device=boxes.device)
boxes = uncrop_boxes_xyxy(boxes, crop_box).float()
near_crop_edge = torch.isclose(boxes, crop_box_torch[None, :], atol=atol, rtol=0)
near_image_edge = torch.isclose(boxes, orig_box_torch[None, :], atol=atol, rtol=0)
near_crop_edge = torch.logical_and(near_crop_edge, ~near_image_edge)
return torch.any(near_crop_edge, dim=1)
def batch_iterator(batch_size: int, *args) -> Generator[List[Any], None, None]:
"""Yield batches of data from the input arguments."""
assert args and all(len(a) == len(args[0]) for a in args), 'Batched iteration must have same-size inputs.'
n_batches = len(args[0]) // batch_size + int(len(args[0]) % batch_size != 0)
for b in range(n_batches):
yield [arg[b * batch_size:(b + 1) * batch_size] for arg in args]
def calculate_stability_score(masks: torch.Tensor, mask_threshold: float, threshold_offset: float) -> torch.Tensor:
"""
Computes the stability score for a batch of masks. The stability
score is the IoU between the binary masks obtained by thresholding
the predicted mask logits at high and low values.
"""
# One mask is always contained inside the other.
# Save memory by preventing unnecessary cast to torch.int64
intersections = ((masks > (mask_threshold + threshold_offset)).sum(-1, dtype=torch.int16).sum(-1,
dtype=torch.int32))
unions = ((masks > (mask_threshold - threshold_offset)).sum(-1, dtype=torch.int16).sum(-1, dtype=torch.int32))
return intersections / unions
def build_point_grid(n_per_side: int) -> np.ndarray:
"""Generate a 2D grid of evenly spaced points in the range [0,1]x[0,1]."""
offset = 1 / (2 * n_per_side)
points_one_side = np.linspace(offset, 1 - offset, n_per_side)
points_x = np.tile(points_one_side[None, :], (n_per_side, 1))
points_y = np.tile(points_one_side[:, None], (1, n_per_side))
return np.stack([points_x, points_y], axis=-1).reshape(-1, 2)
def build_all_layer_point_grids(n_per_side: int, n_layers: int, scale_per_layer: int) -> List[np.ndarray]:
"""Generate point grids for all crop layers."""
return [build_point_grid(int(n_per_side / (scale_per_layer ** i))) for i in range(n_layers + 1)]
def generate_crop_boxes(im_size: Tuple[int, ...], n_layers: int,
overlap_ratio: float) -> Tuple[List[List[int]], List[int]]:
"""Generates a list of crop boxes of different sizes. Each layer has (2**i)**2 boxes for the ith layer."""
crop_boxes, layer_idxs = [], []
im_h, im_w = im_size
short_side = min(im_h, im_w)
# Original image
crop_boxes.append([0, 0, im_w, im_h])
layer_idxs.append(0)
def crop_len(orig_len, n_crops, overlap):
"""Crops bounding boxes to the size of the input image."""
return int(math.ceil((overlap * (n_crops - 1) + orig_len) / n_crops))
for i_layer in range(n_layers):
n_crops_per_side = 2 ** (i_layer + 1)
overlap = int(overlap_ratio * short_side * (2 / n_crops_per_side))
crop_w = crop_len(im_w, n_crops_per_side, overlap)
crop_h = crop_len(im_h, n_crops_per_side, overlap)
crop_box_x0 = [int((crop_w - overlap) * i) for i in range(n_crops_per_side)]
crop_box_y0 = [int((crop_h - overlap) * i) for i in range(n_crops_per_side)]
# Crops in XYWH format
for x0, y0 in product(crop_box_x0, crop_box_y0):
box = [x0, y0, min(x0 + crop_w, im_w), min(y0 + crop_h, im_h)]
crop_boxes.append(box)
layer_idxs.append(i_layer + 1)
return crop_boxes, layer_idxs
def uncrop_boxes_xyxy(boxes: torch.Tensor, crop_box: List[int]) -> torch.Tensor:
"""Uncrop bounding boxes by adding the crop box offset."""
x0, y0, _, _ = crop_box
offset = torch.tensor([[x0, y0, x0, y0]], device=boxes.device)
# Check if boxes has a channel dimension
if len(boxes.shape) == 3:
offset = offset.unsqueeze(1)
return boxes + offset
def uncrop_points(points: torch.Tensor, crop_box: List[int]) -> torch.Tensor:
"""Uncrop points by adding the crop box offset."""
x0, y0, _, _ = crop_box
offset = torch.tensor([[x0, y0]], device=points.device)
# Check if points has a channel dimension
if len(points.shape) == 3:
offset = offset.unsqueeze(1)
return points + offset
def uncrop_masks(masks: torch.Tensor, crop_box: List[int], orig_h: int, orig_w: int) -> torch.Tensor:
"""Uncrop masks by padding them to the original image size."""
x0, y0, x1, y1 = crop_box
if x0 == 0 and y0 == 0 and x1 == orig_w and y1 == orig_h:
return masks
# Coordinate transform masks
pad_x, pad_y = orig_w - (x1 - x0), orig_h - (y1 - y0)
pad = (x0, pad_x - x0, y0, pad_y - y0)
return torch.nn.functional.pad(masks, pad, value=0)
def remove_small_regions(mask: np.ndarray, area_thresh: float, mode: str) -> Tuple[np.ndarray, bool]:
"""Remove small disconnected regions or holes in a mask, returning the mask and a modification indicator."""
import cv2 # type: ignore
assert mode in {'holes', 'islands'}
correct_holes = mode == 'holes'
working_mask = (correct_holes ^ mask).astype(np.uint8)
n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
sizes = stats[:, -1][1:] # Row 0 is background label
small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
if not small_regions:
return mask, False
fill_labels = [0] + small_regions
if not correct_holes:
# If every region is below threshold, keep largest
fill_labels = [i for i in range(n_labels) if i not in fill_labels] or [int(np.argmax(sizes)) + 1]
mask = np.isin(regions, fill_labels)
return mask, True
def batched_mask_to_box(masks: torch.Tensor) -> torch.Tensor:
"""
Calculates boxes in XYXY format around masks. Return [0,0,0,0] for
an empty mask. For input shape C1xC2x...xHxW, the output shape is C1xC2x...x4.
"""
# torch.max below raises an error on empty inputs, just skip in this case
if torch.numel(masks) == 0:
return torch.zeros(*masks.shape[:-2], 4, device=masks.device)
# Normalize shape to CxHxW
shape = masks.shape
h, w = shape[-2:]
masks = masks.flatten(0, -3) if len(shape) > 2 else masks.unsqueeze(0)
# Get top and bottom edges
in_height, _ = torch.max(masks, dim=-1)
in_height_coords = in_height * torch.arange(h, device=in_height.device)[None, :]
bottom_edges, _ = torch.max(in_height_coords, dim=-1)
in_height_coords = in_height_coords + h * (~in_height)
top_edges, _ = torch.min(in_height_coords, dim=-1)
# Get left and right edges
in_width, _ = torch.max(masks, dim=-2)
in_width_coords = in_width * torch.arange(w, device=in_width.device)[None, :]
right_edges, _ = torch.max(in_width_coords, dim=-1)
in_width_coords = in_width_coords + w * (~in_width)
left_edges, _ = torch.min(in_width_coords, dim=-1)
# If the mask is empty the right edge will be to the left of the left edge.
# Replace these boxes with [0, 0, 0, 0]
empty_filter = (right_edges < left_edges) | (bottom_edges < top_edges)
out = torch.stack([left_edges, top_edges, right_edges, bottom_edges], dim=-1)
out = out * (~empty_filter).unsqueeze(-1)
# Return to original shape
return out.reshape(*shape[:-2], 4) if len(shape) > 2 else out[0]

View File

@ -0,0 +1,158 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
from functools import partial
import torch
from ultralytics.utils.downloads import attempt_download_asset
from .modules.decoders import MaskDecoder
from .modules.encoders import ImageEncoderViT, PromptEncoder
from .modules.sam import Sam
from .modules.tiny_encoder import TinyViT
from .modules.transformer import TwoWayTransformer
def build_sam_vit_h(checkpoint=None):
"""Build and return a Segment Anything Model (SAM) h-size model."""
return _build_sam(
encoder_embed_dim=1280,
encoder_depth=32,
encoder_num_heads=16,
encoder_global_attn_indexes=[7, 15, 23, 31],
checkpoint=checkpoint,
)
def build_sam_vit_l(checkpoint=None):
"""Build and return a Segment Anything Model (SAM) l-size model."""
return _build_sam(
encoder_embed_dim=1024,
encoder_depth=24,
encoder_num_heads=16,
encoder_global_attn_indexes=[5, 11, 17, 23],
checkpoint=checkpoint,
)
def build_sam_vit_b(checkpoint=None):
"""Build and return a Segment Anything Model (SAM) b-size model."""
return _build_sam(
encoder_embed_dim=768,
encoder_depth=12,
encoder_num_heads=12,
encoder_global_attn_indexes=[2, 5, 8, 11],
checkpoint=checkpoint,
)
def build_mobile_sam(checkpoint=None):
"""Build and return Mobile Segment Anything Model (Mobile-SAM)."""
return _build_sam(
encoder_embed_dim=[64, 128, 160, 320],
encoder_depth=[2, 2, 6, 2],
encoder_num_heads=[2, 4, 5, 10],
encoder_global_attn_indexes=None,
mobile_sam=True,
checkpoint=checkpoint,
)
def _build_sam(encoder_embed_dim,
encoder_depth,
encoder_num_heads,
encoder_global_attn_indexes,
checkpoint=None,
mobile_sam=False):
"""Builds the selected SAM model architecture."""
prompt_embed_dim = 256
image_size = 1024
vit_patch_size = 16
image_embedding_size = image_size // vit_patch_size
image_encoder = (TinyViT(
img_size=1024,
in_chans=3,
num_classes=1000,
embed_dims=encoder_embed_dim,
depths=encoder_depth,
num_heads=encoder_num_heads,
window_sizes=[7, 7, 14, 7],
mlp_ratio=4.0,
drop_rate=0.0,
drop_path_rate=0.0,
use_checkpoint=False,
mbconv_expand_ratio=4.0,
local_conv_size=3,
layer_lr_decay=0.8,
) if mobile_sam else ImageEncoderViT(
depth=encoder_depth,
embed_dim=encoder_embed_dim,
img_size=image_size,
mlp_ratio=4,
norm_layer=partial(torch.nn.LayerNorm, eps=1e-6),
num_heads=encoder_num_heads,
patch_size=vit_patch_size,
qkv_bias=True,
use_rel_pos=True,
global_attn_indexes=encoder_global_attn_indexes,
window_size=14,
out_chans=prompt_embed_dim,
))
sam = Sam(
image_encoder=image_encoder,
prompt_encoder=PromptEncoder(
embed_dim=prompt_embed_dim,
image_embedding_size=(image_embedding_size, image_embedding_size),
input_image_size=(image_size, image_size),
mask_in_chans=16,
),
mask_decoder=MaskDecoder(
num_multimask_outputs=3,
transformer=TwoWayTransformer(
depth=2,
embedding_dim=prompt_embed_dim,
mlp_dim=2048,
num_heads=8,
),
transformer_dim=prompt_embed_dim,
iou_head_depth=3,
iou_head_hidden_dim=256,
),
pixel_mean=[123.675, 116.28, 103.53],
pixel_std=[58.395, 57.12, 57.375],
)
if checkpoint is not None:
checkpoint = attempt_download_asset(checkpoint)
with open(checkpoint, 'rb') as f:
state_dict = torch.load(f)
sam.load_state_dict(state_dict)
sam.eval()
# sam.load_state_dict(torch.load(checkpoint), strict=True)
# sam.eval()
return sam
sam_model_map = {
'sam_h.pt': build_sam_vit_h,
'sam_l.pt': build_sam_vit_l,
'sam_b.pt': build_sam_vit_b,
'mobile_sam.pt': build_mobile_sam, }
def build_sam(ckpt='sam_b.pt'):
"""Build a SAM model specified by ckpt."""
model_builder = None
for k in sam_model_map.keys():
if ckpt.endswith(k):
model_builder = sam_model_map.get(k)
if not model_builder:
raise FileNotFoundError(f'{ckpt} is not a supported sam model. Available models are: \n {sam_model_map.keys()}')
return model_builder(ckpt)

View File

@ -0,0 +1,51 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
"""
SAM model interface
"""
from pathlib import Path
from ultralytics.engine.model import Model
from ultralytics.utils.torch_utils import model_info
from .build import build_sam
from .predict import Predictor
class SAM(Model):
"""
SAM model interface.
"""
def __init__(self, model='sam_b.pt') -> None:
if model and Path(model).suffix not in ('.pt', '.pth'):
raise NotImplementedError('SAM prediction requires pre-trained *.pt or *.pth model.')
super().__init__(model=model, task='segment')
def _load(self, weights: str, task=None):
self.model = build_sam(weights)
def predict(self, source, stream=False, bboxes=None, points=None, labels=None, **kwargs):
"""Predicts and returns segmentation masks for given image or video source."""
overrides = dict(conf=0.25, task='segment', mode='predict', imgsz=1024)
kwargs.update(overrides)
prompts = dict(bboxes=bboxes, points=points, labels=labels)
return super().predict(source, stream, prompts=prompts, **kwargs)
def __call__(self, source=None, stream=False, bboxes=None, points=None, labels=None, **kwargs):
"""Calls the 'predict' function with given arguments to perform object detection."""
return self.predict(source, stream, bboxes, points, labels, **kwargs)
def info(self, detailed=False, verbose=True):
"""
Logs model info.
Args:
detailed (bool): Show detailed information about model.
verbose (bool): Controls verbosity.
"""
return model_info(self.model, detailed=detailed, verbose=verbose)
@property
def task_map(self):
return {'segment': {'predictor': Predictor}}

View File

@ -0,0 +1 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license

View File

@ -0,0 +1,159 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from typing import List, Tuple, Type
import torch
from torch import nn
from torch.nn import functional as F
from ultralytics.nn.modules import LayerNorm2d
class MaskDecoder(nn.Module):
def __init__(
self,
*,
transformer_dim: int,
transformer: nn.Module,
num_multimask_outputs: int = 3,
activation: Type[nn.Module] = nn.GELU,
iou_head_depth: int = 3,
iou_head_hidden_dim: int = 256,
) -> None:
"""
Predicts masks given an image and prompt embeddings, using a transformer architecture.
Args:
transformer_dim (int): the channel dimension of the transformer module
transformer (nn.Module): the transformer used to predict masks
num_multimask_outputs (int): the number of masks to predict when disambiguating masks
activation (nn.Module): the type of activation to use when upscaling masks
iou_head_depth (int): the depth of the MLP used to predict mask quality
iou_head_hidden_dim (int): the hidden dimension of the MLP used to predict mask quality
"""
super().__init__()
self.transformer_dim = transformer_dim
self.transformer = transformer
self.num_multimask_outputs = num_multimask_outputs
self.iou_token = nn.Embedding(1, transformer_dim)
self.num_mask_tokens = num_multimask_outputs + 1
self.mask_tokens = nn.Embedding(self.num_mask_tokens, transformer_dim)
self.output_upscaling = nn.Sequential(
nn.ConvTranspose2d(transformer_dim, transformer_dim // 4, kernel_size=2, stride=2),
LayerNorm2d(transformer_dim // 4),
activation(),
nn.ConvTranspose2d(transformer_dim // 4, transformer_dim // 8, kernel_size=2, stride=2),
activation(),
)
self.output_hypernetworks_mlps = nn.ModuleList([
MLP(transformer_dim, transformer_dim, transformer_dim // 8, 3) for _ in range(self.num_mask_tokens)])
self.iou_prediction_head = MLP(transformer_dim, iou_head_hidden_dim, self.num_mask_tokens, iou_head_depth)
def forward(
self,
image_embeddings: torch.Tensor,
image_pe: torch.Tensor,
sparse_prompt_embeddings: torch.Tensor,
dense_prompt_embeddings: torch.Tensor,
multimask_output: bool,
) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Predict masks given image and prompt embeddings.
Args:
image_embeddings (torch.Tensor): the embeddings from the image encoder
image_pe (torch.Tensor): positional encoding with the shape of image_embeddings
sparse_prompt_embeddings (torch.Tensor): the embeddings of the points and boxes
dense_prompt_embeddings (torch.Tensor): the embeddings of the mask inputs
multimask_output (bool): Whether to return multiple masks or a single mask.
Returns:
torch.Tensor: batched predicted masks
torch.Tensor: batched predictions of mask quality
"""
masks, iou_pred = self.predict_masks(
image_embeddings=image_embeddings,
image_pe=image_pe,
sparse_prompt_embeddings=sparse_prompt_embeddings,
dense_prompt_embeddings=dense_prompt_embeddings,
)
# Select the correct mask or masks for output
mask_slice = slice(1, None) if multimask_output else slice(0, 1)
masks = masks[:, mask_slice, :, :]
iou_pred = iou_pred[:, mask_slice]
# Prepare output
return masks, iou_pred
def predict_masks(
self,
image_embeddings: torch.Tensor,
image_pe: torch.Tensor,
sparse_prompt_embeddings: torch.Tensor,
dense_prompt_embeddings: torch.Tensor,
) -> Tuple[torch.Tensor, torch.Tensor]:
"""Predicts masks. See 'forward' for more details."""
# Concatenate output tokens
output_tokens = torch.cat([self.iou_token.weight, self.mask_tokens.weight], dim=0)
output_tokens = output_tokens.unsqueeze(0).expand(sparse_prompt_embeddings.size(0), -1, -1)
tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=1)
# Expand per-image data in batch direction to be per-mask
src = torch.repeat_interleave(image_embeddings, tokens.shape[0], dim=0)
src = src + dense_prompt_embeddings
pos_src = torch.repeat_interleave(image_pe, tokens.shape[0], dim=0)
b, c, h, w = src.shape
# Run the transformer
hs, src = self.transformer(src, pos_src, tokens)
iou_token_out = hs[:, 0, :]
mask_tokens_out = hs[:, 1:(1 + self.num_mask_tokens), :]
# Upscale mask embeddings and predict masks using the mask tokens
src = src.transpose(1, 2).view(b, c, h, w)
upscaled_embedding = self.output_upscaling(src)
hyper_in_list: List[torch.Tensor] = [
self.output_hypernetworks_mlps[i](mask_tokens_out[:, i, :]) for i in range(self.num_mask_tokens)]
hyper_in = torch.stack(hyper_in_list, dim=1)
b, c, h, w = upscaled_embedding.shape
masks = (hyper_in @ upscaled_embedding.view(b, c, h * w)).view(b, -1, h, w)
# Generate mask quality predictions
iou_pred = self.iou_prediction_head(iou_token_out)
return masks, iou_pred
class MLP(nn.Module):
"""
Lightly adapted from
https://github.com/facebookresearch/MaskFormer/blob/main/mask_former/modeling/transformer/transformer_predictor.py
"""
def __init__(
self,
input_dim: int,
hidden_dim: int,
output_dim: int,
num_layers: int,
sigmoid_output: bool = False,
) -> None:
super().__init__()
self.num_layers = num_layers
h = [hidden_dim] * (num_layers - 1)
self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
self.sigmoid_output = sigmoid_output
def forward(self, x):
"""Executes feedforward within the neural network module and applies activation."""
for i, layer in enumerate(self.layers):
x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
if self.sigmoid_output:
x = torch.sigmoid(x)
return x

View File

@ -0,0 +1,568 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from typing import Any, Optional, Tuple, Type
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from ultralytics.nn.modules import LayerNorm2d, MLPBlock
# This class and its supporting functions below lightly adapted from the ViTDet backbone available at: https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py # noqa
class ImageEncoderViT(nn.Module):
def __init__(
self,
img_size: int = 1024,
patch_size: int = 16,
in_chans: int = 3,
embed_dim: int = 768,
depth: int = 12,
num_heads: int = 12,
mlp_ratio: float = 4.0,
out_chans: int = 256,
qkv_bias: bool = True,
norm_layer: Type[nn.Module] = nn.LayerNorm,
act_layer: Type[nn.Module] = nn.GELU,
use_abs_pos: bool = True,
use_rel_pos: bool = False,
rel_pos_zero_init: bool = True,
window_size: int = 0,
global_attn_indexes: Tuple[int, ...] = (),
) -> None:
"""
Args:
img_size (int): Input image size.
patch_size (int): Patch size.
in_chans (int): Number of input image channels.
embed_dim (int): Patch embedding dimension.
depth (int): Depth of ViT.
num_heads (int): Number of attention heads in each ViT block.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
qkv_bias (bool): If True, add a learnable bias to query, key, value.
norm_layer (nn.Module): Normalization layer.
act_layer (nn.Module): Activation layer.
use_abs_pos (bool): If True, use absolute positional embeddings.
use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
window_size (int): Window size for window attention blocks.
global_attn_indexes (list): Indexes for blocks using global attention.
"""
super().__init__()
self.img_size = img_size
self.patch_embed = PatchEmbed(
kernel_size=(patch_size, patch_size),
stride=(patch_size, patch_size),
in_chans=in_chans,
embed_dim=embed_dim,
)
self.pos_embed: Optional[nn.Parameter] = None
if use_abs_pos:
# Initialize absolute positional embedding with pretrain image size.
self.pos_embed = nn.Parameter(torch.zeros(1, img_size // patch_size, img_size // patch_size, embed_dim))
self.blocks = nn.ModuleList()
for i in range(depth):
block = Block(
dim=embed_dim,
num_heads=num_heads,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
norm_layer=norm_layer,
act_layer=act_layer,
use_rel_pos=use_rel_pos,
rel_pos_zero_init=rel_pos_zero_init,
window_size=window_size if i not in global_attn_indexes else 0,
input_size=(img_size // patch_size, img_size // patch_size),
)
self.blocks.append(block)
self.neck = nn.Sequential(
nn.Conv2d(
embed_dim,
out_chans,
kernel_size=1,
bias=False,
),
LayerNorm2d(out_chans),
nn.Conv2d(
out_chans,
out_chans,
kernel_size=3,
padding=1,
bias=False,
),
LayerNorm2d(out_chans),
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.patch_embed(x)
if self.pos_embed is not None:
x = x + self.pos_embed
for blk in self.blocks:
x = blk(x)
return self.neck(x.permute(0, 3, 1, 2))
class PromptEncoder(nn.Module):
def __init__(
self,
embed_dim: int,
image_embedding_size: Tuple[int, int],
input_image_size: Tuple[int, int],
mask_in_chans: int,
activation: Type[nn.Module] = nn.GELU,
) -> None:
"""
Encodes prompts for input to SAM's mask decoder.
Args:
embed_dim (int): The prompts' embedding dimension
image_embedding_size (tuple(int, int)): The spatial size of the
image embedding, as (H, W).
input_image_size (int): The padded size of the image as input
to the image encoder, as (H, W).
mask_in_chans (int): The number of hidden channels used for
encoding input masks.
activation (nn.Module): The activation to use when encoding
input masks.
"""
super().__init__()
self.embed_dim = embed_dim
self.input_image_size = input_image_size
self.image_embedding_size = image_embedding_size
self.pe_layer = PositionEmbeddingRandom(embed_dim // 2)
self.num_point_embeddings: int = 4 # pos/neg point + 2 box corners
point_embeddings = [nn.Embedding(1, embed_dim) for _ in range(self.num_point_embeddings)]
self.point_embeddings = nn.ModuleList(point_embeddings)
self.not_a_point_embed = nn.Embedding(1, embed_dim)
self.mask_input_size = (4 * image_embedding_size[0], 4 * image_embedding_size[1])
self.mask_downscaling = nn.Sequential(
nn.Conv2d(1, mask_in_chans // 4, kernel_size=2, stride=2),
LayerNorm2d(mask_in_chans // 4),
activation(),
nn.Conv2d(mask_in_chans // 4, mask_in_chans, kernel_size=2, stride=2),
LayerNorm2d(mask_in_chans),
activation(),
nn.Conv2d(mask_in_chans, embed_dim, kernel_size=1),
)
self.no_mask_embed = nn.Embedding(1, embed_dim)
def get_dense_pe(self) -> torch.Tensor:
"""
Returns the positional encoding used to encode point prompts,
applied to a dense set of points the shape of the image encoding.
Returns:
torch.Tensor: Positional encoding with shape 1x(embed_dim)x(embedding_h)x(embedding_w)
"""
return self.pe_layer(self.image_embedding_size).unsqueeze(0)
def _embed_points(
self,
points: torch.Tensor,
labels: torch.Tensor,
pad: bool,
) -> torch.Tensor:
"""Embeds point prompts."""
points = points + 0.5 # Shift to center of pixel
if pad:
padding_point = torch.zeros((points.shape[0], 1, 2), device=points.device)
padding_label = -torch.ones((labels.shape[0], 1), device=labels.device)
points = torch.cat([points, padding_point], dim=1)
labels = torch.cat([labels, padding_label], dim=1)
point_embedding = self.pe_layer.forward_with_coords(points, self.input_image_size)
point_embedding[labels == -1] = 0.0
point_embedding[labels == -1] += self.not_a_point_embed.weight
point_embedding[labels == 0] += self.point_embeddings[0].weight
point_embedding[labels == 1] += self.point_embeddings[1].weight
return point_embedding
def _embed_boxes(self, boxes: torch.Tensor) -> torch.Tensor:
"""Embeds box prompts."""
boxes = boxes + 0.5 # Shift to center of pixel
coords = boxes.reshape(-1, 2, 2)
corner_embedding = self.pe_layer.forward_with_coords(coords, self.input_image_size)
corner_embedding[:, 0, :] += self.point_embeddings[2].weight
corner_embedding[:, 1, :] += self.point_embeddings[3].weight
return corner_embedding
def _embed_masks(self, masks: torch.Tensor) -> torch.Tensor:
"""Embeds mask inputs."""
return self.mask_downscaling(masks)
def _get_batch_size(
self,
points: Optional[Tuple[torch.Tensor, torch.Tensor]],
boxes: Optional[torch.Tensor],
masks: Optional[torch.Tensor],
) -> int:
"""
Gets the batch size of the output given the batch size of the input prompts.
"""
if points is not None:
return points[0].shape[0]
elif boxes is not None:
return boxes.shape[0]
elif masks is not None:
return masks.shape[0]
else:
return 1
def _get_device(self) -> torch.device:
return self.point_embeddings[0].weight.device
def forward(
self,
points: Optional[Tuple[torch.Tensor, torch.Tensor]],
boxes: Optional[torch.Tensor],
masks: Optional[torch.Tensor],
) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Embeds different types of prompts, returning both sparse and dense embeddings.
Args:
points (tuple(torch.Tensor, torch.Tensor), None): point coordinates and labels to embed.
boxes (torch.Tensor, None): boxes to embed
masks (torch.Tensor, None): masks to embed
Returns:
torch.Tensor: sparse embeddings for the points and boxes, with shape BxNx(embed_dim), where N is determined
by the number of input points and boxes.
torch.Tensor: dense embeddings for the masks, in the shape Bx(embed_dim)x(embed_H)x(embed_W)
"""
bs = self._get_batch_size(points, boxes, masks)
sparse_embeddings = torch.empty((bs, 0, self.embed_dim), device=self._get_device())
if points is not None:
coords, labels = points
point_embeddings = self._embed_points(coords, labels, pad=(boxes is None))
sparse_embeddings = torch.cat([sparse_embeddings, point_embeddings], dim=1)
if boxes is not None:
box_embeddings = self._embed_boxes(boxes)
sparse_embeddings = torch.cat([sparse_embeddings, box_embeddings], dim=1)
if masks is not None:
dense_embeddings = self._embed_masks(masks)
else:
dense_embeddings = self.no_mask_embed.weight.reshape(1, -1, 1,
1).expand(bs, -1, self.image_embedding_size[0],
self.image_embedding_size[1])
return sparse_embeddings, dense_embeddings
class PositionEmbeddingRandom(nn.Module):
"""
Positional encoding using random spatial frequencies.
"""
def __init__(self, num_pos_feats: int = 64, scale: Optional[float] = None) -> None:
super().__init__()
if scale is None or scale <= 0.0:
scale = 1.0
self.register_buffer('positional_encoding_gaussian_matrix', scale * torch.randn((2, num_pos_feats)))
# Set non-deterministic for forward() error 'cumsum_cuda_kernel does not have a deterministic implementation'
torch.use_deterministic_algorithms(False)
torch.backends.cudnn.deterministic = False
def _pe_encoding(self, coords: torch.Tensor) -> torch.Tensor:
"""Positionally encode points that are normalized to [0,1]."""
# assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape
coords = 2 * coords - 1
coords = coords @ self.positional_encoding_gaussian_matrix
coords = 2 * np.pi * coords
# outputs d_1 x ... x d_n x C shape
return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1)
def forward(self, size: Tuple[int, int]) -> torch.Tensor:
"""Generate positional encoding for a grid of the specified size."""
h, w = size
device: Any = self.positional_encoding_gaussian_matrix.device
grid = torch.ones((h, w), device=device, dtype=torch.float32)
y_embed = grid.cumsum(dim=0) - 0.5
x_embed = grid.cumsum(dim=1) - 0.5
y_embed = y_embed / h
x_embed = x_embed / w
pe = self._pe_encoding(torch.stack([x_embed, y_embed], dim=-1))
return pe.permute(2, 0, 1) # C x H x W
def forward_with_coords(self, coords_input: torch.Tensor, image_size: Tuple[int, int]) -> torch.Tensor:
"""Positionally encode points that are not normalized to [0,1]."""
coords = coords_input.clone()
coords[:, :, 0] = coords[:, :, 0] / image_size[1]
coords[:, :, 1] = coords[:, :, 1] / image_size[0]
return self._pe_encoding(coords.to(torch.float)) # B x N x C
class Block(nn.Module):
"""Transformer blocks with support of window attention and residual propagation blocks"""
def __init__(
self,
dim: int,
num_heads: int,
mlp_ratio: float = 4.0,
qkv_bias: bool = True,
norm_layer: Type[nn.Module] = nn.LayerNorm,
act_layer: Type[nn.Module] = nn.GELU,
use_rel_pos: bool = False,
rel_pos_zero_init: bool = True,
window_size: int = 0,
input_size: Optional[Tuple[int, int]] = None,
) -> None:
"""
Args:
dim (int): Number of input channels.
num_heads (int): Number of attention heads in each ViT block.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
qkv_bias (bool): If True, add a learnable bias to query, key, value.
norm_layer (nn.Module): Normalization layer.
act_layer (nn.Module): Activation layer.
use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
window_size (int): Window size for window attention blocks. If it equals 0, then
use global attention.
input_size (tuple(int, int), None): Input resolution for calculating the relative
positional parameter size.
"""
super().__init__()
self.norm1 = norm_layer(dim)
self.attn = Attention(
dim,
num_heads=num_heads,
qkv_bias=qkv_bias,
use_rel_pos=use_rel_pos,
rel_pos_zero_init=rel_pos_zero_init,
input_size=input_size if window_size == 0 else (window_size, window_size),
)
self.norm2 = norm_layer(dim)
self.mlp = MLPBlock(embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer)
self.window_size = window_size
def forward(self, x: torch.Tensor) -> torch.Tensor:
shortcut = x
x = self.norm1(x)
# Window partition
if self.window_size > 0:
H, W = x.shape[1], x.shape[2]
x, pad_hw = window_partition(x, self.window_size)
x = self.attn(x)
# Reverse window partition
if self.window_size > 0:
x = window_unpartition(x, self.window_size, pad_hw, (H, W))
x = shortcut + x
return x + self.mlp(self.norm2(x))
class Attention(nn.Module):
"""Multi-head Attention block with relative position embeddings."""
def __init__(
self,
dim: int,
num_heads: int = 8,
qkv_bias: bool = True,
use_rel_pos: bool = False,
rel_pos_zero_init: bool = True,
input_size: Optional[Tuple[int, int]] = None,
) -> None:
"""
Args:
dim (int): Number of input channels.
num_heads (int): Number of attention heads.
qkv_bias (bool): If True, add a learnable bias to query, key, value.
rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
input_size (tuple(int, int), None): Input resolution for calculating the relative
positional parameter size.
"""
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = head_dim ** -0.5
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
self.proj = nn.Linear(dim, dim)
self.use_rel_pos = use_rel_pos
if self.use_rel_pos:
assert (input_size is not None), 'Input size must be provided if using relative positional encoding.'
# initialize relative positional embeddings
self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
def forward(self, x: torch.Tensor) -> torch.Tensor:
B, H, W, _ = x.shape
# qkv with shape (3, B, nHead, H * W, C)
qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
# q, k, v with shape (B * nHead, H * W, C)
q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0)
attn = (q * self.scale) @ k.transpose(-2, -1)
if self.use_rel_pos:
attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W))
attn = attn.softmax(dim=-1)
x = (attn @ v).view(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1)
return self.proj(x)
def window_partition(x: torch.Tensor, window_size: int) -> Tuple[torch.Tensor, Tuple[int, int]]:
"""
Partition into non-overlapping windows with padding if needed.
Args:
x (tensor): input tokens with [B, H, W, C].
window_size (int): window size.
Returns:
windows: windows after partition with [B * num_windows, window_size, window_size, C].
(Hp, Wp): padded height and width before partition
"""
B, H, W, C = x.shape
pad_h = (window_size - H % window_size) % window_size
pad_w = (window_size - W % window_size) % window_size
if pad_h > 0 or pad_w > 0:
x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
Hp, Wp = H + pad_h, W + pad_w
x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
return windows, (Hp, Wp)
def window_unpartition(windows: torch.Tensor, window_size: int, pad_hw: Tuple[int, int],
hw: Tuple[int, int]) -> torch.Tensor:
"""
Window unpartition into original sequences and removing padding.
Args:
windows (tensor): input tokens with [B * num_windows, window_size, window_size, C].
window_size (int): window size.
pad_hw (Tuple): padded height and width (Hp, Wp).
hw (Tuple): original height and width (H, W) before padding.
Returns:
x: unpartitioned sequences with [B, H, W, C].
"""
Hp, Wp = pad_hw
H, W = hw
B = windows.shape[0] // (Hp * Wp // window_size // window_size)
x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)
x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
if Hp > H or Wp > W:
x = x[:, :H, :W, :].contiguous()
return x
def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
"""
Get relative positional embeddings according to the relative positions of
query and key sizes.
Args:
q_size (int): size of query q.
k_size (int): size of key k.
rel_pos (Tensor): relative position embeddings (L, C).
Returns:
Extracted positional embeddings according to relative positions.
"""
max_rel_dist = int(2 * max(q_size, k_size) - 1)
# Interpolate rel pos if needed.
if rel_pos.shape[0] != max_rel_dist:
# Interpolate rel pos.
rel_pos_resized = F.interpolate(
rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
size=max_rel_dist,
mode='linear',
)
rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
else:
rel_pos_resized = rel_pos
# Scale the coords with short length if shapes for q and k are different.
q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
return rel_pos_resized[relative_coords.long()]
def add_decomposed_rel_pos(
attn: torch.Tensor,
q: torch.Tensor,
rel_pos_h: torch.Tensor,
rel_pos_w: torch.Tensor,
q_size: Tuple[int, int],
k_size: Tuple[int, int],
) -> torch.Tensor:
"""
Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py # noqa B950
Args:
attn (Tensor): attention map.
q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
k_size (Tuple): spatial sequence size of key k with (k_h, k_w).
Returns:
attn (Tensor): attention map with added relative positional embeddings.
"""
q_h, q_w = q_size
k_h, k_w = k_size
Rh = get_rel_pos(q_h, k_h, rel_pos_h)
Rw = get_rel_pos(q_w, k_w, rel_pos_w)
B, _, dim = q.shape
r_q = q.reshape(B, q_h, q_w, dim)
rel_h = torch.einsum('bhwc,hkc->bhwk', r_q, Rh)
rel_w = torch.einsum('bhwc,wkc->bhwk', r_q, Rw)
attn = (attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]).view(
B, q_h * q_w, k_h * k_w)
return attn
class PatchEmbed(nn.Module):
"""
Image to Patch Embedding.
"""
def __init__(
self,
kernel_size: Tuple[int, int] = (16, 16),
stride: Tuple[int, int] = (16, 16),
padding: Tuple[int, int] = (0, 0),
in_chans: int = 3,
embed_dim: int = 768,
) -> None:
"""
Args:
kernel_size (Tuple): kernel size of the projection layer.
stride (Tuple): stride of the projection layer.
padding (Tuple): padding size of the projection layer.
in_chans (int): Number of input image channels.
embed_dim (int): Patch embedding dimension.
"""
super().__init__()
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding)
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.proj(x).permute(0, 2, 3, 1) # B C H W -> B H W C

View File

@ -0,0 +1,49 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
from typing import List
import torch
from torch import nn
from .decoders import MaskDecoder
from .encoders import ImageEncoderViT, PromptEncoder
class Sam(nn.Module):
mask_threshold: float = 0.0
image_format: str = 'RGB'
def __init__(
self,
image_encoder: ImageEncoderViT,
prompt_encoder: PromptEncoder,
mask_decoder: MaskDecoder,
pixel_mean: List[float] = (123.675, 116.28, 103.53),
pixel_std: List[float] = (58.395, 57.12, 57.375)
) -> None:
"""
SAM predicts object masks from an image and input prompts.
Note:
All forward() operations moved to SAMPredictor.
Args:
image_encoder (ImageEncoderViT): The backbone used to encode the image into image embeddings that allow for
efficient mask prediction.
prompt_encoder (PromptEncoder): Encodes various types of input prompts.
mask_decoder (MaskDecoder): Predicts masks from the image embeddings and encoded prompts.
pixel_mean (list(float)): Mean values for normalizing pixels in the input image.
pixel_std (list(float)): Std values for normalizing pixels in the input image.
"""
super().__init__()
self.image_encoder = image_encoder
self.prompt_encoder = prompt_encoder
self.mask_decoder = mask_decoder
self.register_buffer('pixel_mean', torch.Tensor(pixel_mean).view(-1, 1, 1), False)
self.register_buffer('pixel_std', torch.Tensor(pixel_std).view(-1, 1, 1), False)

View File

@ -0,0 +1,582 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
# --------------------------------------------------------
# TinyViT Model Architecture
# Copyright (c) 2022 Microsoft
# Adapted from LeViT and Swin Transformer
# LeViT: (https://github.com/facebookresearch/levit)
# Swin: (https://github.com/microsoft/swin-transformer)
# Build the TinyViT Model
# --------------------------------------------------------
import itertools
from typing import Tuple
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint as checkpoint
from ultralytics.utils.instance import to_2tuple
class Conv2d_BN(torch.nn.Sequential):
def __init__(self, a, b, ks=1, stride=1, pad=0, dilation=1, groups=1, bn_weight_init=1):
super().__init__()
self.add_module('c', torch.nn.Conv2d(a, b, ks, stride, pad, dilation, groups, bias=False))
bn = torch.nn.BatchNorm2d(b)
torch.nn.init.constant_(bn.weight, bn_weight_init)
torch.nn.init.constant_(bn.bias, 0)
self.add_module('bn', bn)
class PatchEmbed(nn.Module):
def __init__(self, in_chans, embed_dim, resolution, activation):
super().__init__()
img_size: Tuple[int, int] = to_2tuple(resolution)
self.patches_resolution = (img_size[0] // 4, img_size[1] // 4)
self.num_patches = self.patches_resolution[0] * self.patches_resolution[1]
self.in_chans = in_chans
self.embed_dim = embed_dim
n = embed_dim
self.seq = nn.Sequential(
Conv2d_BN(in_chans, n // 2, 3, 2, 1),
activation(),
Conv2d_BN(n // 2, n, 3, 2, 1),
)
def forward(self, x):
return self.seq(x)
class MBConv(nn.Module):
def __init__(self, in_chans, out_chans, expand_ratio, activation, drop_path):
super().__init__()
self.in_chans = in_chans
self.hidden_chans = int(in_chans * expand_ratio)
self.out_chans = out_chans
self.conv1 = Conv2d_BN(in_chans, self.hidden_chans, ks=1)
self.act1 = activation()
self.conv2 = Conv2d_BN(self.hidden_chans, self.hidden_chans, ks=3, stride=1, pad=1, groups=self.hidden_chans)
self.act2 = activation()
self.conv3 = Conv2d_BN(self.hidden_chans, out_chans, ks=1, bn_weight_init=0.0)
self.act3 = activation()
# NOTE: `DropPath` is needed only for training.
# self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
self.drop_path = nn.Identity()
def forward(self, x):
shortcut = x
x = self.conv1(x)
x = self.act1(x)
x = self.conv2(x)
x = self.act2(x)
x = self.conv3(x)
x = self.drop_path(x)
x += shortcut
return self.act3(x)
class PatchMerging(nn.Module):
def __init__(self, input_resolution, dim, out_dim, activation):
super().__init__()
self.input_resolution = input_resolution
self.dim = dim
self.out_dim = out_dim
self.act = activation()
self.conv1 = Conv2d_BN(dim, out_dim, 1, 1, 0)
stride_c = 1 if out_dim in [320, 448, 576] else 2
self.conv2 = Conv2d_BN(out_dim, out_dim, 3, stride_c, 1, groups=out_dim)
self.conv3 = Conv2d_BN(out_dim, out_dim, 1, 1, 0)
def forward(self, x):
if x.ndim == 3:
H, W = self.input_resolution
B = len(x)
# (B, C, H, W)
x = x.view(B, H, W, -1).permute(0, 3, 1, 2)
x = self.conv1(x)
x = self.act(x)
x = self.conv2(x)
x = self.act(x)
x = self.conv3(x)
return x.flatten(2).transpose(1, 2)
class ConvLayer(nn.Module):
def __init__(
self,
dim,
input_resolution,
depth,
activation,
drop_path=0.,
downsample=None,
use_checkpoint=False,
out_dim=None,
conv_expand_ratio=4.,
):
super().__init__()
self.dim = dim
self.input_resolution = input_resolution
self.depth = depth
self.use_checkpoint = use_checkpoint
# build blocks
self.blocks = nn.ModuleList([
MBConv(
dim,
dim,
conv_expand_ratio,
activation,
drop_path[i] if isinstance(drop_path, list) else drop_path,
) for i in range(depth)])
# patch merging layer
self.downsample = None if downsample is None else downsample(
input_resolution, dim=dim, out_dim=out_dim, activation=activation)
def forward(self, x):
for blk in self.blocks:
x = checkpoint.checkpoint(blk, x) if self.use_checkpoint else blk(x)
return x if self.downsample is None else self.downsample(x)
class Mlp(nn.Module):
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.norm = nn.LayerNorm(in_features)
self.fc1 = nn.Linear(in_features, hidden_features)
self.fc2 = nn.Linear(hidden_features, out_features)
self.act = act_layer()
self.drop = nn.Dropout(drop)
def forward(self, x):
x = self.norm(x)
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
return self.drop(x)
class Attention(torch.nn.Module):
def __init__(
self,
dim,
key_dim,
num_heads=8,
attn_ratio=4,
resolution=(14, 14),
):
super().__init__()
# (h, w)
assert isinstance(resolution, tuple) and len(resolution) == 2
self.num_heads = num_heads
self.scale = key_dim ** -0.5
self.key_dim = key_dim
self.nh_kd = nh_kd = key_dim * num_heads
self.d = int(attn_ratio * key_dim)
self.dh = int(attn_ratio * key_dim) * num_heads
self.attn_ratio = attn_ratio
h = self.dh + nh_kd * 2
self.norm = nn.LayerNorm(dim)
self.qkv = nn.Linear(dim, h)
self.proj = nn.Linear(self.dh, dim)
points = list(itertools.product(range(resolution[0]), range(resolution[1])))
N = len(points)
attention_offsets = {}
idxs = []
for p1 in points:
for p2 in points:
offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1]))
if offset not in attention_offsets:
attention_offsets[offset] = len(attention_offsets)
idxs.append(attention_offsets[offset])
self.attention_biases = torch.nn.Parameter(torch.zeros(num_heads, len(attention_offsets)))
self.register_buffer('attention_bias_idxs', torch.LongTensor(idxs).view(N, N), persistent=False)
@torch.no_grad()
def train(self, mode=True):
super().train(mode)
if mode and hasattr(self, 'ab'):
del self.ab
else:
self.ab = self.attention_biases[:, self.attention_bias_idxs]
def forward(self, x): # x (B,N,C)
B, N, _ = x.shape
# Normalization
x = self.norm(x)
qkv = self.qkv(x)
# (B, N, num_heads, d)
q, k, v = qkv.view(B, N, self.num_heads, -1).split([self.key_dim, self.key_dim, self.d], dim=3)
# (B, num_heads, N, d)
q = q.permute(0, 2, 1, 3)
k = k.permute(0, 2, 1, 3)
v = v.permute(0, 2, 1, 3)
self.ab = self.ab.to(self.attention_biases.device)
attn = ((q @ k.transpose(-2, -1)) * self.scale +
(self.attention_biases[:, self.attention_bias_idxs] if self.training else self.ab))
attn = attn.softmax(dim=-1)
x = (attn @ v).transpose(1, 2).reshape(B, N, self.dh)
return self.proj(x)
class TinyViTBlock(nn.Module):
"""
TinyViT Block.
Args:
dim (int): Number of input channels.
input_resolution (tuple[int, int]): Input resolution.
num_heads (int): Number of attention heads.
window_size (int): Window size.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
drop (float, optional): Dropout rate. Default: 0.0
drop_path (float, optional): Stochastic depth rate. Default: 0.0
local_conv_size (int): the kernel size of the convolution between Attention and MLP. Default: 3
activation (torch.nn): the activation function. Default: nn.GELU
"""
def __init__(
self,
dim,
input_resolution,
num_heads,
window_size=7,
mlp_ratio=4.,
drop=0.,
drop_path=0.,
local_conv_size=3,
activation=nn.GELU,
):
super().__init__()
self.dim = dim
self.input_resolution = input_resolution
self.num_heads = num_heads
assert window_size > 0, 'window_size must be greater than 0'
self.window_size = window_size
self.mlp_ratio = mlp_ratio
# NOTE: `DropPath` is needed only for training.
# self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
self.drop_path = nn.Identity()
assert dim % num_heads == 0, 'dim must be divisible by num_heads'
head_dim = dim // num_heads
window_resolution = (window_size, window_size)
self.attn = Attention(dim, head_dim, num_heads, attn_ratio=1, resolution=window_resolution)
mlp_hidden_dim = int(dim * mlp_ratio)
mlp_activation = activation
self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=mlp_activation, drop=drop)
pad = local_conv_size // 2
self.local_conv = Conv2d_BN(dim, dim, ks=local_conv_size, stride=1, pad=pad, groups=dim)
def forward(self, x):
H, W = self.input_resolution
B, L, C = x.shape
assert L == H * W, 'input feature has wrong size'
res_x = x
if H == self.window_size and W == self.window_size:
x = self.attn(x)
else:
x = x.view(B, H, W, C)
pad_b = (self.window_size - H % self.window_size) % self.window_size
pad_r = (self.window_size - W % self.window_size) % self.window_size
padding = pad_b > 0 or pad_r > 0
if padding:
x = F.pad(x, (0, 0, 0, pad_r, 0, pad_b))
pH, pW = H + pad_b, W + pad_r
nH = pH // self.window_size
nW = pW // self.window_size
# window partition
x = x.view(B, nH, self.window_size, nW, self.window_size,
C).transpose(2, 3).reshape(B * nH * nW, self.window_size * self.window_size, C)
x = self.attn(x)
# window reverse
x = x.view(B, nH, nW, self.window_size, self.window_size, C).transpose(2, 3).reshape(B, pH, pW, C)
if padding:
x = x[:, :H, :W].contiguous()
x = x.view(B, L, C)
x = res_x + self.drop_path(x)
x = x.transpose(1, 2).reshape(B, C, H, W)
x = self.local_conv(x)
x = x.view(B, C, L).transpose(1, 2)
return x + self.drop_path(self.mlp(x))
def extra_repr(self) -> str:
return f'dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, ' \
f'window_size={self.window_size}, mlp_ratio={self.mlp_ratio}'
class BasicLayer(nn.Module):
"""
A basic TinyViT layer for one stage.
Args:
dim (int): Number of input channels.
input_resolution (tuple[int]): Input resolution.
depth (int): Number of blocks.
num_heads (int): Number of attention heads.
window_size (int): Local window size.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
drop (float, optional): Dropout rate. Default: 0.0
drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
local_conv_size (int): the kernel size of the depthwise convolution between attention and MLP. Default: 3
activation (torch.nn): the activation function. Default: nn.GELU
out_dim (int | optional): the output dimension of the layer. Default: None
"""
def __init__(
self,
dim,
input_resolution,
depth,
num_heads,
window_size,
mlp_ratio=4.,
drop=0.,
drop_path=0.,
downsample=None,
use_checkpoint=False,
local_conv_size=3,
activation=nn.GELU,
out_dim=None,
):
super().__init__()
self.dim = dim
self.input_resolution = input_resolution
self.depth = depth
self.use_checkpoint = use_checkpoint
# build blocks
self.blocks = nn.ModuleList([
TinyViTBlock(
dim=dim,
input_resolution=input_resolution,
num_heads=num_heads,
window_size=window_size,
mlp_ratio=mlp_ratio,
drop=drop,
drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
local_conv_size=local_conv_size,
activation=activation,
) for i in range(depth)])
# patch merging layer
self.downsample = None if downsample is None else downsample(
input_resolution, dim=dim, out_dim=out_dim, activation=activation)
def forward(self, x):
for blk in self.blocks:
x = checkpoint.checkpoint(blk, x) if self.use_checkpoint else blk(x)
return x if self.downsample is None else self.downsample(x)
def extra_repr(self) -> str:
return f'dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}'
class LayerNorm2d(nn.Module):
def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
super().__init__()
self.weight = nn.Parameter(torch.ones(num_channels))
self.bias = nn.Parameter(torch.zeros(num_channels))
self.eps = eps
def forward(self, x: torch.Tensor) -> torch.Tensor:
u = x.mean(1, keepdim=True)
s = (x - u).pow(2).mean(1, keepdim=True)
x = (x - u) / torch.sqrt(s + self.eps)
return self.weight[:, None, None] * x + self.bias[:, None, None]
class TinyViT(nn.Module):
def __init__(
self,
img_size=224,
in_chans=3,
num_classes=1000,
embed_dims=[96, 192, 384, 768],
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_sizes=[7, 7, 14, 7],
mlp_ratio=4.,
drop_rate=0.,
drop_path_rate=0.1,
use_checkpoint=False,
mbconv_expand_ratio=4.0,
local_conv_size=3,
layer_lr_decay=1.0,
):
super().__init__()
self.img_size = img_size
self.num_classes = num_classes
self.depths = depths
self.num_layers = len(depths)
self.mlp_ratio = mlp_ratio
activation = nn.GELU
self.patch_embed = PatchEmbed(in_chans=in_chans,
embed_dim=embed_dims[0],
resolution=img_size,
activation=activation)
patches_resolution = self.patch_embed.patches_resolution
self.patches_resolution = patches_resolution
# stochastic depth
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule
# build layers
self.layers = nn.ModuleList()
for i_layer in range(self.num_layers):
kwargs = dict(
dim=embed_dims[i_layer],
input_resolution=(patches_resolution[0] // (2 ** (i_layer - 1 if i_layer == 3 else i_layer)),
patches_resolution[1] // (2 ** (i_layer - 1 if i_layer == 3 else i_layer))),
# input_resolution=(patches_resolution[0] // (2 ** i_layer),
# patches_resolution[1] // (2 ** i_layer)),
depth=depths[i_layer],
drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
use_checkpoint=use_checkpoint,
out_dim=embed_dims[min(i_layer + 1,
len(embed_dims) - 1)],
activation=activation,
)
if i_layer == 0:
layer = ConvLayer(conv_expand_ratio=mbconv_expand_ratio, **kwargs)
else:
layer = BasicLayer(num_heads=num_heads[i_layer],
window_size=window_sizes[i_layer],
mlp_ratio=self.mlp_ratio,
drop=drop_rate,
local_conv_size=local_conv_size,
**kwargs)
self.layers.append(layer)
# Classifier head
self.norm_head = nn.LayerNorm(embed_dims[-1])
self.head = nn.Linear(embed_dims[-1], num_classes) if num_classes > 0 else torch.nn.Identity()
# init weights
self.apply(self._init_weights)
self.set_layer_lr_decay(layer_lr_decay)
self.neck = nn.Sequential(
nn.Conv2d(
embed_dims[-1],
256,
kernel_size=1,
bias=False,
),
LayerNorm2d(256),
nn.Conv2d(
256,
256,
kernel_size=3,
padding=1,
bias=False,
),
LayerNorm2d(256),
)
def set_layer_lr_decay(self, layer_lr_decay):
decay_rate = layer_lr_decay
# layers -> blocks (depth)
depth = sum(self.depths)
lr_scales = [decay_rate ** (depth - i - 1) for i in range(depth)]
def _set_lr_scale(m, scale):
for p in m.parameters():
p.lr_scale = scale
self.patch_embed.apply(lambda x: _set_lr_scale(x, lr_scales[0]))
i = 0
for layer in self.layers:
for block in layer.blocks:
block.apply(lambda x: _set_lr_scale(x, lr_scales[i]))
i += 1
if layer.downsample is not None:
layer.downsample.apply(lambda x: _set_lr_scale(x, lr_scales[i - 1]))
assert i == depth
for m in [self.norm_head, self.head]:
m.apply(lambda x: _set_lr_scale(x, lr_scales[-1]))
for k, p in self.named_parameters():
p.param_name = k
def _check_lr_scale(m):
for p in m.parameters():
assert hasattr(p, 'lr_scale'), p.param_name
self.apply(_check_lr_scale)
def _init_weights(self, m):
if isinstance(m, nn.Linear):
# NOTE: This initialization is needed only for training.
# trunc_normal_(m.weight, std=.02)
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
@torch.jit.ignore
def no_weight_decay_keywords(self):
return {'attention_biases'}
def forward_features(self, x):
# x: (N, C, H, W)
x = self.patch_embed(x)
x = self.layers[0](x)
start_i = 1
for i in range(start_i, len(self.layers)):
layer = self.layers[i]
x = layer(x)
B, _, C = x.size()
x = x.view(B, 64, 64, C)
x = x.permute(0, 3, 1, 2)
return self.neck(x)
def forward(self, x):
return self.forward_features(x)

View File

@ -0,0 +1,230 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import math
from typing import Tuple, Type
import torch
from torch import Tensor, nn
from ultralytics.nn.modules import MLPBlock
class TwoWayTransformer(nn.Module):
def __init__(
self,
depth: int,
embedding_dim: int,
num_heads: int,
mlp_dim: int,
activation: Type[nn.Module] = nn.ReLU,
attention_downsample_rate: int = 2,
) -> None:
"""
A transformer decoder that attends to an input image using
queries whose positional embedding is supplied.
Args:
depth (int): number of layers in the transformer
embedding_dim (int): the channel dimension for the input embeddings
num_heads (int): the number of heads for multihead attention. Must
divide embedding_dim
mlp_dim (int): the channel dimension internal to the MLP block
activation (nn.Module): the activation to use in the MLP block
"""
super().__init__()
self.depth = depth
self.embedding_dim = embedding_dim
self.num_heads = num_heads
self.mlp_dim = mlp_dim
self.layers = nn.ModuleList()
for i in range(depth):
self.layers.append(
TwoWayAttentionBlock(
embedding_dim=embedding_dim,
num_heads=num_heads,
mlp_dim=mlp_dim,
activation=activation,
attention_downsample_rate=attention_downsample_rate,
skip_first_layer_pe=(i == 0),
))
self.final_attn_token_to_image = Attention(embedding_dim, num_heads, downsample_rate=attention_downsample_rate)
self.norm_final_attn = nn.LayerNorm(embedding_dim)
def forward(
self,
image_embedding: Tensor,
image_pe: Tensor,
point_embedding: Tensor,
) -> Tuple[Tensor, Tensor]:
"""
Args:
image_embedding (torch.Tensor): image to attend to. Should be shape B x embedding_dim x h x w for any h and w.
image_pe (torch.Tensor): the positional encoding to add to the image. Must have same shape as image_embedding.
point_embedding (torch.Tensor): the embedding to add to the query points.
Must have shape B x N_points x embedding_dim for any N_points.
Returns:
(torch.Tensor): the processed point_embedding
(torch.Tensor): the processed image_embedding
"""
# BxCxHxW -> BxHWxC == B x N_image_tokens x C
bs, c, h, w = image_embedding.shape
image_embedding = image_embedding.flatten(2).permute(0, 2, 1)
image_pe = image_pe.flatten(2).permute(0, 2, 1)
# Prepare queries
queries = point_embedding
keys = image_embedding
# Apply transformer blocks and final layernorm
for layer in self.layers:
queries, keys = layer(
queries=queries,
keys=keys,
query_pe=point_embedding,
key_pe=image_pe,
)
# Apply the final attention layer from the points to the image
q = queries + point_embedding
k = keys + image_pe
attn_out = self.final_attn_token_to_image(q=q, k=k, v=keys)
queries = queries + attn_out
queries = self.norm_final_attn(queries)
return queries, keys
class TwoWayAttentionBlock(nn.Module):
def __init__(
self,
embedding_dim: int,
num_heads: int,
mlp_dim: int = 2048,
activation: Type[nn.Module] = nn.ReLU,
attention_downsample_rate: int = 2,
skip_first_layer_pe: bool = False,
) -> None:
"""
A transformer block with four layers: (1) self-attention of sparse inputs, (2) cross attention of sparse
inputs to dense inputs, (3) mlp block on sparse inputs, and (4) cross attention of dense inputs to sparse
inputs.
Args:
embedding_dim (int): the channel dimension of the embeddings
num_heads (int): the number of heads in the attention layers
mlp_dim (int): the hidden dimension of the mlp block
activation (nn.Module): the activation of the mlp block
skip_first_layer_pe (bool): skip the PE on the first layer
"""
super().__init__()
self.self_attn = Attention(embedding_dim, num_heads)
self.norm1 = nn.LayerNorm(embedding_dim)
self.cross_attn_token_to_image = Attention(embedding_dim, num_heads, downsample_rate=attention_downsample_rate)
self.norm2 = nn.LayerNorm(embedding_dim)
self.mlp = MLPBlock(embedding_dim, mlp_dim, activation)
self.norm3 = nn.LayerNorm(embedding_dim)
self.norm4 = nn.LayerNorm(embedding_dim)
self.cross_attn_image_to_token = Attention(embedding_dim, num_heads, downsample_rate=attention_downsample_rate)
self.skip_first_layer_pe = skip_first_layer_pe
def forward(self, queries: Tensor, keys: Tensor, query_pe: Tensor, key_pe: Tensor) -> Tuple[Tensor, Tensor]:
"""Apply self-attention and cross-attention to queries and keys and return the processed embeddings."""
# Self attention block
if self.skip_first_layer_pe:
queries = self.self_attn(q=queries, k=queries, v=queries)
else:
q = queries + query_pe
attn_out = self.self_attn(q=q, k=q, v=queries)
queries = queries + attn_out
queries = self.norm1(queries)
# Cross attention block, tokens attending to image embedding
q = queries + query_pe
k = keys + key_pe
attn_out = self.cross_attn_token_to_image(q=q, k=k, v=keys)
queries = queries + attn_out
queries = self.norm2(queries)
# MLP block
mlp_out = self.mlp(queries)
queries = queries + mlp_out
queries = self.norm3(queries)
# Cross attention block, image embedding attending to tokens
q = queries + query_pe
k = keys + key_pe
attn_out = self.cross_attn_image_to_token(q=k, k=q, v=queries)
keys = keys + attn_out
keys = self.norm4(keys)
return queries, keys
class Attention(nn.Module):
"""
An attention layer that allows for downscaling the size of the embedding after projection to queries, keys, and
values.
"""
def __init__(
self,
embedding_dim: int,
num_heads: int,
downsample_rate: int = 1,
) -> None:
super().__init__()
self.embedding_dim = embedding_dim
self.internal_dim = embedding_dim // downsample_rate
self.num_heads = num_heads
assert self.internal_dim % num_heads == 0, 'num_heads must divide embedding_dim.'
self.q_proj = nn.Linear(embedding_dim, self.internal_dim)
self.k_proj = nn.Linear(embedding_dim, self.internal_dim)
self.v_proj = nn.Linear(embedding_dim, self.internal_dim)
self.out_proj = nn.Linear(self.internal_dim, embedding_dim)
def _separate_heads(self, x: Tensor, num_heads: int) -> Tensor:
"""Separate the input tensor into the specified number of attention heads."""
b, n, c = x.shape
x = x.reshape(b, n, num_heads, c // num_heads)
return x.transpose(1, 2) # B x N_heads x N_tokens x C_per_head
def _recombine_heads(self, x: Tensor) -> Tensor:
"""Recombine the separated attention heads into a single tensor."""
b, n_heads, n_tokens, c_per_head = x.shape
x = x.transpose(1, 2)
return x.reshape(b, n_tokens, n_heads * c_per_head) # B x N_tokens x C
def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
"""Compute the attention output given the input query, key, and value tensors."""
# Input projections
q = self.q_proj(q)
k = self.k_proj(k)
v = self.v_proj(v)
# Separate into heads
q = self._separate_heads(q, self.num_heads)
k = self._separate_heads(k, self.num_heads)
v = self._separate_heads(v, self.num_heads)
# Attention
_, _, _, c_per_head = q.shape
attn = q @ k.permute(0, 1, 3, 2) # B x N_heads x N_tokens x N_tokens
attn = attn / math.sqrt(c_per_head)
attn = torch.softmax(attn, dim=-1)
# Get output
out = attn @ v
out = self._recombine_heads(out)
return self.out_proj(out)

View File

@ -0,0 +1,408 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import numpy as np
import torch
import torch.nn.functional as F
import torchvision
from ultralytics.data.augment import LetterBox
from ultralytics.engine.predictor import BasePredictor
from ultralytics.engine.results import Results
from ultralytics.utils import DEFAULT_CFG, ops
from ultralytics.utils.torch_utils import select_device
from .amg import (batch_iterator, batched_mask_to_box, build_all_layer_point_grids, calculate_stability_score,
generate_crop_boxes, is_box_near_crop_edge, remove_small_regions, uncrop_boxes_xyxy, uncrop_masks)
from .build import build_sam
class Predictor(BasePredictor):
def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
if overrides is None:
overrides = {}
overrides.update(dict(task='segment', mode='predict', imgsz=1024))
super().__init__(cfg, overrides, _callbacks)
# SAM needs retina_masks=True, or the results would be a mess.
self.args.retina_masks = True
# Args for set_image
self.im = None
self.features = None
# Args for set_prompts
self.prompts = {}
# Args for segment everything
self.segment_all = False
def preprocess(self, im):
"""Prepares input image before inference.
Args:
im (torch.Tensor | List(np.ndarray)): BCHW for tensor, [(HWC) x B] for list.
"""
if self.im is not None:
return self.im
not_tensor = not isinstance(im, torch.Tensor)
if not_tensor:
im = np.stack(self.pre_transform(im))
im = im[..., ::-1].transpose((0, 3, 1, 2)) # BGR to RGB, BHWC to BCHW, (n, 3, h, w)
im = np.ascontiguousarray(im) # contiguous
im = torch.from_numpy(im)
im = im.to(self.device)
im = im.half() if self.model.fp16 else im.float() # uint8 to fp16/32
if not_tensor:
im = (im - self.mean) / self.std
return im
def pre_transform(self, im):
"""
Pre-transform input image before inference.
Args:
im (List(np.ndarray)): (N, 3, h, w) for tensor, [(h, w, 3) x N] for list.
Returns:
(list): A list of transformed images.
"""
assert len(im) == 1, 'SAM model does not currently support batched inference'
letterbox = LetterBox(self.args.imgsz, auto=False, center=False)
return [letterbox(image=x) for x in im]
def inference(self, im, bboxes=None, points=None, labels=None, masks=None, multimask_output=False, *args, **kwargs):
"""
Predict masks for the given input prompts, using the currently set image.
Args:
im (torch.Tensor): The preprocessed image, (N, C, H, W).
bboxes (np.ndarray | List, None): (N, 4), in XYXY format.
points (np.ndarray | List, None): (N, 2), Each point is in (X,Y) in pixels.
labels (np.ndarray | List, None): (N, ), labels for the point prompts.
1 indicates a foreground point and 0 indicates a background point.
masks (np.ndarray, None): A low resolution mask input to the model, typically
coming from a previous prediction iteration. Has form (N, H, W), where
for SAM, H=W=256.
multimask_output (bool): If true, the model will return three masks.
For ambiguous input prompts (such as a single click), this will often
produce better masks than a single prediction. If only a single
mask is needed, the model's predicted quality score can be used
to select the best mask. For non-ambiguous prompts, such as multiple
input prompts, multimask_output=False can give better results.
Returns:
(np.ndarray): The output masks in CxHxW format, where C is the
number of masks, and (H, W) is the original image size.
(np.ndarray): An array of length C containing the model's
predictions for the quality of each mask.
(np.ndarray): An array of shape CxHxW, where C is the number
of masks and H=W=256. These low resolution logits can be passed to
a subsequent iteration as mask input.
"""
# Get prompts from self.prompts first
bboxes = self.prompts.pop('bboxes', bboxes)
points = self.prompts.pop('points', points)
masks = self.prompts.pop('masks', masks)
if all(i is None for i in [bboxes, points, masks]):
return self.generate(im, *args, **kwargs)
return self.prompt_inference(im, bboxes, points, labels, masks, multimask_output)
def prompt_inference(self, im, bboxes=None, points=None, labels=None, masks=None, multimask_output=False):
"""
Predict masks for the given input prompts, using the currently set image.
Args:
im (torch.Tensor): The preprocessed image, (N, C, H, W).
bboxes (np.ndarray | List, None): (N, 4), in XYXY format.
points (np.ndarray | List, None): (N, 2), Each point is in (X,Y) in pixels.
labels (np.ndarray | List, None): (N, ), labels for the point prompts.
1 indicates a foreground point and 0 indicates a background point.
masks (np.ndarray, None): A low resolution mask input to the model, typically
coming from a previous prediction iteration. Has form (N, H, W), where
for SAM, H=W=256.
multimask_output (bool): If true, the model will return three masks.
For ambiguous input prompts (such as a single click), this will often
produce better masks than a single prediction. If only a single
mask is needed, the model's predicted quality score can be used
to select the best mask. For non-ambiguous prompts, such as multiple
input prompts, multimask_output=False can give better results.
Returns:
(np.ndarray): The output masks in CxHxW format, where C is the
number of masks, and (H, W) is the original image size.
(np.ndarray): An array of length C containing the model's
predictions for the quality of each mask.
(np.ndarray): An array of shape CxHxW, where C is the number
of masks and H=W=256. These low resolution logits can be passed to
a subsequent iteration as mask input.
"""
features = self.model.image_encoder(im) if self.features is None else self.features
src_shape, dst_shape = self.batch[1][0].shape[:2], im.shape[2:]
r = 1.0 if self.segment_all else min(dst_shape[0] / src_shape[0], dst_shape[1] / src_shape[1])
# Transform input prompts
if points is not None:
points = torch.as_tensor(points, dtype=torch.float32, device=self.device)
points = points[None] if points.ndim == 1 else points
# Assuming labels are all positive if users don't pass labels.
if labels is None:
labels = np.ones(points.shape[0])
labels = torch.as_tensor(labels, dtype=torch.int32, device=self.device)
points *= r
# (N, 2) --> (N, 1, 2), (N, ) --> (N, 1)
points, labels = points[:, None, :], labels[:, None]
if bboxes is not None:
bboxes = torch.as_tensor(bboxes, dtype=torch.float32, device=self.device)
bboxes = bboxes[None] if bboxes.ndim == 1 else bboxes
bboxes *= r
if masks is not None:
masks = torch.as_tensor(masks, dtype=torch.float32, device=self.device).unsqueeze(1)
points = (points, labels) if points is not None else None
# Embed prompts
sparse_embeddings, dense_embeddings = self.model.prompt_encoder(
points=points,
boxes=bboxes,
masks=masks,
)
# Predict masks
pred_masks, pred_scores = self.model.mask_decoder(
image_embeddings=features,
image_pe=self.model.prompt_encoder.get_dense_pe(),
sparse_prompt_embeddings=sparse_embeddings,
dense_prompt_embeddings=dense_embeddings,
multimask_output=multimask_output,
)
# (N, d, H, W) --> (N*d, H, W), (N, d) --> (N*d, )
# `d` could be 1 or 3 depends on `multimask_output`.
return pred_masks.flatten(0, 1), pred_scores.flatten(0, 1)
def generate(self,
im,
crop_n_layers=0,
crop_overlap_ratio=512 / 1500,
crop_downscale_factor=1,
point_grids=None,
points_stride=32,
points_batch_size=64,
conf_thres=0.88,
stability_score_thresh=0.95,
stability_score_offset=0.95,
crop_nms_thresh=0.7):
"""Segment the whole image.
Args:
im (torch.Tensor): The preprocessed image, (N, C, H, W).
crop_n_layers (int): If >0, mask prediction will be run again on
crops of the image. Sets the number of layers to run, where each
layer has 2**i_layer number of image crops.
crop_overlap_ratio (float): Sets the degree to which crops overlap.
In the first crop layer, crops will overlap by this fraction of
the image length. Later layers with more crops scale down this overlap.
crop_downscale_factor (int): The number of points-per-side
sampled in layer n is scaled down by crop_n_points_downscale_factor**n.
point_grids (list(np.ndarray), None): A list over explicit grids
of points used for sampling, normalized to [0,1]. The nth grid in the
list is used in the nth crop layer. Exclusive with points_per_side.
points_stride (int, None): The number of points to be sampled
along one side of the image. The total number of points is
points_per_side**2. If None, 'point_grids' must provide explicit
point sampling.
points_batch_size (int): Sets the number of points run simultaneously
by the model. Higher numbers may be faster but use more GPU memory.
conf_thres (float): A filtering threshold in [0,1], using the
model's predicted mask quality.
stability_score_thresh (float): A filtering threshold in [0,1], using
the stability of the mask under changes to the cutoff used to binarize
the model's mask predictions.
stability_score_offset (float): The amount to shift the cutoff when
calculated the stability score.
crop_nms_thresh (float): The box IoU cutoff used by non-maximal
suppression to filter duplicate masks between different crops.
"""
self.segment_all = True
ih, iw = im.shape[2:]
crop_regions, layer_idxs = generate_crop_boxes((ih, iw), crop_n_layers, crop_overlap_ratio)
if point_grids is None:
point_grids = build_all_layer_point_grids(
points_stride,
crop_n_layers,
crop_downscale_factor,
)
pred_masks, pred_scores, pred_bboxes, region_areas = [], [], [], []
for crop_region, layer_idx in zip(crop_regions, layer_idxs):
x1, y1, x2, y2 = crop_region
w, h = x2 - x1, y2 - y1
area = torch.tensor(w * h, device=im.device)
points_scale = np.array([[w, h]]) # w, h
# Crop image and interpolate to input size
crop_im = F.interpolate(im[..., y1:y2, x1:x2], (ih, iw), mode='bilinear', align_corners=False)
# (num_points, 2)
points_for_image = point_grids[layer_idx] * points_scale
crop_masks, crop_scores, crop_bboxes = [], [], []
for (points, ) in batch_iterator(points_batch_size, points_for_image):
pred_mask, pred_score = self.prompt_inference(crop_im, points=points, multimask_output=True)
# Interpolate predicted masks to input size
pred_mask = F.interpolate(pred_mask[None], (h, w), mode='bilinear', align_corners=False)[0]
idx = pred_score > conf_thres
pred_mask, pred_score = pred_mask[idx], pred_score[idx]
stability_score = calculate_stability_score(pred_mask, self.model.mask_threshold,
stability_score_offset)
idx = stability_score > stability_score_thresh
pred_mask, pred_score = pred_mask[idx], pred_score[idx]
# Bool type is much more memory-efficient.
pred_mask = pred_mask > self.model.mask_threshold
# (N, 4)
pred_bbox = batched_mask_to_box(pred_mask).float()
keep_mask = ~is_box_near_crop_edge(pred_bbox, crop_region, [0, 0, iw, ih])
if not torch.all(keep_mask):
pred_bbox, pred_mask, pred_score = pred_bbox[keep_mask], pred_mask[keep_mask], pred_score[keep_mask]
crop_masks.append(pred_mask)
crop_bboxes.append(pred_bbox)
crop_scores.append(pred_score)
# Do nms within this crop
crop_masks = torch.cat(crop_masks)
crop_bboxes = torch.cat(crop_bboxes)
crop_scores = torch.cat(crop_scores)
keep = torchvision.ops.nms(crop_bboxes, crop_scores, self.args.iou) # NMS
crop_bboxes = uncrop_boxes_xyxy(crop_bboxes[keep], crop_region)
crop_masks = uncrop_masks(crop_masks[keep], crop_region, ih, iw)
crop_scores = crop_scores[keep]
pred_masks.append(crop_masks)
pred_bboxes.append(crop_bboxes)
pred_scores.append(crop_scores)
region_areas.append(area.expand(len(crop_masks)))
pred_masks = torch.cat(pred_masks)
pred_bboxes = torch.cat(pred_bboxes)
pred_scores = torch.cat(pred_scores)
region_areas = torch.cat(region_areas)
# Remove duplicate masks between crops
if len(crop_regions) > 1:
scores = 1 / region_areas
keep = torchvision.ops.nms(pred_bboxes, scores, crop_nms_thresh)
pred_masks, pred_bboxes, pred_scores = pred_masks[keep], pred_bboxes[keep], pred_scores[keep]
return pred_masks, pred_scores, pred_bboxes
def setup_model(self, model, verbose=True):
"""Set up YOLO model with specified thresholds and device."""
device = select_device(self.args.device, verbose=verbose)
if model is None:
model = build_sam(self.args.model)
model.eval()
self.model = model.to(device)
self.device = device
self.mean = torch.tensor([123.675, 116.28, 103.53]).view(-1, 1, 1).to(device)
self.std = torch.tensor([58.395, 57.12, 57.375]).view(-1, 1, 1).to(device)
# TODO: Temporary settings for compatibility
self.model.pt = False
self.model.triton = False
self.model.stride = 32
self.model.fp16 = False
self.done_warmup = True
def postprocess(self, preds, img, orig_imgs):
"""Post-processes inference output predictions to create detection masks for objects."""
# (N, 1, H, W), (N, 1)
pred_masks, pred_scores = preds[:2]
pred_bboxes = preds[2] if self.segment_all else None
names = dict(enumerate(str(i) for i in range(len(pred_masks))))
if not isinstance(orig_imgs, list): # input images are a torch.Tensor, not a list
orig_imgs = ops.convert_torch2numpy_batch(orig_imgs)
results = []
for i, masks in enumerate([pred_masks]):
orig_img = orig_imgs[i]
if pred_bboxes is not None:
pred_bboxes = ops.scale_boxes(img.shape[2:], pred_bboxes.float(), orig_img.shape, padding=False)
cls = torch.arange(len(pred_masks), dtype=torch.int32, device=pred_masks.device)
pred_bboxes = torch.cat([pred_bboxes, pred_scores[:, None], cls[:, None]], dim=-1)
masks = ops.scale_masks(masks[None].float(), orig_img.shape[:2], padding=False)[0]
masks = masks > self.model.mask_threshold # to bool
img_path = self.batch[0][i]
results.append(Results(orig_img, path=img_path, names=names, masks=masks, boxes=pred_bboxes))
# Reset segment-all mode.
self.segment_all = False
return results
def setup_source(self, source):
"""Sets up source and inference mode."""
if source is not None:
super().setup_source(source)
def set_image(self, image):
"""Set image in advance.
Args:
image (str | np.ndarray): image file path or np.ndarray image by cv2.
"""
if self.model is None:
model = build_sam(self.args.model)
self.setup_model(model)
self.setup_source(image)
assert len(self.dataset) == 1, '`set_image` only supports setting one image!'
for batch in self.dataset:
im = self.preprocess(batch[1])
self.features = self.model.image_encoder(im)
self.im = im
break
def set_prompts(self, prompts):
"""Set prompts in advance."""
self.prompts = prompts
def reset_image(self):
self.im = None
self.features = None
@staticmethod
def remove_small_regions(masks, min_area=0, nms_thresh=0.7):
"""
Removes small disconnected regions and holes in masks, then reruns
box NMS to remove any new duplicates. Requires open-cv as a dependency.
Args:
masks (torch.Tensor): Masks, (N, H, W).
min_area (int): Minimum area threshold.
nms_thresh (float): NMS threshold.
Returns:
new_masks (torch.Tensor): New Masks, (N, H, W).
keep (List[int]): The indices of the new masks, which can be used to filter
the corresponding boxes.
"""
if len(masks) == 0:
return masks
# Filter small disconnected regions and holes
new_masks = []
scores = []
for mask in masks:
mask = mask.cpu().numpy().astype(np.uint8)
mask, changed = remove_small_regions(mask, min_area, mode='holes')
unchanged = not changed
mask, changed = remove_small_regions(mask, min_area, mode='islands')
unchanged = unchanged and not changed
new_masks.append(torch.as_tensor(mask).unsqueeze(0))
# Give score=0 to changed masks and score=1 to unchanged masks
# so NMS will prefer ones that didn't need postprocessing
scores.append(float(unchanged))
# Recalculate boxes and remove any new duplicates
new_masks = torch.cat(new_masks, dim=0)
boxes = batched_mask_to_box(new_masks)
keep = torchvision.ops.nms(
boxes.float(),
torch.as_tensor(scores),
nms_thresh,
)
return new_masks[keep].to(device=masks.device, dtype=masks.dtype), keep

View File

@ -0,0 +1 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license

View File

@ -0,0 +1,297 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import torch
import torch.nn as nn
import torch.nn.functional as F
from ultralytics.utils.loss import FocalLoss, VarifocalLoss
from ultralytics.utils.metrics import bbox_iou
from .ops import HungarianMatcher
class DETRLoss(nn.Module):
def __init__(self,
nc=80,
loss_gain=None,
aux_loss=True,
use_fl=True,
use_vfl=False,
use_uni_match=False,
uni_match_ind=0):
"""
DETR loss function.
Args:
nc (int): The number of classes.
loss_gain (dict): The coefficient of loss.
aux_loss (bool): If 'aux_loss = True', loss at each decoder layer are to be used.
use_vfl (bool): Use VarifocalLoss or not.
use_uni_match (bool): Whether to use a fixed layer to assign labels for auxiliary branch.
uni_match_ind (int): The fixed indices of a layer.
"""
super().__init__()
if loss_gain is None:
loss_gain = {'class': 1, 'bbox': 5, 'giou': 2, 'no_object': 0.1, 'mask': 1, 'dice': 1}
self.nc = nc
self.matcher = HungarianMatcher(cost_gain={'class': 2, 'bbox': 5, 'giou': 2})
self.loss_gain = loss_gain
self.aux_loss = aux_loss
self.fl = FocalLoss() if use_fl else None
self.vfl = VarifocalLoss() if use_vfl else None
self.use_uni_match = use_uni_match
self.uni_match_ind = uni_match_ind
self.device = None
def _get_loss_class(self, pred_scores, targets, gt_scores, num_gts, postfix=''):
# logits: [b, query, num_classes], gt_class: list[[n, 1]]
name_class = f'loss_class{postfix}'
bs, nq = pred_scores.shape[:2]
# one_hot = F.one_hot(targets, self.nc + 1)[..., :-1] # (bs, num_queries, num_classes)
one_hot = torch.zeros((bs, nq, self.nc + 1), dtype=torch.int64, device=targets.device)
one_hot.scatter_(2, targets.unsqueeze(-1), 1)
one_hot = one_hot[..., :-1]
gt_scores = gt_scores.view(bs, nq, 1) * one_hot
if self.fl:
if num_gts and self.vfl:
loss_cls = self.vfl(pred_scores, gt_scores, one_hot)
else:
loss_cls = self.fl(pred_scores, one_hot.float())
loss_cls /= max(num_gts, 1) / nq
else:
loss_cls = nn.BCEWithLogitsLoss(reduction='none')(pred_scores, gt_scores).mean(1).sum() # YOLO CLS loss
return {name_class: loss_cls.squeeze() * self.loss_gain['class']}
def _get_loss_bbox(self, pred_bboxes, gt_bboxes, postfix=''):
# boxes: [b, query, 4], gt_bbox: list[[n, 4]]
name_bbox = f'loss_bbox{postfix}'
name_giou = f'loss_giou{postfix}'
loss = {}
if len(gt_bboxes) == 0:
loss[name_bbox] = torch.tensor(0., device=self.device)
loss[name_giou] = torch.tensor(0., device=self.device)
return loss
loss[name_bbox] = self.loss_gain['bbox'] * F.l1_loss(pred_bboxes, gt_bboxes, reduction='sum') / len(gt_bboxes)
loss[name_giou] = 1.0 - bbox_iou(pred_bboxes, gt_bboxes, xywh=True, GIoU=True)
loss[name_giou] = loss[name_giou].sum() / len(gt_bboxes)
loss[name_giou] = self.loss_gain['giou'] * loss[name_giou]
return {k: v.squeeze() for k, v in loss.items()}
# This function is for future RT-DETR Segment models
# def _get_loss_mask(self, masks, gt_mask, match_indices, postfix=''):
# # masks: [b, query, h, w], gt_mask: list[[n, H, W]]
# name_mask = f'loss_mask{postfix}'
# name_dice = f'loss_dice{postfix}'
#
# loss = {}
# if sum(len(a) for a in gt_mask) == 0:
# loss[name_mask] = torch.tensor(0., device=self.device)
# loss[name_dice] = torch.tensor(0., device=self.device)
# return loss
#
# num_gts = len(gt_mask)
# src_masks, target_masks = self._get_assigned_bboxes(masks, gt_mask, match_indices)
# src_masks = F.interpolate(src_masks.unsqueeze(0), size=target_masks.shape[-2:], mode='bilinear')[0]
# # TODO: torch does not have `sigmoid_focal_loss`, but it's not urgent since we don't use mask branch for now.
# loss[name_mask] = self.loss_gain['mask'] * F.sigmoid_focal_loss(src_masks, target_masks,
# torch.tensor([num_gts], dtype=torch.float32))
# loss[name_dice] = self.loss_gain['dice'] * self._dice_loss(src_masks, target_masks, num_gts)
# return loss
# This function is for future RT-DETR Segment models
# @staticmethod
# def _dice_loss(inputs, targets, num_gts):
# inputs = F.sigmoid(inputs).flatten(1)
# targets = targets.flatten(1)
# numerator = 2 * (inputs * targets).sum(1)
# denominator = inputs.sum(-1) + targets.sum(-1)
# loss = 1 - (numerator + 1) / (denominator + 1)
# return loss.sum() / num_gts
def _get_loss_aux(self,
pred_bboxes,
pred_scores,
gt_bboxes,
gt_cls,
gt_groups,
match_indices=None,
postfix='',
masks=None,
gt_mask=None):
"""Get auxiliary losses"""
# NOTE: loss class, bbox, giou, mask, dice
loss = torch.zeros(5 if masks is not None else 3, device=pred_bboxes.device)
if match_indices is None and self.use_uni_match:
match_indices = self.matcher(pred_bboxes[self.uni_match_ind],
pred_scores[self.uni_match_ind],
gt_bboxes,
gt_cls,
gt_groups,
masks=masks[self.uni_match_ind] if masks is not None else None,
gt_mask=gt_mask)
for i, (aux_bboxes, aux_scores) in enumerate(zip(pred_bboxes, pred_scores)):
aux_masks = masks[i] if masks is not None else None
loss_ = self._get_loss(aux_bboxes,
aux_scores,
gt_bboxes,
gt_cls,
gt_groups,
masks=aux_masks,
gt_mask=gt_mask,
postfix=postfix,
match_indices=match_indices)
loss[0] += loss_[f'loss_class{postfix}']
loss[1] += loss_[f'loss_bbox{postfix}']
loss[2] += loss_[f'loss_giou{postfix}']
# if masks is not None and gt_mask is not None:
# loss_ = self._get_loss_mask(aux_masks, gt_mask, match_indices, postfix)
# loss[3] += loss_[f'loss_mask{postfix}']
# loss[4] += loss_[f'loss_dice{postfix}']
loss = {
f'loss_class_aux{postfix}': loss[0],
f'loss_bbox_aux{postfix}': loss[1],
f'loss_giou_aux{postfix}': loss[2]}
# if masks is not None and gt_mask is not None:
# loss[f'loss_mask_aux{postfix}'] = loss[3]
# loss[f'loss_dice_aux{postfix}'] = loss[4]
return loss
@staticmethod
def _get_index(match_indices):
batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(match_indices)])
src_idx = torch.cat([src for (src, _) in match_indices])
dst_idx = torch.cat([dst for (_, dst) in match_indices])
return (batch_idx, src_idx), dst_idx
def _get_assigned_bboxes(self, pred_bboxes, gt_bboxes, match_indices):
pred_assigned = torch.cat([
t[I] if len(I) > 0 else torch.zeros(0, t.shape[-1], device=self.device)
for t, (I, _) in zip(pred_bboxes, match_indices)])
gt_assigned = torch.cat([
t[J] if len(J) > 0 else torch.zeros(0, t.shape[-1], device=self.device)
for t, (_, J) in zip(gt_bboxes, match_indices)])
return pred_assigned, gt_assigned
def _get_loss(self,
pred_bboxes,
pred_scores,
gt_bboxes,
gt_cls,
gt_groups,
masks=None,
gt_mask=None,
postfix='',
match_indices=None):
"""Get losses"""
if match_indices is None:
match_indices = self.matcher(pred_bboxes,
pred_scores,
gt_bboxes,
gt_cls,
gt_groups,
masks=masks,
gt_mask=gt_mask)
idx, gt_idx = self._get_index(match_indices)
pred_bboxes, gt_bboxes = pred_bboxes[idx], gt_bboxes[gt_idx]
bs, nq = pred_scores.shape[:2]
targets = torch.full((bs, nq), self.nc, device=pred_scores.device, dtype=gt_cls.dtype)
targets[idx] = gt_cls[gt_idx]
gt_scores = torch.zeros([bs, nq], device=pred_scores.device)
if len(gt_bboxes):
gt_scores[idx] = bbox_iou(pred_bboxes.detach(), gt_bboxes, xywh=True).squeeze(-1)
loss = {}
loss.update(self._get_loss_class(pred_scores, targets, gt_scores, len(gt_bboxes), postfix))
loss.update(self._get_loss_bbox(pred_bboxes, gt_bboxes, postfix))
# if masks is not None and gt_mask is not None:
# loss.update(self._get_loss_mask(masks, gt_mask, match_indices, postfix))
return loss
def forward(self, pred_bboxes, pred_scores, batch, postfix='', **kwargs):
"""
Args:
pred_bboxes (torch.Tensor): [l, b, query, 4]
pred_scores (torch.Tensor): [l, b, query, num_classes]
batch (dict): A dict includes:
gt_cls (torch.Tensor) with shape [num_gts, ],
gt_bboxes (torch.Tensor): [num_gts, 4],
gt_groups (List(int)): a list of batch size length includes the number of gts of each image.
postfix (str): postfix of loss name.
"""
self.device = pred_bboxes.device
match_indices = kwargs.get('match_indices', None)
gt_cls, gt_bboxes, gt_groups = batch['cls'], batch['bboxes'], batch['gt_groups']
total_loss = self._get_loss(pred_bboxes[-1],
pred_scores[-1],
gt_bboxes,
gt_cls,
gt_groups,
postfix=postfix,
match_indices=match_indices)
if self.aux_loss:
total_loss.update(
self._get_loss_aux(pred_bboxes[:-1], pred_scores[:-1], gt_bboxes, gt_cls, gt_groups, match_indices,
postfix))
return total_loss
class RTDETRDetectionLoss(DETRLoss):
def forward(self, preds, batch, dn_bboxes=None, dn_scores=None, dn_meta=None):
pred_bboxes, pred_scores = preds
total_loss = super().forward(pred_bboxes, pred_scores, batch)
if dn_meta is not None:
dn_pos_idx, dn_num_group = dn_meta['dn_pos_idx'], dn_meta['dn_num_group']
assert len(batch['gt_groups']) == len(dn_pos_idx)
# Denoising match indices
match_indices = self.get_dn_match_indices(dn_pos_idx, dn_num_group, batch['gt_groups'])
# Compute denoising training loss
dn_loss = super().forward(dn_bboxes, dn_scores, batch, postfix='_dn', match_indices=match_indices)
total_loss.update(dn_loss)
else:
total_loss.update({f'{k}_dn': torch.tensor(0., device=self.device) for k in total_loss.keys()})
return total_loss
@staticmethod
def get_dn_match_indices(dn_pos_idx, dn_num_group, gt_groups):
"""
Get the match indices for denoising.
Args:
dn_pos_idx (List[torch.Tensor]): A list includes positive indices of denoising.
dn_num_group (int): The number of groups of denoising.
gt_groups (List(int)): a list of batch size length includes the number of gts of each image.
Returns:
dn_match_indices (List(tuple)): Matched indices.
"""
dn_match_indices = []
idx_groups = torch.as_tensor([0, *gt_groups[:-1]]).cumsum_(0)
for i, num_gt in enumerate(gt_groups):
if num_gt > 0:
gt_idx = torch.arange(end=num_gt, dtype=torch.long) + idx_groups[i]
gt_idx = gt_idx.repeat(dn_num_group)
assert len(dn_pos_idx[i]) == len(gt_idx), 'Expected the same length, '
f'but got {len(dn_pos_idx[i])} and {len(gt_idx)} respectively.'
dn_match_indices.append((dn_pos_idx[i], gt_idx))
else:
dn_match_indices.append((torch.zeros([0], dtype=torch.long), torch.zeros([0], dtype=torch.long)))
return dn_match_indices

View File

@ -0,0 +1,261 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import torch
import torch.nn as nn
import torch.nn.functional as F
from scipy.optimize import linear_sum_assignment
from ultralytics.utils.metrics import bbox_iou
from ultralytics.utils.ops import xywh2xyxy, xyxy2xywh
class HungarianMatcher(nn.Module):
"""
A module implementing the HungarianMatcher, which is a differentiable module to solve the assignment problem in
an end-to-end fashion.
HungarianMatcher performs optimal assignment over the predicted and ground truth bounding boxes using a cost
function that considers classification scores, bounding box coordinates, and optionally, mask predictions.
Attributes:
cost_gain (dict): Dictionary of cost coefficients: 'class', 'bbox', 'giou', 'mask', and 'dice'.
use_fl (bool): Indicates whether to use Focal Loss for the classification cost calculation.
with_mask (bool): Indicates whether the model makes mask predictions.
num_sample_points (int): The number of sample points used in mask cost calculation.
alpha (float): The alpha factor in Focal Loss calculation.
gamma (float): The gamma factor in Focal Loss calculation.
Methods:
forward(pred_bboxes, pred_scores, gt_bboxes, gt_cls, gt_groups, masks=None, gt_mask=None): Computes the
assignment between predictions and ground truths for a batch.
_cost_mask(bs, num_gts, masks=None, gt_mask=None): Computes the mask cost and dice cost if masks are predicted.
"""
def __init__(self, cost_gain=None, use_fl=True, with_mask=False, num_sample_points=12544, alpha=0.25, gamma=2.0):
super().__init__()
if cost_gain is None:
cost_gain = {'class': 1, 'bbox': 5, 'giou': 2, 'mask': 1, 'dice': 1}
self.cost_gain = cost_gain
self.use_fl = use_fl
self.with_mask = with_mask
self.num_sample_points = num_sample_points
self.alpha = alpha
self.gamma = gamma
def forward(self, pred_bboxes, pred_scores, gt_bboxes, gt_cls, gt_groups, masks=None, gt_mask=None):
"""
Forward pass for HungarianMatcher. This function computes costs based on prediction and ground truth
(classification cost, L1 cost between boxes and GIoU cost between boxes) and finds the optimal matching
between predictions and ground truth based on these costs.
Args:
pred_bboxes (Tensor): Predicted bounding boxes with shape [batch_size, num_queries, 4].
pred_scores (Tensor): Predicted scores with shape [batch_size, num_queries, num_classes].
gt_cls (torch.Tensor): Ground truth classes with shape [num_gts, ].
gt_bboxes (torch.Tensor): Ground truth bounding boxes with shape [num_gts, 4].
gt_groups (List[int]): List of length equal to batch size, containing the number of ground truths for
each image.
masks (Tensor, optional): Predicted masks with shape [batch_size, num_queries, height, width].
Defaults to None.
gt_mask (List[Tensor], optional): List of ground truth masks, each with shape [num_masks, Height, Width].
Defaults to None.
Returns:
(List[Tuple[Tensor, Tensor]]): A list of size batch_size, each element is a tuple (index_i, index_j), where:
- index_i is the tensor of indices of the selected predictions (in order)
- index_j is the tensor of indices of the corresponding selected ground truth targets (in order)
For each batch element, it holds:
len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
"""
bs, nq, nc = pred_scores.shape
if sum(gt_groups) == 0:
return [(torch.tensor([], dtype=torch.long), torch.tensor([], dtype=torch.long)) for _ in range(bs)]
# We flatten to compute the cost matrices in a batch
# [batch_size * num_queries, num_classes]
pred_scores = pred_scores.detach().view(-1, nc)
pred_scores = F.sigmoid(pred_scores) if self.use_fl else F.softmax(pred_scores, dim=-1)
# [batch_size * num_queries, 4]
pred_bboxes = pred_bboxes.detach().view(-1, 4)
# Compute the classification cost
pred_scores = pred_scores[:, gt_cls]
if self.use_fl:
neg_cost_class = (1 - self.alpha) * (pred_scores ** self.gamma) * (-(1 - pred_scores + 1e-8).log())
pos_cost_class = self.alpha * ((1 - pred_scores) ** self.gamma) * (-(pred_scores + 1e-8).log())
cost_class = pos_cost_class - neg_cost_class
else:
cost_class = -pred_scores
# Compute the L1 cost between boxes
cost_bbox = (pred_bboxes.unsqueeze(1) - gt_bboxes.unsqueeze(0)).abs().sum(-1) # (bs*num_queries, num_gt)
# Compute the GIoU cost between boxes, (bs*num_queries, num_gt)
cost_giou = 1.0 - bbox_iou(pred_bboxes.unsqueeze(1), gt_bboxes.unsqueeze(0), xywh=True, GIoU=True).squeeze(-1)
# Final cost matrix
C = self.cost_gain['class'] * cost_class + \
self.cost_gain['bbox'] * cost_bbox + \
self.cost_gain['giou'] * cost_giou
# Compute the mask cost and dice cost
if self.with_mask:
C += self._cost_mask(bs, gt_groups, masks, gt_mask)
C = C.view(bs, nq, -1).cpu()
indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(gt_groups, -1))]
gt_groups = torch.as_tensor([0, *gt_groups[:-1]]).cumsum_(0)
# (idx for queries, idx for gt)
return [(torch.tensor(i, dtype=torch.long), torch.tensor(j, dtype=torch.long) + gt_groups[k])
for k, (i, j) in enumerate(indices)]
# This function is for future RT-DETR Segment models
# def _cost_mask(self, bs, num_gts, masks=None, gt_mask=None):
# assert masks is not None and gt_mask is not None, 'Make sure the input has `mask` and `gt_mask`'
# # all masks share the same set of points for efficient matching
# sample_points = torch.rand([bs, 1, self.num_sample_points, 2])
# sample_points = 2.0 * sample_points - 1.0
#
# out_mask = F.grid_sample(masks.detach(), sample_points, align_corners=False).squeeze(-2)
# out_mask = out_mask.flatten(0, 1)
#
# tgt_mask = torch.cat(gt_mask).unsqueeze(1)
# sample_points = torch.cat([a.repeat(b, 1, 1, 1) for a, b in zip(sample_points, num_gts) if b > 0])
# tgt_mask = F.grid_sample(tgt_mask, sample_points, align_corners=False).squeeze([1, 2])
#
# with torch.cuda.amp.autocast(False):
# # binary cross entropy cost
# pos_cost_mask = F.binary_cross_entropy_with_logits(out_mask, torch.ones_like(out_mask), reduction='none')
# neg_cost_mask = F.binary_cross_entropy_with_logits(out_mask, torch.zeros_like(out_mask), reduction='none')
# cost_mask = torch.matmul(pos_cost_mask, tgt_mask.T) + torch.matmul(neg_cost_mask, 1 - tgt_mask.T)
# cost_mask /= self.num_sample_points
#
# # dice cost
# out_mask = F.sigmoid(out_mask)
# numerator = 2 * torch.matmul(out_mask, tgt_mask.T)
# denominator = out_mask.sum(-1, keepdim=True) + tgt_mask.sum(-1).unsqueeze(0)
# cost_dice = 1 - (numerator + 1) / (denominator + 1)
#
# C = self.cost_gain['mask'] * cost_mask + self.cost_gain['dice'] * cost_dice
# return C
def get_cdn_group(batch,
num_classes,
num_queries,
class_embed,
num_dn=100,
cls_noise_ratio=0.5,
box_noise_scale=1.0,
training=False):
"""
Get contrastive denoising training group. This function creates a contrastive denoising training group with
positive and negative samples from the ground truths (gt). It applies noise to the class labels and bounding
box coordinates, and returns the modified labels, bounding boxes, attention mask and meta information.
Args:
batch (dict): A dict that includes 'gt_cls' (torch.Tensor with shape [num_gts, ]), 'gt_bboxes'
(torch.Tensor with shape [num_gts, 4]), 'gt_groups' (List(int)) which is a list of batch size length
indicating the number of gts of each image.
num_classes (int): Number of classes.
num_queries (int): Number of queries.
class_embed (torch.Tensor): Embedding weights to map class labels to embedding space.
num_dn (int, optional): Number of denoising. Defaults to 100.
cls_noise_ratio (float, optional): Noise ratio for class labels. Defaults to 0.5.
box_noise_scale (float, optional): Noise scale for bounding box coordinates. Defaults to 1.0.
training (bool, optional): If it's in training mode. Defaults to False.
Returns:
(Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor], Optional[Dict]]): The modified class embeddings,
bounding boxes, attention mask and meta information for denoising. If not in training mode or 'num_dn'
is less than or equal to 0, the function returns None for all elements in the tuple.
"""
if (not training) or num_dn <= 0:
return None, None, None, None
gt_groups = batch['gt_groups']
total_num = sum(gt_groups)
max_nums = max(gt_groups)
if max_nums == 0:
return None, None, None, None
num_group = num_dn // max_nums
num_group = 1 if num_group == 0 else num_group
# pad gt to max_num of a batch
bs = len(gt_groups)
gt_cls = batch['cls'] # (bs*num, )
gt_bbox = batch['bboxes'] # bs*num, 4
b_idx = batch['batch_idx']
# each group has positive and negative queries.
dn_cls = gt_cls.repeat(2 * num_group) # (2*num_group*bs*num, )
dn_bbox = gt_bbox.repeat(2 * num_group, 1) # 2*num_group*bs*num, 4
dn_b_idx = b_idx.repeat(2 * num_group).view(-1) # (2*num_group*bs*num, )
# positive and negative mask
# (bs*num*num_group, ), the second total_num*num_group part as negative samples
neg_idx = torch.arange(total_num * num_group, dtype=torch.long, device=gt_bbox.device) + num_group * total_num
if cls_noise_ratio > 0:
# half of bbox prob
mask = torch.rand(dn_cls.shape) < (cls_noise_ratio * 0.5)
idx = torch.nonzero(mask).squeeze(-1)
# randomly put a new one here
new_label = torch.randint_like(idx, 0, num_classes, dtype=dn_cls.dtype, device=dn_cls.device)
dn_cls[idx] = new_label
if box_noise_scale > 0:
known_bbox = xywh2xyxy(dn_bbox)
diff = (dn_bbox[..., 2:] * 0.5).repeat(1, 2) * box_noise_scale # 2*num_group*bs*num, 4
rand_sign = torch.randint_like(dn_bbox, 0, 2) * 2.0 - 1.0
rand_part = torch.rand_like(dn_bbox)
rand_part[neg_idx] += 1.0
rand_part *= rand_sign
known_bbox += rand_part * diff
known_bbox.clip_(min=0.0, max=1.0)
dn_bbox = xyxy2xywh(known_bbox)
dn_bbox = inverse_sigmoid(dn_bbox)
# total denoising queries
num_dn = int(max_nums * 2 * num_group)
# class_embed = torch.cat([class_embed, torch.zeros([1, class_embed.shape[-1]], device=class_embed.device)])
dn_cls_embed = class_embed[dn_cls] # bs*num * 2 * num_group, 256
padding_cls = torch.zeros(bs, num_dn, dn_cls_embed.shape[-1], device=gt_cls.device)
padding_bbox = torch.zeros(bs, num_dn, 4, device=gt_bbox.device)
map_indices = torch.cat([torch.tensor(range(num), dtype=torch.long) for num in gt_groups])
pos_idx = torch.stack([map_indices + max_nums * i for i in range(num_group)], dim=0)
map_indices = torch.cat([map_indices + max_nums * i for i in range(2 * num_group)])
padding_cls[(dn_b_idx, map_indices)] = dn_cls_embed
padding_bbox[(dn_b_idx, map_indices)] = dn_bbox
tgt_size = num_dn + num_queries
attn_mask = torch.zeros([tgt_size, tgt_size], dtype=torch.bool)
# match query cannot see the reconstruct
attn_mask[num_dn:, :num_dn] = True
# reconstruct cannot see each other
for i in range(num_group):
if i == 0:
attn_mask[max_nums * 2 * i:max_nums * 2 * (i + 1), max_nums * 2 * (i + 1):num_dn] = True
if i == num_group - 1:
attn_mask[max_nums * 2 * i:max_nums * 2 * (i + 1), :max_nums * i * 2] = True
else:
attn_mask[max_nums * 2 * i:max_nums * 2 * (i + 1), max_nums * 2 * (i + 1):num_dn] = True
attn_mask[max_nums * 2 * i:max_nums * 2 * (i + 1), :max_nums * 2 * i] = True
dn_meta = {
'dn_pos_idx': [p.reshape(-1) for p in pos_idx.cpu().split(list(gt_groups), dim=1)],
'dn_num_group': num_group,
'dn_num_split': [num_dn, num_queries]}
return padding_cls.to(class_embed.device), padding_bbox.to(class_embed.device), attn_mask.to(
class_embed.device), dn_meta
def inverse_sigmoid(x, eps=1e-6):
"""Inverse sigmoid function."""
x = x.clip(min=0., max=1.)
return torch.log(x / (1 - x + eps) + eps)

View File

@ -0,0 +1,7 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from ultralytics.models.yolo import classify, detect, pose, segment
from .model import YOLO
__all__ = 'classify', 'segment', 'detect', 'pose', 'YOLO'

View File

@ -0,0 +1,7 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from ultralytics.models.yolo.classify.predict import ClassificationPredictor
from ultralytics.models.yolo.classify.train import ClassificationTrainer
from ultralytics.models.yolo.classify.val import ClassificationValidator
__all__ = 'ClassificationPredictor', 'ClassificationTrainer', 'ClassificationValidator'

View File

@ -0,0 +1,49 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import torch
from ultralytics.engine.predictor import BasePredictor
from ultralytics.engine.results import Results
from ultralytics.utils import DEFAULT_CFG, ops
class ClassificationPredictor(BasePredictor):
"""
A class extending the BasePredictor class for prediction based on a classification model.
Notes:
- Torchvision classification models can also be passed to the 'model' argument, i.e. model='resnet18'.
Example:
```python
from ultralytics.utils import ASSETS
from ultralytics.models.yolo.classify import ClassificationPredictor
args = dict(model='yolov8n-cls.pt', source=ASSETS)
predictor = ClassificationPredictor(overrides=args)
predictor.predict_cli()
```
"""
def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
super().__init__(cfg, overrides, _callbacks)
self.args.task = 'classify'
def preprocess(self, img):
"""Converts input image to model-compatible data type."""
if not isinstance(img, torch.Tensor):
img = torch.stack([self.transforms(im) for im in img], dim=0)
img = (img if isinstance(img, torch.Tensor) else torch.from_numpy(img)).to(self.model.device)
return img.half() if self.model.fp16 else img.float() # uint8 to fp16/32
def postprocess(self, preds, img, orig_imgs):
"""Post-processes predictions to return Results objects."""
if not isinstance(orig_imgs, list): # input images are a torch.Tensor, not a list
orig_imgs = ops.convert_torch2numpy_batch(orig_imgs)
results = []
for i, pred in enumerate(preds):
orig_img = orig_imgs[i]
img_path = self.batch[0][i]
results.append(Results(orig_img, path=img_path, names=self.model.names, probs=pred))
return results

View File

@ -0,0 +1,150 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import torch
import torchvision
from ultralytics.data import ClassificationDataset, build_dataloader
from ultralytics.engine.trainer import BaseTrainer
from ultralytics.models import yolo
from ultralytics.nn.tasks import ClassificationModel, attempt_load_one_weight
from ultralytics.utils import DEFAULT_CFG, LOGGER, RANK, colorstr
from ultralytics.utils.plotting import plot_images, plot_results
from ultralytics.utils.torch_utils import is_parallel, strip_optimizer, torch_distributed_zero_first
class ClassificationTrainer(BaseTrainer):
"""
A class extending the BaseTrainer class for training based on a classification model.
Notes:
- Torchvision classification models can also be passed to the 'model' argument, i.e. model='resnet18'.
Example:
```python
from ultralytics.models.yolo.classify import ClassificationTrainer
args = dict(model='yolov8n-cls.pt', data='imagenet10', epochs=3)
trainer = ClassificationTrainer(overrides=args)
trainer.train()
```
"""
def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
"""Initialize a ClassificationTrainer object with optional configuration overrides and callbacks."""
if overrides is None:
overrides = {}
overrides['task'] = 'classify'
if overrides.get('imgsz') is None:
overrides['imgsz'] = 224
super().__init__(cfg, overrides, _callbacks)
def set_model_attributes(self):
"""Set the YOLO model's class names from the loaded dataset."""
self.model.names = self.data['names']
def get_model(self, cfg=None, weights=None, verbose=True):
"""Returns a modified PyTorch model configured for training YOLO."""
model = ClassificationModel(cfg, nc=self.data['nc'], verbose=verbose and RANK == -1)
if weights:
model.load(weights)
for m in model.modules():
if not self.args.pretrained and hasattr(m, 'reset_parameters'):
m.reset_parameters()
if isinstance(m, torch.nn.Dropout) and self.args.dropout:
m.p = self.args.dropout # set dropout
for p in model.parameters():
p.requires_grad = True # for training
return model
def setup_model(self):
"""Load, create or download model for any task."""
if isinstance(self.model, torch.nn.Module): # if model is loaded beforehand. No setup needed
return
model, ckpt = str(self.model), None
# Load a YOLO model locally, from torchvision, or from Ultralytics assets
if model.endswith('.pt'):
self.model, ckpt = attempt_load_one_weight(model, device='cpu')
for p in self.model.parameters():
p.requires_grad = True # for training
elif model.split('.')[-1] in ('yaml', 'yml'):
self.model = self.get_model(cfg=model)
elif model in torchvision.models.__dict__:
self.model = torchvision.models.__dict__[model](weights='IMAGENET1K_V1' if self.args.pretrained else None)
else:
FileNotFoundError(f'ERROR: model={model} not found locally or online. Please check model name.')
ClassificationModel.reshape_outputs(self.model, self.data['nc'])
return ckpt
def build_dataset(self, img_path, mode='train', batch=None):
return ClassificationDataset(root=img_path, args=self.args, augment=mode == 'train', prefix=mode)
def get_dataloader(self, dataset_path, batch_size=16, rank=0, mode='train'):
"""Returns PyTorch DataLoader with transforms to preprocess images for inference."""
with torch_distributed_zero_first(rank): # init dataset *.cache only once if DDP
dataset = self.build_dataset(dataset_path, mode)
loader = build_dataloader(dataset, batch_size, self.args.workers, rank=rank)
# Attach inference transforms
if mode != 'train':
if is_parallel(self.model):
self.model.module.transforms = loader.dataset.torch_transforms
else:
self.model.transforms = loader.dataset.torch_transforms
return loader
def preprocess_batch(self, batch):
"""Preprocesses a batch of images and classes."""
batch['img'] = batch['img'].to(self.device)
batch['cls'] = batch['cls'].to(self.device)
return batch
def progress_string(self):
"""Returns a formatted string showing training progress."""
return ('\n' + '%11s' * (4 + len(self.loss_names))) % \
('Epoch', 'GPU_mem', *self.loss_names, 'Instances', 'Size')
def get_validator(self):
"""Returns an instance of ClassificationValidator for validation."""
self.loss_names = ['loss']
return yolo.classify.ClassificationValidator(self.test_loader, self.save_dir)
def label_loss_items(self, loss_items=None, prefix='train'):
"""
Returns a loss dict with labelled training loss items tensor. Not needed for classification but necessary for
segmentation & detection
"""
keys = [f'{prefix}/{x}' for x in self.loss_names]
if loss_items is None:
return keys
loss_items = [round(float(loss_items), 5)]
return dict(zip(keys, loss_items))
def plot_metrics(self):
"""Plots metrics from a CSV file."""
plot_results(file=self.csv, classify=True, on_plot=self.on_plot) # save results.png
def final_eval(self):
"""Evaluate trained model and save validation results."""
for f in self.last, self.best:
if f.exists():
strip_optimizer(f) # strip optimizers
if f is self.best:
LOGGER.info(f'\nValidating {f}...')
self.validator.args.data = self.args.data
self.validator.args.plots = self.args.plots
self.metrics = self.validator(model=f)
self.metrics.pop('fitness', None)
self.run_callbacks('on_fit_epoch_end')
LOGGER.info(f"Results saved to {colorstr('bold', self.save_dir)}")
def plot_training_samples(self, batch, ni):
"""Plots training samples with their annotations."""
plot_images(
images=batch['img'],
batch_idx=torch.arange(len(batch['img'])),
cls=batch['cls'].view(-1), # warning: use .view(), not .squeeze() for Classify models
fname=self.save_dir / f'train_batch{ni}.jpg',
on_plot=self.on_plot)

View File

@ -0,0 +1,110 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import torch
from ultralytics.data import ClassificationDataset, build_dataloader
from ultralytics.engine.validator import BaseValidator
from ultralytics.utils import LOGGER
from ultralytics.utils.metrics import ClassifyMetrics, ConfusionMatrix
from ultralytics.utils.plotting import plot_images
class ClassificationValidator(BaseValidator):
"""
A class extending the BaseValidator class for validation based on a classification model.
Notes:
- Torchvision classification models can also be passed to the 'model' argument, i.e. model='resnet18'.
Example:
```python
from ultralytics.models.yolo.classify import ClassificationValidator
args = dict(model='yolov8n-cls.pt', data='imagenet10')
validator = ClassificationValidator(args=args)
validator()
```
"""
def __init__(self, dataloader=None, save_dir=None, pbar=None, args=None, _callbacks=None):
"""Initializes ClassificationValidator instance with args, dataloader, save_dir, and progress bar."""
super().__init__(dataloader, save_dir, pbar, args, _callbacks)
self.targets = None
self.pred = None
self.args.task = 'classify'
self.metrics = ClassifyMetrics()
def get_desc(self):
"""Returns a formatted string summarizing classification metrics."""
return ('%22s' + '%11s' * 2) % ('classes', 'top1_acc', 'top5_acc')
def init_metrics(self, model):
"""Initialize confusion matrix, class names, and top-1 and top-5 accuracy."""
self.names = model.names
self.nc = len(model.names)
self.confusion_matrix = ConfusionMatrix(nc=self.nc, conf=self.args.conf, task='classify')
self.pred = []
self.targets = []
def preprocess(self, batch):
"""Preprocesses input batch and returns it."""
batch['img'] = batch['img'].to(self.device, non_blocking=True)
batch['img'] = batch['img'].half() if self.args.half else batch['img'].float()
batch['cls'] = batch['cls'].to(self.device)
return batch
def update_metrics(self, preds, batch):
"""Updates running metrics with model predictions and batch targets."""
n5 = min(len(self.names), 5)
self.pred.append(preds.argsort(1, descending=True)[:, :n5])
self.targets.append(batch['cls'])
def finalize_metrics(self, *args, **kwargs):
"""Finalizes metrics of the model such as confusion_matrix and speed."""
self.confusion_matrix.process_cls_preds(self.pred, self.targets)
if self.args.plots:
for normalize in True, False:
self.confusion_matrix.plot(save_dir=self.save_dir,
names=self.names.values(),
normalize=normalize,
on_plot=self.on_plot)
self.metrics.speed = self.speed
self.metrics.confusion_matrix = self.confusion_matrix
self.metrics.save_dir = self.save_dir
def get_stats(self):
"""Returns a dictionary of metrics obtained by processing targets and predictions."""
self.metrics.process(self.targets, self.pred)
return self.metrics.results_dict
def build_dataset(self, img_path):
return ClassificationDataset(root=img_path, args=self.args, augment=False, prefix=self.args.split)
def get_dataloader(self, dataset_path, batch_size):
"""Builds and returns a data loader for classification tasks with given parameters."""
dataset = self.build_dataset(dataset_path)
return build_dataloader(dataset, batch_size, self.args.workers, rank=-1)
def print_results(self):
"""Prints evaluation metrics for YOLO object detection model."""
pf = '%22s' + '%11.3g' * len(self.metrics.keys) # print format
LOGGER.info(pf % ('all', self.metrics.top1, self.metrics.top5))
def plot_val_samples(self, batch, ni):
"""Plot validation image samples."""
plot_images(
images=batch['img'],
batch_idx=torch.arange(len(batch['img'])),
cls=batch['cls'].view(-1), # warning: use .view(), not .squeeze() for Classify models
fname=self.save_dir / f'val_batch{ni}_labels.jpg',
names=self.names,
on_plot=self.on_plot)
def plot_predictions(self, batch, preds, ni):
"""Plots predicted bounding boxes on input images and saves the result."""
plot_images(batch['img'],
batch_idx=torch.arange(len(batch['img'])),
cls=torch.argmax(preds, dim=1),
fname=self.save_dir / f'val_batch{ni}_pred.jpg',
names=self.names,
on_plot=self.on_plot) # pred

View File

@ -0,0 +1,7 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from .predict import DetectionPredictor
from .train import DetectionTrainer
from .val import DetectionValidator
__all__ = 'DetectionPredictor', 'DetectionTrainer', 'DetectionValidator'

View File

@ -0,0 +1,41 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from ultralytics.engine.predictor import BasePredictor
from ultralytics.engine.results import Results
from ultralytics.utils import ops
class DetectionPredictor(BasePredictor):
"""
A class extending the BasePredictor class for prediction based on a detection model.
Example:
```python
from ultralytics.utils import ASSETS
from ultralytics.models.yolo.detect import DetectionPredictor
args = dict(model='yolov8n.pt', source=ASSETS)
predictor = DetectionPredictor(overrides=args)
predictor.predict_cli()
```
"""
def postprocess(self, preds, img, orig_imgs):
"""Post-processes predictions and returns a list of Results objects."""
preds = ops.non_max_suppression(preds,
self.args.conf,
self.args.iou,
agnostic=self.args.agnostic_nms,
max_det=self.args.max_det,
classes=self.args.classes)
if not isinstance(orig_imgs, list): # input images are a torch.Tensor, not a list
orig_imgs = ops.convert_torch2numpy_batch(orig_imgs)
results = []
for i, pred in enumerate(preds):
orig_img = orig_imgs[i]
pred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
img_path = self.batch[0][i]
results.append(Results(orig_img, path=img_path, names=self.model.names, boxes=pred))
return results

View File

@ -0,0 +1,116 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from copy import copy
import numpy as np
from ultralytics.data import build_dataloader, build_yolo_dataset
from ultralytics.engine.trainer import BaseTrainer
from ultralytics.models import yolo
from ultralytics.nn.tasks import DetectionModel
from ultralytics.utils import LOGGER, RANK
from ultralytics.utils.plotting import plot_images, plot_labels, plot_results
from ultralytics.utils.torch_utils import de_parallel, torch_distributed_zero_first
class DetectionTrainer(BaseTrainer):
"""
A class extending the BaseTrainer class for training based on a detection model.
Example:
```python
from ultralytics.models.yolo.detect import DetectionTrainer
args = dict(model='yolov8n.pt', data='coco8.yaml', epochs=3)
trainer = DetectionTrainer(overrides=args)
trainer.train()
```
"""
def build_dataset(self, img_path, mode='train', batch=None):
"""
Build YOLO Dataset.
Args:
img_path (str): Path to the folder containing images.
mode (str): `train` mode or `val` mode, users are able to customize different augmentations for each mode.
batch (int, optional): Size of batches, this is for `rect`. Defaults to None.
"""
gs = max(int(de_parallel(self.model).stride.max() if self.model else 0), 32)
return build_yolo_dataset(self.args, img_path, batch, self.data, mode=mode, rect=mode == 'val', stride=gs)
def get_dataloader(self, dataset_path, batch_size=16, rank=0, mode='train'):
"""Construct and return dataloader."""
assert mode in ['train', 'val']
with torch_distributed_zero_first(rank): # init dataset *.cache only once if DDP
dataset = self.build_dataset(dataset_path, mode, batch_size)
shuffle = mode == 'train'
if getattr(dataset, 'rect', False) and shuffle:
LOGGER.warning("WARNING ⚠️ 'rect=True' is incompatible with DataLoader shuffle, setting shuffle=False")
shuffle = False
workers = self.args.workers if mode == 'train' else self.args.workers * 2
return build_dataloader(dataset, batch_size, workers, shuffle, rank) # return dataloader
def preprocess_batch(self, batch):
"""Preprocesses a batch of images by scaling and converting to float."""
batch['img'] = batch['img'].to(self.device, non_blocking=True).float() / 255
return batch
def set_model_attributes(self):
"""nl = de_parallel(self.model).model[-1].nl # number of detection layers (to scale hyps)."""
# self.args.box *= 3 / nl # scale to layers
# self.args.cls *= self.data["nc"] / 80 * 3 / nl # scale to classes and layers
# self.args.cls *= (self.args.imgsz / 640) ** 2 * 3 / nl # scale to image size and layers
self.model.nc = self.data['nc'] # attach number of classes to model
self.model.names = self.data['names'] # attach class names to model
self.model.args = self.args # attach hyperparameters to model
# TODO: self.model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) * nc
def get_model(self, cfg=None, weights=None, verbose=True):
"""Return a YOLO detection model."""
model = DetectionModel(cfg, nc=self.data['nc'], verbose=verbose and RANK == -1)
if weights:
model.load(weights)
return model
def get_validator(self):
"""Returns a DetectionValidator for YOLO model validation."""
self.loss_names = 'box_loss', 'cls_loss', 'dfl_loss'
return yolo.detect.DetectionValidator(self.test_loader, save_dir=self.save_dir, args=copy(self.args))
def label_loss_items(self, loss_items=None, prefix='train'):
"""
Returns a loss dict with labelled training loss items tensor. Not needed for classification but necessary for
segmentation & detection
"""
keys = [f'{prefix}/{x}' for x in self.loss_names]
if loss_items is not None:
loss_items = [round(float(x), 5) for x in loss_items] # convert tensors to 5 decimal place floats
return dict(zip(keys, loss_items))
else:
return keys
def progress_string(self):
"""Returns a formatted string of training progress with epoch, GPU memory, loss, instances and size."""
return ('\n' + '%11s' *
(4 + len(self.loss_names))) % ('Epoch', 'GPU_mem', *self.loss_names, 'Instances', 'Size')
def plot_training_samples(self, batch, ni):
"""Plots training samples with their annotations."""
plot_images(images=batch['img'],
batch_idx=batch['batch_idx'],
cls=batch['cls'].squeeze(-1),
bboxes=batch['bboxes'],
paths=batch['im_file'],
fname=self.save_dir / f'train_batch{ni}.jpg',
on_plot=self.on_plot)
def plot_metrics(self):
"""Plots metrics from a CSV file."""
plot_results(file=self.csv, on_plot=self.on_plot) # save results.png
def plot_training_labels(self):
"""Create a labeled training plot of the YOLO model."""
boxes = np.concatenate([lb['bboxes'] for lb in self.train_loader.dataset.labels], 0)
cls = np.concatenate([lb['cls'] for lb in self.train_loader.dataset.labels], 0)
plot_labels(boxes, cls.squeeze(), names=self.data['names'], save_dir=self.save_dir, on_plot=self.on_plot)

View File

@ -0,0 +1,268 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import os
from pathlib import Path
import numpy as np
import torch
from ultralytics.data import build_dataloader, build_yolo_dataset, converter
from ultralytics.engine.validator import BaseValidator
from ultralytics.utils import LOGGER, ops
from ultralytics.utils.checks import check_requirements
from ultralytics.utils.metrics import ConfusionMatrix, DetMetrics, box_iou
from ultralytics.utils.plotting import output_to_target, plot_images
from ultralytics.utils.torch_utils import de_parallel
class DetectionValidator(BaseValidator):
"""
A class extending the BaseValidator class for validation based on a detection model.
Example:
```python
from ultralytics.models.yolo.detect import DetectionValidator
args = dict(model='yolov8n.pt', data='coco8.yaml')
validator = DetectionValidator(args=args)
validator()
```
"""
def __init__(self, dataloader=None, save_dir=None, pbar=None, args=None, _callbacks=None):
"""Initialize detection model with necessary variables and settings."""
super().__init__(dataloader, save_dir, pbar, args, _callbacks)
self.nt_per_class = None
self.is_coco = False
self.class_map = None
self.args.task = 'detect'
self.metrics = DetMetrics(save_dir=self.save_dir, on_plot=self.on_plot)
self.iouv = torch.linspace(0.5, 0.95, 10) # iou vector for mAP@0.5:0.95
self.niou = self.iouv.numel()
self.lb = [] # for autolabelling
def preprocess(self, batch):
"""Preprocesses batch of images for YOLO training."""
batch['img'] = batch['img'].to(self.device, non_blocking=True)
batch['img'] = (batch['img'].half() if self.args.half else batch['img'].float()) / 255
for k in ['batch_idx', 'cls', 'bboxes']:
batch[k] = batch[k].to(self.device)
if self.args.save_hybrid:
height, width = batch['img'].shape[2:]
nb = len(batch['img'])
bboxes = batch['bboxes'] * torch.tensor((width, height, width, height), device=self.device)
self.lb = [
torch.cat([batch['cls'][batch['batch_idx'] == i], bboxes[batch['batch_idx'] == i]], dim=-1)
for i in range(nb)] if self.args.save_hybrid else [] # for autolabelling
return batch
def init_metrics(self, model):
"""Initialize evaluation metrics for YOLO."""
val = self.data.get(self.args.split, '') # validation path
self.is_coco = isinstance(val, str) and 'coco' in val and val.endswith(f'{os.sep}val2017.txt') # is COCO
self.class_map = converter.coco80_to_coco91_class() if self.is_coco else list(range(1000))
self.args.save_json |= self.is_coco and not self.training # run on final val if training COCO
self.names = model.names
self.nc = len(model.names)
self.metrics.names = self.names
self.metrics.plot = self.args.plots
self.confusion_matrix = ConfusionMatrix(nc=self.nc, conf=self.args.conf)
self.seen = 0
self.jdict = []
self.stats = []
def get_desc(self):
"""Return a formatted string summarizing class metrics of YOLO model."""
return ('%22s' + '%11s' * 6) % ('Class', 'Images', 'Instances', 'Box(P', 'R', 'mAP50', 'mAP50-95)')
def postprocess(self, preds):
"""Apply Non-maximum suppression to prediction outputs."""
return ops.non_max_suppression(preds,
self.args.conf,
self.args.iou,
labels=self.lb,
multi_label=True,
agnostic=self.args.single_cls,
max_det=self.args.max_det)
def update_metrics(self, preds, batch):
"""Metrics."""
for si, pred in enumerate(preds):
idx = batch['batch_idx'] == si
cls = batch['cls'][idx]
bbox = batch['bboxes'][idx]
nl, npr = cls.shape[0], pred.shape[0] # number of labels, predictions
shape = batch['ori_shape'][si]
correct_bboxes = torch.zeros(npr, self.niou, dtype=torch.bool, device=self.device) # init
self.seen += 1
if npr == 0:
if nl:
self.stats.append((correct_bboxes, *torch.zeros((2, 0), device=self.device), cls.squeeze(-1)))
if self.args.plots:
self.confusion_matrix.process_batch(detections=None, labels=cls.squeeze(-1))
continue
# Predictions
if self.args.single_cls:
pred[:, 5] = 0
predn = pred.clone()
ops.scale_boxes(batch['img'][si].shape[1:], predn[:, :4], shape,
ratio_pad=batch['ratio_pad'][si]) # native-space pred
# Evaluate
if nl:
height, width = batch['img'].shape[2:]
tbox = ops.xywh2xyxy(bbox) * torch.tensor(
(width, height, width, height), device=self.device) # target boxes
ops.scale_boxes(batch['img'][si].shape[1:], tbox, shape,
ratio_pad=batch['ratio_pad'][si]) # native-space labels
labelsn = torch.cat((cls, tbox), 1) # native-space labels
correct_bboxes = self._process_batch(predn, labelsn)
# TODO: maybe remove these `self.` arguments as they already are member variable
if self.args.plots:
self.confusion_matrix.process_batch(predn, labelsn)
self.stats.append((correct_bboxes, pred[:, 4], pred[:, 5], cls.squeeze(-1))) # (conf, pcls, tcls)
# Save
if self.args.save_json:
self.pred_to_json(predn, batch['im_file'][si])
if self.args.save_txt:
file = self.save_dir / 'labels' / f'{Path(batch["im_file"][si]).stem}.txt'
self.save_one_txt(predn, self.args.save_conf, shape, file)
def finalize_metrics(self, *args, **kwargs):
"""Set final values for metrics speed and confusion matrix."""
self.metrics.speed = self.speed
self.metrics.confusion_matrix = self.confusion_matrix
def get_stats(self):
"""Returns metrics statistics and results dictionary."""
stats = [torch.cat(x, 0).cpu().numpy() for x in zip(*self.stats)] # to numpy
if len(stats) and stats[0].any():
self.metrics.process(*stats)
self.nt_per_class = np.bincount(stats[-1].astype(int), minlength=self.nc) # number of targets per class
return self.metrics.results_dict
def print_results(self):
"""Prints training/validation set metrics per class."""
pf = '%22s' + '%11i' * 2 + '%11.3g' * len(self.metrics.keys) # print format
LOGGER.info(pf % ('all', self.seen, self.nt_per_class.sum(), *self.metrics.mean_results()))
if self.nt_per_class.sum() == 0:
LOGGER.warning(
f'WARNING ⚠️ no labels found in {self.args.task} set, can not compute metrics without labels')
# Print results per class
if self.args.verbose and not self.training and self.nc > 1 and len(self.stats):
for i, c in enumerate(self.metrics.ap_class_index):
LOGGER.info(pf % (self.names[c], self.seen, self.nt_per_class[c], *self.metrics.class_result(i)))
if self.args.plots:
for normalize in True, False:
self.confusion_matrix.plot(save_dir=self.save_dir,
names=self.names.values(),
normalize=normalize,
on_plot=self.on_plot)
def _process_batch(self, detections, labels):
"""
Return correct prediction matrix.
Args:
detections (torch.Tensor): Tensor of shape [N, 6] representing detections.
Each detection is of the format: x1, y1, x2, y2, conf, class.
labels (torch.Tensor): Tensor of shape [M, 5] representing labels.
Each label is of the format: class, x1, y1, x2, y2.
Returns:
(torch.Tensor): Correct prediction matrix of shape [N, 10] for 10 IoU levels.
"""
iou = box_iou(labels[:, 1:], detections[:, :4])
return self.match_predictions(detections[:, 5], labels[:, 0], iou)
def build_dataset(self, img_path, mode='val', batch=None):
"""
Build YOLO Dataset.
Args:
img_path (str): Path to the folder containing images.
mode (str): `train` mode or `val` mode, users are able to customize different augmentations for each mode.
batch (int, optional): Size of batches, this is for `rect`. Defaults to None.
"""
gs = max(int(de_parallel(self.model).stride if self.model else 0), 32)
return build_yolo_dataset(self.args, img_path, batch, self.data, mode=mode, stride=gs)
def get_dataloader(self, dataset_path, batch_size):
"""Construct and return dataloader."""
dataset = self.build_dataset(dataset_path, batch=batch_size, mode='val')
return build_dataloader(dataset, batch_size, self.args.workers, shuffle=False, rank=-1) # return dataloader
def plot_val_samples(self, batch, ni):
"""Plot validation image samples."""
plot_images(batch['img'],
batch['batch_idx'],
batch['cls'].squeeze(-1),
batch['bboxes'],
paths=batch['im_file'],
fname=self.save_dir / f'val_batch{ni}_labels.jpg',
names=self.names,
on_plot=self.on_plot)
def plot_predictions(self, batch, preds, ni):
"""Plots predicted bounding boxes on input images and saves the result."""
plot_images(batch['img'],
*output_to_target(preds, max_det=self.args.max_det),
paths=batch['im_file'],
fname=self.save_dir / f'val_batch{ni}_pred.jpg',
names=self.names,
on_plot=self.on_plot) # pred
def save_one_txt(self, predn, save_conf, shape, file):
"""Save YOLO detections to a txt file in normalized coordinates in a specific format."""
gn = torch.tensor(shape)[[1, 0, 1, 0]] # normalization gain whwh
for *xyxy, conf, cls in predn.tolist():
xywh = (ops.xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist() # normalized xywh
line = (cls, *xywh, conf) if save_conf else (cls, *xywh) # label format
with open(file, 'a') as f:
f.write(('%g ' * len(line)).rstrip() % line + '\n')
def pred_to_json(self, predn, filename):
"""Serialize YOLO predictions to COCO json format."""
stem = Path(filename).stem
image_id = int(stem) if stem.isnumeric() else stem
box = ops.xyxy2xywh(predn[:, :4]) # xywh
box[:, :2] -= box[:, 2:] / 2 # xy center to top-left corner
for p, b in zip(predn.tolist(), box.tolist()):
self.jdict.append({
'image_id': image_id,
'category_id': self.class_map[int(p[5])],
'bbox': [round(x, 3) for x in b],
'score': round(p[4], 5)})
def eval_json(self, stats):
"""Evaluates YOLO output in JSON format and returns performance statistics."""
if self.args.save_json and self.is_coco and len(self.jdict):
anno_json = self.data['path'] / 'annotations/instances_val2017.json' # annotations
pred_json = self.save_dir / 'predictions.json' # predictions
LOGGER.info(f'\nEvaluating pycocotools mAP using {pred_json} and {anno_json}...')
try: # https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocoEvalDemo.ipynb
check_requirements('pycocotools>=2.0.6')
from pycocotools.coco import COCO # noqa
from pycocotools.cocoeval import COCOeval # noqa
for x in anno_json, pred_json:
assert x.is_file(), f'{x} file not found'
anno = COCO(str(anno_json)) # init annotations api
pred = anno.loadRes(str(pred_json)) # init predictions api (must pass string, not Path)
eval = COCOeval(anno, pred, 'bbox')
if self.is_coco:
eval.params.imgIds = [int(Path(x).stem) for x in self.dataloader.dataset.im_files] # images to eval
eval.evaluate()
eval.accumulate()
eval.summarize()
stats[self.metrics.keys[-1]], stats[self.metrics.keys[-2]] = eval.stats[:2] # update mAP50-95 and mAP50
except Exception as e:
LOGGER.warning(f'pycocotools unable to run: {e}')
return stats

View File

@ -0,0 +1,36 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from ultralytics.engine.model import Model
from ultralytics.models import yolo # noqa
from ultralytics.nn.tasks import ClassificationModel, DetectionModel, PoseModel, SegmentationModel
class YOLO(Model):
"""
YOLO (You Only Look Once) object detection model.
"""
@property
def task_map(self):
"""Map head to model, trainer, validator, and predictor classes"""
return {
'classify': {
'model': ClassificationModel,
'trainer': yolo.classify.ClassificationTrainer,
'validator': yolo.classify.ClassificationValidator,
'predictor': yolo.classify.ClassificationPredictor, },
'detect': {
'model': DetectionModel,
'trainer': yolo.detect.DetectionTrainer,
'validator': yolo.detect.DetectionValidator,
'predictor': yolo.detect.DetectionPredictor, },
'segment': {
'model': SegmentationModel,
'trainer': yolo.segment.SegmentationTrainer,
'validator': yolo.segment.SegmentationValidator,
'predictor': yolo.segment.SegmentationPredictor, },
'pose': {
'model': PoseModel,
'trainer': yolo.pose.PoseTrainer,
'validator': yolo.pose.PoseValidator,
'predictor': yolo.pose.PosePredictor, }, }

View File

@ -0,0 +1,7 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from .predict import PosePredictor
from .train import PoseTrainer
from .val import PoseValidator
__all__ = 'PoseTrainer', 'PoseValidator', 'PosePredictor'

View File

@ -0,0 +1,52 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from ultralytics.engine.results import Results
from ultralytics.models.yolo.detect.predict import DetectionPredictor
from ultralytics.utils import DEFAULT_CFG, LOGGER, ops
class PosePredictor(DetectionPredictor):
"""
A class extending the DetectionPredictor class for prediction based on a pose model.
Example:
```python
from ultralytics.utils import ASSETS
from ultralytics.models.yolo.pose import PosePredictor
args = dict(model='yolov8n-pose.pt', source=ASSETS)
predictor = PosePredictor(overrides=args)
predictor.predict_cli()
```
"""
def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
super().__init__(cfg, overrides, _callbacks)
self.args.task = 'pose'
if isinstance(self.args.device, str) and self.args.device.lower() == 'mps':
LOGGER.warning("WARNING ⚠️ Apple MPS known Pose bug. Recommend 'device=cpu' for Pose models. "
'See https://github.com/ultralytics/ultralytics/issues/4031.')
def postprocess(self, preds, img, orig_imgs):
"""Return detection results for a given input image or list of images."""
preds = ops.non_max_suppression(preds,
self.args.conf,
self.args.iou,
agnostic=self.args.agnostic_nms,
max_det=self.args.max_det,
classes=self.args.classes,
nc=len(self.model.names))
if not isinstance(orig_imgs, list): # input images are a torch.Tensor, not a list
orig_imgs = ops.convert_torch2numpy_batch(orig_imgs)
results = []
for i, pred in enumerate(preds):
orig_img = orig_imgs[i]
pred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape).round()
pred_kpts = pred[:, 6:].view(len(pred), *self.model.kpt_shape) if len(pred) else pred[:, 6:]
pred_kpts = ops.scale_coords(img.shape[2:], pred_kpts, orig_img.shape)
img_path = self.batch[0][i]
results.append(
Results(orig_img, path=img_path, names=self.model.names, boxes=pred[:, :6], keypoints=pred_kpts))
return results

View File

@ -0,0 +1,73 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from copy import copy
from ultralytics.models import yolo
from ultralytics.nn.tasks import PoseModel
from ultralytics.utils import DEFAULT_CFG, LOGGER
from ultralytics.utils.plotting import plot_images, plot_results
class PoseTrainer(yolo.detect.DetectionTrainer):
"""
A class extending the DetectionTrainer class for training based on a pose model.
Example:
```python
from ultralytics.models.yolo.pose import PoseTrainer
args = dict(model='yolov8n-pose.pt', data='coco8-pose.yaml', epochs=3)
trainer = PoseTrainer(overrides=args)
trainer.train()
```
"""
def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
"""Initialize a PoseTrainer object with specified configurations and overrides."""
if overrides is None:
overrides = {}
overrides['task'] = 'pose'
super().__init__(cfg, overrides, _callbacks)
if isinstance(self.args.device, str) and self.args.device.lower() == 'mps':
LOGGER.warning("WARNING ⚠️ Apple MPS known Pose bug. Recommend 'device=cpu' for Pose models. "
'See https://github.com/ultralytics/ultralytics/issues/4031.')
def get_model(self, cfg=None, weights=None, verbose=True):
"""Get pose estimation model with specified configuration and weights."""
model = PoseModel(cfg, ch=3, nc=self.data['nc'], data_kpt_shape=self.data['kpt_shape'], verbose=verbose)
if weights:
model.load(weights)
return model
def set_model_attributes(self):
"""Sets keypoints shape attribute of PoseModel."""
super().set_model_attributes()
self.model.kpt_shape = self.data['kpt_shape']
def get_validator(self):
"""Returns an instance of the PoseValidator class for validation."""
self.loss_names = 'box_loss', 'pose_loss', 'kobj_loss', 'cls_loss', 'dfl_loss'
return yolo.pose.PoseValidator(self.test_loader, save_dir=self.save_dir, args=copy(self.args))
def plot_training_samples(self, batch, ni):
"""Plot a batch of training samples with annotated class labels, bounding boxes, and keypoints."""
images = batch['img']
kpts = batch['keypoints']
cls = batch['cls'].squeeze(-1)
bboxes = batch['bboxes']
paths = batch['im_file']
batch_idx = batch['batch_idx']
plot_images(images,
batch_idx,
cls,
bboxes,
kpts=kpts,
paths=paths,
fname=self.save_dir / f'train_batch{ni}.jpg',
on_plot=self.on_plot)
def plot_metrics(self):
"""Plots training/val metrics."""
plot_results(file=self.csv, pose=True, on_plot=self.on_plot) # save results.png

View File

@ -0,0 +1,215 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from pathlib import Path
import numpy as np
import torch
from ultralytics.models.yolo.detect import DetectionValidator
from ultralytics.utils import LOGGER, ops
from ultralytics.utils.checks import check_requirements
from ultralytics.utils.metrics import OKS_SIGMA, PoseMetrics, box_iou, kpt_iou
from ultralytics.utils.plotting import output_to_target, plot_images
class PoseValidator(DetectionValidator):
"""
A class extending the DetectionValidator class for validation based on a pose model.
Example:
```python
from ultralytics.models.yolo.pose import PoseValidator
args = dict(model='yolov8n-pose.pt', data='coco8-pose.yaml')
validator = PoseValidator(args=args)
validator()
```
"""
def __init__(self, dataloader=None, save_dir=None, pbar=None, args=None, _callbacks=None):
"""Initialize a 'PoseValidator' object with custom parameters and assigned attributes."""
super().__init__(dataloader, save_dir, pbar, args, _callbacks)
self.sigma = None
self.kpt_shape = None
self.args.task = 'pose'
self.metrics = PoseMetrics(save_dir=self.save_dir, on_plot=self.on_plot)
if isinstance(self.args.device, str) and self.args.device.lower() == 'mps':
LOGGER.warning("WARNING ⚠️ Apple MPS known Pose bug. Recommend 'device=cpu' for Pose models. "
'See https://github.com/ultralytics/ultralytics/issues/4031.')
def preprocess(self, batch):
"""Preprocesses the batch by converting the 'keypoints' data into a float and moving it to the device."""
batch = super().preprocess(batch)
batch['keypoints'] = batch['keypoints'].to(self.device).float()
return batch
def get_desc(self):
"""Returns description of evaluation metrics in string format."""
return ('%22s' + '%11s' * 10) % ('Class', 'Images', 'Instances', 'Box(P', 'R', 'mAP50', 'mAP50-95)', 'Pose(P',
'R', 'mAP50', 'mAP50-95)')
def postprocess(self, preds):
"""Apply non-maximum suppression and return detections with high confidence scores."""
return ops.non_max_suppression(preds,
self.args.conf,
self.args.iou,
labels=self.lb,
multi_label=True,
agnostic=self.args.single_cls,
max_det=self.args.max_det,
nc=self.nc)
def init_metrics(self, model):
"""Initiate pose estimation metrics for YOLO model."""
super().init_metrics(model)
self.kpt_shape = self.data['kpt_shape']
is_pose = self.kpt_shape == [17, 3]
nkpt = self.kpt_shape[0]
self.sigma = OKS_SIGMA if is_pose else np.ones(nkpt) / nkpt
def update_metrics(self, preds, batch):
"""Metrics."""
for si, pred in enumerate(preds):
idx = batch['batch_idx'] == si
cls = batch['cls'][idx]
bbox = batch['bboxes'][idx]
kpts = batch['keypoints'][idx]
nl, npr = cls.shape[0], pred.shape[0] # number of labels, predictions
nk = kpts.shape[1] # number of keypoints
shape = batch['ori_shape'][si]
correct_kpts = torch.zeros(npr, self.niou, dtype=torch.bool, device=self.device) # init
correct_bboxes = torch.zeros(npr, self.niou, dtype=torch.bool, device=self.device) # init
self.seen += 1
if npr == 0:
if nl:
self.stats.append((correct_bboxes, correct_kpts, *torch.zeros(
(2, 0), device=self.device), cls.squeeze(-1)))
if self.args.plots:
self.confusion_matrix.process_batch(detections=None, labels=cls.squeeze(-1))
continue
# Predictions
if self.args.single_cls:
pred[:, 5] = 0
predn = pred.clone()
ops.scale_boxes(batch['img'][si].shape[1:], predn[:, :4], shape,
ratio_pad=batch['ratio_pad'][si]) # native-space pred
pred_kpts = predn[:, 6:].view(npr, nk, -1)
ops.scale_coords(batch['img'][si].shape[1:], pred_kpts, shape, ratio_pad=batch['ratio_pad'][si])
# Evaluate
if nl:
height, width = batch['img'].shape[2:]
tbox = ops.xywh2xyxy(bbox) * torch.tensor(
(width, height, width, height), device=self.device) # target boxes
ops.scale_boxes(batch['img'][si].shape[1:], tbox, shape,
ratio_pad=batch['ratio_pad'][si]) # native-space labels
tkpts = kpts.clone()
tkpts[..., 0] *= width
tkpts[..., 1] *= height
tkpts = ops.scale_coords(batch['img'][si].shape[1:], tkpts, shape, ratio_pad=batch['ratio_pad'][si])
labelsn = torch.cat((cls, tbox), 1) # native-space labels
correct_bboxes = self._process_batch(predn[:, :6], labelsn)
correct_kpts = self._process_batch(predn[:, :6], labelsn, pred_kpts, tkpts)
if self.args.plots:
self.confusion_matrix.process_batch(predn, labelsn)
# Append correct_masks, correct_boxes, pconf, pcls, tcls
self.stats.append((correct_bboxes, correct_kpts, pred[:, 4], pred[:, 5], cls.squeeze(-1)))
# Save
if self.args.save_json:
self.pred_to_json(predn, batch['im_file'][si])
# if self.args.save_txt:
# save_one_txt(predn, save_conf, shape, file=save_dir / 'labels' / f'{path.stem}.txt')
def _process_batch(self, detections, labels, pred_kpts=None, gt_kpts=None):
"""
Return correct prediction matrix.
Args:
detections (torch.Tensor): Tensor of shape [N, 6] representing detections.
Each detection is of the format: x1, y1, x2, y2, conf, class.
labels (torch.Tensor): Tensor of shape [M, 5] representing labels.
Each label is of the format: class, x1, y1, x2, y2.
pred_kpts (torch.Tensor, optional): Tensor of shape [N, 51] representing predicted keypoints.
51 corresponds to 17 keypoints each with 3 values.
gt_kpts (torch.Tensor, optional): Tensor of shape [N, 51] representing ground truth keypoints.
Returns:
torch.Tensor: Correct prediction matrix of shape [N, 10] for 10 IoU levels.
"""
if pred_kpts is not None and gt_kpts is not None:
# `0.53` is from https://github.com/jin-s13/xtcocoapi/blob/master/xtcocotools/cocoeval.py#L384
area = ops.xyxy2xywh(labels[:, 1:])[:, 2:].prod(1) * 0.53
iou = kpt_iou(gt_kpts, pred_kpts, sigma=self.sigma, area=area)
else: # boxes
iou = box_iou(labels[:, 1:], detections[:, :4])
return self.match_predictions(detections[:, 5], labels[:, 0], iou)
def plot_val_samples(self, batch, ni):
"""Plots and saves validation set samples with predicted bounding boxes and keypoints."""
plot_images(batch['img'],
batch['batch_idx'],
batch['cls'].squeeze(-1),
batch['bboxes'],
kpts=batch['keypoints'],
paths=batch['im_file'],
fname=self.save_dir / f'val_batch{ni}_labels.jpg',
names=self.names,
on_plot=self.on_plot)
def plot_predictions(self, batch, preds, ni):
"""Plots predictions for YOLO model."""
pred_kpts = torch.cat([p[:, 6:].view(-1, *self.kpt_shape) for p in preds], 0)
plot_images(batch['img'],
*output_to_target(preds, max_det=self.args.max_det),
kpts=pred_kpts,
paths=batch['im_file'],
fname=self.save_dir / f'val_batch{ni}_pred.jpg',
names=self.names,
on_plot=self.on_plot) # pred
def pred_to_json(self, predn, filename):
"""Converts YOLO predictions to COCO JSON format."""
stem = Path(filename).stem
image_id = int(stem) if stem.isnumeric() else stem
box = ops.xyxy2xywh(predn[:, :4]) # xywh
box[:, :2] -= box[:, 2:] / 2 # xy center to top-left corner
for p, b in zip(predn.tolist(), box.tolist()):
self.jdict.append({
'image_id': image_id,
'category_id': self.class_map[int(p[5])],
'bbox': [round(x, 3) for x in b],
'keypoints': p[6:],
'score': round(p[4], 5)})
def eval_json(self, stats):
"""Evaluates object detection model using COCO JSON format."""
if self.args.save_json and self.is_coco and len(self.jdict):
anno_json = self.data['path'] / 'annotations/person_keypoints_val2017.json' # annotations
pred_json = self.save_dir / 'predictions.json' # predictions
LOGGER.info(f'\nEvaluating pycocotools mAP using {pred_json} and {anno_json}...')
try: # https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocoEvalDemo.ipynb
check_requirements('pycocotools>=2.0.6')
from pycocotools.coco import COCO # noqa
from pycocotools.cocoeval import COCOeval # noqa
for x in anno_json, pred_json:
assert x.is_file(), f'{x} file not found'
anno = COCO(str(anno_json)) # init annotations api
pred = anno.loadRes(str(pred_json)) # init predictions api (must pass string, not Path)
for i, eval in enumerate([COCOeval(anno, pred, 'bbox'), COCOeval(anno, pred, 'keypoints')]):
if self.is_coco:
eval.params.imgIds = [int(Path(x).stem) for x in self.dataloader.dataset.im_files] # im to eval
eval.evaluate()
eval.accumulate()
eval.summarize()
idx = i * 4 + 2
stats[self.metrics.keys[idx + 1]], stats[
self.metrics.keys[idx]] = eval.stats[:2] # update mAP50-95 and mAP50
except Exception as e:
LOGGER.warning(f'pycocotools unable to run: {e}')
return stats

View File

@ -0,0 +1,7 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from .predict import SegmentationPredictor
from .train import SegmentationTrainer
from .val import SegmentationValidator
__all__ = 'SegmentationPredictor', 'SegmentationTrainer', 'SegmentationValidator'

View File

@ -0,0 +1,53 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from ultralytics.engine.results import Results
from ultralytics.models.yolo.detect.predict import DetectionPredictor
from ultralytics.utils import DEFAULT_CFG, ops
class SegmentationPredictor(DetectionPredictor):
"""
A class extending the DetectionPredictor class for prediction based on a segmentation model.
Example:
```python
from ultralytics.utils import ASSETS
from ultralytics.models.yolo.segment import SegmentationPredictor
args = dict(model='yolov8n-seg.pt', source=ASSETS)
predictor = SegmentationPredictor(overrides=args)
predictor.predict_cli()
```
"""
def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
super().__init__(cfg, overrides, _callbacks)
self.args.task = 'segment'
def postprocess(self, preds, img, orig_imgs):
p = ops.non_max_suppression(preds[0],
self.args.conf,
self.args.iou,
agnostic=self.args.agnostic_nms,
max_det=self.args.max_det,
nc=len(self.model.names),
classes=self.args.classes)
if not isinstance(orig_imgs, list): # input images are a torch.Tensor, not a list
orig_imgs = ops.convert_torch2numpy_batch(orig_imgs)
results = []
proto = preds[1][-1] if len(preds[1]) == 3 else preds[1] # second output is len 3 if pt, but only 1 if exported
for i, pred in enumerate(p):
orig_img = orig_imgs[i]
img_path = self.batch[0][i]
if not len(pred): # save empty boxes
masks = None
elif self.args.retina_masks:
pred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
masks = ops.process_mask_native(proto[i], pred[:, 6:], pred[:, :4], orig_img.shape[:2]) # HWC
else:
masks = ops.process_mask(proto[i], pred[:, 6:], pred[:, :4], img.shape[2:], upsample=True) # HWC
pred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
results.append(Results(orig_img, path=img_path, names=self.model.names, boxes=pred[:, :6], masks=masks))
return results

View File

@ -0,0 +1,58 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from copy import copy
from ultralytics.models import yolo
from ultralytics.nn.tasks import SegmentationModel
from ultralytics.utils import DEFAULT_CFG, RANK
from ultralytics.utils.plotting import plot_images, plot_results
class SegmentationTrainer(yolo.detect.DetectionTrainer):
"""
A class extending the DetectionTrainer class for training based on a segmentation model.
Example:
```python
from ultralytics.models.yolo.segment import SegmentationTrainer
args = dict(model='yolov8n-seg.pt', data='coco8-seg.yaml', epochs=3)
trainer = SegmentationTrainer(overrides=args)
trainer.train()
```
"""
def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
"""Initialize a SegmentationTrainer object with given arguments."""
if overrides is None:
overrides = {}
overrides['task'] = 'segment'
super().__init__(cfg, overrides, _callbacks)
def get_model(self, cfg=None, weights=None, verbose=True):
"""Return SegmentationModel initialized with specified config and weights."""
model = SegmentationModel(cfg, ch=3, nc=self.data['nc'], verbose=verbose and RANK == -1)
if weights:
model.load(weights)
return model
def get_validator(self):
"""Return an instance of SegmentationValidator for validation of YOLO model."""
self.loss_names = 'box_loss', 'seg_loss', 'cls_loss', 'dfl_loss'
return yolo.segment.SegmentationValidator(self.test_loader, save_dir=self.save_dir, args=copy(self.args))
def plot_training_samples(self, batch, ni):
"""Creates a plot of training sample images with labels and box coordinates."""
plot_images(batch['img'],
batch['batch_idx'],
batch['cls'].squeeze(-1),
batch['bboxes'],
batch['masks'],
paths=batch['im_file'],
fname=self.save_dir / f'train_batch{ni}.jpg',
on_plot=self.on_plot)
def plot_metrics(self):
"""Plots training/val metrics."""
plot_results(file=self.csv, segment=True, on_plot=self.on_plot) # save results.png

View File

@ -0,0 +1,247 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from multiprocessing.pool import ThreadPool
from pathlib import Path
import numpy as np
import torch
import torch.nn.functional as F
from ultralytics.models.yolo.detect import DetectionValidator
from ultralytics.utils import LOGGER, NUM_THREADS, ops
from ultralytics.utils.checks import check_requirements
from ultralytics.utils.metrics import SegmentMetrics, box_iou, mask_iou
from ultralytics.utils.plotting import output_to_target, plot_images
class SegmentationValidator(DetectionValidator):
"""
A class extending the DetectionValidator class for validation based on a segmentation model.
Example:
```python
from ultralytics.models.yolo.segment import SegmentationValidator
args = dict(model='yolov8n-seg.pt', data='coco8-seg.yaml')
validator = SegmentationValidator(args=args)
validator()
```
"""
def __init__(self, dataloader=None, save_dir=None, pbar=None, args=None, _callbacks=None):
"""Initialize SegmentationValidator and set task to 'segment', metrics to SegmentMetrics."""
super().__init__(dataloader, save_dir, pbar, args, _callbacks)
self.plot_masks = None
self.process = None
self.args.task = 'segment'
self.metrics = SegmentMetrics(save_dir=self.save_dir, on_plot=self.on_plot)
def preprocess(self, batch):
"""Preprocesses batch by converting masks to float and sending to device."""
batch = super().preprocess(batch)
batch['masks'] = batch['masks'].to(self.device).float()
return batch
def init_metrics(self, model):
"""Initialize metrics and select mask processing function based on save_json flag."""
super().init_metrics(model)
self.plot_masks = []
if self.args.save_json:
check_requirements('pycocotools>=2.0.6')
self.process = ops.process_mask_upsample # more accurate
else:
self.process = ops.process_mask # faster
def get_desc(self):
"""Return a formatted description of evaluation metrics."""
return ('%22s' + '%11s' * 10) % ('Class', 'Images', 'Instances', 'Box(P', 'R', 'mAP50', 'mAP50-95)', 'Mask(P',
'R', 'mAP50', 'mAP50-95)')
def postprocess(self, preds):
"""Post-processes YOLO predictions and returns output detections with proto."""
p = ops.non_max_suppression(preds[0],
self.args.conf,
self.args.iou,
labels=self.lb,
multi_label=True,
agnostic=self.args.single_cls,
max_det=self.args.max_det,
nc=self.nc)
proto = preds[1][-1] if len(preds[1]) == 3 else preds[1] # second output is len 3 if pt, but only 1 if exported
return p, proto
def update_metrics(self, preds, batch):
"""Metrics."""
for si, (pred, proto) in enumerate(zip(preds[0], preds[1])):
idx = batch['batch_idx'] == si
cls = batch['cls'][idx]
bbox = batch['bboxes'][idx]
nl, npr = cls.shape[0], pred.shape[0] # number of labels, predictions
shape = batch['ori_shape'][si]
correct_masks = torch.zeros(npr, self.niou, dtype=torch.bool, device=self.device) # init
correct_bboxes = torch.zeros(npr, self.niou, dtype=torch.bool, device=self.device) # init
self.seen += 1
if npr == 0:
if nl:
self.stats.append((correct_bboxes, correct_masks, *torch.zeros(
(2, 0), device=self.device), cls.squeeze(-1)))
if self.args.plots:
self.confusion_matrix.process_batch(detections=None, labels=cls.squeeze(-1))
continue
# Masks
midx = [si] if self.args.overlap_mask else idx
gt_masks = batch['masks'][midx]
pred_masks = self.process(proto, pred[:, 6:], pred[:, :4], shape=batch['img'][si].shape[1:])
# Predictions
if self.args.single_cls:
pred[:, 5] = 0
predn = pred.clone()
ops.scale_boxes(batch['img'][si].shape[1:], predn[:, :4], shape,
ratio_pad=batch['ratio_pad'][si]) # native-space pred
# Evaluate
if nl:
height, width = batch['img'].shape[2:]
tbox = ops.xywh2xyxy(bbox) * torch.tensor(
(width, height, width, height), device=self.device) # target boxes
ops.scale_boxes(batch['img'][si].shape[1:], tbox, shape,
ratio_pad=batch['ratio_pad'][si]) # native-space labels
labelsn = torch.cat((cls, tbox), 1) # native-space labels
correct_bboxes = self._process_batch(predn, labelsn)
# TODO: maybe remove these `self.` arguments as they already are member variable
correct_masks = self._process_batch(predn,
labelsn,
pred_masks,
gt_masks,
overlap=self.args.overlap_mask,
masks=True)
if self.args.plots:
self.confusion_matrix.process_batch(predn, labelsn)
# Append correct_masks, correct_boxes, pconf, pcls, tcls
self.stats.append((correct_bboxes, correct_masks, pred[:, 4], pred[:, 5], cls.squeeze(-1)))
pred_masks = torch.as_tensor(pred_masks, dtype=torch.uint8)
if self.args.plots and self.batch_i < 3:
self.plot_masks.append(pred_masks[:15].cpu()) # filter top 15 to plot
# Save
if self.args.save_json:
pred_masks = ops.scale_image(pred_masks.permute(1, 2, 0).contiguous().cpu().numpy(),
shape,
ratio_pad=batch['ratio_pad'][si])
self.pred_to_json(predn, batch['im_file'][si], pred_masks)
# if self.args.save_txt:
# save_one_txt(predn, save_conf, shape, file=save_dir / 'labels' / f'{path.stem}.txt')
def finalize_metrics(self, *args, **kwargs):
"""Sets speed and confusion matrix for evaluation metrics."""
self.metrics.speed = self.speed
self.metrics.confusion_matrix = self.confusion_matrix
def _process_batch(self, detections, labels, pred_masks=None, gt_masks=None, overlap=False, masks=False):
"""
Return correct prediction matrix
Args:
detections (array[N, 6]), x1, y1, x2, y2, conf, class
labels (array[M, 5]), class, x1, y1, x2, y2
Returns:
correct (array[N, 10]), for 10 IoU levels
"""
if masks:
if overlap:
nl = len(labels)
index = torch.arange(nl, device=gt_masks.device).view(nl, 1, 1) + 1
gt_masks = gt_masks.repeat(nl, 1, 1) # shape(1,640,640) -> (n,640,640)
gt_masks = torch.where(gt_masks == index, 1.0, 0.0)
if gt_masks.shape[1:] != pred_masks.shape[1:]:
gt_masks = F.interpolate(gt_masks[None], pred_masks.shape[1:], mode='bilinear', align_corners=False)[0]
gt_masks = gt_masks.gt_(0.5)
iou = mask_iou(gt_masks.view(gt_masks.shape[0], -1), pred_masks.view(pred_masks.shape[0], -1))
else: # boxes
iou = box_iou(labels[:, 1:], detections[:, :4])
return self.match_predictions(detections[:, 5], labels[:, 0], iou)
def plot_val_samples(self, batch, ni):
"""Plots validation samples with bounding box labels."""
plot_images(batch['img'],
batch['batch_idx'],
batch['cls'].squeeze(-1),
batch['bboxes'],
batch['masks'],
paths=batch['im_file'],
fname=self.save_dir / f'val_batch{ni}_labels.jpg',
names=self.names,
on_plot=self.on_plot)
def plot_predictions(self, batch, preds, ni):
"""Plots batch predictions with masks and bounding boxes."""
plot_images(
batch['img'],
*output_to_target(preds[0], max_det=15), # not set to self.args.max_det due to slow plotting speed
torch.cat(self.plot_masks, dim=0) if len(self.plot_masks) else self.plot_masks,
paths=batch['im_file'],
fname=self.save_dir / f'val_batch{ni}_pred.jpg',
names=self.names,
on_plot=self.on_plot) # pred
self.plot_masks.clear()
def pred_to_json(self, predn, filename, pred_masks):
"""Save one JSON result."""
# Example result = {"image_id": 42, "category_id": 18, "bbox": [258.15, 41.29, 348.26, 243.78], "score": 0.236}
from pycocotools.mask import encode # noqa
def single_encode(x):
"""Encode predicted masks as RLE and append results to jdict."""
rle = encode(np.asarray(x[:, :, None], order='F', dtype='uint8'))[0]
rle['counts'] = rle['counts'].decode('utf-8')
return rle
stem = Path(filename).stem
image_id = int(stem) if stem.isnumeric() else stem
box = ops.xyxy2xywh(predn[:, :4]) # xywh
box[:, :2] -= box[:, 2:] / 2 # xy center to top-left corner
pred_masks = np.transpose(pred_masks, (2, 0, 1))
with ThreadPool(NUM_THREADS) as pool:
rles = pool.map(single_encode, pred_masks)
for i, (p, b) in enumerate(zip(predn.tolist(), box.tolist())):
self.jdict.append({
'image_id': image_id,
'category_id': self.class_map[int(p[5])],
'bbox': [round(x, 3) for x in b],
'score': round(p[4], 5),
'segmentation': rles[i]})
def eval_json(self, stats):
"""Return COCO-style object detection evaluation metrics."""
if self.args.save_json and self.is_coco and len(self.jdict):
anno_json = self.data['path'] / 'annotations/instances_val2017.json' # annotations
pred_json = self.save_dir / 'predictions.json' # predictions
LOGGER.info(f'\nEvaluating pycocotools mAP using {pred_json} and {anno_json}...')
try: # https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocoEvalDemo.ipynb
check_requirements('pycocotools>=2.0.6')
from pycocotools.coco import COCO # noqa
from pycocotools.cocoeval import COCOeval # noqa
for x in anno_json, pred_json:
assert x.is_file(), f'{x} file not found'
anno = COCO(str(anno_json)) # init annotations api
pred = anno.loadRes(str(pred_json)) # init predictions api (must pass string, not Path)
for i, eval in enumerate([COCOeval(anno, pred, 'bbox'), COCOeval(anno, pred, 'segm')]):
if self.is_coco:
eval.params.imgIds = [int(Path(x).stem) for x in self.dataloader.dataset.im_files] # im to eval
eval.evaluate()
eval.accumulate()
eval.summarize()
idx = i * 4 + 2
stats[self.metrics.keys[idx + 1]], stats[
self.metrics.keys[idx]] = eval.stats[:2] # update mAP50-95 and mAP50
except Exception as e:
LOGGER.warning(f'pycocotools unable to run: {e}')
return stats

Some files were not shown because too many files have changed in this diff Show More