add yolo v10 and modify pipeline

This commit is contained in:
王庆刚
2025-03-28 13:19:54 +08:00
parent 183299c06b
commit 798c596acc
471 changed files with 19109 additions and 7342 deletions

View File

@ -4,4 +4,4 @@ from .model import RTDETR
from .predict import RTDETRPredictor
from .val import RTDETRValidator
__all__ = 'RTDETRPredictor', 'RTDETRValidator', 'RTDETR'
__all__ = "RTDETRPredictor", "RTDETRValidator", "RTDETR"

View File

@ -1,7 +1,12 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
"""
RT-DETR model interface
Interface for Baidu's RT-DETR, a Vision Transformer-based real-time object detector. RT-DETR offers real-time
performance and high accuracy, excelling in accelerated backends like CUDA with TensorRT. It features an efficient
hybrid encoder and IoU-aware query selection for enhanced detection accuracy.
For more information on RT-DETR, visit: https://arxiv.org/pdf/2304.08069.pdf
"""
from ultralytics.engine.model import Model
from ultralytics.nn.tasks import RTDETRDetectionModel
@ -12,19 +17,38 @@ from .val import RTDETRValidator
class RTDETR(Model):
"""
RTDETR model interface.
Interface for Baidu's RT-DETR model. This Vision Transformer-based object detector provides real-time performance
with high accuracy. It supports efficient hybrid encoding, IoU-aware query selection, and adaptable inference speed.
Attributes:
model (str): Path to the pre-trained model. Defaults to 'rtdetr-l.pt'.
"""
def __init__(self, model='rtdetr-l.pt') -> None:
if model and model.split('.')[-1] not in ('pt', 'yaml', 'yml'):
raise NotImplementedError('RT-DETR only supports creating from *.pt file or *.yaml file.')
super().__init__(model=model, task='detect')
def __init__(self, model="rtdetr-l.pt") -> None:
"""
Initializes the RT-DETR model with the given pre-trained model file. Supports .pt and .yaml formats.
Args:
model (str): Path to the pre-trained model. Defaults to 'rtdetr-l.pt'.
Raises:
NotImplementedError: If the model file extension is not 'pt', 'yaml', or 'yml'.
"""
super().__init__(model=model, task="detect")
@property
def task_map(self):
def task_map(self) -> dict:
"""
Returns a task map for RT-DETR, associating tasks with corresponding Ultralytics classes.
Returns:
dict: A dictionary mapping task names to Ultralytics task classes for the RT-DETR model.
"""
return {
'detect': {
'predictor': RTDETRPredictor,
'validator': RTDETRValidator,
'trainer': RTDETRTrainer,
'model': RTDETRDetectionModel}}
"detect": {
"predictor": RTDETRPredictor,
"validator": RTDETRValidator,
"trainer": RTDETRTrainer,
"model": RTDETRDetectionModel,
}
}

View File

@ -10,7 +10,11 @@ from ultralytics.utils import ops
class RTDETRPredictor(BasePredictor):
"""
A class extending the BasePredictor class for prediction based on an RT-DETR detection model.
RT-DETR (Real-Time Detection Transformer) Predictor extending the BasePredictor class for making predictions using
Baidu's RT-DETR model.
This class leverages the power of Vision Transformers to provide real-time object detection while maintaining
high accuracy. It supports key features like efficient hybrid encoding and IoU-aware query selection.
Example:
```python
@ -21,10 +25,30 @@ class RTDETRPredictor(BasePredictor):
predictor = RTDETRPredictor(overrides=args)
predictor.predict_cli()
```
Attributes:
imgsz (int): Image size for inference (must be square and scale-filled).
args (dict): Argument overrides for the predictor.
"""
def postprocess(self, preds, img, orig_imgs):
"""Postprocess predictions and returns a list of Results objects."""
"""
Postprocess the raw predictions from the model to generate bounding boxes and confidence scores.
The method filters detections based on confidence and class if specified in `self.args`.
Args:
preds (list): List of [predictions, extra] from the model.
img (torch.Tensor): Processed input images.
orig_imgs (list or torch.Tensor): Original, unprocessed images.
Returns:
(list[Results]): A list of Results objects containing the post-processed bounding boxes, confidence scores,
and class labels.
"""
if not isinstance(preds, (list, tuple)): # list for PyTorch inference but list[0] Tensor for export inference
preds = [preds, None]
nd = preds[0].shape[-1]
bboxes, scores = preds[0].split((4, nd - 4), dim=-1)
@ -48,15 +72,15 @@ class RTDETRPredictor(BasePredictor):
return results
def pre_transform(self, im):
"""Pre-transform input image before inference.
"""
Pre-transforms the input images before feeding them into the model for inference. The input images are
letterboxed to ensure a square aspect ratio and scale-filled. The size must be square(640) and scaleFilled.
Args:
im (List(np.ndarray)): (N, 3, h, w) for tensor, [(h, w, 3) x N] for list.
Notes: The size must be square(640) and scaleFilled.
im (list[np.ndarray] |torch.Tensor): Input images of shape (N,3,h,w) for tensor, [(h,w,3) x N] for list.
Returns:
(list): A list of transformed imgs.
(list): List of pre-transformed images ready for model inference.
"""
letterbox = LetterBox(self.imgsz, auto=False, scaleFill=True)
return [letterbox(image=x) for x in im]

View File

@ -7,16 +7,17 @@ import torch
from ultralytics.models.yolo.detect import DetectionTrainer
from ultralytics.nn.tasks import RTDETRDetectionModel
from ultralytics.utils import RANK, colorstr
from .val import RTDETRDataset, RTDETRValidator
class RTDETRTrainer(DetectionTrainer):
"""
A class extending the DetectionTrainer class for training based on an RT-DETR detection model.
Trainer class for the RT-DETR model developed by Baidu for real-time object detection. Extends the DetectionTrainer
class for YOLO to adapt to the specific features and architecture of RT-DETR. This model leverages Vision
Transformers and has capabilities like IoU-aware query selection and adaptable inference speed.
Notes:
- F.grid_sample used in rt-detr does not support the `deterministic=True` argument.
- F.grid_sample used in RT-DETR does not support the `deterministic=True` argument.
- AMP training can lead to NaN outputs and may produce errors during bipartite graph matching.
Example:
@ -30,43 +31,71 @@ class RTDETRTrainer(DetectionTrainer):
"""
def get_model(self, cfg=None, weights=None, verbose=True):
"""Return a YOLO detection model."""
model = RTDETRDetectionModel(cfg, nc=self.data['nc'], verbose=verbose and RANK == -1)
"""
Initialize and return an RT-DETR model for object detection tasks.
Args:
cfg (dict, optional): Model configuration. Defaults to None.
weights (str, optional): Path to pre-trained model weights. Defaults to None.
verbose (bool): Verbose logging if True. Defaults to True.
Returns:
(RTDETRDetectionModel): Initialized model.
"""
model = RTDETRDetectionModel(cfg, nc=self.data["nc"], verbose=verbose and RANK == -1)
if weights:
model.load(weights)
return model
def build_dataset(self, img_path, mode='val', batch=None):
"""Build RTDETR Dataset
def build_dataset(self, img_path, mode="val", batch=None):
"""
Build and return an RT-DETR dataset for training or validation.
Args:
img_path (str): Path to the folder containing images.
mode (str): `train` mode or `val` mode, users are able to customize different augmentations for each mode.
batch (int, optional): Size of batches, this is for `rect`. Defaults to None.
mode (str): Dataset mode, either 'train' or 'val'.
batch (int, optional): Batch size for rectangle training. Defaults to None.
Returns:
(RTDETRDataset): Dataset object for the specific mode.
"""
return RTDETRDataset(
img_path=img_path,
imgsz=self.args.imgsz,
batch_size=batch,
augment=mode == 'train', # no augmentation
augment=mode == "train",
hyp=self.args,
rect=False, # no rect
rect=False,
cache=self.args.cache or None,
prefix=colorstr(f'{mode}: '),
data=self.data)
prefix=colorstr(f"{mode}: "),
data=self.data,
)
def get_validator(self):
"""Returns a DetectionValidator for RTDETR model validation."""
self.loss_names = 'giou_loss', 'cls_loss', 'l1_loss'
"""
Returns a DetectionValidator suitable for RT-DETR model validation.
Returns:
(RTDETRValidator): Validator object for model validation.
"""
self.loss_names = "giou_loss", "cls_loss", "l1_loss"
return RTDETRValidator(self.test_loader, save_dir=self.save_dir, args=copy(self.args))
def preprocess_batch(self, batch):
"""Preprocesses a batch of images by scaling and converting to float."""
"""
Preprocess a batch of images. Scales and converts the images to float format.
Args:
batch (dict): Dictionary containing a batch of images, bboxes, and labels.
Returns:
(dict): Preprocessed batch.
"""
batch = super().preprocess_batch(batch)
bs = len(batch['img'])
batch_idx = batch['batch_idx']
bs = len(batch["img"])
batch_idx = batch["batch_idx"]
gt_bbox, gt_class = [], []
for i in range(bs):
gt_bbox.append(batch['bboxes'][batch_idx == i].to(batch_idx.device))
gt_class.append(batch['cls'][batch_idx == i].to(device=batch_idx.device, dtype=torch.long))
gt_bbox.append(batch["bboxes"][batch_idx == i].to(batch_idx.device))
gt_class.append(batch["cls"][batch_idx == i].to(device=batch_idx.device, dtype=torch.long))
return batch

View File

@ -1,7 +1,5 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from pathlib import Path
import torch
from ultralytics.data import YOLODataset
@ -9,16 +7,22 @@ from ultralytics.data.augment import Compose, Format, v8_transforms
from ultralytics.models.yolo.detect import DetectionValidator
from ultralytics.utils import colorstr, ops
__all__ = 'RTDETRValidator', # tuple or list
__all__ = ("RTDETRValidator",) # tuple or list
# TODO: Temporarily RT-DETR does not need padding.
class RTDETRDataset(YOLODataset):
"""
Real-Time DEtection and TRacking (RT-DETR) dataset class extending the base YOLODataset class.
This specialized dataset class is designed for use with the RT-DETR object detection model and is optimized for
real-time detection and tracking tasks.
"""
def __init__(self, *args, data=None, **kwargs):
super().__init__(*args, data=data, use_segments=False, use_keypoints=False, **kwargs)
"""Initialize the RTDETRDataset class by inheriting from the YOLODataset class."""
super().__init__(*args, data=data, **kwargs)
# NOTE: add stretch version load_image for rtdetr mosaic
# NOTE: add stretch version load_image for RTDETR mosaic
def load_image(self, i, rect_mode=False):
"""Loads 1 image from dataset index 'i', returns (im, resized hw)."""
return super().load_image(i=i, rect_mode=rect_mode)
@ -33,19 +37,26 @@ class RTDETRDataset(YOLODataset):
# transforms = Compose([LetterBox(new_shape=(self.imgsz, self.imgsz), auto=False, scaleFill=True)])
transforms = Compose([])
transforms.append(
Format(bbox_format='xywh',
normalize=True,
return_mask=self.use_segments,
return_keypoint=self.use_keypoints,
batch_idx=True,
mask_ratio=hyp.mask_ratio,
mask_overlap=hyp.overlap_mask))
Format(
bbox_format="xywh",
normalize=True,
return_mask=self.use_segments,
return_keypoint=self.use_keypoints,
batch_idx=True,
mask_ratio=hyp.mask_ratio,
mask_overlap=hyp.overlap_mask,
)
)
return transforms
class RTDETRValidator(DetectionValidator):
"""
A class extending the DetectionValidator class for validation based on an RT-DETR detection model.
RTDETRValidator extends the DetectionValidator class to provide validation capabilities specifically tailored for
the RT-DETR (Real-Time DETR) object detection model.
The class allows building of an RTDETR-specific dataset for validation, applies Non-maximum suppression for
post-processing, and updates evaluation metrics accordingly.
Example:
```python
@ -55,9 +66,12 @@ class RTDETRValidator(DetectionValidator):
validator = RTDETRValidator(args=args)
validator()
```
Note:
For further details on the attributes and methods, refer to the parent DetectionValidator class.
"""
def build_dataset(self, img_path, mode='val', batch=None):
def build_dataset(self, img_path, mode="val", batch=None):
"""
Build an RTDETR Dataset.
@ -74,11 +88,15 @@ class RTDETRValidator(DetectionValidator):
hyp=self.args,
rect=False, # no rect
cache=self.args.cache or None,
prefix=colorstr(f'{mode}: '),
data=self.data)
prefix=colorstr(f"{mode}: "),
data=self.data,
)
def postprocess(self, preds):
"""Apply Non-maximum suppression to prediction outputs."""
if not isinstance(preds, (list, tuple)): # list for PyTorch inference but list[0] Tensor for export inference
preds = [preds, None]
bs, _, nd = preds[0].shape
bboxes, scores = preds[0].split((4, nd - 4), dim=-1)
bboxes *= self.args.imgsz
@ -86,56 +104,32 @@ class RTDETRValidator(DetectionValidator):
for i, bbox in enumerate(bboxes): # (300, 4)
bbox = ops.xywh2xyxy(bbox)
score, cls = scores[i].max(-1) # (300, )
# Do not need threshold for evaluation as only got 300 boxes here.
# Do not need threshold for evaluation as only got 300 boxes here
# idx = score > self.args.conf
pred = torch.cat([bbox, score[..., None], cls[..., None]], dim=-1) # filter
# sort by confidence to correctly get internal metrics.
# Sort by confidence to correctly get internal metrics
pred = pred[score.argsort(descending=True)]
outputs[i] = pred # [idx]
return outputs
def update_metrics(self, preds, batch):
"""Metrics."""
for si, pred in enumerate(preds):
idx = batch['batch_idx'] == si
cls = batch['cls'][idx]
bbox = batch['bboxes'][idx]
nl, npr = cls.shape[0], pred.shape[0] # number of labels, predictions
shape = batch['ori_shape'][si]
correct_bboxes = torch.zeros(npr, self.niou, dtype=torch.bool, device=self.device) # init
self.seen += 1
def _prepare_batch(self, si, batch):
"""Prepares a batch for training or inference by applying transformations."""
idx = batch["batch_idx"] == si
cls = batch["cls"][idx].squeeze(-1)
bbox = batch["bboxes"][idx]
ori_shape = batch["ori_shape"][si]
imgsz = batch["img"].shape[2:]
ratio_pad = batch["ratio_pad"][si]
if len(cls):
bbox = ops.xywh2xyxy(bbox) # target boxes
bbox[..., [0, 2]] *= ori_shape[1] # native-space pred
bbox[..., [1, 3]] *= ori_shape[0] # native-space pred
return dict(cls=cls, bbox=bbox, ori_shape=ori_shape, imgsz=imgsz, ratio_pad=ratio_pad)
if npr == 0:
if nl:
self.stats.append((correct_bboxes, *torch.zeros((2, 0), device=self.device), cls.squeeze(-1)))
if self.args.plots:
self.confusion_matrix.process_batch(detections=None, labels=cls.squeeze(-1))
continue
# Predictions
if self.args.single_cls:
pred[:, 5] = 0
predn = pred.clone()
predn[..., [0, 2]] *= shape[1] / self.args.imgsz # native-space pred
predn[..., [1, 3]] *= shape[0] / self.args.imgsz # native-space pred
# Evaluate
if nl:
tbox = ops.xywh2xyxy(bbox) # target boxes
tbox[..., [0, 2]] *= shape[1] # native-space pred
tbox[..., [1, 3]] *= shape[0] # native-space pred
labelsn = torch.cat((cls, tbox), 1) # native-space labels
# NOTE: To get correct metrics, the inputs of `_process_batch` should always be float32 type.
correct_bboxes = self._process_batch(predn.float(), labelsn)
# TODO: maybe remove these `self.` arguments as they already are member variable
if self.args.plots:
self.confusion_matrix.process_batch(predn, labelsn)
self.stats.append((correct_bboxes, pred[:, 4], pred[:, 5], cls.squeeze(-1))) # (conf, pcls, tcls)
# Save
if self.args.save_json:
self.pred_to_json(predn, batch['im_file'][si])
if self.args.save_txt:
file = self.save_dir / 'labels' / f'{Path(batch["im_file"][si]).stem}.txt'
self.save_one_txt(predn, self.args.save_conf, shape, file)
def _prepare_pred(self, pred, pbatch):
"""Prepares and returns a batch with transformed bounding boxes and class labels."""
predn = pred.clone()
predn[..., [0, 2]] *= pbatch["ori_shape"][1] / self.args.imgsz # native-space pred
predn[..., [1, 3]] *= pbatch["ori_shape"][0] / self.args.imgsz # native-space pred
return predn.float()