This commit is contained in:
lee
2025-06-18 14:35:43 +08:00
commit e474ab5f9f
529 changed files with 80523 additions and 0 deletions

View File

@ -0,0 +1,15 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from .base import BaseDataset
from .build import build_dataloader, build_yolo_dataset, load_inference_source
from .dataset import ClassificationDataset, SemanticDataset, YOLODataset
__all__ = (
"BaseDataset",
"ClassificationDataset",
"SemanticDataset",
"YOLODataset",
"build_yolo_dataset",
"build_dataloader",
"load_inference_source",
)

View File

@ -0,0 +1,50 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from pathlib import Path
from ultralytics import SAM, YOLO
def auto_annotate(data, det_model="yolov8x.pt", sam_model="sam_b.pt", device="", output_dir=None):
"""
Automatically annotates images using a YOLO object detection model and a SAM segmentation model.
Args:
data (str): Path to a folder containing images to be annotated.
det_model (str, optional): Pre-trained YOLO detection model. Defaults to 'yolov8x.pt'.
sam_model (str, optional): Pre-trained SAM segmentation model. Defaults to 'sam_b.pt'.
device (str, optional): Device to run the models on. Defaults to an empty string (CPU or GPU, if available).
output_dir (str | None | optional): Directory to save the annotated results.
Defaults to a 'labels' folder in the same directory as 'data'.
Example:
```python
from ultralytics.data.annotator import auto_annotate
auto_annotate(data='ultralytics/assets', det_model='yolov8n.pt', sam_model='mobile_sam.pt')
```
"""
det_model = YOLO(det_model)
sam_model = SAM(sam_model)
data = Path(data)
if not output_dir:
output_dir = data.parent / f"{data.stem}_auto_annotate_labels"
Path(output_dir).mkdir(exist_ok=True, parents=True)
det_results = det_model(data, stream=True, device=device)
for result in det_results:
class_ids = result.boxes.cls.int().tolist() # noqa
if len(class_ids):
boxes = result.boxes.xyxy # Boxes object for bbox outputs
sam_results = sam_model(result.orig_img, bboxes=boxes, verbose=False, save=False, device=device)
segments = sam_results[0].masks.xyn # noqa
with open(f"{Path(output_dir) / Path(result.path).stem}.txt", "w") as f:
for i in range(len(segments)):
s = segments[i]
if len(s) == 0:
continue
segment = map(str, segments[i].reshape(-1).tolist())
f.write(f"{class_ids[i]} " + " ".join(segment) + "\n")

1254
ultralytics/data/augment.py Normal file

File diff suppressed because it is too large Load Diff

311
ultralytics/data/base.py Normal file
View File

@ -0,0 +1,311 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import glob
import math
import os
import random
from copy import deepcopy
from multiprocessing.pool import ThreadPool
from pathlib import Path
from typing import Optional
import cv2
import numpy as np
import psutil
from torch.utils.data import Dataset
from ultralytics.utils import DEFAULT_CFG, LOCAL_RANK, LOGGER, NUM_THREADS, TQDM
from .utils import HELP_URL, IMG_FORMATS
class BaseDataset(Dataset):
"""
Base dataset class for loading and processing image data.
Args:
img_path (str): Path to the folder containing images.
imgsz (int, optional): Image size. Defaults to 640.
cache (bool, optional): Cache images to RAM or disk during training. Defaults to False.
augment (bool, optional): If True, data augmentation is applied. Defaults to True.
hyp (dict, optional): Hyperparameters to apply data augmentation. Defaults to None.
prefix (str, optional): Prefix to print in log messages. Defaults to ''.
rect (bool, optional): If True, rectangular training is used. Defaults to False.
batch_size (int, optional): Size of batches. Defaults to None.
stride (int, optional): Stride. Defaults to 32.
pad (float, optional): Padding. Defaults to 0.0.
single_cls (bool, optional): If True, single class training is used. Defaults to False.
classes (list): List of included classes. Default is None.
fraction (float): Fraction of dataset to utilize. Default is 1.0 (use all data).
Attributes:
im_files (list): List of image file paths.
labels (list): List of label data dictionaries.
ni (int): Number of images in the dataset.
ims (list): List of loaded images.
npy_files (list): List of numpy file paths.
transforms (callable): Image transformation function.
"""
def __init__(
self,
img_path,
imgsz=640,
cache=False,
augment=True,
hyp=DEFAULT_CFG,
prefix="",
rect=False,
batch_size=16,
stride=32,
pad=0.5,
single_cls=False,
classes=None,
fraction=1.0,
):
"""Initialize BaseDataset with given configuration and options."""
super().__init__()
self.img_path = img_path
self.imgsz = imgsz
self.augment = augment
self.single_cls = single_cls
self.prefix = prefix
self.fraction = fraction
self.im_files = self.get_img_files(self.img_path)
self.labels = self.get_labels()
self.update_labels(include_class=classes) # single_cls and include_class
self.ni = len(self.labels) # number of images
self.rect = rect
self.batch_size = batch_size
self.stride = stride
self.pad = pad
if self.rect:
assert self.batch_size is not None
self.set_rectangle()
# Buffer thread for mosaic images
self.buffer = [] # buffer size = batch size
self.max_buffer_length = min((self.ni, self.batch_size * 8, 1000)) if self.augment else 0
# Cache images
if cache == "ram" and not self.check_cache_ram():
cache = False
self.ims, self.im_hw0, self.im_hw = [None] * self.ni, [None] * self.ni, [None] * self.ni
self.npy_files = [Path(f).with_suffix(".npy") for f in self.im_files]
if cache:
self.cache_images(cache)
# Transforms
self.transforms = self.build_transforms(hyp=hyp)
def get_img_files(self, img_path):
"""Read image files."""
try:
f = [] # image files
for p in img_path if isinstance(img_path, list) else [img_path]:
p = Path(p) # os-agnostic
if p.is_dir(): # dir
f += glob.glob(str(p / "**" / "*.*"), recursive=True)
# F = list(p.rglob('*.*')) # pathlib
elif p.is_file(): # file
with open(p) as t:
t = t.read().strip().splitlines()
parent = str(p.parent) + os.sep
f += [x.replace("./", parent) if x.startswith("./") else x for x in t] # local to global path
# F += [p.parent / x.lstrip(os.sep) for x in t] # local to global path (pathlib)
else:
raise FileNotFoundError(f"{self.prefix}{p} does not exist")
im_files = sorted(x.replace("/", os.sep) for x in f if x.split(".")[-1].lower() in IMG_FORMATS)
# self.img_files = sorted([x for x in f if x.suffix[1:].lower() in IMG_FORMATS]) # pathlib
assert im_files, f"{self.prefix}No images found in {img_path}"
except Exception as e:
raise FileNotFoundError(f"{self.prefix}Error loading data from {img_path}\n{HELP_URL}") from e
if self.fraction < 1:
# im_files = im_files[: round(len(im_files) * self.fraction)]
num_elements_to_select = round(len(im_files) * self.fraction)
im_files = random.sample(im_files, num_elements_to_select)
return im_files
def update_labels(self, include_class: Optional[list]):
"""Update labels to include only these classes (optional)."""
include_class_array = np.array(include_class).reshape(1, -1)
for i in range(len(self.labels)):
if include_class is not None:
cls = self.labels[i]["cls"]
bboxes = self.labels[i]["bboxes"]
segments = self.labels[i]["segments"]
keypoints = self.labels[i]["keypoints"]
j = (cls == include_class_array).any(1)
self.labels[i]["cls"] = cls[j]
self.labels[i]["bboxes"] = bboxes[j]
if segments:
self.labels[i]["segments"] = [segments[si] for si, idx in enumerate(j) if idx]
if keypoints is not None:
self.labels[i]["keypoints"] = keypoints[j]
if self.single_cls:
self.labels[i]["cls"][:, 0] = 0
def load_image(self, i, rect_mode=True):
"""Loads 1 image from dataset index 'i', returns (im, resized hw)."""
im, f, fn = self.ims[i], self.im_files[i], self.npy_files[i]
if im is None: # not cached in RAM
if fn.exists(): # load npy
try:
im = np.load(fn)
except Exception as e:
LOGGER.warning(f"{self.prefix}WARNING ⚠️ Removing corrupt *.npy image file {fn} due to: {e}")
Path(fn).unlink(missing_ok=True)
im = cv2.imread(f) # BGR
else: # read image
im = cv2.imread(f) # BGR
if im is None:
raise FileNotFoundError(f"Image Not Found {f}")
h0, w0 = im.shape[:2] # orig hw
if rect_mode: # resize long side to imgsz while maintaining aspect ratio
r = self.imgsz / max(h0, w0) # ratio
if r != 1: # if sizes are not equal
w, h = (min(math.ceil(w0 * r), self.imgsz), min(math.ceil(h0 * r), self.imgsz))
im = cv2.resize(im, (w, h), interpolation=cv2.INTER_LINEAR)
elif not (h0 == w0 == self.imgsz): # resize by stretching image to square imgsz
im = cv2.resize(im, (self.imgsz, self.imgsz), interpolation=cv2.INTER_LINEAR)
# Add to buffer if training with augmentations
if self.augment:
self.ims[i], self.im_hw0[i], self.im_hw[i] = im, (h0, w0), im.shape[:2] # im, hw_original, hw_resized
self.buffer.append(i)
if len(self.buffer) >= self.max_buffer_length:
j = self.buffer.pop(0)
self.ims[j], self.im_hw0[j], self.im_hw[j] = None, None, None
return im, (h0, w0), im.shape[:2]
return self.ims[i], self.im_hw0[i], self.im_hw[i]
def cache_images(self, cache):
"""Cache images to memory or disk."""
b, gb = 0, 1 << 30 # bytes of cached images, bytes per gigabytes
fcn = self.cache_images_to_disk if cache == "disk" else self.load_image
with ThreadPool(NUM_THREADS) as pool:
results = pool.imap(fcn, range(self.ni))
pbar = TQDM(enumerate(results), total=self.ni, disable=LOCAL_RANK > 0)
for i, x in pbar:
if cache == "disk":
b += self.npy_files[i].stat().st_size
else: # 'ram'
self.ims[i], self.im_hw0[i], self.im_hw[i] = x # im, hw_orig, hw_resized = load_image(self, i)
b += self.ims[i].nbytes
pbar.desc = f"{self.prefix}Caching images ({b / gb:.1f}GB {cache})"
pbar.close()
def cache_images_to_disk(self, i):
"""Saves an image as an *.npy file for faster loading."""
f = self.npy_files[i]
if not f.exists():
np.save(f.as_posix(), cv2.imread(self.im_files[i]), allow_pickle=False)
def check_cache_ram(self, safety_margin=0.5):
"""Check image caching requirements vs available memory."""
b, gb = 0, 1 << 30 # bytes of cached images, bytes per gigabytes
n = min(self.ni, 30) # extrapolate from 30 random images
for _ in range(n):
im = cv2.imread(random.choice(self.im_files)) # sample image
ratio = self.imgsz / max(im.shape[0], im.shape[1]) # max(h, w) # ratio
b += im.nbytes * ratio**2
mem_required = b * self.ni / n * (1 + safety_margin) # GB required to cache dataset into RAM
mem = psutil.virtual_memory()
cache = mem_required < mem.available # to cache or not to cache, that is the question
if not cache:
LOGGER.info(
f'{self.prefix}{mem_required / gb:.1f}GB RAM required to cache images '
f'with {int(safety_margin * 100)}% safety margin but only '
f'{mem.available / gb:.1f}/{mem.total / gb:.1f}GB available, '
f"{'caching images ✅' if cache else 'not caching images ⚠️'}"
)
return cache
def set_rectangle(self):
"""Sets the shape of bounding boxes for YOLO detections as rectangles."""
bi = np.floor(np.arange(self.ni) / self.batch_size).astype(int) # batch index
nb = bi[-1] + 1 # number of batches
s = np.array([x.pop("shape") for x in self.labels]) # hw
ar = s[:, 0] / s[:, 1] # aspect ratio
irect = ar.argsort()
self.im_files = [self.im_files[i] for i in irect]
self.labels = [self.labels[i] for i in irect]
ar = ar[irect]
# Set training image shapes
shapes = [[1, 1]] * nb
for i in range(nb):
ari = ar[bi == i]
mini, maxi = ari.min(), ari.max()
if maxi < 1:
shapes[i] = [maxi, 1]
elif mini > 1:
shapes[i] = [1, 1 / mini]
self.batch_shapes = np.ceil(np.array(shapes) * self.imgsz / self.stride + self.pad).astype(int) * self.stride
self.batch = bi # batch index of image
def __getitem__(self, index):
"""Returns transformed label information for given index."""
return self.transforms(self.get_image_and_label(index))
def get_image_and_label(self, index):
"""Get and return label information from the dataset."""
label = deepcopy(self.labels[index]) # requires deepcopy() https://github.com/ultralytics/ultralytics/pull/1948
label.pop("shape", None) # shape is for rect, remove it
label["img"], label["ori_shape"], label["resized_shape"] = self.load_image(index)
label["ratio_pad"] = (
label["resized_shape"][0] / label["ori_shape"][0],
label["resized_shape"][1] / label["ori_shape"][1],
) # for evaluation
if self.rect:
label["rect_shape"] = self.batch_shapes[self.batch[index]]
return self.update_labels_info(label)
def __len__(self):
"""Returns the length of the labels list for the dataset."""
return len(self.labels)
def update_labels_info(self, label):
"""Custom your label format here."""
return label
def build_transforms(self, hyp=None):
"""
Users can customize augmentations here.
Example:
```python
if self.augment:
# Training transforms
return Compose([])
else:
# Val transforms
return Compose([])
```
"""
raise NotImplementedError
def get_labels(self):
"""
Users can customize their own format here.
Note:
Ensure output is a dictionary with the following keys:
```python
dict(
im_file=im_file,
shape=shape, # format: (height, width)
cls=cls,
bboxes=bboxes, # xywh
segments=segments, # xy
keypoints=keypoints, # xy
normalized=True, # or False
bbox_format="xyxy", # or xywh, ltwh
)
```
"""
raise NotImplementedError

186
ultralytics/data/build.py Normal file
View File

@ -0,0 +1,186 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import os
import random
from pathlib import Path
import numpy as np
import torch
from PIL import Image
from torch.utils.data import dataloader, distributed
from ultralytics.data.loaders import (
LOADERS,
LoadImagesAndVideos,
LoadPilAndNumpy,
LoadScreenshots,
LoadStreams,
LoadTensor,
SourceTypes,
autocast_list,
)
from ultralytics.data.utils import IMG_FORMATS, VID_FORMATS
from ultralytics.utils import RANK, colorstr
from ultralytics.utils.checks import check_file
from .dataset import YOLODataset
from .utils import PIN_MEMORY
class InfiniteDataLoader(dataloader.DataLoader):
"""
Dataloader that reuses workers.
Uses same syntax as vanilla DataLoader.
"""
def __init__(self, *args, **kwargs):
"""Dataloader that infinitely recycles workers, inherits from DataLoader."""
super().__init__(*args, **kwargs)
object.__setattr__(self, "batch_sampler", _RepeatSampler(self.batch_sampler))
self.iterator = super().__iter__()
def __len__(self):
"""Returns the length of the batch sampler's sampler."""
return len(self.batch_sampler.sampler)
def __iter__(self):
"""Creates a sampler that repeats indefinitely."""
for _ in range(len(self)):
yield next(self.iterator)
def reset(self):
"""
Reset iterator.
This is useful when we want to modify settings of dataset while training.
"""
self.iterator = self._get_iterator()
class _RepeatSampler:
"""
Sampler that repeats forever.
Args:
sampler (Dataset.sampler): The sampler to repeat.
"""
def __init__(self, sampler):
"""Initializes an object that repeats a given sampler indefinitely."""
self.sampler = sampler
def __iter__(self):
"""Iterates over the 'sampler' and yields its contents."""
while True:
yield from iter(self.sampler)
def seed_worker(worker_id): # noqa
"""Set dataloader worker seed https://pytorch.org/docs/stable/notes/randomness.html#dataloader."""
worker_seed = torch.initial_seed() % 2**32
np.random.seed(worker_seed)
random.seed(worker_seed)
def build_yolo_dataset(cfg, img_path, batch, data, mode="train", rect=False, stride=32):
"""Build YOLO Dataset."""
return YOLODataset(
img_path=img_path,
imgsz=cfg.imgsz,
batch_size=batch,
augment=mode == "train", # augmentation
hyp=cfg, # TODO: probably add a get_hyps_from_cfg function
rect=cfg.rect or rect, # rectangular batches
cache=cfg.cache or None,
single_cls=cfg.single_cls or False,
stride=int(stride),
pad=0.0 if mode == "train" else 0.5,
prefix=colorstr(f"{mode}: "),
task=cfg.task,
classes=cfg.classes,
data=data,
fraction=cfg.fraction if mode == "train" else 1.0,
)
def build_dataloader(dataset, batch, workers, shuffle=True, rank=-1):
"""Return an InfiniteDataLoader or DataLoader for training or validation set."""
batch = min(batch, len(dataset))
nd = torch.cuda.device_count() # number of CUDA devices
nw = min([os.cpu_count() // max(nd, 1), workers]) # number of workers
sampler = None if rank == -1 else distributed.DistributedSampler(dataset, shuffle=shuffle)
generator = torch.Generator()
generator.manual_seed(6148914691236517205 + RANK)
return InfiniteDataLoader(
dataset=dataset,
batch_size=batch,
shuffle=shuffle and sampler is None,
num_workers=nw,
sampler=sampler,
pin_memory=PIN_MEMORY,
collate_fn=getattr(dataset, "collate_fn", None),
worker_init_fn=seed_worker,
generator=generator,
)
def check_source(source):
"""Check source type and return corresponding flag values."""
webcam, screenshot, from_img, in_memory, tensor = False, False, False, False, False
if isinstance(source, (str, int, Path)): # int for local usb camera
source = str(source)
is_file = Path(source).suffix[1:] in (IMG_FORMATS | VID_FORMATS)
is_url = source.lower().startswith(("https://", "http://", "rtsp://", "rtmp://", "tcp://"))
webcam = source.isnumeric() or source.endswith(".streams") or (is_url and not is_file)
screenshot = source.lower() == "screen"
if is_url and is_file:
source = check_file(source) # download
elif isinstance(source, LOADERS):
in_memory = True
elif isinstance(source, (list, tuple)):
source = autocast_list(source) # convert all list elements to PIL or np arrays
from_img = True
elif isinstance(source, (Image.Image, np.ndarray)):
from_img = True
elif isinstance(source, torch.Tensor):
tensor = True
else:
raise TypeError("Unsupported image type. For supported types see https://docs.ultralytics.com/modes/predict")
return source, webcam, screenshot, from_img, in_memory, tensor
def load_inference_source(source=None, batch=1, vid_stride=1, buffer=False):
"""
Loads an inference source for object detection and applies necessary transformations.
Args:
source (str, Path, Tensor, PIL.Image, np.ndarray): The input source for inference.
batch (int, optional): Batch size for dataloaders. Default is 1.
vid_stride (int, optional): The frame interval for video sources. Default is 1.
buffer (bool, optional): Determined whether stream frames will be buffered. Default is False.
Returns:
dataset (Dataset): A dataset object for the specified input source.
"""
source, stream, screenshot, from_img, in_memory, tensor = check_source(source)
source_type = source.source_type if in_memory else SourceTypes(stream, screenshot, from_img, tensor)
# Dataloader
if tensor:
dataset = LoadTensor(source)
elif in_memory:
dataset = source
elif stream:
dataset = LoadStreams(source, vid_stride=vid_stride, buffer=buffer)
elif screenshot:
dataset = LoadScreenshots(source)
elif from_img:
dataset = LoadPilAndNumpy(source)
else:
dataset = LoadImagesAndVideos(source, batch=batch, vid_stride=vid_stride)
# Attach source types to the dataset
setattr(dataset, "source_type", source_type)
return dataset

View File

@ -0,0 +1,542 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import json
from collections import defaultdict
from pathlib import Path
import cv2
import numpy as np
from ultralytics.utils import LOGGER, TQDM
from ultralytics.utils.files import increment_path
def coco91_to_coco80_class():
"""
Converts 91-index COCO class IDs to 80-index COCO class IDs.
Returns:
(list): A list of 91 class IDs where the index represents the 80-index class ID and the value is the
corresponding 91-index class ID.
"""
return [
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
None,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22,
23,
None,
24,
25,
None,
None,
26,
27,
28,
29,
30,
31,
32,
33,
34,
35,
36,
37,
38,
39,
None,
40,
41,
42,
43,
44,
45,
46,
47,
48,
49,
50,
51,
52,
53,
54,
55,
56,
57,
58,
59,
None,
60,
None,
None,
61,
None,
62,
63,
64,
65,
66,
67,
68,
69,
70,
71,
72,
None,
73,
74,
75,
76,
77,
78,
79,
None,
]
def coco80_to_coco91_class():
"""
Converts 80-index (val2014) to 91-index (paper).
For details see https://tech.amikelive.com/node-718/what-object-categories-labels-are-in-coco-dataset/.
Example:
```python
import numpy as np
a = np.loadtxt('data/coco.names', dtype='str', delimiter='\n')
b = np.loadtxt('data/coco_paper.names', dtype='str', delimiter='\n')
x1 = [list(a[i] == b).index(True) + 1 for i in range(80)] # darknet to coco
x2 = [list(b[i] == a).index(True) if any(b[i] == a) else None for i in range(91)] # coco to darknet
```
"""
return [
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22,
23,
24,
25,
27,
28,
31,
32,
33,
34,
35,
36,
37,
38,
39,
40,
41,
42,
43,
44,
46,
47,
48,
49,
50,
51,
52,
53,
54,
55,
56,
57,
58,
59,
60,
61,
62,
63,
64,
65,
67,
70,
72,
73,
74,
75,
76,
77,
78,
79,
80,
81,
82,
84,
85,
86,
87,
88,
89,
90,
]
def convert_coco(
labels_dir="../coco/annotations/",
save_dir="coco_converted/",
use_segments=False,
use_keypoints=False,
cls91to80=True,
):
"""
Converts COCO dataset annotations to a YOLO annotation format suitable for training YOLO models.
Args:
labels_dir (str, optional): Path to directory containing COCO dataset annotation files.
save_dir (str, optional): Path to directory to save results to.
use_segments (bool, optional): Whether to include segmentation masks in the output.
use_keypoints (bool, optional): Whether to include keypoint annotations in the output.
cls91to80 (bool, optional): Whether to map 91 COCO class IDs to the corresponding 80 COCO class IDs.
Example:
```python
from ultralytics.data.converter import convert_coco
convert_coco('../datasets/coco/annotations/', use_segments=True, use_keypoints=False, cls91to80=True)
```
Output:
Generates output files in the specified output directory.
"""
# Create dataset directory
save_dir = increment_path(save_dir) # increment if save directory already exists
for p in save_dir / "labels", save_dir / "images":
p.mkdir(parents=True, exist_ok=True) # make dir
# Convert classes
coco80 = coco91_to_coco80_class()
# Import json
for json_file in sorted(Path(labels_dir).resolve().glob("*.json")):
fn = Path(save_dir) / "labels" / json_file.stem.replace("instances_", "") # folder name
fn.mkdir(parents=True, exist_ok=True)
with open(json_file) as f:
data = json.load(f)
# Create image dict
images = {f'{x["id"]:d}': x for x in data["images"]}
# Create image-annotations dict
imgToAnns = defaultdict(list)
for ann in data["annotations"]:
imgToAnns[ann["image_id"]].append(ann)
# Write labels file
for img_id, anns in TQDM(imgToAnns.items(), desc=f"Annotations {json_file}"):
img = images[f"{img_id:d}"]
h, w, f = img["height"], img["width"], img["file_name"]
bboxes = []
segments = []
keypoints = []
for ann in anns:
if ann["iscrowd"]:
continue
# The COCO box format is [top left x, top left y, width, height]
box = np.array(ann["bbox"], dtype=np.float64)
box[:2] += box[2:] / 2 # xy top-left corner to center
box[[0, 2]] /= w # normalize x
box[[1, 3]] /= h # normalize y
if box[2] <= 0 or box[3] <= 0: # if w <= 0 and h <= 0
continue
cls = coco80[ann["category_id"] - 1] if cls91to80 else ann["category_id"] - 1 # class
box = [cls] + box.tolist()
if box not in bboxes:
bboxes.append(box)
if use_segments and ann.get("segmentation") is not None:
if len(ann["segmentation"]) == 0:
segments.append([])
continue
elif len(ann["segmentation"]) > 1:
s = merge_multi_segment(ann["segmentation"])
s = (np.concatenate(s, axis=0) / np.array([w, h])).reshape(-1).tolist()
else:
s = [j for i in ann["segmentation"] for j in i] # all segments concatenated
s = (np.array(s).reshape(-1, 2) / np.array([w, h])).reshape(-1).tolist()
s = [cls] + s
segments.append(s)
if use_keypoints and ann.get("keypoints") is not None:
keypoints.append(
box + (np.array(ann["keypoints"]).reshape(-1, 3) / np.array([w, h, 1])).reshape(-1).tolist()
)
# Write
with open((fn / f).with_suffix(".txt"), "a") as file:
for i in range(len(bboxes)):
if use_keypoints:
line = (*(keypoints[i]),) # cls, box, keypoints
else:
line = (
*(segments[i] if use_segments and len(segments[i]) > 0 else bboxes[i]),
) # cls, box or segments
file.write(("%g " * len(line)).rstrip() % line + "\n")
LOGGER.info(f"COCO data converted successfully.\nResults saved to {save_dir.resolve()}")
def convert_dota_to_yolo_obb(dota_root_path: str):
"""
Converts DOTA dataset annotations to YOLO OBB (Oriented Bounding Box) format.
The function processes images in the 'train' and 'val' folders of the DOTA dataset. For each image, it reads the
associated label from the original labels directory and writes new labels in YOLO OBB format to a new directory.
Args:
dota_root_path (str): The root directory path of the DOTA dataset.
Example:
```python
from ultralytics.data.converter import convert_dota_to_yolo_obb
convert_dota_to_yolo_obb('path/to/DOTA')
```
Notes:
The directory structure assumed for the DOTA dataset:
- DOTA
├─ images
│ ├─ train
│ └─ val
└─ labels
├─ train_original
└─ val_original
After execution, the function will organize the labels into:
- DOTA
└─ labels
├─ train
└─ val
"""
dota_root_path = Path(dota_root_path)
# Class names to indices mapping
class_mapping = {
"plane": 0,
"ship": 1,
"storage-tank": 2,
"baseball-diamond": 3,
"tennis-court": 4,
"basketball-court": 5,
"ground-track-field": 6,
"harbor": 7,
"bridge": 8,
"large-vehicle": 9,
"small-vehicle": 10,
"helicopter": 11,
"roundabout": 12,
"soccer-ball-field": 13,
"swimming-pool": 14,
"container-crane": 15,
"airport": 16,
"helipad": 17,
}
def convert_label(image_name, image_width, image_height, orig_label_dir, save_dir):
"""Converts a single image's DOTA annotation to YOLO OBB format and saves it to a specified directory."""
orig_label_path = orig_label_dir / f"{image_name}.txt"
save_path = save_dir / f"{image_name}.txt"
with orig_label_path.open("r") as f, save_path.open("w") as g:
lines = f.readlines()
for line in lines:
parts = line.strip().split()
if len(parts) < 9:
continue
class_name = parts[8]
class_idx = class_mapping[class_name]
coords = [float(p) for p in parts[:8]]
normalized_coords = [
coords[i] / image_width if i % 2 == 0 else coords[i] / image_height for i in range(8)
]
formatted_coords = ["{:.6g}".format(coord) for coord in normalized_coords]
g.write(f"{class_idx} {' '.join(formatted_coords)}\n")
for phase in ["train", "val"]:
image_dir = dota_root_path / "images" / phase
orig_label_dir = dota_root_path / "labels" / f"{phase}_original"
save_dir = dota_root_path / "labels" / phase
save_dir.mkdir(parents=True, exist_ok=True)
image_paths = list(image_dir.iterdir())
for image_path in TQDM(image_paths, desc=f"Processing {phase} images"):
if image_path.suffix != ".png":
continue
image_name_without_ext = image_path.stem
img = cv2.imread(str(image_path))
h, w = img.shape[:2]
convert_label(image_name_without_ext, w, h, orig_label_dir, save_dir)
def min_index(arr1, arr2):
"""
Find a pair of indexes with the shortest distance between two arrays of 2D points.
Args:
arr1 (np.ndarray): A NumPy array of shape (N, 2) representing N 2D points.
arr2 (np.ndarray): A NumPy array of shape (M, 2) representing M 2D points.
Returns:
(tuple): A tuple containing the indexes of the points with the shortest distance in arr1 and arr2 respectively.
"""
dis = ((arr1[:, None, :] - arr2[None, :, :]) ** 2).sum(-1)
return np.unravel_index(np.argmin(dis, axis=None), dis.shape)
def merge_multi_segment(segments):
"""
Merge multiple segments into one list by connecting the coordinates with the minimum distance between each segment.
This function connects these coordinates with a thin line to merge all segments into one.
Args:
segments (List[List]): Original segmentations in COCO's JSON file.
Each element is a list of coordinates, like [segmentation1, segmentation2,...].
Returns:
s (List[np.ndarray]): A list of connected segments represented as NumPy arrays.
"""
s = []
segments = [np.array(i).reshape(-1, 2) for i in segments]
idx_list = [[] for _ in range(len(segments))]
# Record the indexes with min distance between each segment
for i in range(1, len(segments)):
idx1, idx2 = min_index(segments[i - 1], segments[i])
idx_list[i - 1].append(idx1)
idx_list[i].append(idx2)
# Use two round to connect all the segments
for k in range(2):
# Forward connection
if k == 0:
for i, idx in enumerate(idx_list):
# Middle segments have two indexes, reverse the index of middle segments
if len(idx) == 2 and idx[0] > idx[1]:
idx = idx[::-1]
segments[i] = segments[i][::-1, :]
segments[i] = np.roll(segments[i], -idx[0], axis=0)
segments[i] = np.concatenate([segments[i], segments[i][:1]])
# Deal with the first segment and the last one
if i in [0, len(idx_list) - 1]:
s.append(segments[i])
else:
idx = [0, idx[1] - idx[0]]
s.append(segments[i][idx[0] : idx[1] + 1])
else:
for i in range(len(idx_list) - 1, -1, -1):
if i not in [0, len(idx_list) - 1]:
idx = idx_list[i]
nidx = abs(idx[1] - idx[0])
s.append(segments[i][nidx:])
return s
def yolo_bbox2segment(im_dir, save_dir=None, sam_model="sam_b.pt"):
"""
Converts existing object detection dataset (bounding boxes) to segmentation dataset or oriented bounding box (OBB)
in YOLO format. Generates segmentation data using SAM auto-annotator as needed.
Args:
im_dir (str | Path): Path to image directory to convert.
save_dir (str | Path): Path to save the generated labels, labels will be saved
into `labels-segment` in the same directory level of `im_dir` if save_dir is None. Default: None.
sam_model (str): Segmentation model to use for intermediate segmentation data; optional.
Notes:
The input directory structure assumed for dataset:
- im_dir
├─ 001.jpg
├─ ..
└─ NNN.jpg
- labels
├─ 001.txt
├─ ..
└─ NNN.txt
"""
from ultralytics.data import YOLODataset
from ultralytics.utils.ops import xywh2xyxy
from ultralytics.utils import LOGGER
from ultralytics import SAM
from tqdm import tqdm
# NOTE: add placeholder to pass class index check
dataset = YOLODataset(im_dir, data=dict(names=list(range(1000))))
if len(dataset.labels[0]["segments"]) > 0: # if it's segment data
LOGGER.info("Segmentation labels detected, no need to generate new ones!")
return
LOGGER.info("Detection labels detected, generating segment labels by SAM model!")
sam_model = SAM(sam_model)
for l in tqdm(dataset.labels, total=len(dataset.labels), desc="Generating segment labels"):
h, w = l["shape"]
boxes = l["bboxes"]
if len(boxes) == 0: # skip empty labels
continue
boxes[:, [0, 2]] *= w
boxes[:, [1, 3]] *= h
im = cv2.imread(l["im_file"])
sam_results = sam_model(im, bboxes=xywh2xyxy(boxes), verbose=False, save=False)
l["segments"] = sam_results[0].masks.xyn
save_dir = Path(save_dir) if save_dir else Path(im_dir).parent / "labels-segment"
save_dir.mkdir(parents=True, exist_ok=True)
for l in dataset.labels:
texts = []
lb_name = Path(l["im_file"]).with_suffix(".txt").name
txt_file = save_dir / lb_name
cls = l["cls"]
for i, s in enumerate(l["segments"]):
line = (int(cls[i]), *s.reshape(-1))
texts.append(("%g " * len(line)).rstrip() % line)
if texts:
with open(txt_file, "a") as f:
f.writelines(text + "\n" for text in texts)
LOGGER.info(f"Generated segment labels saved in {save_dir}")

383
ultralytics/data/dataset.py Normal file
View File

@ -0,0 +1,383 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import contextlib
from itertools import repeat
from multiprocessing.pool import ThreadPool
from pathlib import Path
import cv2
import numpy as np
import torch
import torchvision
from PIL import Image
from ultralytics.utils import LOCAL_RANK, NUM_THREADS, TQDM, colorstr, is_dir_writeable
from ultralytics.utils.ops import resample_segments
from .augment import Compose, Format, Instances, LetterBox, classify_augmentations, classify_transforms, v8_transforms
from .base import BaseDataset
from .utils import HELP_URL, LOGGER, get_hash, img2label_paths, verify_image, verify_image_label
# Ultralytics dataset *.cache version, >= 1.0.0 for YOLOv8
DATASET_CACHE_VERSION = "1.0.3"
class YOLODataset(BaseDataset):
"""
Dataset class for loading object detection and/or segmentation labels in YOLO format.
Args:
data (dict, optional): A dataset YAML dictionary. Defaults to None.
task (str): An explicit arg to point current task, Defaults to 'detect'.
Returns:
(torch.utils.data.Dataset): A PyTorch dataset object that can be used for training an object detection model.
"""
def __init__(self, *args, data=None, task="detect", **kwargs):
"""Initializes the YOLODataset with optional configurations for segments and keypoints."""
self.use_segments = task == "segment"
self.use_keypoints = task == "pose"
self.use_obb = task == "obb"
self.data = data
assert not (self.use_segments and self.use_keypoints), "Can not use both segments and keypoints."
super().__init__(*args, **kwargs)
def cache_labels(self, path=Path("./labels.cache")):
"""
Cache dataset labels, check images and read shapes.
Args:
path (Path): Path where to save the cache file. Default is Path('./labels.cache').
Returns:
(dict): labels.
"""
x = {"labels": []}
nm, nf, ne, nc, msgs = 0, 0, 0, 0, [] # number missing, found, empty, corrupt, messages
desc = f"{self.prefix}Scanning {path.parent / path.stem}..."
total = len(self.im_files)
nkpt, ndim = self.data.get("kpt_shape", (0, 0))
if self.use_keypoints and (nkpt <= 0 or ndim not in (2, 3)):
raise ValueError(
"'kpt_shape' in data.yaml missing or incorrect. Should be a list with [number of "
"keypoints, number of dims (2 for x,y or 3 for x,y,visible)], i.e. 'kpt_shape: [17, 3]'"
)
with ThreadPool(NUM_THREADS) as pool:
results = pool.imap(
func=verify_image_label,
iterable=zip(
self.im_files,
self.label_files,
repeat(self.prefix),
repeat(self.use_keypoints),
repeat(len(self.data["names"])),
repeat(nkpt),
repeat(ndim),
),
)
pbar = TQDM(results, desc=desc, total=total)
for im_file, lb, shape, segments, keypoint, nm_f, nf_f, ne_f, nc_f, msg in pbar:
nm += nm_f
nf += nf_f
ne += ne_f
nc += nc_f
if im_file:
x["labels"].append(
dict(
im_file=im_file,
shape=shape,
cls=lb[:, 0:1], # n, 1
bboxes=lb[:, 1:], # n, 4
segments=segments,
keypoints=keypoint,
normalized=True,
bbox_format="xywh",
)
)
if msg:
msgs.append(msg)
pbar.desc = f"{desc} {nf} images, {nm + ne} backgrounds, {nc} corrupt"
pbar.close()
if msgs:
LOGGER.info("\n".join(msgs))
if nf == 0:
LOGGER.warning(f"{self.prefix}WARNING ⚠️ No labels found in {path}. {HELP_URL}")
x["hash"] = get_hash(self.label_files + self.im_files)
x["results"] = nf, nm, ne, nc, len(self.im_files)
x["msgs"] = msgs # warnings
save_dataset_cache_file(self.prefix, path, x)
return x
def get_labels(self):
"""Returns dictionary of labels for YOLO training."""
self.label_files = img2label_paths(self.im_files)
cache_path = Path(self.label_files[0]).parent.with_suffix(".cache")
try:
cache, exists = load_dataset_cache_file(cache_path), True # attempt to load a *.cache file
assert cache["version"] == DATASET_CACHE_VERSION # matches current version
assert cache["hash"] == get_hash(self.label_files + self.im_files) # identical hash
except (FileNotFoundError, AssertionError, AttributeError):
cache, exists = self.cache_labels(cache_path), False # run cache ops
# Display cache
nf, nm, ne, nc, n = cache.pop("results") # found, missing, empty, corrupt, total
if exists and LOCAL_RANK in (-1, 0):
d = f"Scanning {cache_path}... {nf} images, {nm + ne} backgrounds, {nc} corrupt"
TQDM(None, desc=self.prefix + d, total=n, initial=n) # display results
if cache["msgs"]:
LOGGER.info("\n".join(cache["msgs"])) # display warnings
# Read cache
[cache.pop(k) for k in ("hash", "version", "msgs")] # remove items
labels = cache["labels"]
if not labels:
LOGGER.warning(f"WARNING ⚠️ No images found in {cache_path}, training may not work correctly. {HELP_URL}")
self.im_files = [lb["im_file"] for lb in labels] # update im_files
# Check if the dataset is all boxes or all segments
lengths = ((len(lb["cls"]), len(lb["bboxes"]), len(lb["segments"])) for lb in labels)
len_cls, len_boxes, len_segments = (sum(x) for x in zip(*lengths))
if len_segments and len_boxes != len_segments:
LOGGER.warning(
f"WARNING ⚠️ Box and segment counts should be equal, but got len(segments) = {len_segments}, "
f"len(boxes) = {len_boxes}. To resolve this only boxes will be used and all segments will be removed. "
"To avoid this please supply either a detect or segment dataset, not a detect-segment mixed dataset."
)
for lb in labels:
lb["segments"] = []
if len_cls == 0:
LOGGER.warning(f"WARNING ⚠️ No labels found in {cache_path}, training may not work correctly. {HELP_URL}")
return labels
def build_transforms(self, hyp=None):
"""Builds and appends transforms to the list."""
if self.augment:
hyp.mosaic = hyp.mosaic if self.augment and not self.rect else 0.0
hyp.mixup = hyp.mixup if self.augment and not self.rect else 0.0
transforms = v8_transforms(self, self.imgsz, hyp)
else:
transforms = Compose([LetterBox(new_shape=(self.imgsz, self.imgsz), scaleup=False)])
transforms.append(
Format(
bbox_format="xywh",
normalize=True,
return_mask=self.use_segments,
return_keypoint=self.use_keypoints,
return_obb=self.use_obb,
batch_idx=True,
mask_ratio=hyp.mask_ratio,
mask_overlap=hyp.overlap_mask,
bgr=hyp.bgr if self.augment else 0.0, # only affect training.
)
)
return transforms
def close_mosaic(self, hyp):
"""Sets mosaic, copy_paste and mixup options to 0.0 and builds transformations."""
hyp.mosaic = 0.0 # set mosaic ratio=0.0
hyp.copy_paste = 0.0 # keep the same behavior as previous v8 close-mosaic
hyp.mixup = 0.0 # keep the same behavior as previous v8 close-mosaic
self.transforms = self.build_transforms(hyp)
def update_labels_info(self, label):
"""
Custom your label format here.
Note:
cls is not with bboxes now, classification and semantic segmentation need an independent cls label
Can also support classification and semantic segmentation by adding or removing dict keys there.
"""
bboxes = label.pop("bboxes")
segments = label.pop("segments", [])
keypoints = label.pop("keypoints", None)
bbox_format = label.pop("bbox_format")
normalized = label.pop("normalized")
# NOTE: do NOT resample oriented boxes
segment_resamples = 100 if self.use_obb else 1000
if len(segments) > 0:
# list[np.array(1000, 2)] * num_samples
# (N, 1000, 2)
segments = np.stack(resample_segments(segments, n=segment_resamples), axis=0)
else:
segments = np.zeros((0, segment_resamples, 2), dtype=np.float32)
label["instances"] = Instances(bboxes, segments, keypoints, bbox_format=bbox_format, normalized=normalized)
return label
@staticmethod
def collate_fn(batch):
"""Collates data samples into batches."""
new_batch = {}
keys = batch[0].keys()
values = list(zip(*[list(b.values()) for b in batch]))
for i, k in enumerate(keys):
value = values[i]
if k == "img":
value = torch.stack(value, 0)
if k in ["masks", "keypoints", "bboxes", "cls", "segments", "obb"]:
value = torch.cat(value, 0)
new_batch[k] = value
new_batch["batch_idx"] = list(new_batch["batch_idx"])
for i in range(len(new_batch["batch_idx"])):
new_batch["batch_idx"][i] += i # add target image index for build_targets()
new_batch["batch_idx"] = torch.cat(new_batch["batch_idx"], 0)
return new_batch
# Classification dataloaders -------------------------------------------------------------------------------------------
class ClassificationDataset(torchvision.datasets.ImageFolder):
"""
Extends torchvision ImageFolder to support YOLO classification tasks, offering functionalities like image
augmentation, caching, and verification. It's designed to efficiently handle large datasets for training deep
learning models, with optional image transformations and caching mechanisms to speed up training.
This class allows for augmentations using both torchvision and Albumentations libraries, and supports caching images
in RAM or on disk to reduce IO overhead during training. Additionally, it implements a robust verification process
to ensure data integrity and consistency.
Attributes:
cache_ram (bool): Indicates if caching in RAM is enabled.
cache_disk (bool): Indicates if caching on disk is enabled.
samples (list): A list of tuples, each containing the path to an image, its class index, path to its .npy cache
file (if caching on disk), and optionally the loaded image array (if caching in RAM).
torch_transforms (callable): PyTorch transforms to be applied to the images.
"""
def __init__(self, root, args, augment=False, prefix=""):
"""
Initialize YOLO object with root, image size, augmentations, and cache settings.
Args:
root (str): Path to the dataset directory where images are stored in a class-specific folder structure.
args (Namespace): Configuration containing dataset-related settings such as image size, augmentation
parameters, and cache settings. It includes attributes like `imgsz` (image size), `fraction` (fraction
of data to use), `scale`, `fliplr`, `flipud`, `cache` (disk or RAM caching for faster training),
`auto_augment`, `hsv_h`, `hsv_s`, `hsv_v`, and `crop_fraction`.
augment (bool, optional): Whether to apply augmentations to the dataset. Default is False.
prefix (str, optional): Prefix for logging and cache filenames, aiding in dataset identification and
debugging. Default is an empty string.
"""
super().__init__(root=root)
if augment and args.fraction < 1.0: # reduce training fraction
self.samples = self.samples[: round(len(self.samples) * args.fraction)]
self.prefix = colorstr(f"{prefix}: ") if prefix else ""
self.cache_ram = args.cache is True or args.cache == "ram" # cache images into RAM
self.cache_disk = args.cache == "disk" # cache images on hard drive as uncompressed *.npy files
self.samples = self.verify_images() # filter out bad images
self.samples = [list(x) + [Path(x[0]).with_suffix(".npy"), None] for x in self.samples] # file, index, npy, im
scale = (1.0 - args.scale, 1.0) # (0.08, 1.0)
self.torch_transforms = (
classify_augmentations(
size=args.imgsz,
scale=scale,
hflip=args.fliplr,
vflip=args.flipud,
erasing=args.erasing,
auto_augment=args.auto_augment,
hsv_h=args.hsv_h,
hsv_s=args.hsv_s,
hsv_v=args.hsv_v,
)
if augment
else classify_transforms(size=args.imgsz, crop_fraction=args.crop_fraction)
)
def __getitem__(self, i):
"""Returns subset of data and targets corresponding to given indices."""
f, j, fn, im = self.samples[i] # filename, index, filename.with_suffix('.npy'), image
if self.cache_ram and im is None:
im = self.samples[i][3] = cv2.imread(f)
elif self.cache_disk:
if not fn.exists(): # load npy
np.save(fn.as_posix(), cv2.imread(f), allow_pickle=False)
im = np.load(fn)
else: # read image
im = cv2.imread(f) # BGR
# Convert NumPy array to PIL image
im = Image.fromarray(cv2.cvtColor(im, cv2.COLOR_BGR2RGB))
sample = self.torch_transforms(im)
return {"img": sample, "cls": j}
def __len__(self) -> int:
"""Return the total number of samples in the dataset."""
return len(self.samples)
def verify_images(self):
"""Verify all images in dataset."""
desc = f"{self.prefix}Scanning {self.root}..."
path = Path(self.root).with_suffix(".cache") # *.cache file path
with contextlib.suppress(FileNotFoundError, AssertionError, AttributeError):
cache = load_dataset_cache_file(path) # attempt to load a *.cache file
assert cache["version"] == DATASET_CACHE_VERSION # matches current version
assert cache["hash"] == get_hash([x[0] for x in self.samples]) # identical hash
nf, nc, n, samples = cache.pop("results") # found, missing, empty, corrupt, total
if LOCAL_RANK in (-1, 0):
d = f"{desc} {nf} images, {nc} corrupt"
TQDM(None, desc=d, total=n, initial=n)
if cache["msgs"]:
LOGGER.info("\n".join(cache["msgs"])) # display warnings
return samples
# Run scan if *.cache retrieval failed
nf, nc, msgs, samples, x = 0, 0, [], [], {}
with ThreadPool(NUM_THREADS) as pool:
results = pool.imap(func=verify_image, iterable=zip(self.samples, repeat(self.prefix)))
pbar = TQDM(results, desc=desc, total=len(self.samples))
for sample, nf_f, nc_f, msg in pbar:
if nf_f:
samples.append(sample)
if msg:
msgs.append(msg)
nf += nf_f
nc += nc_f
pbar.desc = f"{desc} {nf} images, {nc} corrupt"
pbar.close()
if msgs:
LOGGER.info("\n".join(msgs))
x["hash"] = get_hash([x[0] for x in self.samples])
x["results"] = nf, nc, len(samples), samples
x["msgs"] = msgs # warnings
save_dataset_cache_file(self.prefix, path, x)
return samples
def load_dataset_cache_file(path):
"""Load an Ultralytics *.cache dictionary from path."""
import gc
gc.disable() # reduce pickle load time https://github.com/ultralytics/ultralytics/pull/1585
cache = np.load(str(path), allow_pickle=True).item() # load dict
gc.enable()
return cache
def save_dataset_cache_file(prefix, path, x):
"""Save an Ultralytics dataset *.cache dictionary x to path."""
x["version"] = DATASET_CACHE_VERSION # add cache version
if is_dir_writeable(path.parent):
if path.exists():
path.unlink() # remove *.cache file if exists
np.save(str(path), x) # save cache for next time
path.with_suffix(".cache.npy").rename(path) # remove .npy suffix
LOGGER.info(f"{prefix}New cache created: {path}")
else:
LOGGER.warning(f"{prefix}WARNING ⚠️ Cache directory {path.parent} is not writeable, cache not saved.")
# TODO: support semantic segmentation
class SemanticDataset(BaseDataset):
"""
Semantic Segmentation Dataset.
This class is responsible for handling datasets used for semantic segmentation tasks. It inherits functionalities
from the BaseDataset class.
Note:
This class is currently a placeholder and needs to be populated with methods and attributes for supporting
semantic segmentation tasks.
"""
def __init__(self):
"""Initialize a SemanticDataset object."""
super().__init__()

View File

@ -0,0 +1,5 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from .utils import plot_query_result
__all__ = ["plot_query_result"]

View File

@ -0,0 +1,472 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from io import BytesIO
from pathlib import Path
from typing import Any, List, Tuple, Union
import cv2
import numpy as np
import torch
from PIL import Image
from matplotlib import pyplot as plt
from pandas import DataFrame
from tqdm import tqdm
from ultralytics.data.augment import Format
from ultralytics.data.dataset import YOLODataset
from ultralytics.data.utils import check_det_dataset
from ultralytics.models.yolo.model import YOLO
from ultralytics.utils import LOGGER, IterableSimpleNamespace, checks, USER_CONFIG_DIR
from .utils import get_sim_index_schema, get_table_schema, plot_query_result, prompt_sql_query, sanitize_batch
class ExplorerDataset(YOLODataset):
def __init__(self, *args, data: dict = None, **kwargs) -> None:
super().__init__(*args, data=data, **kwargs)
def load_image(self, i: int) -> Union[Tuple[np.ndarray, Tuple[int, int], Tuple[int, int]], Tuple[None, None, None]]:
"""Loads 1 image from dataset index 'i' without any resize ops."""
im, f, fn = self.ims[i], self.im_files[i], self.npy_files[i]
if im is None: # not cached in RAM
if fn.exists(): # load npy
im = np.load(fn)
else: # read image
im = cv2.imread(f) # BGR
if im is None:
raise FileNotFoundError(f"Image Not Found {f}")
h0, w0 = im.shape[:2] # orig hw
return im, (h0, w0), im.shape[:2]
return self.ims[i], self.im_hw0[i], self.im_hw[i]
def build_transforms(self, hyp: IterableSimpleNamespace = None):
"""Creates transforms for dataset images without resizing."""
return Format(
bbox_format="xyxy",
normalize=False,
return_mask=self.use_segments,
return_keypoint=self.use_keypoints,
batch_idx=True,
mask_ratio=hyp.mask_ratio,
mask_overlap=hyp.overlap_mask,
)
class Explorer:
def __init__(
self,
data: Union[str, Path] = "coco128.yaml",
model: str = "yolov8n.pt",
uri: str = USER_CONFIG_DIR / "explorer",
) -> None:
# Note duckdb==0.10.0 bug https://github.com/ultralytics/ultralytics/pull/8181
checks.check_requirements(["lancedb>=0.4.3", "duckdb<=0.9.2"])
import lancedb
self.connection = lancedb.connect(uri)
self.table_name = Path(data).name.lower() + "_" + model.lower()
self.sim_idx_base_name = (
f"{self.table_name}_sim_idx".lower()
) # Use this name and append thres and top_k to reuse the table
self.model = YOLO(model)
self.data = data # None
self.choice_set = None
self.table = None
self.progress = 0
def create_embeddings_table(self, force: bool = False, split: str = "train") -> None:
"""
Create LanceDB table containing the embeddings of the images in the dataset. The table will be reused if it
already exists. Pass force=True to overwrite the existing table.
Args:
force (bool): Whether to overwrite the existing table or not. Defaults to False.
split (str): Split of the dataset to use. Defaults to 'train'.
Example:
```python
exp = Explorer()
exp.create_embeddings_table()
```
"""
if self.table is not None and not force:
LOGGER.info("Table already exists. Reusing it. Pass force=True to overwrite it.")
return
if self.table_name in self.connection.table_names() and not force:
LOGGER.info(f"Table {self.table_name} already exists. Reusing it. Pass force=True to overwrite it.")
self.table = self.connection.open_table(self.table_name)
self.progress = 1
return
if self.data is None:
raise ValueError("Data must be provided to create embeddings table")
data_info = check_det_dataset(self.data)
if split not in data_info:
raise ValueError(
f"Split {split} is not found in the dataset. Available keys in the dataset are {list(data_info.keys())}"
)
choice_set = data_info[split]
choice_set = choice_set if isinstance(choice_set, list) else [choice_set]
self.choice_set = choice_set
dataset = ExplorerDataset(img_path=choice_set, data=data_info, augment=False, cache=False, task=self.model.task)
# Create the table schema
batch = dataset[0]
vector_size = self.model.embed(batch["im_file"], verbose=False)[0].shape[0]
table = self.connection.create_table(self.table_name, schema=get_table_schema(vector_size), mode="overwrite")
table.add(
self._yield_batches(
dataset,
data_info,
self.model,
exclude_keys=["img", "ratio_pad", "resized_shape", "ori_shape", "batch_idx"],
)
)
self.table = table
def _yield_batches(self, dataset: ExplorerDataset, data_info: dict, model: YOLO, exclude_keys: List[str]):
"""Generates batches of data for embedding, excluding specified keys."""
for i in tqdm(range(len(dataset))):
self.progress = float(i + 1) / len(dataset)
batch = dataset[i]
for k in exclude_keys:
batch.pop(k, None)
batch = sanitize_batch(batch, data_info)
batch["vector"] = model.embed(batch["im_file"], verbose=False)[0].detach().tolist()
yield [batch]
def query(
self, imgs: Union[str, np.ndarray, List[str], List[np.ndarray]] = None, limit: int = 25
) -> Any: # pyarrow.Table
"""
Query the table for similar images. Accepts a single image or a list of images.
Args:
imgs (str or list): Path to the image or a list of paths to the images.
limit (int): Number of results to return.
Returns:
(pyarrow.Table): An arrow table containing the results. Supports converting to:
- pandas dataframe: `result.to_pandas()`
- dict of lists: `result.to_pydict()`
Example:
```python
exp = Explorer()
exp.create_embeddings_table()
similar = exp.query(img='https://ultralytics.com/images/zidane.jpg')
```
"""
if self.table is None:
raise ValueError("Table is not created. Please create the table first.")
if isinstance(imgs, str):
imgs = [imgs]
assert isinstance(imgs, list), f"img must be a string or a list of strings. Got {type(imgs)}"
embeds = self.model.embed(imgs)
# Get avg if multiple images are passed (len > 1)
embeds = torch.mean(torch.stack(embeds), 0).cpu().numpy() if len(embeds) > 1 else embeds[0].cpu().numpy()
return self.table.search(embeds).limit(limit).to_arrow()
def sql_query(
self, query: str, return_type: str = "pandas"
) -> Union[DataFrame, Any, None]: # pandas.dataframe or pyarrow.Table
"""
Run a SQL-Like query on the table. Utilizes LanceDB predicate pushdown.
Args:
query (str): SQL query to run.
return_type (str): Type of the result to return. Can be either 'pandas' or 'arrow'. Defaults to 'pandas'.
Returns:
(pyarrow.Table): An arrow table containing the results.
Example:
```python
exp = Explorer()
exp.create_embeddings_table()
query = "SELECT * FROM 'table' WHERE labels LIKE '%person%'"
result = exp.sql_query(query)
```
"""
assert return_type in {
"pandas",
"arrow",
}, f"Return type should be either `pandas` or `arrow`, but got {return_type}"
import duckdb
if self.table is None:
raise ValueError("Table is not created. Please create the table first.")
# Note: using filter pushdown would be a better long term solution. Temporarily using duckdb for this.
table = self.table.to_arrow() # noqa NOTE: Don't comment this. This line is used by DuckDB
if not query.startswith("SELECT") and not query.startswith("WHERE"):
raise ValueError(
f"Query must start with SELECT or WHERE. You can either pass the entire query or just the WHERE clause. found {query}"
)
if query.startswith("WHERE"):
query = f"SELECT * FROM 'table' {query}"
LOGGER.info(f"Running query: {query}")
rs = duckdb.sql(query)
if return_type == "arrow":
return rs.arrow()
elif return_type == "pandas":
return rs.df()
def plot_sql_query(self, query: str, labels: bool = True) -> Image.Image:
"""
Plot the results of a SQL-Like query on the table.
Args:
query (str): SQL query to run.
labels (bool): Whether to plot the labels or not.
Returns:
(PIL.Image): Image containing the plot.
Example:
```python
exp = Explorer()
exp.create_embeddings_table()
query = "SELECT * FROM 'table' WHERE labels LIKE '%person%'"
result = exp.plot_sql_query(query)
```
"""
result = self.sql_query(query, return_type="arrow")
if len(result) == 0:
LOGGER.info("No results found.")
return None
img = plot_query_result(result, plot_labels=labels)
return Image.fromarray(img)
def get_similar(
self,
img: Union[str, np.ndarray, List[str], List[np.ndarray]] = None,
idx: Union[int, List[int]] = None,
limit: int = 25,
return_type: str = "pandas",
) -> Union[DataFrame, Any]: # pandas.dataframe or pyarrow.Table
"""
Query the table for similar images. Accepts a single image or a list of images.
Args:
img (str or list): Path to the image or a list of paths to the images.
idx (int or list): Index of the image in the table or a list of indexes.
limit (int): Number of results to return. Defaults to 25.
return_type (str): Type of the result to return. Can be either 'pandas' or 'arrow'. Defaults to 'pandas'.
Returns:
(pandas.DataFrame): A dataframe containing the results.
Example:
```python
exp = Explorer()
exp.create_embeddings_table()
similar = exp.get_similar(img='https://ultralytics.com/images/zidane.jpg')
```
"""
assert return_type in {
"pandas",
"arrow",
}, f"Return type should be either `pandas` or `arrow`, but got {return_type}"
img = self._check_imgs_or_idxs(img, idx)
similar = self.query(img, limit=limit)
if return_type == "arrow":
return similar
elif return_type == "pandas":
return similar.to_pandas()
def plot_similar(
self,
img: Union[str, np.ndarray, List[str], List[np.ndarray]] = None,
idx: Union[int, List[int]] = None,
limit: int = 25,
labels: bool = True,
) -> Image.Image:
"""
Plot the similar images. Accepts images or indexes.
Args:
img (str or list): Path to the image or a list of paths to the images.
idx (int or list): Index of the image in the table or a list of indexes.
labels (bool): Whether to plot the labels or not.
limit (int): Number of results to return. Defaults to 25.
Returns:
(PIL.Image): Image containing the plot.
Example:
```python
exp = Explorer()
exp.create_embeddings_table()
similar = exp.plot_similar(img='https://ultralytics.com/images/zidane.jpg')
```
"""
similar = self.get_similar(img, idx, limit, return_type="arrow")
if len(similar) == 0:
LOGGER.info("No results found.")
return None
img = plot_query_result(similar, plot_labels=labels)
return Image.fromarray(img)
def similarity_index(self, max_dist: float = 0.2, top_k: float = None, force: bool = False) -> DataFrame:
"""
Calculate the similarity index of all the images in the table. Here, the index will contain the data points that
are max_dist or closer to the image in the embedding space at a given index.
Args:
max_dist (float): maximum L2 distance between the embeddings to consider. Defaults to 0.2.
top_k (float): Percentage of the closest data points to consider when counting. Used to apply limit when running
vector search. Defaults: None.
force (bool): Whether to overwrite the existing similarity index or not. Defaults to True.
Returns:
(pandas.DataFrame): A dataframe containing the similarity index. Each row corresponds to an image, and columns
include indices of similar images and their respective distances.
Example:
```python
exp = Explorer()
exp.create_embeddings_table()
sim_idx = exp.similarity_index()
```
"""
if self.table is None:
raise ValueError("Table is not created. Please create the table first.")
sim_idx_table_name = f"{self.sim_idx_base_name}_thres_{max_dist}_top_{top_k}".lower()
if sim_idx_table_name in self.connection.table_names() and not force:
LOGGER.info("Similarity matrix already exists. Reusing it. Pass force=True to overwrite it.")
return self.connection.open_table(sim_idx_table_name).to_pandas()
if top_k and not (1.0 >= top_k >= 0.0):
raise ValueError(f"top_k must be between 0.0 and 1.0. Got {top_k}")
if max_dist < 0.0:
raise ValueError(f"max_dist must be greater than 0. Got {max_dist}")
top_k = int(top_k * len(self.table)) if top_k else len(self.table)
top_k = max(top_k, 1)
features = self.table.to_lance().to_table(columns=["vector", "im_file"]).to_pydict()
im_files = features["im_file"]
embeddings = features["vector"]
sim_table = self.connection.create_table(sim_idx_table_name, schema=get_sim_index_schema(), mode="overwrite")
def _yield_sim_idx():
"""Generates a dataframe with similarity indices and distances for images."""
for i in tqdm(range(len(embeddings))):
sim_idx = self.table.search(embeddings[i]).limit(top_k).to_pandas().query(f"_distance <= {max_dist}")
yield [
{
"idx": i,
"im_file": im_files[i],
"count": len(sim_idx),
"sim_im_files": sim_idx["im_file"].tolist(),
}
]
sim_table.add(_yield_sim_idx())
self.sim_index = sim_table
return sim_table.to_pandas()
def plot_similarity_index(self, max_dist: float = 0.2, top_k: float = None, force: bool = False) -> Image:
"""
Plot the similarity index of all the images in the table. Here, the index will contain the data points that are
max_dist or closer to the image in the embedding space at a given index.
Args:
max_dist (float): maximum L2 distance between the embeddings to consider. Defaults to 0.2.
top_k (float): Percentage of closest data points to consider when counting. Used to apply limit when
running vector search. Defaults to 0.01.
force (bool): Whether to overwrite the existing similarity index or not. Defaults to True.
Returns:
(PIL.Image): Image containing the plot.
Example:
```python
exp = Explorer()
exp.create_embeddings_table()
similarity_idx_plot = exp.plot_similarity_index()
similarity_idx_plot.show() # view image preview
similarity_idx_plot.save('path/to/save/similarity_index_plot.png') # save contents to file
```
"""
sim_idx = self.similarity_index(max_dist=max_dist, top_k=top_k, force=force)
sim_count = sim_idx["count"].tolist()
sim_count = np.array(sim_count)
indices = np.arange(len(sim_count))
# Create the bar plot
plt.bar(indices, sim_count)
# Customize the plot (optional)
plt.xlabel("data idx")
plt.ylabel("Count")
plt.title("Similarity Count")
buffer = BytesIO()
plt.savefig(buffer, format="png")
buffer.seek(0)
# Use Pillow to open the image from the buffer
return Image.fromarray(np.array(Image.open(buffer)))
def _check_imgs_or_idxs(
self, img: Union[str, np.ndarray, List[str], List[np.ndarray], None], idx: Union[None, int, List[int]]
) -> List[np.ndarray]:
if img is None and idx is None:
raise ValueError("Either img or idx must be provided.")
if img is not None and idx is not None:
raise ValueError("Only one of img or idx must be provided.")
if idx is not None:
idx = idx if isinstance(idx, list) else [idx]
img = self.table.to_lance().take(idx, columns=["im_file"]).to_pydict()["im_file"]
return img if isinstance(img, list) else [img]
def ask_ai(self, query):
"""
Ask AI a question.
Args:
query (str): Question to ask.
Returns:
(pandas.DataFrame): A dataframe containing filtered results to the SQL query.
Example:
```python
exp = Explorer()
exp.create_embeddings_table()
answer = exp.ask_ai('Show images with 1 person and 2 dogs')
```
"""
result = prompt_sql_query(query)
try:
df = self.sql_query(result)
except Exception as e:
LOGGER.error("AI generated query is not valid. Please try again with a different prompt")
LOGGER.error(e)
return None
return df
def visualize(self, result):
"""
Visualize the results of a query. TODO.
Args:
result (pyarrow.Table): Table containing the results of a query.
"""
pass
def generate_report(self, result):
"""
Generate a report of the dataset.
TODO
"""
pass

View File

@ -0,0 +1 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license

View File

@ -0,0 +1,268 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import time
from threading import Thread
import pandas as pd
from ultralytics import Explorer
from ultralytics.utils import ROOT, SETTINGS
from ultralytics.utils.checks import check_requirements
check_requirements(("streamlit>=1.29.0", "streamlit-select>=0.3"))
import streamlit as st
from streamlit_select import image_select
def _get_explorer():
"""Initializes and returns an instance of the Explorer class."""
exp = Explorer(data=st.session_state.get("dataset"), model=st.session_state.get("model"))
thread = Thread(
target=exp.create_embeddings_table, kwargs={"force": st.session_state.get("force_recreate_embeddings")}
)
thread.start()
progress_bar = st.progress(0, text="Creating embeddings table...")
while exp.progress < 1:
time.sleep(0.1)
progress_bar.progress(exp.progress, text=f"Progress: {exp.progress * 100}%")
thread.join()
st.session_state["explorer"] = exp
progress_bar.empty()
def init_explorer_form():
"""Initializes an Explorer instance and creates embeddings table with progress tracking."""
datasets = ROOT / "cfg" / "datasets"
ds = [d.name for d in datasets.glob("*.yaml")]
models = [
"yolov8n.pt",
"yolov8s.pt",
"yolov8m.pt",
"yolov8l.pt",
"yolov8x.pt",
"yolov8n-seg.pt",
"yolov8s-seg.pt",
"yolov8m-seg.pt",
"yolov8l-seg.pt",
"yolov8x-seg.pt",
"yolov8n-pose.pt",
"yolov8s-pose.pt",
"yolov8m-pose.pt",
"yolov8l-pose.pt",
"yolov8x-pose.pt",
]
with st.form(key="explorer_init_form"):
col1, col2 = st.columns(2)
with col1:
st.selectbox("Select dataset", ds, key="dataset", index=ds.index("coco128.yaml"))
with col2:
st.selectbox("Select model", models, key="model")
st.checkbox("Force recreate embeddings", key="force_recreate_embeddings")
st.form_submit_button("Explore", on_click=_get_explorer)
def query_form():
"""Sets up a form in Streamlit to initialize Explorer with dataset and model selection."""
with st.form("query_form"):
col1, col2 = st.columns([0.8, 0.2])
with col1:
st.text_input(
"Query",
"WHERE labels LIKE '%person%' AND labels LIKE '%dog%'",
label_visibility="collapsed",
key="query",
)
with col2:
st.form_submit_button("Query", on_click=run_sql_query)
def ai_query_form():
"""Sets up a Streamlit form for user input to initialize Explorer with dataset and model selection."""
with st.form("ai_query_form"):
col1, col2 = st.columns([0.8, 0.2])
with col1:
st.text_input("Query", "Show images with 1 person and 1 dog", label_visibility="collapsed", key="ai_query")
with col2:
st.form_submit_button("Ask AI", on_click=run_ai_query)
def find_similar_imgs(imgs):
"""Initializes a Streamlit form for AI-based image querying with custom input."""
exp = st.session_state["explorer"]
similar = exp.get_similar(img=imgs, limit=st.session_state.get("limit"), return_type="arrow")
paths = similar.to_pydict()["im_file"]
st.session_state["imgs"] = paths
st.session_state["res"] = similar
def similarity_form(selected_imgs):
"""Initializes a form for AI-based image querying with custom input in Streamlit."""
st.write("Similarity Search")
with st.form("similarity_form"):
subcol1, subcol2 = st.columns([1, 1])
with subcol1:
st.number_input(
"limit", min_value=None, max_value=None, value=25, label_visibility="collapsed", key="limit"
)
with subcol2:
disabled = not len(selected_imgs)
st.write("Selected: ", len(selected_imgs))
st.form_submit_button(
"Search",
disabled=disabled,
on_click=find_similar_imgs,
args=(selected_imgs,),
)
if disabled:
st.error("Select at least one image to search.")
# def persist_reset_form():
# with st.form("persist_reset"):
# col1, col2 = st.columns([1, 1])
# with col1:
# st.form_submit_button("Reset", on_click=reset)
#
# with col2:
# st.form_submit_button("Persist", on_click=update_state, args=("PERSISTING", True))
def run_sql_query():
"""Executes an SQL query and returns the results."""
st.session_state["error"] = None
query = st.session_state.get("query")
if query.rstrip().lstrip():
exp = st.session_state["explorer"]
res = exp.sql_query(query, return_type="arrow")
st.session_state["imgs"] = res.to_pydict()["im_file"]
st.session_state["res"] = res
def run_ai_query():
"""Execute SQL query and update session state with query results."""
if not SETTINGS["openai_api_key"]:
st.session_state["error"] = (
'OpenAI API key not found in settings. Please run yolo settings openai_api_key="..."'
)
return
st.session_state["error"] = None
query = st.session_state.get("ai_query")
if query.rstrip().lstrip():
exp = st.session_state["explorer"]
res = exp.ask_ai(query)
if not isinstance(res, pd.DataFrame) or res.empty:
st.session_state["error"] = "No results found using AI generated query. Try another query or rerun it."
return
st.session_state["imgs"] = res["im_file"].to_list()
st.session_state["res"] = res
def reset_explorer():
"""Resets the explorer to its initial state by clearing session variables."""
st.session_state["explorer"] = None
st.session_state["imgs"] = None
st.session_state["error"] = None
def utralytics_explorer_docs_callback():
"""Resets the explorer to its initial state by clearing session variables."""
with st.container(border=True):
st.image(
"https://raw.githubusercontent.com/ultralytics/assets/main/logo/Ultralytics_Logotype_Original.svg",
width=100,
)
st.markdown(
"<p>This demo is built using Ultralytics Explorer API. Visit <a href='https://docs.ultralytics.com/datasets/explorer/'>API docs</a> to try examples & learn more</p>",
unsafe_allow_html=True,
help=None,
)
st.link_button("Ultrlaytics Explorer API", "https://docs.ultralytics.com/datasets/explorer/")
def layout():
"""Resets explorer session variables and provides documentation with a link to API docs."""
st.set_page_config(layout="wide", initial_sidebar_state="collapsed")
st.markdown("<h1 style='text-align: center;'>Ultralytics Explorer Demo</h1>", unsafe_allow_html=True)
if st.session_state.get("explorer") is None:
init_explorer_form()
return
st.button(":arrow_backward: Select Dataset", on_click=reset_explorer)
exp = st.session_state.get("explorer")
col1, col2 = st.columns([0.75, 0.25], gap="small")
imgs = []
if st.session_state.get("error"):
st.error(st.session_state["error"])
else:
if st.session_state.get("imgs"):
imgs = st.session_state.get("imgs")
else:
imgs = exp.table.to_lance().to_table(columns=["im_file"]).to_pydict()["im_file"]
st.session_state["res"] = exp.table.to_arrow()
total_imgs, selected_imgs = len(imgs), []
with col1:
subcol1, subcol2, subcol3, subcol4, subcol5 = st.columns(5)
with subcol1:
st.write("Max Images Displayed:")
with subcol2:
num = st.number_input(
"Max Images Displayed",
min_value=0,
max_value=total_imgs,
value=min(500, total_imgs),
key="num_imgs_displayed",
label_visibility="collapsed",
)
with subcol3:
st.write("Start Index:")
with subcol4:
start_idx = st.number_input(
"Start Index",
min_value=0,
max_value=total_imgs,
value=0,
key="start_index",
label_visibility="collapsed",
)
with subcol5:
reset = st.button("Reset", use_container_width=False, key="reset")
if reset:
st.session_state["imgs"] = None
st.experimental_rerun()
query_form()
ai_query_form()
if total_imgs:
labels, boxes, masks, kpts, classes = None, None, None, None, None
task = exp.model.task
if st.session_state.get("display_labels"):
labels = st.session_state.get("res").to_pydict()["labels"][start_idx : start_idx + num]
boxes = st.session_state.get("res").to_pydict()["bboxes"][start_idx : start_idx + num]
masks = st.session_state.get("res").to_pydict()["masks"][start_idx : start_idx + num]
kpts = st.session_state.get("res").to_pydict()["keypoints"][start_idx : start_idx + num]
classes = st.session_state.get("res").to_pydict()["cls"][start_idx : start_idx + num]
imgs_displayed = imgs[start_idx : start_idx + num]
selected_imgs = image_select(
f"Total samples: {total_imgs}",
images=imgs_displayed,
use_container_width=False,
# indices=[i for i in range(num)] if select_all else None,
labels=labels,
classes=classes,
bboxes=boxes,
masks=masks if task == "segment" else None,
kpts=kpts if task == "pose" else None,
)
with col2:
similarity_form(selected_imgs)
display_labels = st.checkbox("Labels", value=False, key="display_labels")
utralytics_explorer_docs_callback()
if __name__ == "__main__":
layout()

View File

@ -0,0 +1,166 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import getpass
from typing import List
import cv2
import numpy as np
import pandas as pd
from ultralytics.data.augment import LetterBox
from ultralytics.utils import LOGGER as logger
from ultralytics.utils import SETTINGS
from ultralytics.utils.checks import check_requirements
from ultralytics.utils.ops import xyxy2xywh
from ultralytics.utils.plotting import plot_images
def get_table_schema(vector_size):
"""Extracts and returns the schema of a database table."""
from lancedb.pydantic import LanceModel, Vector
class Schema(LanceModel):
im_file: str
labels: List[str]
cls: List[int]
bboxes: List[List[float]]
masks: List[List[List[int]]]
keypoints: List[List[List[float]]]
vector: Vector(vector_size)
return Schema
def get_sim_index_schema():
"""Returns a LanceModel schema for a database table with specified vector size."""
from lancedb.pydantic import LanceModel
class Schema(LanceModel):
idx: int
im_file: str
count: int
sim_im_files: List[str]
return Schema
def sanitize_batch(batch, dataset_info):
"""Sanitizes input batch for inference, ensuring correct format and dimensions."""
batch["cls"] = batch["cls"].flatten().int().tolist()
box_cls_pair = sorted(zip(batch["bboxes"].tolist(), batch["cls"]), key=lambda x: x[1])
batch["bboxes"] = [box for box, _ in box_cls_pair]
batch["cls"] = [cls for _, cls in box_cls_pair]
batch["labels"] = [dataset_info["names"][i] for i in batch["cls"]]
batch["masks"] = batch["masks"].tolist() if "masks" in batch else [[[]]]
batch["keypoints"] = batch["keypoints"].tolist() if "keypoints" in batch else [[[]]]
return batch
def plot_query_result(similar_set, plot_labels=True):
"""
Plot images from the similar set.
Args:
similar_set (list): Pyarrow or pandas object containing the similar data points
plot_labels (bool): Whether to plot labels or not
"""
similar_set = (
similar_set.to_dict(orient="list") if isinstance(similar_set, pd.DataFrame) else similar_set.to_pydict()
)
empty_masks = [[[]]]
empty_boxes = [[]]
images = similar_set.get("im_file", [])
bboxes = similar_set.get("bboxes", []) if similar_set.get("bboxes") is not empty_boxes else []
masks = similar_set.get("masks") if similar_set.get("masks")[0] != empty_masks else []
kpts = similar_set.get("keypoints") if similar_set.get("keypoints")[0] != empty_masks else []
cls = similar_set.get("cls", [])
plot_size = 640
imgs, batch_idx, plot_boxes, plot_masks, plot_kpts = [], [], [], [], []
for i, imf in enumerate(images):
im = cv2.imread(imf)
im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
h, w = im.shape[:2]
r = min(plot_size / h, plot_size / w)
imgs.append(LetterBox(plot_size, center=False)(image=im).transpose(2, 0, 1))
if plot_labels:
if len(bboxes) > i and len(bboxes[i]) > 0:
box = np.array(bboxes[i], dtype=np.float32)
box[:, [0, 2]] *= r
box[:, [1, 3]] *= r
plot_boxes.append(box)
if len(masks) > i and len(masks[i]) > 0:
mask = np.array(masks[i], dtype=np.uint8)[0]
plot_masks.append(LetterBox(plot_size, center=False)(image=mask))
if len(kpts) > i and kpts[i] is not None:
kpt = np.array(kpts[i], dtype=np.float32)
kpt[:, :, :2] *= r
plot_kpts.append(kpt)
batch_idx.append(np.ones(len(np.array(bboxes[i], dtype=np.float32))) * i)
imgs = np.stack(imgs, axis=0)
masks = np.stack(plot_masks, axis=0) if plot_masks else np.zeros(0, dtype=np.uint8)
kpts = np.concatenate(plot_kpts, axis=0) if plot_kpts else np.zeros((0, 51), dtype=np.float32)
boxes = xyxy2xywh(np.concatenate(plot_boxes, axis=0)) if plot_boxes else np.zeros(0, dtype=np.float32)
batch_idx = np.concatenate(batch_idx, axis=0)
cls = np.concatenate([np.array(c, dtype=np.int32) for c in cls], axis=0)
return plot_images(
imgs, batch_idx, cls, bboxes=boxes, masks=masks, kpts=kpts, max_subplots=len(images), save=False, threaded=False
)
def prompt_sql_query(query):
"""Plots images with optional labels from a similar data set."""
check_requirements("openai>=1.6.1")
from openai import OpenAI
if not SETTINGS["openai_api_key"]:
logger.warning("OpenAI API key not found in settings. Please enter your API key below.")
openai_api_key = getpass.getpass("OpenAI API key: ")
SETTINGS.update({"openai_api_key": openai_api_key})
openai = OpenAI(api_key=SETTINGS["openai_api_key"])
messages = [
{
"role": "system",
"content": """
You are a helpful data scientist proficient in SQL. You need to output exactly one SQL query based on
the following schema and a user request. You only need to output the format with fixed selection
statement that selects everything from "'table'", like `SELECT * from 'table'`
Schema:
im_file: string not null
labels: list<item: string> not null
child 0, item: string
cls: list<item: int64> not null
child 0, item: int64
bboxes: list<item: list<item: double>> not null
child 0, item: list<item: double>
child 0, item: double
masks: list<item: list<item: list<item: int64>>> not null
child 0, item: list<item: list<item: int64>>
child 0, item: list<item: int64>
child 0, item: int64
keypoints: list<item: list<item: list<item: double>>> not null
child 0, item: list<item: list<item: double>>
child 0, item: list<item: double>
child 0, item: double
vector: fixed_size_list<item: float>[256] not null
child 0, item: float
Some details about the schema:
- the "labels" column contains the string values like 'person' and 'dog' for the respective objects
in each image
- the "cls" column contains the integer values on these classes that map them the labels
Example of a correct query:
request - Get all data points that contain 2 or more people and at least one dog
correct query-
SELECT * FROM 'table' WHERE ARRAY_LENGTH(cls) >= 2 AND ARRAY_LENGTH(FILTER(labels, x -> x = 'person')) >= 2 AND ARRAY_LENGTH(FILTER(labels, x -> x = 'dog')) >= 1;
""",
},
{"role": "user", "content": f"{query}"},
]
response = openai.chat.completions.create(model="gpt-3.5-turbo", messages=messages)
return response.choices[0].message.content

555
ultralytics/data/loaders.py Normal file
View File

@ -0,0 +1,555 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import glob
import math
import os
import time
from dataclasses import dataclass
from pathlib import Path
from threading import Thread
from urllib.parse import urlparse
import cv2
import numpy as np
import requests
import torch
from PIL import Image
from ultralytics.data.utils import IMG_FORMATS, VID_FORMATS
from ultralytics.utils import LOGGER, is_colab, is_kaggle, ops
from ultralytics.utils.checks import check_requirements
@dataclass
class SourceTypes:
"""Class to represent various types of input sources for predictions."""
stream: bool = False
screenshot: bool = False
from_img: bool = False
tensor: bool = False
class LoadStreams:
"""
Stream Loader for various types of video streams, Supports RTSP, RTMP, HTTP, and TCP streams.
Attributes:
sources (str): The source input paths or URLs for the video streams.
vid_stride (int): Video frame-rate stride, defaults to 1.
buffer (bool): Whether to buffer input streams, defaults to False.
running (bool): Flag to indicate if the streaming thread is running.
mode (str): Set to 'stream' indicating real-time capture.
imgs (list): List of image frames for each stream.
fps (list): List of FPS for each stream.
frames (list): List of total frames for each stream.
threads (list): List of threads for each stream.
shape (list): List of shapes for each stream.
caps (list): List of cv2.VideoCapture objects for each stream.
bs (int): Batch size for processing.
Methods:
__init__: Initialize the stream loader.
update: Read stream frames in daemon thread.
close: Close stream loader and release resources.
__iter__: Returns an iterator object for the class.
__next__: Returns source paths, transformed, and original images for processing.
__len__: Return the length of the sources object.
Example:
```bash
yolo predict source='rtsp://example.com/media.mp4'
```
"""
def __init__(self, sources="file.streams", vid_stride=1, buffer=False):
"""Initialize instance variables and check for consistent input stream shapes."""
torch.backends.cudnn.benchmark = True # faster for fixed-size inference
self.buffer = buffer # buffer input streams
self.running = True # running flag for Thread
self.mode = "stream"
self.vid_stride = vid_stride # video frame-rate stride
sources = Path(sources).read_text().rsplit() if os.path.isfile(sources) else [sources]
n = len(sources)
self.bs = n
self.fps = [0] * n # frames per second
self.frames = [0] * n
self.threads = [None] * n
self.caps = [None] * n # video capture objects
self.imgs = [[] for _ in range(n)] # images
self.shape = [[] for _ in range(n)] # image shapes
self.sources = [ops.clean_str(x) for x in sources] # clean source names for later
for i, s in enumerate(sources): # index, source
# Start thread to read frames from video stream
st = f"{i + 1}/{n}: {s}... "
if urlparse(s).hostname in ("www.youtube.com", "youtube.com", "youtu.be"): # if source is YouTube video
# YouTube format i.e. 'https://www.youtube.com/watch?v=Zgi9g1ksQHc' or 'https://youtu.be/LNwODJXcvt4'
s = get_best_youtube_url(s)
s = eval(s) if s.isnumeric() else s # i.e. s = '0' local webcam
if s == 0 and (is_colab() or is_kaggle()):
raise NotImplementedError(
"'source=0' webcam not supported in Colab and Kaggle notebooks. "
"Try running 'source=0' in a local environment."
)
self.caps[i] = cv2.VideoCapture(s) # store video capture object
if not self.caps[i].isOpened():
raise ConnectionError(f"{st}Failed to open {s}")
w = int(self.caps[i].get(cv2.CAP_PROP_FRAME_WIDTH))
h = int(self.caps[i].get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = self.caps[i].get(cv2.CAP_PROP_FPS) # warning: may return 0 or nan
self.frames[i] = max(int(self.caps[i].get(cv2.CAP_PROP_FRAME_COUNT)), 0) or float(
"inf"
) # infinite stream fallback
self.fps[i] = max((fps if math.isfinite(fps) else 0) % 100, 0) or 30 # 30 FPS fallback
success, im = self.caps[i].read() # guarantee first frame
if not success or im is None:
raise ConnectionError(f"{st}Failed to read images from {s}")
self.imgs[i].append(im)
self.shape[i] = im.shape
self.threads[i] = Thread(target=self.update, args=([i, self.caps[i], s]), daemon=True)
LOGGER.info(f"{st}Success ✅ ({self.frames[i]} frames of shape {w}x{h} at {self.fps[i]:.2f} FPS)")
self.threads[i].start()
LOGGER.info("") # newline
def update(self, i, cap, stream):
"""Read stream `i` frames in daemon thread."""
n, f = 0, self.frames[i] # frame number, frame array
while self.running and cap.isOpened() and n < (f - 1):
if len(self.imgs[i]) < 30: # keep a <=30-image buffer
n += 1
cap.grab() # .read() = .grab() followed by .retrieve()
if n % self.vid_stride == 0:
success, im = cap.retrieve()
if not success:
im = np.zeros(self.shape[i], dtype=np.uint8)
LOGGER.warning("WARNING ⚠️ Video stream unresponsive, please check your IP camera connection.")
cap.open(stream) # re-open stream if signal was lost
if self.buffer:
self.imgs[i].append(im)
else:
self.imgs[i] = [im]
else:
time.sleep(0.01) # wait until the buffer is empty
def close(self):
"""Close stream loader and release resources."""
self.running = False # stop flag for Thread
for thread in self.threads:
if thread.is_alive():
thread.join(timeout=5) # Add timeout
for cap in self.caps: # Iterate through the stored VideoCapture objects
try:
cap.release() # release video capture
except Exception as e:
LOGGER.warning(f"WARNING ⚠️ Could not release VideoCapture object: {e}")
cv2.destroyAllWindows()
def __iter__(self):
"""Iterates through YOLO image feed and re-opens unresponsive streams."""
self.count = -1
return self
def __next__(self):
"""Returns source paths, transformed and original images for processing."""
self.count += 1
images = []
for i, x in enumerate(self.imgs):
# Wait until a frame is available in each buffer
while not x:
if not self.threads[i].is_alive() or cv2.waitKey(1) == ord("q"): # q to quit
self.close()
raise StopIteration
time.sleep(1 / min(self.fps))
x = self.imgs[i]
if not x:
LOGGER.warning(f"WARNING ⚠️ Waiting for stream {i}")
# Get and remove the first frame from imgs buffer
if self.buffer:
images.append(x.pop(0))
# Get the last frame, and clear the rest from the imgs buffer
else:
images.append(x.pop(-1) if x else np.zeros(self.shape[i], dtype=np.uint8))
x.clear()
return self.sources, images, [""] * self.bs
def __len__(self):
"""Return the length of the sources object."""
return self.bs # 1E12 frames = 32 streams at 30 FPS for 30 years
class LoadScreenshots:
"""
YOLOv8 screenshot dataloader.
This class manages the loading of screenshot images for processing with YOLOv8.
Suitable for use with `yolo predict source=screen`.
Attributes:
source (str): The source input indicating which screen to capture.
screen (int): The screen number to capture.
left (int): The left coordinate for screen capture area.
top (int): The top coordinate for screen capture area.
width (int): The width of the screen capture area.
height (int): The height of the screen capture area.
mode (str): Set to 'stream' indicating real-time capture.
frame (int): Counter for captured frames.
sct (mss.mss): Screen capture object from `mss` library.
bs (int): Batch size, set to 1.
monitor (dict): Monitor configuration details.
Methods:
__iter__: Returns an iterator object.
__next__: Captures the next screenshot and returns it.
"""
def __init__(self, source):
"""Source = [screen_number left top width height] (pixels)."""
check_requirements("mss")
import mss # noqa
source, *params = source.split()
self.screen, left, top, width, height = 0, None, None, None, None # default to full screen 0
if len(params) == 1:
self.screen = int(params[0])
elif len(params) == 4:
left, top, width, height = (int(x) for x in params)
elif len(params) == 5:
self.screen, left, top, width, height = (int(x) for x in params)
self.mode = "stream"
self.frame = 0
self.sct = mss.mss()
self.bs = 1
self.fps = 30
# Parse monitor shape
monitor = self.sct.monitors[self.screen]
self.top = monitor["top"] if top is None else (monitor["top"] + top)
self.left = monitor["left"] if left is None else (monitor["left"] + left)
self.width = width or monitor["width"]
self.height = height or monitor["height"]
self.monitor = {"left": self.left, "top": self.top, "width": self.width, "height": self.height}
def __iter__(self):
"""Returns an iterator of the object."""
return self
def __next__(self):
"""mss screen capture: get raw pixels from the screen as np array."""
im0 = np.asarray(self.sct.grab(self.monitor))[:, :, :3] # BGRA to BGR
s = f"screen {self.screen} (LTWH): {self.left},{self.top},{self.width},{self.height}: "
self.frame += 1
return [str(self.screen)], [im0], [s] # screen, img, string
class LoadImagesAndVideos:
"""
YOLOv8 image/video dataloader.
This class manages the loading and pre-processing of image and video data for YOLOv8. It supports loading from
various formats, including single image files, video files, and lists of image and video paths.
Attributes:
files (list): List of image and video file paths.
nf (int): Total number of files (images and videos).
video_flag (list): Flags indicating whether a file is a video (True) or an image (False).
mode (str): Current mode, 'image' or 'video'.
vid_stride (int): Stride for video frame-rate, defaults to 1.
bs (int): Batch size, set to 1 for this class.
cap (cv2.VideoCapture): Video capture object for OpenCV.
frame (int): Frame counter for video.
frames (int): Total number of frames in the video.
count (int): Counter for iteration, initialized at 0 during `__iter__()`.
Methods:
_new_video(path): Create a new cv2.VideoCapture object for a given video path.
"""
def __init__(self, path, batch=1, vid_stride=1):
"""Initialize the Dataloader and raise FileNotFoundError if file not found."""
parent = None
if isinstance(path, str) and Path(path).suffix == ".txt": # *.txt file with img/vid/dir on each line
parent = Path(path).parent
path = Path(path).read_text().splitlines() # list of sources
files = []
for p in sorted(path) if isinstance(path, (list, tuple)) else [path]:
a = str(Path(p).absolute()) # do not use .resolve() https://github.com/ultralytics/ultralytics/issues/2912
if "*" in a:
files.extend(sorted(glob.glob(a, recursive=True))) # glob
elif os.path.isdir(a):
files.extend(sorted(glob.glob(os.path.join(a, "*.*")))) # dir
elif os.path.isfile(a):
files.append(a) # files (absolute or relative to CWD)
elif parent and (parent / p).is_file():
files.append(str((parent / p).absolute())) # files (relative to *.txt file parent)
else:
raise FileNotFoundError(f"{p} does not exist")
images = [x for x in files if x.split(".")[-1].lower() in IMG_FORMATS]
videos = [x for x in files if x.split(".")[-1].lower() in VID_FORMATS]
ni, nv = len(images), len(videos)
self.files = images + videos
self.nf = ni + nv # number of files
self.ni = ni # number of images
self.video_flag = [False] * ni + [True] * nv
self.mode = "image"
self.vid_stride = vid_stride # video frame-rate stride
self.bs = batch
if any(videos):
self._new_video(videos[0]) # new video
else:
self.cap = None
if self.nf == 0:
raise FileNotFoundError(
f"No images or videos found in {p}. "
f"Supported formats are:\nimages: {IMG_FORMATS}\nvideos: {VID_FORMATS}"
)
def __iter__(self):
"""Returns an iterator object for VideoStream or ImageFolder."""
self.count = 0
return self
def __next__(self):
"""Returns the next batch of images or video frames along with their paths and metadata."""
paths, imgs, info = [], [], []
while len(imgs) < self.bs:
if self.count >= self.nf: # end of file list
if len(imgs) > 0:
return paths, imgs, info # return last partial batch
else:
raise StopIteration
path = self.files[self.count]
if self.video_flag[self.count]:
self.mode = "video"
if not self.cap or not self.cap.isOpened():
self._new_video(path)
for _ in range(self.vid_stride):
success = self.cap.grab()
if not success:
break # end of video or failure
if success:
success, im0 = self.cap.retrieve()
if success:
self.frame += 1
paths.append(path)
imgs.append(im0)
info.append(f"video {self.count + 1}/{self.nf} (frame {self.frame}/{self.frames}) {path}: ")
if self.frame == self.frames: # end of video
self.count += 1
self.cap.release()
else:
# Move to the next file if the current video ended or failed to open
self.count += 1
if self.cap:
self.cap.release()
if self.count < self.nf:
self._new_video(self.files[self.count])
else:
self.mode = "image"
im0 = cv2.imread(path) # BGR
if im0 is None:
raise FileNotFoundError(f"Image Not Found {path}")
paths.append(path)
imgs.append(im0)
info.append(f"image {self.count + 1}/{self.nf} {path}: ")
self.count += 1 # move to the next file
if self.count >= self.ni: # end of image list
break
return paths, imgs, info
def _new_video(self, path):
"""Creates a new video capture object for the given path."""
self.frame = 0
self.cap = cv2.VideoCapture(path)
self.fps = int(self.cap.get(cv2.CAP_PROP_FPS))
if not self.cap.isOpened():
raise FileNotFoundError(f"Failed to open video {path}")
self.frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT) / self.vid_stride)
def __len__(self):
"""Returns the number of batches in the object."""
return math.ceil(self.nf / self.bs) # number of files
class LoadPilAndNumpy:
"""
Load images from PIL and Numpy arrays for batch processing.
This class is designed to manage loading and pre-processing of image data from both PIL and Numpy formats.
It performs basic validation and format conversion to ensure that the images are in the required format for
downstream processing.
Attributes:
paths (list): List of image paths or autogenerated filenames.
im0 (list): List of images stored as Numpy arrays.
mode (str): Type of data being processed, defaults to 'image'.
bs (int): Batch size, equivalent to the length of `im0`.
Methods:
_single_check(im): Validate and format a single image to a Numpy array.
"""
def __init__(self, im0):
"""Initialize PIL and Numpy Dataloader."""
if not isinstance(im0, list):
im0 = [im0]
self.paths = [getattr(im, "filename", f"image{i}.jpg") for i, im in enumerate(im0)]
self.im0 = [self._single_check(im) for im in im0]
self.mode = "image"
self.bs = len(self.im0)
@staticmethod
def _single_check(im):
"""Validate and format an image to numpy array."""
assert isinstance(im, (Image.Image, np.ndarray)), f"Expected PIL/np.ndarray image type, but got {type(im)}"
if isinstance(im, Image.Image):
if im.mode != "RGB":
im = im.convert("RGB")
im = np.asarray(im)[:, :, ::-1]
im = np.ascontiguousarray(im) # contiguous
return im
def __len__(self):
"""Returns the length of the 'im0' attribute."""
return len(self.im0)
def __next__(self):
"""Returns batch paths, images, processed images, None, ''."""
if self.count == 1: # loop only once as it's batch inference
raise StopIteration
self.count += 1
return self.paths, self.im0, [""] * self.bs
def __iter__(self):
"""Enables iteration for class LoadPilAndNumpy."""
self.count = 0
return self
class LoadTensor:
"""
Load images from torch.Tensor data.
This class manages the loading and pre-processing of image data from PyTorch tensors for further processing.
Attributes:
im0 (torch.Tensor): The input tensor containing the image(s).
bs (int): Batch size, inferred from the shape of `im0`.
mode (str): Current mode, set to 'image'.
paths (list): List of image paths or filenames.
count (int): Counter for iteration, initialized at 0 during `__iter__()`.
Methods:
_single_check(im, stride): Validate and possibly modify the input tensor.
"""
def __init__(self, im0) -> None:
"""Initialize Tensor Dataloader."""
self.im0 = self._single_check(im0)
self.bs = self.im0.shape[0]
self.mode = "image"
self.paths = [getattr(im, "filename", f"image{i}.jpg") for i, im in enumerate(im0)]
@staticmethod
def _single_check(im, stride=32):
"""Validate and format an image to torch.Tensor."""
s = (
f"WARNING ⚠️ torch.Tensor inputs should be BCHW i.e. shape(1, 3, 640, 640) "
f"divisible by stride {stride}. Input shape{tuple(im.shape)} is incompatible."
)
if len(im.shape) != 4:
if len(im.shape) != 3:
raise ValueError(s)
LOGGER.warning(s)
im = im.unsqueeze(0)
if im.shape[2] % stride or im.shape[3] % stride:
raise ValueError(s)
if im.max() > 1.0 + torch.finfo(im.dtype).eps: # torch.float32 eps is 1.2e-07
LOGGER.warning(
f"WARNING ⚠️ torch.Tensor inputs should be normalized 0.0-1.0 but max value is {im.max()}. "
f"Dividing input by 255."
)
im = im.float() / 255.0
return im
def __iter__(self):
"""Returns an iterator object."""
self.count = 0
return self
def __next__(self):
"""Return next item in the iterator."""
if self.count == 1:
raise StopIteration
self.count += 1
return self.paths, self.im0, [""] * self.bs
def __len__(self):
"""Returns the batch size."""
return self.bs
def autocast_list(source):
"""Merges a list of source of different types into a list of numpy arrays or PIL images."""
files = []
for im in source:
if isinstance(im, (str, Path)): # filename or uri
files.append(Image.open(requests.get(im, stream=True).raw if str(im).startswith("http") else im))
elif isinstance(im, (Image.Image, np.ndarray)): # PIL or np Image
files.append(im)
else:
raise TypeError(
f"type {type(im).__name__} is not a supported Ultralytics prediction source type. \n"
f"See https://docs.ultralytics.com/modes/predict for supported source types."
)
return files
def get_best_youtube_url(url, use_pafy=True):
"""
Retrieves the URL of the best quality MP4 video stream from a given YouTube video.
This function uses the pafy or yt_dlp library to extract the video info from YouTube. It then finds the highest
quality MP4 format that has video codec but no audio codec, and returns the URL of this video stream.
Args:
url (str): The URL of the YouTube video.
use_pafy (bool): Use the pafy package, default=True, otherwise use yt_dlp package.
Returns:
(str): The URL of the best quality MP4 video stream, or None if no suitable stream is found.
"""
if use_pafy:
check_requirements(("pafy", "youtube_dl==2020.12.2"))
import pafy # noqa
return pafy.new(url).getbestvideo(preftype="mp4").url
else:
check_requirements("yt-dlp")
import yt_dlp
with yt_dlp.YoutubeDL({"quiet": True}) as ydl:
info_dict = ydl.extract_info(url, download=False) # extract info
for f in reversed(info_dict.get("formats", [])): # reversed because best is usually last
# Find a format with video codec, no audio, *.mp4 extension at least 1920x1080 size
good_size = (f.get("width") or 0) >= 1920 or (f.get("height") or 0) >= 1080
if good_size and f["vcodec"] != "none" and f["acodec"] == "none" and f["ext"] == "mp4":
return f.get("url")
# Define constants
LOADERS = (LoadStreams, LoadPilAndNumpy, LoadImagesAndVideos, LoadScreenshots)

View File

@ -0,0 +1,18 @@
#!/bin/bash
# Ultralytics YOLO 🚀, AGPL-3.0 license
# Download latest models from https://github.com/ultralytics/assets/releases
# Example usage: bash ultralytics/data/scripts/download_weights.sh
# parent
# └── weights
# ├── yolov8n.pt ← downloads here
# ├── yolov8s.pt
# └── ...
python - <<EOF
from ultralytics.utils.downloads import attempt_download_asset
assets = [f'yolov8{size}{suffix}.pt' for size in 'nsmlx' for suffix in ('', '-cls', '-seg', '-pose')]
for x in assets:
attempt_download_asset(f'weights/{x}')
EOF

View File

@ -0,0 +1,60 @@
#!/bin/bash
# Ultralytics YOLO 🚀, AGPL-3.0 license
# Download COCO 2017 dataset https://cocodataset.org
# Example usage: bash data/scripts/get_coco.sh
# parent
# ├── ultralytics
# └── datasets
# └── coco ← downloads here
# Arguments (optional) Usage: bash data/scripts/get_coco.sh --train --val --test --segments
if [ "$#" -gt 0 ]; then
for opt in "$@"; do
case "${opt}" in
--train) train=true ;;
--val) val=true ;;
--test) test=true ;;
--segments) segments=true ;;
--sama) sama=true ;;
esac
done
else
train=true
val=true
test=false
segments=false
sama=false
fi
# Download/unzip labels
d='../datasets' # unzip directory
url=https://github.com/ultralytics/yolov5/releases/download/v1.0/
if [ "$segments" == "true" ]; then
f='coco2017labels-segments.zip' # 169 MB
elif [ "$sama" == "true" ]; then
f='coco2017labels-segments-sama.zip' # 199 MB https://www.sama.com/sama-coco-dataset/
else
f='coco2017labels.zip' # 46 MB
fi
echo 'Downloading' $url$f ' ...'
curl -L $url$f -o $f -# && unzip -q $f -d $d && rm $f &
# Download/unzip images
d='../datasets/coco/images' # unzip directory
url=http://images.cocodataset.org/zips/
if [ "$train" == "true" ]; then
f='train2017.zip' # 19G, 118k images
echo 'Downloading' $url$f '...'
curl -L $url$f -o $f -# && unzip -q $f -d $d && rm $f &
fi
if [ "$val" == "true" ]; then
f='val2017.zip' # 1G, 5k images
echo 'Downloading' $url$f '...'
curl -L $url$f -o $f -# && unzip -q $f -d $d && rm $f &
fi
if [ "$test" == "true" ]; then
f='test2017.zip' # 7G, 41k images (optional)
echo 'Downloading' $url$f '...'
curl -L $url$f -o $f -# && unzip -q $f -d $d && rm $f &
fi
wait # finish background tasks

View File

@ -0,0 +1,17 @@
#!/bin/bash
# Ultralytics YOLO 🚀, AGPL-3.0 license
# Download COCO128 dataset https://www.kaggle.com/ultralytics/coco128 (first 128 images from COCO train2017)
# Example usage: bash data/scripts/get_coco128.sh
# parent
# ├── ultralytics
# └── datasets
# └── coco128 ← downloads here
# Download/unzip images and labels
d='../datasets' # unzip directory
url=https://github.com/ultralytics/yolov5/releases/download/v1.0/
f='coco128.zip' # or 'coco128-segments.zip', 68 MB
echo 'Downloading' $url$f ' ...'
curl -L $url$f -o $f -# && unzip -q $f -d $d && rm $f &
wait # finish background tasks

View File

@ -0,0 +1,51 @@
#!/bin/bash
# Ultralytics YOLO 🚀, AGPL-3.0 license
# Download ILSVRC2012 ImageNet dataset https://image-net.org
# Example usage: bash data/scripts/get_imagenet.sh
# parent
# ├── ultralytics
# └── datasets
# └── imagenet ← downloads here
# Arguments (optional) Usage: bash data/scripts/get_imagenet.sh --train --val
if [ "$#" -gt 0 ]; then
for opt in "$@"; do
case "${opt}" in
--train) train=true ;;
--val) val=true ;;
esac
done
else
train=true
val=true
fi
# Make dir
d='../datasets/imagenet' # unzip directory
mkdir -p $d && cd $d
# Download/unzip train
if [ "$train" == "true" ]; then
wget https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_train.tar # download 138G, 1281167 images
mkdir train && mv ILSVRC2012_img_train.tar train/ && cd train
tar -xf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar
find . -name "*.tar" | while read NAME; do
mkdir -p "${NAME%.tar}"
tar -xf "${NAME}" -C "${NAME%.tar}"
rm -f "${NAME}"
done
cd ..
fi
# Download/unzip val
if [ "$val" == "true" ]; then
wget https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_val.tar # download 6.3G, 50000 images
mkdir val && mv ILSVRC2012_img_val.tar val/ && cd val && tar -xf ILSVRC2012_img_val.tar
wget -qO- https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh | bash # move into subdirs
fi
# Delete corrupted image (optional: PNG under JPEG name that may cause dataloaders to fail)
# rm train/n04266014/n04266014_10835.JPEG
# TFRecords (optional)
# wget https://raw.githubusercontent.com/tensorflow/models/master/research/slim/datasets/imagenet_lsvrc_2015_synsets.txt

View File

@ -0,0 +1,288 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import itertools
from glob import glob
from math import ceil
from pathlib import Path
import cv2
import numpy as np
from PIL import Image
from tqdm import tqdm
from ultralytics.data.utils import exif_size, img2label_paths
from ultralytics.utils.checks import check_requirements
check_requirements("shapely")
from shapely.geometry import Polygon
def bbox_iof(polygon1, bbox2, eps=1e-6):
"""
Calculate iofs between bbox1 and bbox2.
Args:
polygon1 (np.ndarray): Polygon coordinates, (n, 8).
bbox2 (np.ndarray): Bounding boxes, (n ,4).
"""
polygon1 = polygon1.reshape(-1, 4, 2)
lt_point = np.min(polygon1, axis=-2)
rb_point = np.max(polygon1, axis=-2)
bbox1 = np.concatenate([lt_point, rb_point], axis=-1)
lt = np.maximum(bbox1[:, None, :2], bbox2[..., :2])
rb = np.minimum(bbox1[:, None, 2:], bbox2[..., 2:])
wh = np.clip(rb - lt, 0, np.inf)
h_overlaps = wh[..., 0] * wh[..., 1]
l, t, r, b = (bbox2[..., i] for i in range(4))
polygon2 = np.stack([l, t, r, t, r, b, l, b], axis=-1).reshape(-1, 4, 2)
sg_polys1 = [Polygon(p) for p in polygon1]
sg_polys2 = [Polygon(p) for p in polygon2]
overlaps = np.zeros(h_overlaps.shape)
for p in zip(*np.nonzero(h_overlaps)):
overlaps[p] = sg_polys1[p[0]].intersection(sg_polys2[p[-1]]).area
unions = np.array([p.area for p in sg_polys1], dtype=np.float32)
unions = unions[..., None]
unions = np.clip(unions, eps, np.inf)
outputs = overlaps / unions
if outputs.ndim == 1:
outputs = outputs[..., None]
return outputs
def load_yolo_dota(data_root, split="train"):
"""
Load DOTA dataset.
Args:
data_root (str): Data root.
split (str): The split data set, could be train or val.
Notes:
The directory structure assumed for the DOTA dataset:
- data_root
- images
- train
- val
- labels
- train
- val
"""
assert split in ["train", "val"]
im_dir = Path(data_root) / "images" / split
assert im_dir.exists(), f"Can't find {im_dir}, please check your data root."
im_files = glob(str(Path(data_root) / "images" / split / "*"))
lb_files = img2label_paths(im_files)
annos = []
for im_file, lb_file in zip(im_files, lb_files):
w, h = exif_size(Image.open(im_file))
with open(lb_file) as f:
lb = [x.split() for x in f.read().strip().splitlines() if len(x)]
lb = np.array(lb, dtype=np.float32)
annos.append(dict(ori_size=(h, w), label=lb, filepath=im_file))
return annos
def get_windows(im_size, crop_sizes=[1024], gaps=[200], im_rate_thr=0.6, eps=0.01):
"""
Get the coordinates of windows.
Args:
im_size (tuple): Original image size, (h, w).
crop_sizes (List(int)): Crop size of windows.
gaps (List(int)): Gap between crops.
im_rate_thr (float): Threshold of windows areas divided by image ares.
"""
h, w = im_size
windows = []
for crop_size, gap in zip(crop_sizes, gaps):
assert crop_size > gap, f"invalid crop_size gap pair [{crop_size} {gap}]"
step = crop_size - gap
xn = 1 if w <= crop_size else ceil((w - crop_size) / step + 1)
xs = [step * i for i in range(xn)]
if len(xs) > 1 and xs[-1] + crop_size > w:
xs[-1] = w - crop_size
yn = 1 if h <= crop_size else ceil((h - crop_size) / step + 1)
ys = [step * i for i in range(yn)]
if len(ys) > 1 and ys[-1] + crop_size > h:
ys[-1] = h - crop_size
start = np.array(list(itertools.product(xs, ys)), dtype=np.int64)
stop = start + crop_size
windows.append(np.concatenate([start, stop], axis=1))
windows = np.concatenate(windows, axis=0)
im_in_wins = windows.copy()
im_in_wins[:, 0::2] = np.clip(im_in_wins[:, 0::2], 0, w)
im_in_wins[:, 1::2] = np.clip(im_in_wins[:, 1::2], 0, h)
im_areas = (im_in_wins[:, 2] - im_in_wins[:, 0]) * (im_in_wins[:, 3] - im_in_wins[:, 1])
win_areas = (windows[:, 2] - windows[:, 0]) * (windows[:, 3] - windows[:, 1])
im_rates = im_areas / win_areas
if not (im_rates > im_rate_thr).any():
max_rate = im_rates.max()
im_rates[abs(im_rates - max_rate) < eps] = 1
return windows[im_rates > im_rate_thr]
def get_window_obj(anno, windows, iof_thr=0.7):
"""Get objects for each window."""
h, w = anno["ori_size"]
label = anno["label"]
if len(label):
label[:, 1::2] *= w
label[:, 2::2] *= h
iofs = bbox_iof(label[:, 1:], windows)
# Unnormalized and misaligned coordinates
return [(label[iofs[:, i] >= iof_thr]) for i in range(len(windows))] # window_anns
else:
return [np.zeros((0, 9), dtype=np.float32) for _ in range(len(windows))] # window_anns
def crop_and_save(anno, windows, window_objs, im_dir, lb_dir):
"""
Crop images and save new labels.
Args:
anno (dict): Annotation dict, including `filepath`, `label`, `ori_size` as its keys.
windows (list): A list of windows coordinates.
window_objs (list): A list of labels inside each window.
im_dir (str): The output directory path of images.
lb_dir (str): The output directory path of labels.
Notes:
The directory structure assumed for the DOTA dataset:
- data_root
- images
- train
- val
- labels
- train
- val
"""
im = cv2.imread(anno["filepath"])
name = Path(anno["filepath"]).stem
for i, window in enumerate(windows):
x_start, y_start, x_stop, y_stop = window.tolist()
new_name = f"{name}__{x_stop - x_start}__{x_start}___{y_start}"
patch_im = im[y_start:y_stop, x_start:x_stop]
ph, pw = patch_im.shape[:2]
cv2.imwrite(str(Path(im_dir) / f"{new_name}.jpg"), patch_im)
label = window_objs[i]
if len(label) == 0:
continue
label[:, 1::2] -= x_start
label[:, 2::2] -= y_start
label[:, 1::2] /= pw
label[:, 2::2] /= ph
with open(Path(lb_dir) / f"{new_name}.txt", "w") as f:
for lb in label:
formatted_coords = ["{:.6g}".format(coord) for coord in lb[1:]]
f.write(f"{int(lb[0])} {' '.join(formatted_coords)}\n")
def split_images_and_labels(data_root, save_dir, split="train", crop_sizes=[1024], gaps=[200]):
"""
Split both images and labels.
Notes:
The directory structure assumed for the DOTA dataset:
- data_root
- images
- split
- labels
- split
and the output directory structure is:
- save_dir
- images
- split
- labels
- split
"""
im_dir = Path(save_dir) / "images" / split
im_dir.mkdir(parents=True, exist_ok=True)
lb_dir = Path(save_dir) / "labels" / split
lb_dir.mkdir(parents=True, exist_ok=True)
annos = load_yolo_dota(data_root, split=split)
for anno in tqdm(annos, total=len(annos), desc=split):
windows = get_windows(anno["ori_size"], crop_sizes, gaps)
window_objs = get_window_obj(anno, windows)
crop_and_save(anno, windows, window_objs, str(im_dir), str(lb_dir))
def split_trainval(data_root, save_dir, crop_size=1024, gap=200, rates=[1.0]):
"""
Split train and val set of DOTA.
Notes:
The directory structure assumed for the DOTA dataset:
- data_root
- images
- train
- val
- labels
- train
- val
and the output directory structure is:
- save_dir
- images
- train
- val
- labels
- train
- val
"""
crop_sizes, gaps = [], []
for r in rates:
crop_sizes.append(int(crop_size / r))
gaps.append(int(gap / r))
for split in ["train", "val"]:
split_images_and_labels(data_root, save_dir, split, crop_sizes, gaps)
def split_test(data_root, save_dir, crop_size=1024, gap=200, rates=[1.0]):
"""
Split test set of DOTA, labels are not included within this set.
Notes:
The directory structure assumed for the DOTA dataset:
- data_root
- images
- test
and the output directory structure is:
- save_dir
- images
- test
"""
crop_sizes, gaps = [], []
for r in rates:
crop_sizes.append(int(crop_size / r))
gaps.append(int(gap / r))
save_dir = Path(save_dir) / "images" / "test"
save_dir.mkdir(parents=True, exist_ok=True)
im_dir = Path(data_root) / "images" / "test"
assert im_dir.exists(), f"Can't find {im_dir}, please check your data root."
im_files = glob(str(im_dir / "*"))
for im_file in tqdm(im_files, total=len(im_files), desc="test"):
w, h = exif_size(Image.open(im_file))
windows = get_windows((h, w), crop_sizes=crop_sizes, gaps=gaps)
im = cv2.imread(im_file)
name = Path(im_file).stem
for window in windows:
x_start, y_start, x_stop, y_stop = window.tolist()
new_name = f"{name}__{x_stop - x_start}__{x_start}___{y_start}"
patch_im = im[y_start:y_stop, x_start:x_stop]
cv2.imwrite(str(save_dir / f"{new_name}.jpg"), patch_im)
if __name__ == "__main__":
split_trainval(data_root="DOTAv2", save_dir="DOTAv2-split")
split_test(data_root="DOTAv2", save_dir="DOTAv2-split")

651
ultralytics/data/utils.py Normal file
View File

@ -0,0 +1,651 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import contextlib
import hashlib
import json
import os
import random
import subprocess
import time
import zipfile
from multiprocessing.pool import ThreadPool
from pathlib import Path
from tarfile import is_tarfile
import cv2
import numpy as np
from PIL import Image, ImageOps
from ultralytics.nn.autobackend import check_class_names
from ultralytics.utils import (
DATASETS_DIR,
LOGGER,
NUM_THREADS,
ROOT,
SETTINGS_YAML,
TQDM,
clean_url,
colorstr,
emojis,
yaml_load,
yaml_save,
)
from ultralytics.utils.checks import check_file, check_font, is_ascii
from ultralytics.utils.downloads import download, safe_download, unzip_file
from ultralytics.utils.ops import segments2boxes
HELP_URL = "See https://docs.ultralytics.com/datasets/detect for dataset formatting guidance."
IMG_FORMATS = {"bmp", "dng", "jpeg", "jpg", "mpo", "png", "tif", "tiff", "webp", "pfm"} # image suffixes
VID_FORMATS = {"asf", "avi", "gif", "m4v", "mkv", "mov", "mp4", "mpeg", "mpg", "ts", "wmv", "webm"} # video suffixes
PIN_MEMORY = str(os.getenv("PIN_MEMORY", True)).lower() == "true" # global pin_memory for dataloaders
def img2label_paths(img_paths):
"""Define label paths as a function of image paths."""
sa, sb = f"{os.sep}images{os.sep}", f"{os.sep}labels{os.sep}" # /images/, /labels/ substrings
return [sb.join(x.rsplit(sa, 1)).rsplit(".", 1)[0] + ".txt" for x in img_paths]
def get_hash(paths):
"""Returns a single hash value of a list of paths (files or dirs)."""
size = sum(os.path.getsize(p) for p in paths if os.path.exists(p)) # sizes
h = hashlib.sha256(str(size).encode()) # hash sizes
h.update("".join(paths).encode()) # hash paths
return h.hexdigest() # return hash
def exif_size(img: Image.Image):
"""Returns exif-corrected PIL size."""
s = img.size # (width, height)
if img.format == "JPEG": # only support JPEG images
with contextlib.suppress(Exception):
exif = img.getexif()
if exif:
rotation = exif.get(274, None) # the EXIF key for the orientation tag is 274
if rotation in [6, 8]: # rotation 270 or 90
s = s[1], s[0]
return s
def verify_image(args):
"""Verify one image."""
(im_file, cls), prefix = args
# Number (found, corrupt), message
nf, nc, msg = 0, 0, ""
try:
im = Image.open(im_file)
im.verify() # PIL verify
shape = exif_size(im) # image size
shape = (shape[1], shape[0]) # hw
assert (shape[0] > 9) & (shape[1] > 9), f"image size {shape} <10 pixels"
assert im.format.lower() in IMG_FORMATS, f"invalid image format {im.format}"
if im.format.lower() in ("jpg", "jpeg"):
with open(im_file, "rb") as f:
f.seek(-2, 2)
if f.read() != b"\xff\xd9": # corrupt JPEG
ImageOps.exif_transpose(Image.open(im_file)).save(im_file, "JPEG", subsampling=0, quality=100)
msg = f"{prefix}WARNING ⚠️ {im_file}: corrupt JPEG restored and saved"
nf = 1
except Exception as e:
nc = 1
msg = f"{prefix}WARNING ⚠️ {im_file}: ignoring corrupt image/label: {e}"
return (im_file, cls), nf, nc, msg
def verify_image_label(args):
"""Verify one image-label pair."""
im_file, lb_file, prefix, keypoint, num_cls, nkpt, ndim = args
# Number (missing, found, empty, corrupt), message, segments, keypoints
nm, nf, ne, nc, msg, segments, keypoints = 0, 0, 0, 0, "", [], None
try:
# Verify images
im = Image.open(im_file)
im.verify() # PIL verify
shape = exif_size(im) # image size
shape = (shape[1], shape[0]) # hw
assert (shape[0] > 9) & (shape[1] > 9), f"image size {shape} <10 pixels"
assert im.format.lower() in IMG_FORMATS, f"invalid image format {im.format}"
if im.format.lower() in ("jpg", "jpeg"):
with open(im_file, "rb") as f:
f.seek(-2, 2)
if f.read() != b"\xff\xd9": # corrupt JPEG
ImageOps.exif_transpose(Image.open(im_file)).save(im_file, "JPEG", subsampling=0, quality=100)
msg = f"{prefix}WARNING ⚠️ {im_file}: corrupt JPEG restored and saved"
# Verify labels
if os.path.isfile(lb_file):
nf = 1 # label found
with open(lb_file) as f:
lb = [x.split() for x in f.read().strip().splitlines() if len(x)]
if any(len(x) > 6 for x in lb) and (not keypoint): # is segment
classes = np.array([x[0] for x in lb], dtype=np.float32)
segments = [np.array(x[1:], dtype=np.float32).reshape(-1, 2) for x in lb] # (cls, xy1...)
lb = np.concatenate((classes.reshape(-1, 1), segments2boxes(segments)), 1) # (cls, xywh)
lb = np.array(lb, dtype=np.float32)
nl = len(lb)
if nl:
if keypoint:
assert lb.shape[1] == (5 + nkpt * ndim), f"labels require {(5 + nkpt * ndim)} columns each"
points = lb[:, 5:].reshape(-1, ndim)[:, :2]
else:
assert lb.shape[1] == 5, f"labels require 5 columns, {lb.shape[1]} columns detected"
points = lb[:, 1:]
assert points.max() <= 1, f"non-normalized or out of bounds coordinates {points[points > 1]}"
assert lb.min() >= 0, f"negative label values {lb[lb < 0]}"
# All labels
max_cls = lb[:, 0].max() # max label count
assert max_cls <= num_cls, (
f"Label class {int(max_cls)} exceeds dataset class count {num_cls}. "
f"Possible class labels are 0-{num_cls - 1}"
)
_, i = np.unique(lb, axis=0, return_index=True)
if len(i) < nl: # duplicate row check
lb = lb[i] # remove duplicates
if segments:
segments = [segments[x] for x in i]
msg = f"{prefix}WARNING ⚠️ {im_file}: {nl - len(i)} duplicate labels removed"
else:
ne = 1 # label empty
lb = np.zeros((0, (5 + nkpt * ndim) if keypoint else 5), dtype=np.float32)
else:
nm = 1 # label missing
lb = np.zeros((0, (5 + nkpt * ndim) if keypoints else 5), dtype=np.float32)
if keypoint:
keypoints = lb[:, 5:].reshape(-1, nkpt, ndim)
if ndim == 2:
kpt_mask = np.where((keypoints[..., 0] < 0) | (keypoints[..., 1] < 0), 0.0, 1.0).astype(np.float32)
keypoints = np.concatenate([keypoints, kpt_mask[..., None]], axis=-1) # (nl, nkpt, 3)
lb = lb[:, :5]
return im_file, lb, shape, segments, keypoints, nm, nf, ne, nc, msg
except Exception as e:
nc = 1
msg = f"{prefix}WARNING ⚠️ {im_file}: ignoring corrupt image/label: {e}"
return [None, None, None, None, None, nm, nf, ne, nc, msg]
def polygon2mask(imgsz, polygons, color=1, downsample_ratio=1):
"""
Convert a list of polygons to a binary mask of the specified image size.
Args:
imgsz (tuple): The size of the image as (height, width).
polygons (list[np.ndarray]): A list of polygons. Each polygon is an array with shape [N, M], where
N is the number of polygons, and M is the number of points such that M % 2 = 0.
color (int, optional): The color value to fill in the polygons on the mask. Defaults to 1.
downsample_ratio (int, optional): Factor by which to downsample the mask. Defaults to 1.
Returns:
(np.ndarray): A binary mask of the specified image size with the polygons filled in.
"""
mask = np.zeros(imgsz, dtype=np.uint8)
polygons = np.asarray(polygons, dtype=np.int32)
polygons = polygons.reshape((polygons.shape[0], -1, 2))
cv2.fillPoly(mask, polygons, color=color)
nh, nw = (imgsz[0] // downsample_ratio, imgsz[1] // downsample_ratio)
# Note: fillPoly first then resize is trying to keep the same loss calculation method when mask-ratio=1
return cv2.resize(mask, (nw, nh))
def polygons2masks(imgsz, polygons, color, downsample_ratio=1):
"""
Convert a list of polygons to a set of binary masks of the specified image size.
Args:
imgsz (tuple): The size of the image as (height, width).
polygons (list[np.ndarray]): A list of polygons. Each polygon is an array with shape [N, M], where
N is the number of polygons, and M is the number of points such that M % 2 = 0.
color (int): The color value to fill in the polygons on the masks.
downsample_ratio (int, optional): Factor by which to downsample each mask. Defaults to 1.
Returns:
(np.ndarray): A set of binary masks of the specified image size with the polygons filled in.
"""
return np.array([polygon2mask(imgsz, [x.reshape(-1)], color, downsample_ratio) for x in polygons])
def polygons2masks_overlap(imgsz, segments, downsample_ratio=1):
"""Return a (640, 640) overlap mask."""
masks = np.zeros(
(imgsz[0] // downsample_ratio, imgsz[1] // downsample_ratio),
dtype=np.int32 if len(segments) > 255 else np.uint8,
)
areas = []
ms = []
for si in range(len(segments)):
mask = polygon2mask(imgsz, [segments[si].reshape(-1)], downsample_ratio=downsample_ratio, color=1)
ms.append(mask)
areas.append(mask.sum())
areas = np.asarray(areas)
index = np.argsort(-areas)
ms = np.array(ms)[index]
for i in range(len(segments)):
mask = ms[i] * (i + 1)
masks = masks + mask
masks = np.clip(masks, a_min=0, a_max=i + 1)
return masks, index
def find_dataset_yaml(path: Path) -> Path:
"""
Find and return the YAML file associated with a Detect, Segment or Pose dataset.
This function searches for a YAML file at the root level of the provided directory first, and if not found, it
performs a recursive search. It prefers YAML files that have the same stem as the provided path. An AssertionError
is raised if no YAML file is found or if multiple YAML files are found.
Args:
path (Path): The directory path to search for the YAML file.
Returns:
(Path): The path of the found YAML file.
"""
files = list(path.glob("*.yaml")) or list(path.rglob("*.yaml")) # try root level first and then recursive
assert files, f"No YAML file found in '{path.resolve()}'"
if len(files) > 1:
files = [f for f in files if f.stem == path.stem] # prefer *.yaml files that match
assert len(files) == 1, f"Expected 1 YAML file in '{path.resolve()}', but found {len(files)}.\n{files}"
return files[0]
def check_det_dataset(dataset, autodownload=True):
"""
Download, verify, and/or unzip a dataset if not found locally.
This function checks the availability of a specified dataset, and if not found, it has the option to download and
unzip the dataset. It then reads and parses the accompanying YAML data, ensuring key requirements are met and also
resolves paths related to the dataset.
Args:
dataset (str): Path to the dataset or dataset descriptor (like a YAML file).
autodownload (bool, optional): Whether to automatically download the dataset if not found. Defaults to True.
Returns:
(dict): Parsed dataset information and paths.
"""
file = check_file(dataset)
# Download (optional)
extract_dir = ""
if zipfile.is_zipfile(file) or is_tarfile(file):
new_dir = safe_download(file, dir=DATASETS_DIR, unzip=True, delete=False)
file = find_dataset_yaml(DATASETS_DIR / new_dir)
extract_dir, autodownload = file.parent, False
# Read YAML
data = yaml_load(file, append_filename=True) # dictionary
# Checks
for k in "train", "val":
if k not in data:
if k != "val" or "validation" not in data:
raise SyntaxError(
emojis(f"{dataset} '{k}:' key missing ❌.\n'train' and 'val' are required in all data YAMLs.")
)
LOGGER.info("WARNING ⚠️ renaming data YAML 'validation' key to 'val' to match YOLO format.")
data["val"] = data.pop("validation") # replace 'validation' key with 'val' key
if "names" not in data and "nc" not in data:
raise SyntaxError(emojis(f"{dataset} key missing ❌.\n either 'names' or 'nc' are required in all data YAMLs."))
if "names" in data and "nc" in data and len(data["names"]) != data["nc"]:
raise SyntaxError(emojis(f"{dataset} 'names' length {len(data['names'])} and 'nc: {data['nc']}' must match."))
if "names" not in data:
data["names"] = [f"class_{i}" for i in range(data["nc"])]
else:
data["nc"] = len(data["names"])
data["names"] = check_class_names(data["names"])
# Resolve paths
path = Path(extract_dir or data.get("path") or Path(data.get("yaml_file", "")).parent) # dataset root
if not path.is_absolute():
path = (DATASETS_DIR / path).resolve()
# Set paths
data["path"] = path # download scripts
for k in "train", "val", "test":
if data.get(k): # prepend path
if isinstance(data[k], str):
x = (path / data[k]).resolve()
if not x.exists() and data[k].startswith("../"):
x = (path / data[k][3:]).resolve()
data[k] = str(x)
else:
data[k] = [str((path / x).resolve()) for x in data[k]]
# Parse YAML
val, s = (data.get(x) for x in ("val", "download"))
if val:
val = [Path(x).resolve() for x in (val if isinstance(val, list) else [val])] # val path
if not all(x.exists() for x in val):
name = clean_url(dataset) # dataset name with URL auth stripped
m = f"\nDataset '{name}' images not found ⚠️, missing path '{[x for x in val if not x.exists()][0]}'"
if s and autodownload:
LOGGER.warning(m)
else:
m += f"\nNote dataset download directory is '{DATASETS_DIR}'. You can update this in '{SETTINGS_YAML}'"
raise FileNotFoundError(m)
t = time.time()
r = None # success
if s.startswith("http") and s.endswith(".zip"): # URL
safe_download(url=s, dir=DATASETS_DIR, delete=True)
elif s.startswith("bash "): # bash script
LOGGER.info(f"Running {s} ...")
r = os.system(s)
else: # python script
exec(s, {"yaml": data})
dt = f"({round(time.time() - t, 1)}s)"
s = f"success ✅ {dt}, saved to {colorstr('bold', DATASETS_DIR)}" if r in (0, None) else f"failure {dt}"
LOGGER.info(f"Dataset download {s}\n")
check_font("Arial.ttf" if is_ascii(data["names"]) else "Arial.Unicode.ttf") # download fonts
return data # dictionary
def check_cls_dataset(dataset, split=""):
"""
Checks a classification dataset such as Imagenet.
This function accepts a `dataset` name and attempts to retrieve the corresponding dataset information.
If the dataset is not found locally, it attempts to download the dataset from the internet and save it locally.
Args:
dataset (str | Path): The name of the dataset.
split (str, optional): The split of the dataset. Either 'val', 'test', or ''. Defaults to ''.
Returns:
(dict): A dictionary containing the following keys:
- 'train' (Path): The directory path containing the training set of the dataset.
- 'val' (Path): The directory path containing the validation set of the dataset.
- 'test' (Path): The directory path containing the test set of the dataset.
- 'nc' (int): The number of classes in the dataset.
- 'names' (dict): A dictionary of class names in the dataset.
"""
# Download (optional if dataset=https://file.zip is passed directly)
if str(dataset).startswith(("http:/", "https:/")):
dataset = safe_download(dataset, dir=DATASETS_DIR, unzip=True, delete=False)
elif Path(dataset).suffix in (".zip", ".tar", ".gz"):
file = check_file(dataset)
dataset = safe_download(file, dir=DATASETS_DIR, unzip=True, delete=False)
dataset = Path(dataset)
data_dir = (dataset if dataset.is_dir() else (DATASETS_DIR / dataset)).resolve()
if not data_dir.is_dir():
LOGGER.warning(f"\nDataset not found ⚠️, missing path {data_dir}, attempting download...")
t = time.time()
if str(dataset) == "imagenet":
subprocess.run(f"bash {ROOT / 'data/scripts/get_imagenet.sh'}", shell=True, check=True)
else:
url = f"https://github.com/ultralytics/yolov5/releases/download/v1.0/{dataset}.zip"
download(url, dir=data_dir.parent)
s = f"Dataset download success ✅ ({time.time() - t:.1f}s), saved to {colorstr('bold', data_dir)}\n"
LOGGER.info(s)
train_set = data_dir / "train"
val_set = (
data_dir / "val"
if (data_dir / "val").exists()
else data_dir / "validation"
if (data_dir / "validation").exists()
else None
) # data/test or data/val
test_set = data_dir / "test" if (data_dir / "test").exists() else None # data/val or data/test
if split == "val" and not val_set:
LOGGER.warning("WARNING ⚠️ Dataset 'split=val' not found, using 'split=test' instead.")
elif split == "test" and not test_set:
LOGGER.warning("WARNING ⚠️ Dataset 'split=test' not found, using 'split=val' instead.")
nc = len([x for x in (data_dir / "train").glob("*") if x.is_dir()]) # number of classes
names = [x.name for x in (data_dir / "train").iterdir() if x.is_dir()] # class names list
names = dict(enumerate(sorted(names)))
# Print to console
for k, v in {"train": train_set, "val": val_set, "test": test_set}.items():
prefix = f'{colorstr(f"{k}:")} {v}...'
if v is None:
LOGGER.info(prefix)
else:
files = [path for path in v.rglob("*.*") if path.suffix[1:].lower() in IMG_FORMATS]
nf = len(files) # number of files
nd = len({file.parent for file in files}) # number of directories
if nf == 0:
if k == "train":
raise FileNotFoundError(emojis(f"{dataset} '{k}:' no training images found ❌ "))
else:
LOGGER.warning(f"{prefix} found {nf} images in {nd} classes: WARNING ⚠️ no images found")
elif nd != nc:
LOGGER.warning(f"{prefix} found {nf} images in {nd} classes: ERROR ❌️ requires {nc} classes, not {nd}")
else:
LOGGER.info(f"{prefix} found {nf} images in {nd} classes ✅ ")
return {"train": train_set, "val": val_set, "test": test_set, "nc": nc, "names": names}
class HUBDatasetStats:
"""
A class for generating HUB dataset JSON and `-hub` dataset directory.
Args:
path (str): Path to data.yaml or data.zip (with data.yaml inside data.zip). Default is 'coco8.yaml'.
task (str): Dataset task. Options are 'detect', 'segment', 'pose', 'classify'. Default is 'detect'.
autodownload (bool): Attempt to download dataset if not found locally. Default is False.
Example:
Download *.zip files from https://github.com/ultralytics/hub/tree/main/example_datasets
i.e. https://github.com/ultralytics/hub/raw/main/example_datasets/coco8.zip for coco8.zip.
```python
from ultralytics.data.utils import HUBDatasetStats
stats = HUBDatasetStats('path/to/coco8.zip', task='detect') # detect dataset
stats = HUBDatasetStats('path/to/coco8-seg.zip', task='segment') # segment dataset
stats = HUBDatasetStats('path/to/coco8-pose.zip', task='pose') # pose dataset
stats = HUBDatasetStats('path/to/imagenet10.zip', task='classify') # classification dataset
stats.get_json(save=True)
stats.process_images()
```
"""
def __init__(self, path="coco8.yaml", task="detect", autodownload=False):
"""Initialize class."""
path = Path(path).resolve()
LOGGER.info(f"Starting HUB dataset checks for {path}....")
self.task = task # detect, segment, pose, classify
if self.task == "classify":
unzip_dir = unzip_file(path)
data = check_cls_dataset(unzip_dir)
data["path"] = unzip_dir
else: # detect, segment, pose
_, data_dir, yaml_path = self._unzip(Path(path))
try:
# Load YAML with checks
data = yaml_load(yaml_path)
data["path"] = "" # strip path since YAML should be in dataset root for all HUB datasets
yaml_save(yaml_path, data)
data = check_det_dataset(yaml_path, autodownload) # dict
data["path"] = data_dir # YAML path should be set to '' (relative) or parent (absolute)
except Exception as e:
raise Exception("error/HUB/dataset_stats/init") from e
self.hub_dir = Path(f'{data["path"]}-hub')
self.im_dir = self.hub_dir / "images"
self.stats = {"nc": len(data["names"]), "names": list(data["names"].values())} # statistics dictionary
self.data = data
@staticmethod
def _unzip(path):
"""Unzip data.zip."""
if not str(path).endswith(".zip"): # path is data.yaml
return False, None, path
unzip_dir = unzip_file(path, path=path.parent)
assert unzip_dir.is_dir(), (
f"Error unzipping {path}, {unzip_dir} not found. " f"path/to/abc.zip MUST unzip to path/to/abc/"
)
return True, str(unzip_dir), find_dataset_yaml(unzip_dir) # zipped, data_dir, yaml_path
def _hub_ops(self, f):
"""Saves a compressed image for HUB previews."""
compress_one_image(f, self.im_dir / Path(f).name) # save to dataset-hub
def get_json(self, save=False, verbose=False):
"""Return dataset JSON for Ultralytics HUB."""
def _round(labels):
"""Update labels to integer class and 4 decimal place floats."""
if self.task == "detect":
coordinates = labels["bboxes"]
elif self.task == "segment":
coordinates = [x.flatten() for x in labels["segments"]]
elif self.task == "pose":
n = labels["keypoints"].shape[0]
coordinates = np.concatenate((labels["bboxes"], labels["keypoints"].reshape(n, -1)), 1)
else:
raise ValueError("Undefined dataset task.")
zipped = zip(labels["cls"], coordinates)
return [[int(c[0]), *(round(float(x), 4) for x in points)] for c, points in zipped]
for split in "train", "val", "test":
self.stats[split] = None # predefine
path = self.data.get(split)
# Check split
if path is None: # no split
continue
files = [f for f in Path(path).rglob("*.*") if f.suffix[1:].lower() in IMG_FORMATS] # image files in split
if not files: # no images
continue
# Get dataset statistics
if self.task == "classify":
from torchvision.datasets import ImageFolder
dataset = ImageFolder(self.data[split])
x = np.zeros(len(dataset.classes)).astype(int)
for im in dataset.imgs:
x[im[1]] += 1
self.stats[split] = {
"instance_stats": {"total": len(dataset), "per_class": x.tolist()},
"image_stats": {"total": len(dataset), "unlabelled": 0, "per_class": x.tolist()},
"labels": [{Path(k).name: v} for k, v in dataset.imgs],
}
else:
from ultralytics.data import YOLODataset
dataset = YOLODataset(img_path=self.data[split], data=self.data, task=self.task)
x = np.array(
[
np.bincount(label["cls"].astype(int).flatten(), minlength=self.data["nc"])
for label in TQDM(dataset.labels, total=len(dataset), desc="Statistics")
]
) # shape(128x80)
self.stats[split] = {
"instance_stats": {"total": int(x.sum()), "per_class": x.sum(0).tolist()},
"image_stats": {
"total": len(dataset),
"unlabelled": int(np.all(x == 0, 1).sum()),
"per_class": (x > 0).sum(0).tolist(),
},
"labels": [{Path(k).name: _round(v)} for k, v in zip(dataset.im_files, dataset.labels)],
}
# Save, print and return
if save:
self.hub_dir.mkdir(parents=True, exist_ok=True) # makes dataset-hub/
stats_path = self.hub_dir / "stats.json"
LOGGER.info(f"Saving {stats_path.resolve()}...")
with open(stats_path, "w") as f:
json.dump(self.stats, f) # save stats.json
if verbose:
LOGGER.info(json.dumps(self.stats, indent=2, sort_keys=False))
return self.stats
def process_images(self):
"""Compress images for Ultralytics HUB."""
from ultralytics.data import YOLODataset # ClassificationDataset
self.im_dir.mkdir(parents=True, exist_ok=True) # makes dataset-hub/images/
for split in "train", "val", "test":
if self.data.get(split) is None:
continue
dataset = YOLODataset(img_path=self.data[split], data=self.data)
with ThreadPool(NUM_THREADS) as pool:
for _ in TQDM(pool.imap(self._hub_ops, dataset.im_files), total=len(dataset), desc=f"{split} images"):
pass
LOGGER.info(f"Done. All images saved to {self.im_dir}")
return self.im_dir
def compress_one_image(f, f_new=None, max_dim=1920, quality=50):
"""
Compresses a single image file to reduced size while preserving its aspect ratio and quality using either the Python
Imaging Library (PIL) or OpenCV library. If the input image is smaller than the maximum dimension, it will not be
resized.
Args:
f (str): The path to the input image file.
f_new (str, optional): The path to the output image file. If not specified, the input file will be overwritten.
max_dim (int, optional): The maximum dimension (width or height) of the output image. Default is 1920 pixels.
quality (int, optional): The image compression quality as a percentage. Default is 50%.
Example:
```python
from pathlib import Path
from ultralytics.data.utils import compress_one_image
for f in Path('path/to/dataset').rglob('*.jpg'):
compress_one_image(f)
```
"""
try: # use PIL
im = Image.open(f)
r = max_dim / max(im.height, im.width) # ratio
if r < 1.0: # image too large
im = im.resize((int(im.width * r), int(im.height * r)))
im.save(f_new or f, "JPEG", quality=quality, optimize=True) # save
except Exception as e: # use OpenCV
LOGGER.info(f"WARNING ⚠️ HUB ops PIL failure {f}: {e}")
im = cv2.imread(f)
im_height, im_width = im.shape[:2]
r = max_dim / max(im_height, im_width) # ratio
if r < 1.0: # image too large
im = cv2.resize(im, (int(im_width * r), int(im_height * r)), interpolation=cv2.INTER_AREA)
cv2.imwrite(str(f_new or f), im)
def autosplit(path=DATASETS_DIR / "coco8/images", weights=(0.9, 0.1, 0.0), annotated_only=False):
"""
Automatically split a dataset into train/val/test splits and save the resulting splits into autosplit_*.txt files.
Args:
path (Path, optional): Path to images directory. Defaults to DATASETS_DIR / 'coco8/images'.
weights (list | tuple, optional): Train, validation, and test split fractions. Defaults to (0.9, 0.1, 0.0).
annotated_only (bool, optional): If True, only images with an associated txt file are used. Defaults to False.
Example:
```python
from ultralytics.data.utils import autosplit
autosplit()
```
"""
path = Path(path) # images dir
files = sorted(x for x in path.rglob("*.*") if x.suffix[1:].lower() in IMG_FORMATS) # image files only
n = len(files) # number of files
random.seed(0) # for reproducibility
indices = random.choices([0, 1, 2], weights=weights, k=n) # assign each image to a split
txt = ["autosplit_train.txt", "autosplit_val.txt", "autosplit_test.txt"] # 3 txt files
for x in txt:
if (path.parent / x).exists():
(path.parent / x).unlink() # remove existing
LOGGER.info(f"Autosplitting images from {path}" + ", using *.txt labeled images only" * annotated_only)
for i, img in TQDM(zip(indices, files), total=n):
if not annotated_only or Path(img2label_paths([str(img)])[0]).exists(): # check label
with open(path.parent / txt[i], "a") as f:
f.write(f"./{img.relative_to(path.parent).as_posix()}" + "\n") # add image to txt file