initial project version!

2024-05-20 20:01:06 +08:00
commit d6f3693d3f
483 changed files with 60345 additions and 0 deletions
--- a/tracking/trackers/README.md
+++ b/tracking/trackers/README.md
@ -0,0 +1,94 @@
+# Tracker
+
+## Supported Trackers
+
+- [x] ByteTracker
+- [x] BoT-SORT
+
+## Usage
+
+### python interface:
+
+You can use the Python interface to track objects using the YOLO model.
+
+```python
+from ultralytics import YOLO
+
+model = YOLO("yolov8n.pt")  # or a segmentation model .i.e yolov8n-seg.pt
+model.track(
+    source="video/streams",
+    stream=True,
+    tracker="botsort.yaml",  # or 'bytetrack.yaml'
+    show=True,
+)
+```
+
+You can get the IDs of the tracked objects using the following code:
+
+```python
+from ultralytics import YOLO
+
+model = YOLO("yolov8n.pt")
+
+for result in model.track(source="video.mp4"):
+    print(
+        result.boxes.id.cpu().numpy().astype(int)
+    )  # this will print the IDs of the tracked objects in the frame
+```
+
+If you want to use the tracker with a folder of images or when you loop on the video frames, you should use the `persist` parameter to tell the model that these frames are related to each other so the IDs will be fixed for the same objects. Otherwise, the IDs will be different in each frame because in each loop, the model creates a new object for tracking, but the `persist` parameter makes it use the same object for tracking.
+
+```python
+import cv2
+from ultralytics import YOLO
+
+cap = cv2.VideoCapture("video.mp4")
+model = YOLO("yolov8n.pt")
+while True:
+    ret, frame = cap.read()
+    if not ret:
+        break
+    results = model.track(frame, persist=True)
+    boxes = results[0].boxes.xyxy.cpu().numpy().astype(int)
+    ids = results[0].boxes.id.cpu().numpy().astype(int)
+    for box, id in zip(boxes, ids):
+        cv2.rectangle(frame, (box[0], box[1]), (box[2], box[3]), (0, 255, 0), 2)
+        cv2.putText(
+            frame,
+            f"Id {id}",
+            (box[0], box[1]),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            1,
+            (0, 0, 255),
+            2,
+        )
+    cv2.imshow("frame", frame)
+    if cv2.waitKey(1) & 0xFF == ord("q"):
+        break
+```
+
+## Change tracker parameters
+
+You can change the tracker parameters by editing the `tracker.yaml` file which is located in the ultralytics/cfg/trackers folder.
+
+## Command Line Interface (CLI)
+
+You can also use the command line interface to track objects using the YOLO model.
+
+```bash
+yolo detect track source=... tracker=...
+yolo segment track source=... tracker=...
+yolo pose track source=... tracker=...
+```
+
+By default, trackers will use the configuration in `ultralytics/cfg/trackers`. We also support using a modified tracker config file. Please refer to the tracker config files in `ultralytics/cfg/trackers`.
+
+## Contribute to Our Trackers Section
+
+Are you proficient in multi-object tracking and have successfully implemented or adapted a tracking algorithm with Ultralytics YOLO? We invite you to contribute to our Trackers section! Your real-world applications and solutions could be invaluable for users working on tracking tasks.
+
+By contributing to this section, you help expand the scope of tracking solutions available within the Ultralytics YOLO framework, adding another layer of functionality and utility for the community.
+
+To initiate your contribution, please refer to our [Contributing Guide](https://docs.ultralytics.com/help/contributing) for comprehensive instructions on submitting a Pull Request (PR) 🛠️. We are excited to see what you bring to the table!
+
+Together, let's enhance the tracking capabilities of the Ultralytics YOLO ecosystem 🙏!
--- a/tracking/trackers/init.py
+++ b/tracking/trackers/init.py
@ -0,0 +1,10 @@
+# Ultralytics YOLO ğŸš€, AGPL-3.0 license
+
+from .bot_sort import BOTSORT
+from .byte_tracker import BYTETracker
+from .track import register_tracker
+
+
+
+__all__ = 'register_tracker', 'BOTSORT', 'BYTETracker'  # allow simpler import
+
--- a/tracking/trackers/pycache/init.cpython-39.pyc
+++ b/tracking/trackers/pycache/init.cpython-39.pyc
--- a/tracking/trackers/pycache/basetrack.cpython-39.pyc
+++ b/tracking/trackers/pycache/basetrack.cpython-39.pyc
--- a/tracking/trackers/pycache/bot_sort.cpython-39.pyc
+++ b/tracking/trackers/pycache/bot_sort.cpython-39.pyc
--- a/tracking/trackers/pycache/byte_tracker.cpython-39.pyc
+++ b/tracking/trackers/pycache/byte_tracker.cpython-39.pyc
--- a/tracking/trackers/pycache/track.cpython-39.pyc
+++ b/tracking/trackers/pycache/track.cpython-39.pyc
--- a/tracking/trackers/basetrack.py
+++ b/tracking/trackers/basetrack.py
@ -0,0 +1,71 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+
+from collections import OrderedDict
+
+import numpy as np
+
+
+class TrackState:
+    """Enumeration of possible object tracking states."""
+
+    New = 0
+    Tracked = 1
+    Lost = 2
+    Removed = 3
+
+
+class BaseTrack:
+    """Base class for object tracking, handling basic track attributes and operations."""
+
+    _count = 0
+
+    track_id = 0
+    is_activated = False
+    state = TrackState.New
+
+    history = OrderedDict()
+    features = []
+    curr_feature = None
+    score = 0
+    start_frame = 0
+    frame_id = 0
+    time_since_update = 0
+
+    # Multi-camera
+    location = (np.inf, np.inf)
+
+    @property
+    def end_frame(self):
+        """Return the last frame ID of the track."""
+        return self.frame_id
+
+    @staticmethod
+    def next_id():
+        """Increment and return the global track ID counter."""
+        BaseTrack._count += 1
+        return BaseTrack._count
+
+    def activate(self, *args):
+        """Activate the track with the provided arguments."""
+        raise NotImplementedError
+
+    def predict(self):
+        """Predict the next state of the track."""
+        raise NotImplementedError
+
+    def update(self, *args, **kwargs):
+        """Update the track with new observations."""
+        raise NotImplementedError
+
+    def mark_lost(self):
+        """Mark the track as lost."""
+        self.state = TrackState.Lost
+
+    def mark_removed(self):
+        """Mark the track as removed."""
+        self.state = TrackState.Removed
+
+    @staticmethod
+    def reset_id():
+        """Reset the global track ID counter."""
+        BaseTrack._count = 0
--- a/tracking/trackers/bot_sort.py
+++ b/tracking/trackers/bot_sort.py
@ -0,0 +1,198 @@
+# Ultralytics YOLO ğŸš€, AGPL-3.0 license
+
+from collections import deque
+
+import numpy as np
+
+from .basetrack import TrackState
+from .byte_tracker import BYTETracker, STrack
+from .utils import matching
+# from .utils.gmc import GMC
+from .utils.kalman_filter import KalmanFilterXYWH
+
+from .reid.reid_interface import ReIDInterface
+from .reid.config import config
+
+class BOTrack(STrack):
+    shared_kalman = KalmanFilterXYWH()
+
+    def __init__(self, tlwh, score, cls, feat=None, feat_history=50):
+        """Initialize YOLOv8 object with temporal parameters, such as feature history, alpha and current features."""
+        super().__init__(tlwh, score, cls)
+
+        self.smooth_feat = None
+        self.curr_feat = None
+        if feat is not None:
+            self.update_features(feat)
+        self.features = deque([], maxlen=feat_history)
+        self.alpha = 0.9
+
+    def update_features(self, feat):
+        """Update features vector and smooth it using exponential moving average."""
+        feat /= np.linalg.norm(feat)
+        self.curr_feat = feat
+        if self.smooth_feat is None:
+            self.smooth_feat = feat
+        else:
+            self.smooth_feat = self.alpha * self.smooth_feat + (1 - self.alpha) * feat
+        self.features.append(feat)
+        self.smooth_feat /= np.linalg.norm(self.smooth_feat)
+
+    def predict(self):
+        """Predicts the mean and covariance using Kalman filter."""
+        mean_state = self.mean.copy()
+        if self.state != TrackState.Tracked:
+            mean_state[6] = 0
+            mean_state[7] = 0
+
+        self.mean, self.covariance = self.kalman_filter.predict(mean_state, self.covariance)
+
+    def re_activate(self, new_track, frame_id, new_id=False):
+        """Reactivates a track with updated features and optionally assigns a new ID."""
+        if new_track.curr_feat is not None:
+            self.update_features(new_track.curr_feat)
+        super().re_activate(new_track, frame_id, new_id)
+
+    def update(self, new_track, frame_id):
+        """Update the YOLOv8 instance with new track and frame ID."""
+        if new_track.curr_feat is not None:
+            self.update_features(new_track.curr_feat)
+        super().update(new_track, frame_id)
+
+    @property
+    def tlwh(self):
+        """Get current position in bounding box format `(top left x, top left y,
+        width, height)`.
+        """
+        if self.mean is None:
+            return self._tlwh.copy()
+        ret = self.mean[:4].copy()
+        ret[:2] -= ret[2:] / 2
+        return ret
+
+    @staticmethod
+    def multi_predict(stracks):
+        """Predicts the mean and covariance of multiple object tracks using shared Kalman filter."""
+        if len(stracks) <= 0:
+            return
+        multi_mean = np.asarray([st.mean.copy() for st in stracks])
+        multi_covariance = np.asarray([st.covariance for st in stracks])
+        for i, st in enumerate(stracks):
+            if st.state != TrackState.Tracked:
+                multi_mean[i][6] = 0
+                multi_mean[i][7] = 0
+        multi_mean, multi_covariance = BOTrack.shared_kalman.multi_predict(multi_mean, multi_covariance)
+        for i, (mean, cov) in enumerate(zip(multi_mean, multi_covariance)):
+            stracks[i].mean = mean
+            stracks[i].covariance = cov
+
+    def convert_coords(self, tlwh):
+        """Converts Top-Left-Width-Height bounding box coordinates to X-Y-Width-Height format."""
+        return self.tlwh_to_xywh(tlwh)
+
+    @staticmethod
+    def tlwh_to_xywh(tlwh):
+        """Convert bounding box to format `(center x, center y, width,
+        height)`.
+        """
+        ret = np.asarray(tlwh).copy()
+        ret[:2] += ret[2:] / 2
+        return ret
+
+
+class BOTSORT(BYTETracker):
+
+    def __init__(self, args, frame_rate=30):
+        """Initialize YOLOv8 object with ReID module and GMC algorithm."""
+        super().__init__(args, frame_rate)
+        # ReID module
+        self.proximity_thresh = args.proximity_thresh
+        self.appearance_thresh = args.appearance_thresh
+
+        if args.with_reid:
+            # Haven't supported BoT-SORT(reid) yet
+            self.encoder = ReIDInterface(config)
+        
+        # self.gmc = GMC(method=args.gmc_method)     # commented by WQG
+
+    def get_kalmanfilter(self):
+        """Returns an instance of KalmanFilterXYWH for object tracking."""
+        return KalmanFilterXYWH()
+
+    def init_track(self, dets, scores, cls, imgs):
+        """Initialize track with detections, scores, and classes."""
+        if len(dets) == 0:
+            return []
+        if self.args.with_reid and self.encoder is not None:
+            features_keep = self.encoder.inference(imgs, dets)
+            return [BOTrack(xyxy, s, c, f) for (xyxy, s, c, f) in zip(dets, scores, cls, features_keep)]  # detections
+        else:
+            return [BOTrack(xyxy, s, c) for (xyxy, s, c) in zip(dets, scores, cls)]  # detections
+
+    def get_dists(self, tracks, detections):
+        """Get distances between tracks and detections using IoU and (optionally) ReID embeddings."""
+        dists = matching.iou_distance(tracks, detections)
+        # proximity_thresh 应该设较大的值，表示只有两个boxes离得较远时，不考虑reid特征
+        dists_mask = (dists > self.proximity_thresh)
+
+        # TODO: mot20
+        # if not self.args.mot20:
+        dists = matching.fuse_score(dists, detections)
+
+        if self.args.with_reid and self.encoder is not None:
+            emb_dists = matching.embedding_distance(tracks, detections) / 2.0
+            emb_dists[emb_dists > self.appearance_thresh] = 1.0
+            emb_dists[dists_mask] = 1.0
+            dists = np.minimum(dists, emb_dists)
+  
+        return dists
+    
+    def get_dists_1(self, tracks, detections):
+        """Get distances between tracks and detections using IoU and (optionally) ReID embeddings."""
+        iou_dists = matching.iou_distance(tracks, detections)
+        iou_dists_mask = (iou_dists>0.9)
+        
+        iou_dists = matching.fuse_score(iou_dists, detections)
+        weight = 0.4
+        if self.args.with_reid and self.encoder is not None:
+            emb_dists = matching.embedding_distance(tracks, detections)
+            
+            '''============ iou_dists 和 emb_dists 融合有两种策略 ==========='''
+            '''1. reid 相似度阈值，低于该值的两 boxes 图像不可能是同一对象，需要确定一个合理的可信阈值
+               2. iou 的约束为若约束，故 iou_dists 应设置为较大的值
+            '''
+            emb_dists_mask = (emb_dists > 0.85)
+            iou_dists[emb_dists_mask] = 1
+            emb_dists[iou_dists_mask] = 1
+  
+            dists = np.minimum(iou_dists, emb_dists)
+            '''2. embed 阈值'''
+            # dists = (1-weight)*iou_dists + weight*emb_dists
+        else:
+            
+            dists = iou_dists.copy()
+  
+        return dists
+
+
+
+
+
+
+
+    def multi_predict(self, tracks):
+        """Predict and track multiple objects with YOLOv8 model."""
+        BOTrack.multi_predict(tracks)
+        
+        
+    def get_result(self):
+        '''written by WQG'''
+        activate_tracks = np.asarray([x.tlbr.tolist() + [x.track_id, x.score, x.cls, x.idx] 
+                                      for x in self.tracked_stracks if x.is_activated], dtype=np.float32)
+        
+        track_features = []
+        if self.args.with_reid and self.encoder is not None:
+            track_features = np.asarray([x.curr_feat for x in self.tracked_stracks if x.is_activated], dtype=np.float32)
+            
+                
+        return (activate_tracks, track_features)
--- a/tracking/trackers/byte_tracker.py
+++ b/tracking/trackers/byte_tracker.py
@ -0,0 +1,464 @@
+# Ultralytics YOLO ğŸš€, AGPL-3.0 license
+
+import numpy as np
+
+from .basetrack import BaseTrack, TrackState
+from .utils import matching
+from .utils.kalman_filter import KalmanFilterXYAH
+
+
+def dists_update(dists, strack_pool, detections):
+    '''written by WQG'''
+    
+    if len(strack_pool) and len(detections):
+        # alabel = np.array([int(stack.cls) if int(stack.cls)==0 or int(stack.cls)==9 else -1 for stack in strack_pool])
+        # blabel = np.array([int(stack.cls) if int(stack.cls)==0 or int(stack.cls)==9 else -1 for stack in detections])
+
+        alabel = np.array([int(stack.cls) for stack in strack_pool])
+        blabel = np.array([int(stack.cls) for stack in detections])
+        amlabel = np.expand_dims(alabel, axis=1).repeat(len(detections),axis=1)
+        bmlabel = np.expand_dims(blabel, axis=0).repeat(len(strack_pool),axis=0)
+        dist_label = 1 - (bmlabel == amlabel)
+        dists = np.where(dists > dist_label, dists, dist_label)
+    return dists
+
+
+class STrack(BaseTrack):
+    shared_kalman = KalmanFilterXYAH()
+
+    def __init__(self, tlwh, score, cls):
+        """wait activate."""
+        self._tlwh = np.asarray(self.tlbr_to_tlwh(tlwh[:-1]), dtype=np.float32)
+        self.kalman_filter = None
+        self.mean, self.covariance = None, None
+        self.is_activated = False
+        
+        self.first_find = False    ###
+
+        self.score = score
+        self.tracklet_len = 0
+        self.cls = cls
+        self.idx = tlwh[-1]
+
+    def predict(self):
+        """Predicts mean and covariance using Kalman filter."""
+        mean_state = self.mean.copy()
+        if self.state != TrackState.Tracked:
+            mean_state[7] = 0
+        self.mean, self.covariance = self.kalman_filter.predict(mean_state, self.covariance)
+
+    @staticmethod
+    def multi_predict(stracks):
+        """Perform multi-object predictive tracking using Kalman filter for given stracks."""
+        if len(stracks) <= 0:
+            return
+        multi_mean = np.asarray([st.mean.copy() for st in stracks])
+        multi_covariance = np.asarray([st.covariance for st in stracks])
+        for i, st in enumerate(stracks):
+            if st.state != TrackState.Tracked:
+                multi_mean[i][7] = 0
+        multi_mean, multi_covariance = STrack.shared_kalman.multi_predict(multi_mean, multi_covariance)
+        for i, (mean, cov) in enumerate(zip(multi_mean, multi_covariance)):
+            stracks[i].mean = mean
+            stracks[i].covariance = cov
+
+    @staticmethod
+    def multi_gmc(stracks, H=np.eye(2, 3)):
+        """Update state tracks positions and covariances using a homography matrix."""
+        if len(stracks) > 0:
+            multi_mean = np.asarray([st.mean.copy() for st in stracks])
+            multi_covariance = np.asarray([st.covariance for st in stracks])
+
+            R = H[:2, :2]
+            R8x8 = np.kron(np.eye(4, dtype=float), R)
+            t = H[:2, 2]
+
+            for i, (mean, cov) in enumerate(zip(multi_mean, multi_covariance)):
+                mean = R8x8.dot(mean)
+                mean[:2] += t
+                cov = R8x8.dot(cov).dot(R8x8.transpose())
+
+                stracks[i].mean = mean
+                stracks[i].covariance = cov
+
+    def activate(self, kalman_filter, frame_id):
+        """Start a new tracklet."""
+        self.kalman_filter = kalman_filter
+        self.track_id = self.next_id()
+        self.mean, self.covariance = self.kalman_filter.initiate(self.convert_coords(self._tlwh))
+
+        self.tracklet_len = 0
+        self.state = TrackState.Tracked
+        if frame_id == 1:
+            self.is_activated = True
+        else:
+            self.first_find = True      ### Add by WQG  
+        self.frame_id = frame_id
+        self.start_frame = frame_id
+
+    def re_activate(self, new_track, frame_id, new_id=False):
+        """Reactivates a previously lost track with a new detection."""
+        self.mean, self.covariance = self.kalman_filter.update(self.mean, self.covariance,
+                                                               self.convert_coords(new_track.tlwh))
+        self.tracklet_len = 0
+        self.state = TrackState.Tracked
+        self.is_activated = True
+        self.frame_id = frame_id
+        if new_id:
+            self.track_id = self.next_id()
+        self.score = new_track.score
+        self.cls = new_track.cls
+        self.idx = new_track.idx
+        
+        self._tlwh = new_track._tlwh
+
+    def update(self, new_track, frame_id):
+        """
+        Update a matched track
+        :type new_track: STrack
+        :type frame_id: int
+        :return:
+        """
+        self.frame_id = frame_id
+        self.tracklet_len += 1
+
+        new_tlwh = new_track.tlwh
+        self.mean, self.covariance = self.kalman_filter.update(self.mean, self.covariance,
+                                                               self.convert_coords(new_tlwh))
+        self.state = TrackState.Tracked
+        self.is_activated = True
+
+        self.score = new_track.score
+        self.cls = new_track.cls
+        self.idx = new_track.idx
+        
+        self._tlwh = new_track._tlwh
+        
+
+    def convert_coords(self, tlwh):
+        """Convert a bounding box's top-left-width-height format to its x-y-angle-height equivalent."""
+        return self.tlwh_to_xyah(tlwh)
+
+    @property
+    def tlwh(self):
+        """Get current position in bounding box format `(top left x, top left y,
+        width, height)`.
+        """
+        if self.mean is None:
+            return self._tlwh.copy()
+        ret = self.mean[:4].copy()
+        ret[2] *= ret[3]
+        ret[:2] -= ret[2:] / 2
+        return ret
+
+    @property
+    def tlbr(self):
+        """Convert bounding box to format `(min x, min y, max x, max y)`, i.e.,
+        `(top left, bottom right)`.
+        """
+        ret = self.tlwh.copy()
+        ret[2:] += ret[:2]
+        return ret
+
+    @staticmethod
+    def tlwh_to_xyah(tlwh):
+        """Convert bounding box to format `(center x, center y, aspect ratio,
+        height)`, where the aspect ratio is `width / height`.
+        """
+        ret = np.asarray(tlwh).copy()
+        ret[:2] += ret[2:] / 2
+        ret[2] /= ret[3]
+        return ret
+
+    @staticmethod
+    def tlbr_to_tlwh(tlbr):
+        """Converts top-left bottom-right format to top-left width height format."""
+        ret = np.asarray(tlbr).copy()
+        ret[2:] -= ret[:2]
+        return ret
+
+    @staticmethod
+    def tlwh_to_tlbr(tlwh):
+        """Converts tlwh bounding box format to tlbr format."""
+        ret = np.asarray(tlwh).copy()
+        ret[2:] += ret[:2]
+        return ret
+
+    def __repr__(self):
+        """Return a string representation of the BYTETracker object with start and end frames and track ID."""
+        return f'OT_{self.track_id}_({self.start_frame}-{self.end_frame})'
+
+
+class BYTETracker:
+
+    def __init__(self, args, frame_rate=30):
+        """Initialize a YOLOv8 object to track objects with given arguments and frame rate."""
+        self.tracked_stracks = []  # type: list[STrack]
+        self.lost_stracks = []  # type: list[STrack]
+        self.removed_stracks = []  # type: list[STrack]
+
+        self.frame_id = 0
+        self.args = args
+        self.max_time_lost = int(frame_rate / 30.0 * args.track_buffer)
+        self.kalman_filter = self.get_kalmanfilter()
+        self.reset_id()
+        
+        # Add by WQG
+        self.args.new_track_thresh = 0.5
+        
+
+    def update(self, results, img=None):
+        """Updates object tracker with new detections and returns tracked object bounding boxes."""
+        self.frame_id += 1
+        activated_stracks = []
+        refind_stracks = []
+        lost_stracks = []
+        removed_stracks = []
+
+        scores = results.conf
+        cls = results.cls
+        
+# =============================================================================
+#         # get xyxy and add index
+#         bboxes = results.xyxy
+#         bboxes = np.concatenate([bboxes, np.arange(len(bboxes)).reshape(-1, 1)], axis=-1)
+# =============================================================================
+        bboxes = results.xyxyb
+        
+        
+        remain_inds = scores > self.args.track_high_thresh
+        inds_low = scores > self.args.track_low_thresh
+        inds_high = scores < self.args.track_high_thresh
+
+        inds_second = np.logical_and(inds_low, inds_high)
+        dets_second = bboxes[inds_second]
+        dets = bboxes[remain_inds]
+        scores_keep = scores[remain_inds]
+        scores_second = scores[inds_second]
+        cls_keep = cls[remain_inds]
+        cls_second = cls[inds_second]
+
+        detections = self.init_track(dets, scores_keep, cls_keep, img)
+        
+        # Add newly detected tracklets to tracked_stracks
+        unconfirmed = []
+        tracked_stracks = []  # type: list[STrack]
+        for track in self.tracked_stracks:
+            if not track.is_activated:
+                unconfirmed.append(track)
+            else:
+                tracked_stracks.append(track)
+        
+        
+        # Step 2: First association, with high score detection boxes
+        strack_pool = self.joint_stracks(tracked_stracks, self.lost_stracks)
+        # Predict the current location with KF
+        self.multi_predict(strack_pool)
+        
+# ============================================================= 没必要gmc，WQG
+#         if hasattr(self, 'gmc') and img is not None:
+#             warp = self.gmc.apply(img, dets)
+#             STrack.multi_gmc(strack_pool, warp)
+#             STrack.multi_gmc(unconfirmed, warp)
+# =============================================================================
+
+        dists = self.get_dists_1(strack_pool, detections)
+        
+        '''written by WQG for different class'''
+        dists = dists_update(dists, strack_pool, detections)
+
+        matches, u_track, u_detection = matching.linear_assignment(dists, thresh=self.args.match_thresh)
+        for itracked, idet in matches:
+            track = strack_pool[itracked]
+            det = detections[idet]
+            if track.state == TrackState.Tracked:
+                track.update(det, self.frame_id)
+                activated_stracks.append(track)
+            else:
+                track.re_activate(det, self.frame_id, new_id=False)
+                refind_stracks.append(track)
+        
+        
+        # Step 3: Second association, with low score detection boxes
+        # association the untrack to the low score detections
+        detections_second = self.init_track(dets_second, scores_second, cls_second, img)
+        r_tracked_stracks = [strack_pool[i] for i in u_track if strack_pool[i].state == TrackState.Tracked]
+        
+        # TODO
+        dists = matching.iou_distance(r_tracked_stracks, detections_second)
+        '''written by WQG for different class'''
+        dists = dists_update(dists, r_tracked_stracks, detections_second)
+        
+        matches, u_track, u_detection_second = matching.linear_assignment(dists, thresh=0.5)
+        for itracked, idet in matches:
+            track = r_tracked_stracks[itracked]
+            det = detections_second[idet]
+            if track.state == TrackState.Tracked:
+                track.update(det, self.frame_id)
+                activated_stracks.append(track)
+            else:
+                track.re_activate(det, self.frame_id, new_id=False)
+                refind_stracks.append(track)
+
+        for it in u_track:
+            track = r_tracked_stracks[it]
+            if track.state != TrackState.Lost:
+                track.mark_lost()
+                lost_stracks.append(track)
+        
+        # Deal with unconfirmed tracks, usually tracks with only one beginning frame
+        detections = [detections[i] for i in u_detection]
+        dists = self.get_dists_1(unconfirmed, detections)
+        '''written by WQG for different class'''
+        dists = dists_update(dists, unconfirmed, detections)
+        
+        matches, u_unconfirmed, u_detection = matching.linear_assignment(dists, thresh=0.7)
+        for itracked, idet in matches:
+            unconfirmed[itracked].update(detections[idet], self.frame_id)
+            activated_stracks.append(unconfirmed[itracked])
+        for it in u_unconfirmed:
+            track = unconfirmed[it]
+            if self.frame_id - track.end_frame > 2:   # Add by WQG
+                track.mark_removed()
+                removed_stracks.append(track)
+        # Step 4: Init new stracks
+        for inew in u_detection:
+            track = detections[inew]
+            if track.score < self.args.new_track_thresh:
+                continue
+            track.activate(self.kalman_filter, self.frame_id)
+            activated_stracks.append(track)
+        # Step 5: Update state
+        for track in self.lost_stracks:
+            if self.frame_id - track.end_frame > self.max_time_lost:
+                track.mark_removed()
+                removed_stracks.append(track)
+
+        self.tracked_stracks = [t for t in self.tracked_stracks if t.state == TrackState.Tracked]
+        self.tracked_stracks = self.joint_stracks(self.tracked_stracks, activated_stracks)
+        self.tracked_stracks = self.joint_stracks(self.tracked_stracks, refind_stracks)
+        self.lost_stracks = self.sub_stracks(self.lost_stracks, self.tracked_stracks)
+        self.lost_stracks.extend(lost_stracks)
+        self.lost_stracks = self.sub_stracks(self.lost_stracks, self.removed_stracks)
+        self.tracked_stracks, self.lost_stracks = self.remove_duplicate_stracks(self.tracked_stracks, self.lost_stracks)
+        self.removed_stracks.extend(removed_stracks)
+        if len(self.removed_stracks) > 1000:
+            self.removed_stracks = self.removed_stracks[-999:]  # clip remove stracks to 1000 maximum
+        
+        '''x.tlbr have update by function:
+                @property
+                def tlwh(self):
+        '''
+        
+        ##================ 原算法输出
+        # output = np.asarray([x.tlbr.tolist() + [x.track_id, x.score, x.cls, x.frame_id, x.idx] 
+        #                     for x in self.tracked_stracks if x.is_activated], dtype=np.float32)
+        
+        ## ===== write by WQG
+        output1 = [x.tlwh_to_tlbr(x._tlwh).tolist() + [x.track_id, x.score, x.cls, x.frame_id, x.idx] 
+                              for x in self.tracked_stracks if x.is_activated]
+        
+        output2 = [x.tlwh_to_tlbr(x._tlwh).tolist() + [x.track_id, x.score, x.cls, x.frame_id, x.idx] 
+                              for x in activated_stracks if x.first_find]
+        
+        output = np.asarray(output1+output2, dtype=np.float32)
+
+        return output
+    
+    
+    def get_result(self):
+        '''written by WQG'''
+# =============================================================================
+#         activate_tracks = np.asarray([x.tlbr.tolist() + [x.track_id, x.score, x.cls, x.idx] 
+#                                       for x in self.tracked_stracks if x.is_activated], dtype=np.float32)
+#         
+#         track_features = []
+# =============================================================================
+        tracks = []
+        feats = []                  
+        for t in self.tracked_stracks:
+            if t.is_activated:
+                track = t.tlbr.tolist() + [t.track_id, t.score, t.cls, t.idx] 
+                feat = t.curr_feature
+                
+                tracks.append(track)
+                feats.append(feat)
+        
+        tracks = np.asarray(tracks, dtype=np.float32)
+   
+        return (tracks, feats)
+
+
+    def get_kalmanfilter(self):
+        """Returns a Kalman filter object for tracking bounding boxes."""
+        return KalmanFilterXYAH()
+
+    def init_track(self, dets, scores, cls, img=None):
+        """Initialize object tracking with detections and scores using STrack algorithm."""
+        return [STrack(xyxy, s, c) for (xyxy, s, c) in zip(dets, scores, cls)] if len(dets) else []  # detections
+
+    def get_dists(self, tracks, detections):
+        """Calculates the distance between tracks and detections using IOU and fuses scores."""
+        dists = matching.iou_distance(tracks, detections)
+        # TODO: mot20
+        # if not self.args.mot20:
+        dists = matching.fuse_score(dists, detections)
+        return dists
+    def get_dists_1(self, tracks, detections):
+        """Calculates the distance between tracks and detections using IOU and fuses scores."""
+        
+        pass
+    
+
+    def multi_predict(self, tracks):
+        """Returns the predicted tracks using the YOLOv8 network."""
+        STrack.multi_predict(tracks)
+
+    def reset_id(self):
+        """Resets the ID counter of STrack."""
+        STrack.reset_id()
+
+    @staticmethod
+    def joint_stracks(tlista, tlistb):
+        """Combine two lists of stracks into a single one."""
+        exists = {}
+        res = []
+        for t in tlista:
+            exists[t.track_id] = 1
+            res.append(t)
+        for t in tlistb:
+            tid = t.track_id
+            if not exists.get(tid, 0):
+                exists[tid] = 1
+                res.append(t)
+        return res
+
+    @staticmethod
+    def sub_stracks(tlista, tlistb):
+        """DEPRECATED CODE in https://github.com/ultralytics/ultralytics/pull/1890/
+        stracks = {t.track_id: t for t in tlista}
+        for t in tlistb:
+            tid = t.track_id
+            if stracks.get(tid, 0):
+                del stracks[tid]
+        return list(stracks.values())
+        """
+        track_ids_b = {t.track_id for t in tlistb}
+        return [t for t in tlista if t.track_id not in track_ids_b]
+
+    @staticmethod
+    def remove_duplicate_stracks(stracksa, stracksb):
+        """Remove duplicate stracks with non-maximum IOU distance."""
+        pdist = matching.iou_distance(stracksa, stracksb)
+        pairs = np.where(pdist < 0.15)
+        dupa, dupb = [], []
+        for p, q in zip(*pairs):
+            timep = stracksa[p].frame_id - stracksa[p].start_frame
+            timeq = stracksb[q].frame_id - stracksb[q].start_frame
+            if timep > timeq:
+                dupb.append(q)
+            else:
+                dupa.append(p)
+        resa = [t for i, t in enumerate(stracksa) if i not in dupa]
+        resb = [t for i, t in enumerate(stracksb) if i not in dupb]
+        return resa, resb
--- a/tracking/trackers/cfg/botsort.yaml
+++ b/tracking/trackers/cfg/botsort.yaml
@ -0,0 +1,18 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+# Default YOLO tracker settings for BoT-SORT tracker https://github.com/NirAharon/BoT-SORT
+
+tracker_type: botsort  # tracker type, ['botsort', 'bytetrack']
+track_high_thresh: 0.5  # threshold for the first association
+track_low_thresh: 0.1  # threshold for the second association
+new_track_thresh: 0.6  # threshold for init new track if the detection does not match any tracks
+track_buffer: 30  # buffer to calculate the time when to remove tracks
+match_thresh: 0.8  # threshold for matching tracks
+# min_box_area: 10  # threshold for min box areas(for tracker evaluation, not used for now)
+# mot20: False  # for tracker evaluation(not used for now)
+
+# BoT-SORT settings
+gmc_method: sparseOptFlow  # method of global motion compensation
+# ReID model related thresh (not supported yet)
+proximity_thresh: 0.5
+appearance_thresh: 0.25
+with_reid: True
--- a/tracking/trackers/cfg/bytetrack.yaml
+++ b/tracking/trackers/cfg/bytetrack.yaml
@ -0,0 +1,11 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+# Default YOLO tracker settings for ByteTrack tracker https://github.com/ifzhang/ByteTrack
+
+tracker_type: bytetrack  # tracker type, ['botsort', 'bytetrack']
+track_high_thresh: 0.5  # threshold for the first association
+track_low_thresh: 0.1  # threshold for the second association
+new_track_thresh: 0.6  # threshold for init new track if the detection does not match any tracks
+track_buffer: 30  # buffer to calculate the time when to remove tracks
+match_thresh: 0.8  # threshold for matching tracks
+# min_box_area: 10  # threshold for min box areas(for tracker evaluation, not used for now)
+# mot20: False  # for tracker evaluation(not used for now)
--- a/tracking/trackers/reid/init.py
+++ b/tracking/trackers/reid/init.py
@ -0,0 +1,7 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Jan 19 16:15:35 2024
+
+@author: ym
+"""
+
--- a/tracking/trackers/reid/pycache/init.cpython-39.pyc
+++ b/tracking/trackers/reid/pycache/init.cpython-39.pyc
--- a/tracking/trackers/reid/pycache/config.cpython-39.pyc
+++ b/tracking/trackers/reid/pycache/config.cpython-39.pyc
--- a/tracking/trackers/reid/pycache/reid_interface.cpython-39.pyc
+++ b/tracking/trackers/reid/pycache/reid_interface.cpython-39.pyc
--- a/tracking/trackers/reid/best.pth
+++ b/tracking/trackers/reid/best.pth
--- a/tracking/trackers/reid/ckpts/resnet18_1220/best.pth
+++ b/tracking/trackers/reid/ckpts/resnet18_1220/best.pth
--- a/tracking/trackers/reid/config.py
+++ b/tracking/trackers/reid/config.py
@ -0,0 +1,42 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Jan 19 14:01:46 2024
+
+@author: ym
+"""
+
+import torch
+import os
+# import torchvision.transforms as T
+class Config:
+    # network settings
+    backbone = 'resnet18' # [resnet18, mobilevit_s, mobilenet_v2, mobilenetv3]
+    batch_size = 8
+    embedding_size = 256
+    img_size = 224
+    
+    current_path = os.path.dirname(os.path.abspath(__file__))
+    model_path = os.path.join(current_path, r"ckpts\resnet18_1220\best.pth")
+    
+    # model_path = "./trackers/reid/ckpts/resnet18_1220/best.pth"
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    
+# =============================================================================
+#     metric = 'arcface'  # [cosface, arcface]
+#     drop_ratio = 0.5
+#    
+#     # training settings
+#     checkpoints = "checkpoints/Mobilev3Large_1225"  # [resnet18, mobilevit_s, mobilenet_v2, mobilenetv3]
+#     restore = False
+#     
+#     test_model = "./checkpoints/resnet18_1220/best.pth"
+#     
+# 
+# 
+# 
+#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+#     pin_memory = True  # if memory is large, set it True to speed up a bit
+#     num_workers = 4  # dataloader
+# =============================================================================
+
+config = Config()
--- a/tracking/trackers/reid/model/BAM.py
+++ b/tracking/trackers/reid/model/BAM.py
@ -0,0 +1,83 @@
+import torch.nn as nn
+import torchvision
+from torch.nn import init
+
+class Flatten(nn.Module):
+    def forward(self, x):
+        return x.view(x.shape[0], -1)
+
+class ChannelAttention(nn.Module):
+    def __int__(self,channel,reduction, num_layers):
+        super(ChannelAttention,self).__init__()
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+        gate_channels = [channel]
+        gate_channels += [len(channel)//reduction]*num_layers
+        gate_channels += [channel]
+
+        self.ca = nn.Sequential()
+        self.ca.add_module('flatten', Flatten())
+        for i in range(len(gate_channels)-2):
+            self.ca.add_module('',nn.Linear(gate_channels[i], gate_channels[i+1]))
+            self.ca.add_module('',nn.BatchNorm1d(gate_channels[i+1]))
+            self.ca.add_module('',nn.ReLU())
+        self.ca.add_module('',nn.Linear(gate_channels[-2], gate_channels[-1]))
+
+    def forward(self, x):
+        res = self.avgpool(x)
+        res = self.ca(res)
+        res = res.unsqueeze(-1).unsqueeze(-1).expand_as(x)
+        return res
+
+class SpatialAttention(nn.Module):
+    def __int__(self, channel,reduction=16,num_lay=3,dilation=2):
+        super(SpatialAttention).__init__()
+        self.sa = nn.Sequential()
+        self.sa.add_module('', nn.Conv2d(kernel_size=1, in_channels=channel, out_channels=(channel//reduction)*3))
+        self.sa.add_module('',nn.BatchNorm2d(num_features=(channel//reduction)))
+        self.sa.add_module('',nn.ReLU())
+        for i in range(num_lay):
+            self.sa.add_module('', nn.Conv2d(kernel_size=3,
+                                             in_channels=(channel//reduction),
+                                             out_channels=(channel//reduction),
+                                             padding=1,
+                                             dilation= 2))
+            self.sa.add_module('',nn.BatchNorm2d(channel//reduction))
+            self.sa.add_module('',nn.ReLU())
+        self.sa.add_module('',nn.Conv2d(channel//reduction, 1, kernel_size=1))
+    def forward(self,x):
+        res = self.sa(x)
+        res = res.expand_as(x)
+        return res
+
+class BAMblock(nn.Module):
+    def __init__(self,channel=512, reduction=16, dia_val=2):
+        super(BAMblock, self).__init__()
+        self.ca = ChannelAttention(channel, reduction)
+        self.sa = SpatialAttention(channel,reduction,dia_val)
+        self.sigmoid = nn.Sigmoid()
+
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                init.kaiming_normal(m.weight, mode='fan_out')
+                if m.bais is not None:
+                    init.constant_(m.bias, 0)
+                elif isinstance(m, nn.BatchNorm2d):
+                    init.constant_(m.weight, 1)
+                    init.constant_(m.bias, 0)
+                elif isinstance(m, nn.Linear):
+                    init.normal_(m.weight, std=0.001)
+                    if m.bias is not None:
+                        init.constant_(m.bias, 0)
+
+    def forward(self,x):
+        b, c, _, _ = x.size()
+        sa_out=self.sa(x)
+        ca_out=self.ca(x)
+        weight=self.sigmoid(sa_out+ca_out)
+        out=(1+weight)*x
+        return out
+
+if __name__ =="__main__":
+
+    print(512//14)
--- a/tracking/trackers/reid/model/CBAM.py
+++ b/tracking/trackers/reid/model/CBAM.py
@ -0,0 +1,68 @@
+import torch
+import torch.nn as nn
+import torch.nn.init as init
+
+class channelAttention(nn.Module):
+    def __init__(self, channel, reduction=16):
+        super(channelAttention, self).__init__()
+        self.Maxpooling = nn.AdaptiveMaxPool2d(1)
+        self.Avepooling = nn.AdaptiveAvgPool2d(1)
+        self.ca = nn.Sequential()
+        self.ca.add_module('conv1',nn.Conv2d(channel, channel//reduction, 1, bias=False))
+        self.ca.add_module('Relu', nn.ReLU())
+        self.ca.add_module('conv2',nn.Conv2d(channel//reduction, channel, 1, bias=False))
+        self.sigmod = nn.Sigmoid()
+
+    def forward(self, x):
+        M_out = self.Maxpooling(x)
+        A_out = self.Avepooling(x)
+        M_out = self.ca(M_out)
+        A_out = self.ca(A_out)
+        out = self.sigmod(M_out+A_out)
+        return out
+
+class SpatialAttention(nn.Module):
+    def __init__(self, kernel_size=7):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels=2, out_channels=1, kernel_size=kernel_size, padding=kernel_size // 2)
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        max_result, _ = torch.max(x, dim=1, keepdim=True)
+        avg_result = torch.mean(x, dim=1, keepdim=True)
+        result = torch.cat([max_result, avg_result], dim=1)
+        output = self.conv(result)
+        output = self.sigmoid(output)
+        return output
+class CBAM(nn.Module):
+    def __init__(self, channel=512, reduction=16, kernel_size=7):
+        super().__init__()
+        self.ca = channelAttention(channel, reduction)
+        self.sa = SpatialAttention(kernel_size)
+
+    def init_weights(self):
+        for m in self.modules():#权重初始化
+            if isinstance(m, nn.Conv2d):
+                init.kaiming_normal_(m.weight, mode='fan_out')
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d):
+                init.constant_(m.weight, 1)
+                init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                init.normal_(m.weight, std=0.001)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        # b,c_,_ = x.size()
+        # residual = x
+        out = x*self.ca(x)
+        out = out*self.sa(out)
+        return out
+if __name__ == '__main__':
+    input=torch.randn(50,512,7,7)
+    kernel_size=input.shape[2]
+    cbam = CBAM(channel=512,reduction=16,kernel_size=kernel_size)
+    output=cbam(input)
+    print(output.shape)
--- a/tracking/trackers/reid/model/Tool.py
+++ b/tracking/trackers/reid/model/Tool.py
@ -0,0 +1,33 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class GeM(nn.Module):
+    def __init__(self, p=3, eps=1e-6):
+        super(GeM, self).__init__()
+        self.p = nn.Parameter(torch.ones(1) * p)
+        self.eps = eps
+
+    def forward(self, x):
+        return self.gem(x, p=self.p, eps=self.eps, stride = 2)
+
+    def gem(self, x, p=3, eps=1e-6, stride = 2):
+        return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1)), stride=2).pow(1. / p)
+
+    def __repr__(self):
+        return self.__class__.__name__ + \
+               '(' + 'p=' + '{:.4f}'.format(self.p.data.tolist()[0]) + \
+               ', ' + 'eps=' + str(self.eps) + ')'
+
+class TripletLoss(nn.Module):
+    def __init__(self, margin):
+        super(TripletLoss, self).__init__()
+        self.margin = margin
+
+    def forward(self, anchor, positive, negative, size_average = True):
+        distance_positive = (anchor-positive).pow(2).sum(1)
+        distance_negative = (anchor-negative).pow(2).sum(1)
+        losses = F.relu(distance_negative-distance_positive+self.margin)
+        return losses.mean() if size_average else losses.sum()
+
+if __name__ == '__main__':
+    print('')
--- a/tracking/trackers/reid/model/init.py
+++ b/tracking/trackers/reid/model/init.py
@ -0,0 +1,9 @@
+from .fmobilenet import FaceMobileNet
+from .resnet_face import ResIRSE
+from .mobilevit import mobilevit_s
+from .metric import ArcFace, CosFace
+from .loss import FocalLoss
+from .resbam import resnet
+from .resnet_pre import resnet18, resnet34, resnet50
+from .mobilenet_v2 import mobilenet_v2
+from .mobilenet_v3 import MobileNetV3_Small, MobileNetV3_Large
--- a/tracking/trackers/reid/model/pycache/CBAM.cpython-39.pyc
+++ b/tracking/trackers/reid/model/pycache/CBAM.cpython-39.pyc
--- a/tracking/trackers/reid/model/pycache/Tool.cpython-39.pyc
+++ b/tracking/trackers/reid/model/pycache/Tool.cpython-39.pyc
--- a/tracking/trackers/reid/model/pycache/init.cpython-39.pyc
+++ b/tracking/trackers/reid/model/pycache/init.cpython-39.pyc
--- a/tracking/trackers/reid/model/pycache/fmobilenet.cpython-39.pyc
+++ b/tracking/trackers/reid/model/pycache/fmobilenet.cpython-39.pyc
--- a/tracking/trackers/reid/model/pycache/loss.cpython-39.pyc
+++ b/tracking/trackers/reid/model/pycache/loss.cpython-39.pyc
--- a/tracking/trackers/reid/model/pycache/metric.cpython-39.pyc
+++ b/tracking/trackers/reid/model/pycache/metric.cpython-39.pyc
--- a/tracking/trackers/reid/model/pycache/mobilenet_v2.cpython-39.pyc
+++ b/tracking/trackers/reid/model/pycache/mobilenet_v2.cpython-39.pyc
--- a/tracking/trackers/reid/model/pycache/mobilenet_v3.cpython-39.pyc
+++ b/tracking/trackers/reid/model/pycache/mobilenet_v3.cpython-39.pyc
--- a/tracking/trackers/reid/model/pycache/mobilevit.cpython-39.pyc
+++ b/tracking/trackers/reid/model/pycache/mobilevit.cpython-39.pyc
--- a/tracking/trackers/reid/model/pycache/resbam.cpython-39.pyc
+++ b/tracking/trackers/reid/model/pycache/resbam.cpython-39.pyc
--- a/tracking/trackers/reid/model/pycache/resnet_face.cpython-39.pyc
+++ b/tracking/trackers/reid/model/pycache/resnet_face.cpython-39.pyc
--- a/tracking/trackers/reid/model/pycache/resnet_pre.cpython-39.pyc
+++ b/tracking/trackers/reid/model/pycache/resnet_pre.cpython-39.pyc
--- a/tracking/trackers/reid/model/pycache/utils.cpython-39.pyc
+++ b/tracking/trackers/reid/model/pycache/utils.cpython-39.pyc
--- a/tracking/trackers/reid/model/fmobilenet.py
+++ b/tracking/trackers/reid/model/fmobilenet.py
@ -0,0 +1,124 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+
+class Flatten(nn.Module):
+    def forward(self, x):
+        return x.view(x.shape[0], -1)
+
+class ConvBn(nn.Module):
+
+    def __init__(self, in_c, out_c, kernel=(1, 1), stride=1, padding=0, groups=1):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Conv2d(in_c, out_c, kernel, stride, padding, groups=groups, bias=False),
+            nn.BatchNorm2d(out_c)
+        )
+        
+    def forward(self, x):
+        return self.net(x)
+
+
+class ConvBnPrelu(nn.Module):
+
+    def __init__(self, in_c, out_c, kernel=(1, 1), stride=1, padding=0, groups=1):
+        super().__init__()
+        self.net = nn.Sequential(
+            ConvBn(in_c, out_c, kernel, stride, padding, groups),
+            nn.PReLU(out_c)
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+class DepthWise(nn.Module):
+
+    def __init__(self, in_c, out_c, kernel=(3, 3), stride=2, padding=1, groups=1):
+        super().__init__()
+        self.net = nn.Sequential(
+            ConvBnPrelu(in_c, groups, kernel=(1, 1), stride=1, padding=0),
+            ConvBnPrelu(groups, groups, kernel=kernel, stride=stride, padding=padding, groups=groups),
+            ConvBn(groups, out_c, kernel=(1, 1), stride=1, padding=0),
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+class DepthWiseRes(nn.Module):
+    """DepthWise with Residual"""
+
+    def __init__(self, in_c, out_c, kernel=(3, 3), stride=2, padding=1, groups=1):
+        super().__init__()
+        self.net = DepthWise(in_c, out_c, kernel, stride, padding, groups)
+
+    def forward(self, x):
+        return self.net(x) + x
+
+
+class MultiDepthWiseRes(nn.Module):
+
+    def __init__(self, num_block, channels, kernel=(3, 3), stride=1, padding=1, groups=1):
+        super().__init__()
+
+        self.net = nn.Sequential(*[
+            DepthWiseRes(channels, channels, kernel, stride, padding, groups)
+            for _ in range(num_block)
+        ])
+
+    def forward(self, x):
+        return self.net(x)
+
+
+class FaceMobileNet(nn.Module):
+
+    def __init__(self, embedding_size):
+        super().__init__()
+        self.conv1 = ConvBnPrelu(1, 64, kernel=(3, 3), stride=2, padding=1)
+        self.conv2 = ConvBn(64, 64, kernel=(3, 3), stride=1, padding=1, groups=64)
+        self.conv3 = DepthWise(64, 64, kernel=(3, 3), stride=2, padding=1, groups=128)
+        self.conv4 = MultiDepthWiseRes(num_block=4, channels=64, kernel=3, stride=1, padding=1, groups=128)
+        self.conv5 = DepthWise(64, 128, kernel=(3, 3), stride=2, padding=1, groups=256)
+        self.conv6 = MultiDepthWiseRes(num_block=6, channels=128, kernel=(3, 3), stride=1, padding=1, groups=256)
+        self.conv7 = DepthWise(128, 128, kernel=(3, 3), stride=2, padding=1, groups=512)
+        self.conv8 = MultiDepthWiseRes(num_block=2, channels=128, kernel=(3, 3), stride=1, padding=1, groups=256)
+        self.conv9 = ConvBnPrelu(128, 512, kernel=(1, 1))
+        self.conv10 = ConvBn(512, 512, groups=512, kernel=(7, 7))
+        self.flatten = Flatten()
+        self.linear = nn.Linear(2048, embedding_size, bias=False)
+        self.bn = nn.BatchNorm1d(embedding_size)
+        
+    def forward(self, x):
+        #print('x',x.shape)
+        out = self.conv1(x)
+        out = self.conv2(out)
+        out = self.conv3(out)
+        out = self.conv4(out)
+        out = self.conv5(out)
+        out = self.conv6(out)
+        out = self.conv7(out)
+        out = self.conv8(out)
+        out = self.conv9(out)
+        out = self.conv10(out)
+        out = self.flatten(out)
+        out = self.linear(out)
+        out = self.bn(out)
+        return out
+
+if __name__ == "__main__":
+    from PIL import Image
+    import numpy as np
+
+    x = Image.open("../samples/009.jpg").convert('L')
+    x = x.resize((128, 128))
+    x = np.asarray(x, dtype=np.float32)
+    x = x[None, None, ...]
+    x = torch.from_numpy(x)
+    net = FaceMobileNet(512)
+    net.eval()
+    with torch.no_grad():
+        out = net(x)
+    print(out.shape)
--- a/tracking/trackers/reid/model/loss.py
+++ b/tracking/trackers/reid/model/loss.py
@ -0,0 +1,18 @@
+import torch
+import torch.nn as nn
+
+
+class FocalLoss(nn.Module):
+
+    def __init__(self, gamma=2):
+        super().__init__()
+        self.gamma = gamma
+        self.ce = torch.nn.CrossEntropyLoss()
+
+    def forward(self, input, target):
+
+        #print(f'theta {input.shape, input[0]}, target {target.shape, target}')
+        logp = self.ce(input, target)
+        p = torch.exp(-logp)
+        loss = (1 - p) ** self.gamma * logp
+        return loss.mean()
--- a/tracking/trackers/reid/model/metric.py
+++ b/tracking/trackers/reid/model/metric.py
@ -0,0 +1,83 @@
+# Definition of ArcFace loss and CosFace loss
+
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class ArcFace(nn.Module):
+    
+    def __init__(self, embedding_size, class_num, s=30.0, m=0.50):
+        """ArcFace formula: 
+            cos(m + theta) = cos(m)cos(theta) - sin(m)sin(theta)
+        Note that:
+            0 <= m + theta <= Pi
+        So if (m + theta) >= Pi, then theta >= Pi - m. In [0, Pi]
+        we have:
+            cos(theta) < cos(Pi - m)
+        So we can use cos(Pi - m) as threshold to check whether 
+        (m + theta) go out of [0, Pi]
+
+        Args:
+            embedding_size: usually 128, 256, 512 ...
+            class_num: num of people when training
+            s: scale, see normface https://arxiv.org/abs/1704.06369
+            m: margin, see SphereFace, CosFace, and ArcFace paper
+        """
+        super().__init__()
+        self.in_features = embedding_size
+        self.out_features = class_num
+        self.s = s
+        self.m = m
+        self.weight = nn.Parameter(torch.FloatTensor(class_num, embedding_size))
+        nn.init.xavier_uniform_(self.weight)
+
+        self.cos_m = math.cos(m)
+        self.sin_m = math.sin(m)
+        self.th = math.cos(math.pi - m)
+        self.mm = math.sin(math.pi - m) * m
+
+    def forward(self, input, label):
+        #print(f"embding {self.in_features}, class_num {self.out_features}, input {len(input)}, label {len(label)}")
+        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
+        # print('F.normalize(input)',input.shape)
+        # print('F.normalize(self.weight)',F.normalize(self.weight).shape)
+        sine = ((1.0 - cosine.pow(2)).clamp(0, 1)).sqrt()
+        phi = cosine * self.cos_m - sine * self.sin_m
+        phi = torch.where(cosine > self.th, phi, cosine - self.mm)  # drop to CosFace
+        #print(f'consine {cosine.shape, cosine}, sine {sine.shape, sine}, phi {phi.shape, phi}')
+        # update y_i by phi in cosine
+        output = cosine * 1.0  # make backward works
+        batch_size = len(output)
+        output[range(batch_size), label] = phi[range(batch_size), label]
+        # print(f'output {(output * self.s).shape}')
+        # print(f'phi[range(batch_size), label] {phi[range(batch_size), label]}')
+        return output * self.s
+
+
+class CosFace(nn.Module):
+
+    def __init__(self, in_features, out_features, s=30.0, m=0.40):
+        """
+        Args:
+            embedding_size: usually 128, 256, 512 ...
+            class_num: num of people when training
+            s: scale, see normface https://arxiv.org/abs/1704.06369
+            m: margin, see SphereFace, CosFace, and ArcFace paper
+        """
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.s = s
+        self.m = m
+        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
+        nn.init.xavier_uniform_(self.weight)
+
+    def forward(self, input, label):
+        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
+        phi = cosine - self.m
+        output = cosine * 1.0  # make backward works
+        batch_size = len(output)
+        output[range(batch_size), label] = phi[range(batch_size), label]
+        return output * self.s
--- a/tracking/trackers/reid/model/mobilenet_v2.py
+++ b/tracking/trackers/reid/model/mobilenet_v2.py
@ -0,0 +1,200 @@
+from torch import nn
+from .utils import load_state_dict_from_url
+from ..config import config as conf
+
+__all__ = ['MobileNetV2', 'mobilenet_v2']
+
+
+model_urls = {
+    'mobilenet_v2': 'https://download.pytorch.org/models/mobilenet_v2-b0353104.pth',
+}
+
+
+def _make_divisible(v, divisor, min_value=None):
+    """
+    This function is taken from the original tf repo.
+    It ensures that all layers have a channel number that is divisible by 8
+    It can be seen here:
+    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+    :param v:
+    :param divisor:
+    :param min_value:
+    :return:
+    """
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class ConvBNReLU(nn.Sequential):
+    def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1, norm_layer=None):
+        padding = (kernel_size - 1) // 2
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        super(ConvBNReLU, self).__init__(
+            nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False),
+            norm_layer(out_planes),
+            nn.ReLU6(inplace=True)
+        )
+
+
+class InvertedResidual(nn.Module):
+    def __init__(self, inp, oup, stride, expand_ratio, norm_layer=None):
+        super(InvertedResidual, self).__init__()
+        self.stride = stride
+        assert stride in [1, 2]
+
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+
+        hidden_dim = int(round(inp * expand_ratio))
+        self.use_res_connect = self.stride == 1 and inp == oup
+
+        layers = []
+        if expand_ratio != 1:
+            # pw
+            layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1, norm_layer=norm_layer))
+        layers.extend([
+            # dw
+            ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim, norm_layer=norm_layer),
+            # pw-linear
+            nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+            norm_layer(oup),
+        ])
+        self.conv = nn.Sequential(*layers)
+
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+
+
+class MobileNetV2(nn.Module):
+    def __init__(self,
+                 num_classes=conf.embedding_size,
+                 width_mult=1.0,
+                 inverted_residual_setting=None,
+                 round_nearest=8,
+                 block=None,
+                 norm_layer=None):
+        """
+        MobileNet V2 main class
+
+        Args:
+            num_classes (int): Number of classes
+            width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount
+            inverted_residual_setting: Network structure
+            round_nearest (int): Round the number of channels in each layer to be a multiple of this number
+            Set to 1 to turn off rounding
+            block: Module specifying inverted residual building block for mobilenet
+            norm_layer: Module specifying the normalization layer to use
+
+        """
+        super(MobileNetV2, self).__init__()
+
+        if block is None:
+            block = InvertedResidual
+
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+
+        input_channel = 32
+        last_channel = 1280
+
+        if inverted_residual_setting is None:
+            inverted_residual_setting = [
+                # t, c, n, s
+                [1, 16, 1, 1],
+                [6, 24, 2, 2],
+                [6, 32, 3, 2],
+                [6, 64, 4, 2],
+                [6, 96, 3, 1],
+                [6, 160, 3, 2],
+                [6, 320, 1, 1],
+            ]
+
+        # only check the first element, assuming user knows t,c,n,s are required
+        if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4:
+            raise ValueError("inverted_residual_setting should be non-empty "
+                             "or a 4-element list, got {}".format(inverted_residual_setting))
+
+        # building first layer
+        input_channel = _make_divisible(input_channel * width_mult, round_nearest)
+        self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest)
+        features = [ConvBNReLU(3, input_channel, stride=2, norm_layer=norm_layer)]
+        # building inverted residual blocks
+        for t, c, n, s in inverted_residual_setting:
+            output_channel = _make_divisible(c * width_mult, round_nearest)
+            for i in range(n):
+                stride = s if i == 0 else 1
+                features.append(block(input_channel, output_channel, stride, expand_ratio=t, norm_layer=norm_layer))
+                input_channel = output_channel
+        # building last several layers
+        features.append(ConvBNReLU(input_channel, self.last_channel, kernel_size=1, norm_layer=norm_layer))
+        # make it nn.Sequential
+        self.features = nn.Sequential(*features)
+
+        # building classifier
+        self.classifier = nn.Sequential(
+            nn.Dropout(0.2),
+            nn.Linear(self.last_channel, num_classes),
+        )
+
+        # weight initialization
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out')
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, 0, 0.01)
+                nn.init.zeros_(m.bias)
+
+    def _forward_impl(self, x):
+        # This exists since TorchScript doesn't support inheritance, so the superclass method
+        # (this one) needs to have a name other than `forward` that can be accessed in a subclass
+        x = self.features(x)
+        # Cannot use "squeeze" as batch-size can be 1 => must use reshape with x.shape[0]
+        x = nn.functional.adaptive_avg_pool2d(x, 1).reshape(x.shape[0], -1)
+        x = self.classifier(x)
+        return x
+
+    def forward(self, x):
+        return self._forward_impl(x)
+
+
+def mobilenet_v2(pretrained=True, progress=True, **kwargs):
+    """
+    Constructs a MobileNetV2 architecture from
+    `"MobileNetV2: Inverted Residuals and Linear Bottlenecks" <https://arxiv.org/abs/1801.04381>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    model = MobileNetV2(**kwargs)
+    if pretrained:
+        state_dict = load_state_dict_from_url(model_urls['mobilenet_v2'],
+                                              progress=progress)
+        src_state_dict = state_dict
+        target_state_dict = model.state_dict()
+        skip_keys = []
+        # skip mismatch size tensors in case of pretraining
+        for k in src_state_dict.keys():
+            if k not in target_state_dict:
+                continue
+            if src_state_dict[k].size() != target_state_dict[k].size():
+                skip_keys.append(k)
+        for k in skip_keys:
+            del src_state_dict[k]
+        missing_keys, unexpected_keys = model.load_state_dict(src_state_dict, strict=False)
+        #.load_state_dict(state_dict)
+    return model
--- a/tracking/trackers/reid/model/mobilenet_v3.py
+++ b/tracking/trackers/reid/model/mobilenet_v3.py
@ -0,0 +1,200 @@
+'''MobileNetV3 in PyTorch.
+
+See the paper "Inverted Residuals and Linear Bottlenecks:
+Mobile Networks for Classification, Detection and Segmentation" for more details.
+'''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import init
+from ..config import config as conf
+
+
+class hswish(nn.Module):
+    def forward(self, x):
+        out = x * F.relu6(x + 3, inplace=True) / 6
+        return out
+
+
+class hsigmoid(nn.Module):
+    def forward(self, x):
+        out = F.relu6(x + 3, inplace=True) / 6
+        return out
+
+
+class SeModule(nn.Module):
+    def __init__(self, in_size, reduction=4):
+        super(SeModule, self).__init__()
+        self.se = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(in_size, in_size // reduction, kernel_size=1, stride=1, padding=0, bias=False),
+            nn.BatchNorm2d(in_size // reduction),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(in_size // reduction, in_size, kernel_size=1, stride=1, padding=0, bias=False),
+            nn.BatchNorm2d(in_size),
+            hsigmoid()
+        )
+
+    def forward(self, x):
+        return x * self.se(x)
+
+
+class Block(nn.Module):
+    '''expand + depthwise + pointwise'''
+    def __init__(self, kernel_size, in_size, expand_size, out_size, nolinear, semodule, stride):
+        super(Block, self).__init__()
+        self.stride = stride
+        self.se = semodule
+
+        self.conv1 = nn.Conv2d(in_size, expand_size, kernel_size=1, stride=1, padding=0, bias=False)
+        self.bn1 = nn.BatchNorm2d(expand_size)
+        self.nolinear1 = nolinear
+        self.conv2 = nn.Conv2d(expand_size, expand_size, kernel_size=kernel_size, stride=stride, padding=kernel_size//2, groups=expand_size, bias=False)
+        self.bn2 = nn.BatchNorm2d(expand_size)
+        self.nolinear2 = nolinear
+        self.conv3 = nn.Conv2d(expand_size, out_size, kernel_size=1, stride=1, padding=0, bias=False)
+        self.bn3 = nn.BatchNorm2d(out_size)
+
+        self.shortcut = nn.Sequential()
+        if stride == 1 and in_size != out_size:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_size, out_size, kernel_size=1, stride=1, padding=0, bias=False),
+                nn.BatchNorm2d(out_size),
+            )
+
+    def forward(self, x):
+        out = self.nolinear1(self.bn1(self.conv1(x)))
+        out = self.nolinear2(self.bn2(self.conv2(out)))
+        out = self.bn3(self.conv3(out))
+        if self.se != None:
+            out = self.se(out)
+        out = out + self.shortcut(x) if self.stride==1 else out
+        return out
+
+
+class MobileNetV3_Large(nn.Module):
+    def __init__(self, num_classes=conf.embedding_size):
+        super(MobileNetV3_Large, self).__init__()
+        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(16)
+        self.hs1 = hswish()
+
+        self.bneck = nn.Sequential(
+            Block(3, 16, 16, 16, nn.ReLU(inplace=True), None, 1),
+            Block(3, 16, 64, 24, nn.ReLU(inplace=True), None, 2),
+            Block(3, 24, 72, 24, nn.ReLU(inplace=True), None, 1),
+            Block(5, 24, 72, 40, nn.ReLU(inplace=True), SeModule(40), 2),
+            Block(5, 40, 120, 40, nn.ReLU(inplace=True), SeModule(40), 1),
+            Block(5, 40, 120, 40, nn.ReLU(inplace=True), SeModule(40), 1),
+            Block(3, 40, 240, 80, hswish(), None, 2),
+            Block(3, 80, 200, 80, hswish(), None, 1),
+            Block(3, 80, 184, 80, hswish(), None, 1),
+            Block(3, 80, 184, 80, hswish(), None, 1),
+            Block(3, 80, 480, 112, hswish(), SeModule(112), 1),
+            Block(3, 112, 672, 112, hswish(), SeModule(112), 1),
+            Block(5, 112, 672, 160, hswish(), SeModule(160), 1),
+            Block(5, 160, 672, 160, hswish(), SeModule(160), 2),
+            Block(5, 160, 960, 160, hswish(), SeModule(160), 1),
+        )
+
+
+        self.conv2 = nn.Conv2d(160, 960, kernel_size=1, stride=1, padding=0, bias=False)
+        self.bn2 = nn.BatchNorm2d(960)
+        self.hs2 = hswish()
+        self.linear3 = nn.Linear(960, 1280)
+        self.bn3 = nn.BatchNorm1d(1280)
+        self.hs3 = hswish()
+        self.linear4 = nn.Linear(1280, num_classes)
+        self.init_params()
+
+    def init_params(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                init.kaiming_normal_(m.weight, mode='fan_out')
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d):
+                init.constant_(m.weight, 1)
+                init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                init.normal_(m.weight, std=0.001)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        out = self.hs1(self.bn1(self.conv1(x)))
+        out = self.bneck(out)
+        out = self.hs2(self.bn2(self.conv2(out)))
+        out = F.avg_pool2d(out, conf.img_size // 32)
+        out = out.view(out.size(0), -1)
+        out = self.hs3(self.bn3(self.linear3(out)))
+        out = self.linear4(out)
+        return out
+
+
+
+class MobileNetV3_Small(nn.Module):
+    def __init__(self, num_classes=conf.embedding_size):
+        super(MobileNetV3_Small, self).__init__()
+        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(16)
+        self.hs1 = hswish()
+
+        self.bneck = nn.Sequential(
+            Block(3, 16, 16, 16, nn.ReLU(inplace=True), SeModule(16), 2),
+            Block(3, 16, 72, 24, nn.ReLU(inplace=True), None, 2),
+            Block(3, 24, 88, 24, nn.ReLU(inplace=True), None, 1),
+            Block(5, 24, 96, 40, hswish(), SeModule(40), 2),
+            Block(5, 40, 240, 40, hswish(), SeModule(40), 1),
+            Block(5, 40, 240, 40, hswish(), SeModule(40), 1),
+            Block(5, 40, 120, 48, hswish(), SeModule(48), 1),
+            Block(5, 48, 144, 48, hswish(), SeModule(48), 1),
+            Block(5, 48, 288, 96, hswish(), SeModule(96), 2),
+            Block(5, 96, 576, 96, hswish(), SeModule(96), 1),
+            Block(5, 96, 576, 96, hswish(), SeModule(96), 1),
+        )
+
+
+        self.conv2 = nn.Conv2d(96, 576, kernel_size=1, stride=1, padding=0, bias=False)
+        self.bn2 = nn.BatchNorm2d(576)
+        self.hs2 = hswish()
+        self.linear3 = nn.Linear(576, 1280)
+        self.bn3 = nn.BatchNorm1d(1280)
+        self.hs3 = hswish()
+        self.linear4 = nn.Linear(1280, num_classes)
+        self.init_params()
+
+    def init_params(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                init.kaiming_normal_(m.weight, mode='fan_out')
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d):
+                init.constant_(m.weight, 1)
+                init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                init.normal_(m.weight, std=0.001)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        out = self.hs1(self.bn1(self.conv1(x)))
+        out = self.bneck(out)
+        out = self.hs2(self.bn2(self.conv2(out)))
+        out = F.avg_pool2d(out, conf.img_size // 32)
+        out = out.view(out.size(0), -1)
+
+        out = self.hs3(self.bn3(self.linear3(out)))
+        out = self.linear4(out)
+        return out
+
+
+
+def test():
+    net = MobileNetV3_Small()
+    x = torch.randn(2,3,224,224)
+    y = net(x)
+    print(y.size())
+
+# test()
--- a/tracking/trackers/reid/model/mobilevit.py
+++ b/tracking/trackers/reid/model/mobilevit.py
@ -0,0 +1,265 @@
+import torch
+import torch.nn as nn
+
+from einops import rearrange
+from ..config import config as conf
+
+
+def conv_1x1_bn(inp, oup):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
+        nn.BatchNorm2d(oup),
+        nn.SiLU()
+    )
+
+
+def conv_nxn_bn(inp, oup, kernal_size=3, stride=1):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, kernal_size, stride, 1, bias=False),
+        nn.BatchNorm2d(oup),
+        nn.SiLU()
+    )
+
+
+class PreNorm(nn.Module):
+    def __init__(self, dim, fn):
+        super().__init__()
+        self.norm = nn.LayerNorm(dim)
+        self.fn = fn
+
+    def forward(self, x, **kwargs):
+        return self.fn(self.norm(x), **kwargs)
+
+
+class FeedForward(nn.Module):
+    def __init__(self, dim, hidden_dim, dropout=0.):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(dim, hidden_dim),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim, dim),
+            nn.Dropout(dropout)
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+class Attention(nn.Module):
+    def __init__(self, dim, heads=8, dim_head=64, dropout=0.):
+        super().__init__()
+        inner_dim = dim_head * heads
+        project_out = not (heads == 1 and dim_head == dim)
+
+        self.heads = heads
+        self.scale = dim_head ** -0.5
+
+        self.attend = nn.Softmax(dim=-1)
+        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False)
+
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, dim),
+            nn.Dropout(dropout)
+        ) if project_out else nn.Identity()
+
+    def forward(self, x):
+        qkv = self.to_qkv(x).chunk(3, dim=-1)
+        q, k, v = map(lambda t: rearrange(t, 'b p n (h d) -> b p h n d', h=self.heads), qkv)
+
+        dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale
+        attn = self.attend(dots)
+        out = torch.matmul(attn, v)
+        out = rearrange(out, 'b p h n d -> b p n (h d)')
+        return self.to_out(out)
+
+
+class Transformer(nn.Module):
+    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout=0.):
+        super().__init__()
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(nn.ModuleList([
+                PreNorm(dim, Attention(dim, heads, dim_head, dropout)),
+                PreNorm(dim, FeedForward(dim, mlp_dim, dropout))
+            ]))
+
+    def forward(self, x):
+        for attn, ff in self.layers:
+            x = attn(x) + x
+            x = ff(x) + x
+        return x
+
+
+class MV2Block(nn.Module):
+    def __init__(self, inp, oup, stride=1, expansion=4):
+        super().__init__()
+        self.stride = stride
+        assert stride in [1, 2]
+
+        hidden_dim = int(inp * expansion)
+        self.use_res_connect = self.stride == 1 and inp == oup
+
+        if expansion == 1:
+            self.conv = nn.Sequential(
+                # dw
+                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
+                nn.BatchNorm2d(hidden_dim),
+                nn.SiLU(),
+                # pw-linear
+                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup),
+            )
+        else:
+            self.conv = nn.Sequential(
+                # pw
+                nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(hidden_dim),
+                nn.SiLU(),
+                # dw
+                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
+                nn.BatchNorm2d(hidden_dim),
+                nn.SiLU(),
+                # pw-linear
+                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup),
+            )
+
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+
+
+class MobileViTBlock(nn.Module):
+    def __init__(self, dim, depth, channel, kernel_size, patch_size, mlp_dim, dropout=0.):
+        super().__init__()
+        self.ph, self.pw = patch_size
+
+        self.conv1 = conv_nxn_bn(channel, channel, kernel_size)
+        self.conv2 = conv_1x1_bn(channel, dim)
+
+        self.transformer = Transformer(dim, depth, 4, 8, mlp_dim, dropout)
+
+        self.conv3 = conv_1x1_bn(dim, channel)
+        self.conv4 = conv_nxn_bn(2 * channel, channel, kernel_size)
+
+    def forward(self, x):
+        y = x.clone()
+
+        # Local representations
+        x = self.conv1(x)
+        x = self.conv2(x)
+
+        # Global representations
+        _, _, h, w = x.shape
+        x = rearrange(x, 'b d (h ph) (w pw) -> b (ph pw) (h w) d', ph=self.ph, pw=self.pw)
+        x = self.transformer(x)
+        x = rearrange(x, 'b (ph pw) (h w) d -> b d (h ph) (w pw)', h=h // self.ph, w=w // self.pw, ph=self.ph,
+                      pw=self.pw)
+
+        # Fusion
+        x = self.conv3(x)
+        x = torch.cat((x, y), 1)
+        x = self.conv4(x)
+        return x
+
+
+class MobileViT(nn.Module):
+    def __init__(self, image_size, dims, channels, num_classes, expansion=4, kernel_size=3, patch_size=(2, 2)):
+        super().__init__()
+        ih, iw = image_size
+        ph, pw = patch_size
+        assert ih % ph == 0 and iw % pw == 0
+
+        L = [2, 4, 3]
+
+        self.conv1 = conv_nxn_bn(3, channels[0], stride=2)
+
+        self.mv2 = nn.ModuleList([])
+        self.mv2.append(MV2Block(channels[0], channels[1], 1, expansion))
+        self.mv2.append(MV2Block(channels[1], channels[2], 2, expansion))
+        self.mv2.append(MV2Block(channels[2], channels[3], 1, expansion))
+        self.mv2.append(MV2Block(channels[2], channels[3], 1, expansion))  # Repeat
+        self.mv2.append(MV2Block(channels[3], channels[4], 2, expansion))
+        self.mv2.append(MV2Block(channels[5], channels[6], 2, expansion))
+        self.mv2.append(MV2Block(channels[7], channels[8], 2, expansion))
+
+        self.mvit = nn.ModuleList([])
+        self.mvit.append(MobileViTBlock(dims[0], L[0], channels[5], kernel_size, patch_size, int(dims[0] * 2)))
+        self.mvit.append(MobileViTBlock(dims[1], L[1], channels[7], kernel_size, patch_size, int(dims[1] * 4)))
+        self.mvit.append(MobileViTBlock(dims[2], L[2], channels[9], kernel_size, patch_size, int(dims[2] * 4)))
+
+        self.conv2 = conv_1x1_bn(channels[-2], channels[-1])
+
+        self.pool = nn.AvgPool2d(ih // 32, 1)
+        self.fc = nn.Linear(channels[-1], num_classes, bias=False)
+
+    def forward(self, x):
+        #print('x',x.shape)
+        x = self.conv1(x)
+        x = self.mv2[0](x)
+
+        x = self.mv2[1](x)
+        x = self.mv2[2](x)
+        x = self.mv2[3](x)  # Repeat
+
+        x = self.mv2[4](x)
+        x = self.mvit[0](x)
+
+        x = self.mv2[5](x)
+        x = self.mvit[1](x)
+
+        x = self.mv2[6](x)
+        x = self.mvit[2](x)
+        x = self.conv2(x)
+
+
+        #print('pool_before',x.shape)
+        x = self.pool(x).view(-1, x.shape[1])
+        #print('self_pool',self.pool)
+        #print('pool_after',x.shape)
+        x = self.fc(x)
+        return x
+
+
+def mobilevit_xxs():
+    dims = [64, 80, 96]
+    channels = [16, 16, 24, 24, 48, 48, 64, 64, 80, 80, 320]
+    return MobileViT((256, 256), dims, channels, num_classes=1000, expansion=2)
+
+
+def mobilevit_xs():
+    dims = [96, 120, 144]
+    channels = [16, 32, 48, 48, 64, 64, 80, 80, 96, 96, 384]
+    return MobileViT((256, 256), dims, channels, num_classes=1000)
+
+
+def mobilevit_s():
+    dims = [144, 192, 240]
+    channels = [16, 32, 64, 64, 96, 96, 128, 128, 160, 160, 640]
+    return MobileViT((conf.img_size, conf.img_size), dims, channels, num_classes=conf.embedding_size)
+
+
+def count_parameters(model):
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+
+if __name__ == '__main__':
+    img = torch.randn(5, 3, 256, 256)
+
+    vit = mobilevit_xxs()
+    out = vit(img)
+    print(out.shape)
+    print(count_parameters(vit))
+
+    vit = mobilevit_xs()
+    out = vit(img)
+    print(out.shape)
+    print(count_parameters(vit))
+
+    vit = mobilevit_s()
+    out = vit(img)
+    print(out.shape)
+    print(count_parameters(vit))
--- a/tracking/trackers/reid/model/resbam.py
+++ b/tracking/trackers/reid/model/resbam.py
@ -0,0 +1,134 @@
+from .CBAM import CBAM
+import torch
+import torch.nn as nn
+from .Tool import GeM as gem
+
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inchannel, outchannel,stride =1,dowsample=None):
+        # super(Bottleneck, self).__init__()
+        super().__init__()
+        self.conv1 = nn.Conv2d(in_channels=inchannel,out_channels=outchannel, kernel_size=1, stride=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(outchannel)
+        self.conv2 = nn.Conv2d(in_channels=outchannel, out_channels=outchannel,kernel_size=3,bias=False, stride=stride,padding=1)
+        self.bn2 = nn.BatchNorm2d(outchannel)
+        self.conv3 =nn.Conv2d(in_channels=outchannel, out_channels=outchannel*self.expansion,stride=1,bias=False,kernel_size=1)
+        self.bn3 = nn.BatchNorm2d(outchannel*self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = dowsample
+
+    def forward(self, x):
+        self.identity = x
+        # print('>>>>>>>>',type(x))
+        if self.downsample is not None:
+            # print('>>>>downsample>>>>', type(self.downsample))
+            self.identity = self.downsample(x)
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        # print('>>>>out>>>identity',out.size(),self.identity.size())
+        out = out+self.identity
+        out = self.relu(out)
+        return out
+
+class resnet(nn.Module):
+    def __init__(self,block=Bottleneck, block_num=[3,4,6,3], num_class=1000):
+        super().__init__()
+        self.in_channel = 64
+        self.conv1 = nn.Conv2d(in_channels=3,
+                               out_channels=self.in_channel,
+                               stride=2,
+                               kernel_size=7,
+                               padding=3,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(self.in_channel)
+        self.relu = nn.ReLU(inplace=True)
+        self.cbam = CBAM(self.in_channel)
+        self.cbam1 = CBAM(2048)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, block_num[0],stride=1)
+        self.layer2 = self._make_layer(block, 128, block_num[1],stride=2)
+        self.layer3 = self._make_layer(block, 256, block_num[2],stride=2)
+        self.layer4 = self._make_layer(block, 512, block_num[3],stride=2)
+        self.avgpool = nn.AdaptiveAvgPool2d((1,1))
+        self.gem = gem()
+        self.fc = nn.Linear(512*block.expansion, num_class)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal(m.weight,mode = 'fan_out',
+                                       nonlinearity='relu')
+            if isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1.0)
+                nn.init.constant_(m.bias, 1.0)
+
+    def _make_layer(self,block ,channel, block_num, stride=1):
+        downsample = None
+        if stride !=1 or self.in_channel != channel*block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.in_channel, channel*block.expansion,kernel_size=1,stride=stride,bias=False),
+                nn.BatchNorm2d(channel*block.expansion))
+        layer = []
+        layer.append(block(self.in_channel, channel, stride, downsample))
+        self.in_channel = channel*block.expansion
+        for _ in range(1, block_num):
+            layer.append(block(self.in_channel, channel))
+        return nn.Sequential(*layer)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        x = self.cbam(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.cbam1(x)
+       # x = self.avgpool(x)
+        x = self.gem(x)
+        x = torch.flatten(x, 1)
+        x = self.fc(x)
+        return x
+
+class TripletNet(nn.Module):
+    def __init__(self, num_class, flag=True):
+        super(TripletNet, self).__init__()
+        self.initnet = rescbam(num_class)
+        self.flag = flag
+
+    def forward(self, x1, x2=None, x3=None):
+        if self.flag:
+            output1 = self.initnet(x1)
+            output2 = self.initnet(x2)
+            output3 = self.initnet(x3)
+            return output1, output2, output3
+        else:
+            output = self.initnet(x1)
+            return output
+
+def rescbam(num_class):
+    return resnet(block=Bottleneck, block_num=[3,4,6,3],num_class=num_class)
+
+if __name__ =='__main__':
+    input1 = torch.randn(4,3,640,640)
+    input2 = torch.randn(4,3,640,640)
+    input3 = torch.randn(4,3,640,640)
+
+    #rescbam测试
+    # Resnet50 = rescbam(512)
+    # output = Resnet50.forward(input1)
+    # print(Resnet50)
+
+    #trnet测试
+    trnet = TripletNet(512)
+    output = trnet(input1, input2, input3)
+    print(output)
--- a/tracking/trackers/reid/model/resnet.py
+++ b/tracking/trackers/reid/model/resnet.py
@ -0,0 +1,182 @@
+"""resnet in pytorch
+
+
+
+[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
+
+    Deep Residual Learning for Image Recognition
+    https://arxiv.org/abs/1512.03385v1
+"""
+
+import torch
+import torch.nn as nn
+from config import config as conf
+
+class BasicBlock(nn.Module):
+    """Basic Block for resnet 18 and resnet 34
+
+    """
+
+    #BasicBlock and BottleNeck block
+    #have different output size
+    #we use class attribute expansion
+    #to distinct
+    expansion = 1
+
+    def __init__(self, in_channels, out_channels, stride=1):
+        super().__init__()
+
+        #residual function
+        self.residual_function = nn.Sequential(
+            nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(out_channels, out_channels * BasicBlock.expansion, kernel_size=3, padding=1, bias=False),
+            nn.BatchNorm2d(out_channels * BasicBlock.expansion)
+        )
+
+        #shortcut
+        self.shortcut = nn.Sequential()
+
+        #the shortcut output dimension is not the same with residual function
+        #use 1*1 convolution to match the dimension
+        if stride != 1 or in_channels != BasicBlock.expansion * out_channels:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_channels, out_channels * BasicBlock.expansion, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(out_channels * BasicBlock.expansion)
+            )
+
+    def forward(self, x):
+        return nn.ReLU(inplace=True)(self.residual_function(x) + self.shortcut(x))
+
+class BottleNeck(nn.Module):
+    """Residual block for resnet over 50 layers
+
+    """
+    expansion = 4
+    def __init__(self, in_channels, out_channels, stride=1):
+        super().__init__()
+        self.residual_function = nn.Sequential(
+            nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(out_channels, out_channels, stride=stride, kernel_size=3, padding=1, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(out_channels, out_channels * BottleNeck.expansion, kernel_size=1, bias=False),
+            nn.BatchNorm2d(out_channels * BottleNeck.expansion),
+        )
+
+        self.shortcut = nn.Sequential()
+
+        if stride != 1 or in_channels != out_channels * BottleNeck.expansion:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_channels, out_channels * BottleNeck.expansion, stride=stride, kernel_size=1, bias=False),
+                nn.BatchNorm2d(out_channels * BottleNeck.expansion)
+            )
+
+    def forward(self, x):
+        return nn.ReLU(inplace=True)(self.residual_function(x) + self.shortcut(x))
+
+class ResNet(nn.Module):
+
+    def __init__(self, block, num_block, num_classes=conf.embedding_size):
+        super().__init__()
+
+        self.in_channels = 64
+
+        # self.conv1 = nn.Sequential(
+        #     nn.Conv2d(3, 64, kernel_size=3, padding=1, bias=False),
+        #     nn.BatchNorm2d(64),
+        #     nn.ReLU(inplace=True))
+
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(3, 64,stride=2,kernel_size=7,padding=3,bias=False),
+            nn.BatchNorm2d(64),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
+
+
+        #we use a different inputsize than the original paper
+        #so conv2_x's stride is 1
+        self.conv2_x = self._make_layer(block, 64, num_block[0], 1)
+        self.conv3_x = self._make_layer(block, 128, num_block[1], 2)
+        self.conv4_x = self._make_layer(block, 256, num_block[2], 2)
+        self.conv5_x = self._make_layer(block, 512, num_block[3], 2)
+        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal(m.weight,mode = 'fan_out',
+                                       nonlinearity='relu')
+            if isinstance(m, (nn.BatchNorm2d)):
+                nn.init.constant_(m.weight, 1.0)
+                nn.init.constant_(m.bias, 1.0)
+
+    def _make_layer(self, block, out_channels, num_blocks, stride):
+        """make resnet layers(by layer i didnt mean this 'layer' was the
+        same as a neuron netowork layer, ex. conv layer), one layer may
+        contain more than one residual block
+
+        Args:
+            block: block type, basic block or bottle neck block
+            out_channels: output depth channel number of this layer
+            num_blocks: how many blocks per layer
+            stride: the stride of the first block of this layer
+
+        Return:
+            return a resnet layer
+        """
+
+        # we have num_block blocks per layer, the first block
+        # could be 1 or 2, other blocks would always be 1
+        strides = [stride] + [1] * (num_blocks - 1)
+        layers = []
+        for stride in strides:
+            layers.append(block(self.in_channels, out_channels, stride))
+            self.in_channels = out_channels * block.expansion
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        output = self.conv1(x)
+        output = self.conv2_x(output)
+        output = self.conv3_x(output)
+        output = self.conv4_x(output)
+        output = self.conv5_x(output)
+        print('pollBefore',output.shape)
+        output = self.avg_pool(output)
+        print('poolAfter',output.shape)
+        output = output.view(output.size(0), -1)
+        print('fcBefore',output.shape)
+        output = self.fc(output)
+
+        return output
+
+def resnet18():
+    """ return a ResNet 18 object
+    """
+    return ResNet(BasicBlock, [2, 2, 2, 2])
+
+def resnet34():
+    """ return a ResNet 34 object
+    """
+    return ResNet(BasicBlock, [3, 4, 6, 3])
+
+def resnet50():
+    """ return a ResNet 50 object
+    """
+    return ResNet(BottleNeck, [3, 4, 6, 3])
+
+def resnet101():
+    """ return a ResNet 101 object
+    """
+    return ResNet(BottleNeck, [3, 4, 23, 3])
+
+def resnet152():
+    """ return a ResNet 152 object
+    """
+    return ResNet(BottleNeck, [3, 8, 36, 3])
+
+
--- a/tracking/trackers/reid/model/resnet_face.py
+++ b/tracking/trackers/reid/model/resnet_face.py
@ -0,0 +1,120 @@
+""" Resnet_IR_SE in ArcFace """
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Flatten(nn.Module):
+    def forward(self, x):
+        return x.reshape(x.shape[0], -1)
+
+
+class SEConv(nn.Module):
+    """Use Convolution instead of FullyConnection in SE"""
+
+    def __init__(self, channels, reduction):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(channels, channels // reduction, kernel_size=1, bias=False),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(channels // reduction, channels, kernel_size=1, bias=False),
+            nn.Sigmoid(),
+        )
+
+    def forward(self, x):
+        return self.net(x) * x
+
+
+class SE(nn.Module):
+
+    def __init__(self, channels, reduction):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Linear(channels, channels // reduction),
+            nn.ReLU(inplace=True),
+            nn.Linear(channels // reduction, channels),
+            nn.Sigmoid(),
+        )
+
+    def forward(self, x):
+        return self.net(x) * x
+
+
+class IRSE(nn.Module):
+
+    def __init__(self, channels, depth, stride):
+        super().__init__()
+        if channels == depth:
+            self.shortcut = nn.MaxPool2d(kernel_size=1, stride=stride)
+        else:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(channels, depth, (1, 1), stride, bias=False), 
+                nn.BatchNorm2d(depth),
+            )
+        self.residual = nn.Sequential(
+            nn.BatchNorm2d(channels),
+            nn.Conv2d(channels, depth, (3, 3), 1, 1, bias=False),
+            nn.PReLU(depth),
+            nn.Conv2d(depth, depth, (3, 3), stride, 1, bias=False),
+            nn.BatchNorm2d(depth),
+            SEConv(depth, 16),
+        )
+
+    def forward(self, x):
+        return self.shortcut(x) + self.residual(x)
+
+class ResIRSE(nn.Module):
+    """Resnet50-IRSE backbone"""
+
+    def __init__(self, ih,embedding_size, drop_ratio):
+        super().__init__()
+        ih_last = ih // 16
+        self.input_layer = nn.Sequential(
+            nn.Conv2d(3, 64, (3, 3), 1, 1, bias=False),
+            nn.BatchNorm2d(64),
+            nn.PReLU(64),
+        )
+        self.output_layer = nn.Sequential(
+            nn.BatchNorm2d(512),
+            nn.Dropout(drop_ratio),
+            Flatten(),
+            nn.Linear(512 * ih_last * ih_last, embedding_size),
+            nn.BatchNorm1d(embedding_size),
+        )
+
+        # ["channels", "depth", "stride"],
+        self.res50_arch = [
+            [64, 64, 2], [64, 64, 1], [64, 64, 1],
+            [64, 128, 2], [128, 128, 1], [128, 128, 1], [128, 128, 1],
+            [128, 256, 2], [256, 256, 1], [256, 256, 1], [256, 256, 1], [256, 256, 1],
+            [256, 256, 1], [256, 256, 1], [256, 256, 1], [256, 256, 1], [256, 256, 1],
+            [256, 256, 1], [256, 256, 1], [256, 256, 1], [256, 256, 1],
+            [256, 512, 2], [512, 512, 1], [512, 512, 1],
+        ]
+
+        self.body = nn.Sequential(*[ IRSE(a,b,c) for (a,b,c) in self.res50_arch ])
+
+    def forward(self, x):
+        x = self.input_layer(x)
+        x = self.body(x)
+        x = self.output_layer(x) 
+        return x
+
+
+if __name__ == "__main__":
+    from PIL import Image
+    import numpy as np
+
+    x = Image.open("../samples/009.jpg").convert('L')
+    x = x.resize((128, 128))
+    x = np.asarray(x, dtype=np.float32)    
+    x = x[None, None, ...]
+    x = torch.from_numpy(x)
+    net = ResIRSE(512, 0.6)
+    net.eval()
+    with torch.no_grad():
+        out = net(x)
+    print(out.shape)
--- a/tracking/trackers/reid/model/resnet_pre.py
+++ b/tracking/trackers/reid/model/resnet_pre.py
@ -0,0 +1,384 @@
+import torch
+import torch.nn as nn
+# from config import config as conf
+from ..config import config as conf
+
+try:
+    from torch.hub import load_state_dict_from_url
+except ImportError:
+    from torch.utils.model_zoo import load_url as load_state_dict_from_url
+#from .utils import load_state_dict_from_url
+
+
+__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
+           'resnet152', 'resnext50_32x4d', 'resnext101_32x8d',
+           'wide_resnet50_2', 'wide_resnet101_2']
+
+
+model_urls = {
+    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
+    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
+    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
+    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
+    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
+    'resnext50_32x4d': 'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth',
+    'resnext101_32x8d': 'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth',
+    'wide_resnet50_2': 'https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth',
+    'wide_resnet101_2': 'https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth',
+}
+
+
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=dilation, groups=groups, bias=False, dilation=dilation)
+
+
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
+                 base_width=64, dilation=1, norm_layer=None):
+        super(BasicBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if groups != 1 or base_width != 64:
+            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
+    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
+    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
+    # This variant is also known as ResNet V1.5 and improves accuracy according to
+    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
+
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
+                 base_width=64, dilation=1, norm_layer=None):
+        super(Bottleneck, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.)) * groups
+        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+
+    def __init__(self, block, layers, num_classes=conf.embedding_size, zero_init_residual=False,
+                 groups=1, width_per_group=64, replace_stride_with_dilation=None,
+                 norm_layer=None):
+        super(ResNet, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self._norm_layer = norm_layer
+
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            # each element in the tuple indicates if we should replace
+            # the 2x2 stride with a dilated convolution instead
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError("replace_stride_with_dilation should be None "
+                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
+                                       dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
+                                       dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
+                                       dilate=replace_stride_with_dilation[2])
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+        # Zero-initialize the last BN in each residual branch,
+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, Bottleneck):
+                    nn.init.constant_(m.bn3.weight, 0)
+                elif isinstance(m, BasicBlock):
+                    nn.init.constant_(m.bn2.weight, 0)
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                norm_layer(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
+                            self.base_width, previous_dilation, norm_layer))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(self.inplanes, planes, groups=self.groups,
+                                base_width=self.base_width, dilation=self.dilation,
+                                norm_layer=norm_layer))
+
+        return nn.Sequential(*layers)
+
+    def _forward_impl(self, x):
+        # See note [TorchScript super()]
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        #print('poolBefore', x.shape)
+        x = self.avgpool(x)
+        #print('poolAfter', x.shape)
+        x = torch.flatten(x, 1)
+        #print('fcBefore',x.shape)
+        x = self.fc(x)
+       # print('fcAfter',x.shape)
+
+        return x
+
+    def forward(self, x):
+        return self._forward_impl(x)
+
+
+# def _resnet(arch, block, layers, pretrained, progress, **kwargs):
+#     model = ResNet(block, layers, **kwargs)
+#     if pretrained:
+#         state_dict = load_state_dict_from_url(model_urls[arch],
+#                                               progress=progress)
+#         model.load_state_dict(state_dict, strict=False)
+#     return model
+def _resnet(arch, block, layers, pretrained, progress, **kwargs):
+    model = ResNet(block, layers, **kwargs)
+    if pretrained:
+        state_dict = load_state_dict_from_url(model_urls[arch],
+                                              progress=progress)
+        #print('state_dict',state_dict)
+        src_state_dict = state_dict
+        target_state_dict = model.state_dict()
+        skip_keys = []
+        # skip mismatch size tensors in case of pretraining
+        for k in src_state_dict.keys():
+            if k not in target_state_dict:
+                continue
+            if src_state_dict[k].size() != target_state_dict[k].size():
+                skip_keys.append(k)
+        for k in skip_keys:
+            del src_state_dict[k]
+        missing_keys, unexpected_keys = model.load_state_dict(src_state_dict, strict=False)
+
+    return model
+
+
+def resnet18(pretrained=True, progress=True, **kwargs):
+    r"""ResNet-18 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress,
+                   **kwargs)
+
+
+def resnet34(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-34 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet34', BasicBlock, [3, 4, 6, 3], pretrained, progress,
+                   **kwargs)
+
+
+def resnet50(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-50 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress,
+                   **kwargs)
+
+
+def resnet101(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-101 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet101', Bottleneck, [3, 4, 23, 3], pretrained, progress,
+                   **kwargs)
+
+
+def resnet152(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-152 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet152', Bottleneck, [3, 8, 36, 3], pretrained, progress,
+                   **kwargs)
+
+
+def resnext50_32x4d(pretrained=False, progress=True, **kwargs):
+    r"""ResNeXt-50 32x4d model from
+    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['groups'] = 32
+    kwargs['width_per_group'] = 4
+    return _resnet('resnext50_32x4d', Bottleneck, [3, 4, 6, 3],
+                   pretrained, progress, **kwargs)
+
+
+def resnext101_32x8d(pretrained=False, progress=True, **kwargs):
+    r"""ResNeXt-101 32x8d model from
+    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['groups'] = 32
+    kwargs['width_per_group'] = 8
+    return _resnet('resnext101_32x8d', Bottleneck, [3, 4, 23, 3],
+                   pretrained, progress, **kwargs)
+
+
+def wide_resnet50_2(pretrained=False, progress=True, **kwargs):
+    r"""Wide ResNet-50-2 model from
+    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_
+
+    The model is the same as ResNet except for the bottleneck number of channels
+    which is twice larger in every block. The number of channels in outer 1x1
+    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
+    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['width_per_group'] = 64 * 2
+    return _resnet('wide_resnet50_2', Bottleneck, [3, 4, 6, 3],
+                   pretrained, progress, **kwargs)
+
+
+def wide_resnet101_2(pretrained=False, progress=True, **kwargs):
+    r"""Wide ResNet-101-2 model from
+    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_
+
+    The model is the same as ResNet except for the bottleneck number of channels
+    which is twice larger in every block. The number of channels in outer 1x1
+    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
+    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['width_per_group'] = 64 * 2
+    return _resnet('wide_resnet101_2', Bottleneck, [3, 4, 23, 3],
+                   pretrained, progress, **kwargs)
--- a/tracking/trackers/reid/model/utils.py
+++ b/tracking/trackers/reid/model/utils.py
@ -0,0 +1,4 @@
+try:
+    from torch.hub import load_state_dict_from_url
+except ImportError:
+    from torch.utils.model_zoo import load_url as load_state_dict_from_url
--- a/tracking/trackers/reid/reid_interface.py
+++ b/tracking/trackers/reid/reid_interface.py
@ -0,0 +1,143 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Jan 18 17:21:01 2024
+
+@author: ym
+"""
+import numpy as np
+import torch
+import cv2
+import torch.nn as nn
+import torchvision.transforms as T
+from .model import mobilevit_s, resnet18, resnet34, resnet50, mobilenet_v2, MobileNetV3_Small
+from .config import config as conf
+
+
+class ReIDInterface:
+    def __init__(self, config):        
+        self.device = conf.device
+        if conf.backbone == 'resnet18':
+            # model = ResIRSE(img_size, embedding_size, conf.drop_ratio).to(device)
+            model = resnet18().to(self.device)
+        elif conf.backbone == 'resnet34':
+            model = resnet34().to(self.device)
+        elif conf.backbone == 'resnet50':
+            model = resnet50().to(self.device)
+        elif conf.backbone == 'mobilevit_s':
+            model = mobilevit_s().to(self.device)
+        elif conf.backbone == 'mobilenetv3':
+            model = MobileNetV3_Small().to(self.device)
+        else:
+            model = mobilenet_v2().to(self.device)
+        
+        self.batch_size = conf.batch_size
+        self.embedding_size = conf.embedding_size
+        self.img_size = conf.img_size
+        
+        self.model_path = conf.model_path
+        
+        # 原输入为PIL
+        self.transform = T.Compose([
+                T.ToTensor(),
+                T.Resize((self.img_size, self.img_size)),
+                T.ConvertImageDtype(torch.float32),
+                T.Normalize(mean=[0.5], std=[0.5]),
+            ])
+        
+        
+        self.model = nn.DataParallel(model).to(self.device)
+        self.model.load_state_dict(torch.load(self.model_path, map_location=self.device))
+        self.model.eval()
+        
+    def inference(self, images, detections):
+        if isinstance(images, np.ndarray):
+            features = self.inference_image(images, detections)
+            return features
+                  
+        batch_patches = []
+        patches = []
+        for i, img in enumerate(images):
+            img = img.copy()   
+            patch = self.transform(img)            
+            if str(self.device) != "cpu":
+                patch = patch.to(device=self.device).half()
+            else:
+                patch = patch.to(device=self.device)
+            
+            patches.append(patch)
+            if (i + 1) % self.batch_size == 0:
+                patches = torch.stack(patches, dim=0)
+                batch_patches.append(patches)
+                patches = []
+        
+        if len(patches):
+            patches = torch.stack(patches, dim=0)
+            batch_patches.append(patches)
+        
+        features = np.zeros((0, self.embedding_size))
+        for patches in batch_patches:
+            pred=self.model(patches)
+            pred[torch.isinf(pred)] = 1.0            
+            feat = pred.cpu().data.numpy()
+            features = np.vstack((features, feat))        
+        return features
+    
+    def inference_image(self, image, detections):
+        H, W, _ = np.shape(image)
+
+        batch_patches = []
+        patches = []
+        for d in range(np.size(detections, 0)):
+            tlbr = detections[d, :4].astype(np.int_)
+            tlbr[0] = max(0, tlbr[0])
+            tlbr[1] = max(0, tlbr[1])
+            tlbr[2] = min(W - 1, tlbr[2])
+            tlbr[3] = min(H - 1, tlbr[3])
+            img = image[tlbr[1]:tlbr[3], tlbr[0]:tlbr[2], :]
+            
+            img = img[:, :, ::-1].copy()    # the model expects RGB inputs
+            patch = self.transform(img)
+
+            # patch = patch.to(device=self.device).half()
+            if str(self.device) != "cpu":
+                patch = patch.to(device=self.device).half()
+            else:
+                patch = patch.to(device=self.device)
+            
+            patches.append(patch)
+            if (d + 1) % self.batch_size == 0:
+                patches = torch.stack(patches, dim=0)
+                batch_patches.append(patches)
+                patches = []
+
+        if len(patches):
+            patches = torch.stack(patches, dim=0)
+            batch_patches.append(patches)
+
+        features = np.zeros((0, self.embedding_size))
+        for patches in batch_patches:
+            pred = self.model(patches)
+            pred[torch.isinf(pred)] = 1.0
+            feat = pred.cpu().data.numpy()
+            features = np.vstack((features, feat))
+
+        return features
+        
+              
+            
+            
+            
+            
+            
+            
+            
+            
+            
+            
+            
+            
+            
+            
+            
+            
+            
--- a/tracking/trackers/reid/test.py
+++ b/tracking/trackers/reid/test.py
@ -0,0 +1,21 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Jan 19 16:10:39 2024
+
+@author: ym
+"""
+import torch
+from model.resnet_pre import resnet18
+
+
+
+def main():
+    model_path = "best.pth"
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model = resnet18().to(device)
+    model.load_state_dict(torch.load(model_path, map_location=device))
+    
+
+
+if __name__ == "__main__": 
+    main()
--- a/tracking/trackers/track.py
+++ b/tracking/trackers/track.py
@ -0,0 +1,66 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+
+from functools import partial
+
+import torch
+
+from ultralytics.utils import IterableSimpleNamespace, yaml_load
+from ultralytics.utils.checks import check_yaml
+
+from .bot_sort import BOTSORT
+from .byte_tracker import BYTETracker
+
+TRACKER_MAP = {'bytetrack': BYTETracker, 'botsort': BOTSORT}
+
+
+def on_predict_start(predictor, persist=False):
+    """
+    Initialize trackers for object tracking during prediction.
+
+    Args:
+        predictor (object): The predictor object to initialize trackers for.
+        persist (bool, optional): Whether to persist the trackers if they already exist. Defaults to False.
+
+    Raises:
+        AssertionError: If the tracker_type is not 'bytetrack' or 'botsort'.
+    """
+    if hasattr(predictor, 'trackers') and persist:
+        return
+    tracker = check_yaml(predictor.args.tracker)
+    cfg = IterableSimpleNamespace(**yaml_load(tracker))
+    assert cfg.tracker_type in ['bytetrack', 'botsort'], \
+        f"Only support 'bytetrack' and 'botsort' for now, but got '{cfg.tracker_type}'"
+    trackers = []
+    for _ in range(predictor.dataset.bs):
+        tracker = TRACKER_MAP[cfg.tracker_type](args=cfg, frame_rate=30)
+        trackers.append(tracker)
+    predictor.trackers = trackers
+
+
+def on_predict_postprocess_end(predictor):
+    """Postprocess detected boxes and update with object tracking."""
+    bs = predictor.dataset.bs
+    im0s = predictor.batch[1]
+    for i in range(bs):
+        det = predictor.results[i].boxes.cpu().numpy()
+        if len(det) == 0:
+            continue
+        tracks = predictor.trackers[i].update(det, im0s[i])
+        if len(tracks) == 0:
+            continue
+        idx = tracks[:, -1].astype(int)
+        predictor.results[i] = predictor.results[i][idx]
+        predictor.results[i].update(boxes=torch.as_tensor(tracks[:, :-1]))
+
+
+def register_tracker(model, persist):
+    """
+    Register tracking callbacks to the model for object tracking during prediction.
+
+    Args:
+        model (object): The model object to register tracking callbacks for.
+        persist (bool): Whether to persist the trackers if they already exist.
+
+    """
+    model.add_callback('on_predict_start', partial(on_predict_start, persist=persist))
+    model.add_callback('on_predict_postprocess_end', on_predict_postprocess_end)
--- a/tracking/trackers/utils/init.py
+++ b/tracking/trackers/utils/init.py
@ -0,0 +1,3 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+
+
--- a/tracking/trackers/utils/pycache/init.cpython-39.pyc
+++ b/tracking/trackers/utils/pycache/init.cpython-39.pyc
--- a/tracking/trackers/utils/pycache/gmc.cpython-39.pyc
+++ b/tracking/trackers/utils/pycache/gmc.cpython-39.pyc
--- a/tracking/trackers/utils/pycache/kalman_filter.cpython-39.pyc
+++ b/tracking/trackers/utils/pycache/kalman_filter.cpython-39.pyc
--- a/tracking/trackers/utils/pycache/matching.cpython-39.pyc
+++ b/tracking/trackers/utils/pycache/matching.cpython-39.pyc
--- a/tracking/trackers/utils/pycache/results.cpython-39.pyc
+++ b/tracking/trackers/utils/pycache/results.cpython-39.pyc
--- a/tracking/trackers/utils/gmc.py
+++ b/tracking/trackers/utils/gmc.py
@ -0,0 +1,279 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+
+import copy
+
+import cv2
+import numpy as np
+
+from ultralytics.utils import LOGGER
+
+
+class GMC:
+
+    def __init__(self, method='sparseOptFlow', downscale=2):
+        """Initialize a video tracker with specified parameters."""
+        super().__init__()
+
+        self.method = method
+        self.downscale = max(1, int(downscale))
+
+        if self.method == 'orb':
+            self.detector = cv2.FastFeatureDetector_create(20)
+            self.extractor = cv2.ORB_create()
+            self.matcher = cv2.BFMatcher(cv2.NORM_HAMMING)
+
+        elif self.method == 'sift':
+            self.detector = cv2.SIFT_create(nOctaveLayers=3, contrastThreshold=0.02, edgeThreshold=20)
+            self.extractor = cv2.SIFT_create(nOctaveLayers=3, contrastThreshold=0.02, edgeThreshold=20)
+            self.matcher = cv2.BFMatcher(cv2.NORM_L2)
+
+        elif self.method == 'ecc':
+            number_of_iterations = 5000
+            termination_eps = 1e-6
+            self.warp_mode = cv2.MOTION_EUCLIDEAN
+            self.criteria = (cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, number_of_iterations, termination_eps)
+
+        elif self.method == 'sparseOptFlow':
+            self.feature_params = dict(maxCorners=1000,
+                                       qualityLevel=0.01,
+                                       minDistance=1,
+                                       blockSize=3,
+                                       useHarrisDetector=False,
+                                       k=0.04)
+
+        elif self.method in ['none', 'None', None]:
+            self.method = None
+        else:
+            raise ValueError(f'Error: Unknown GMC method:{method}')
+
+        self.prevFrame = None
+        self.prevKeyPoints = None
+        self.prevDescriptors = None
+
+        self.initializedFirstFrame = False
+
+    def apply(self, raw_frame, detections=None):
+        """Apply object detection on a raw frame using specified method."""
+        if self.method in ['orb', 'sift']:
+            return self.applyFeatures(raw_frame, detections)
+        elif self.method == 'ecc':
+            return self.applyEcc(raw_frame, detections)
+        elif self.method == 'sparseOptFlow':
+            return self.applySparseOptFlow(raw_frame, detections)
+        else:
+            return np.eye(2, 3)
+
+    def applyEcc(self, raw_frame, detections=None):
+        """Initialize."""
+        height, width, _ = raw_frame.shape
+        frame = cv2.cvtColor(raw_frame, cv2.COLOR_BGR2GRAY)
+        H = np.eye(2, 3, dtype=np.float32)
+
+        # Downscale image (TODO: consider using pyramids)
+        if self.downscale > 1.0:
+            frame = cv2.GaussianBlur(frame, (3, 3), 1.5)
+            frame = cv2.resize(frame, (width // self.downscale, height // self.downscale))
+            width = width // self.downscale
+            height = height // self.downscale
+
+        # Handle first frame
+        if not self.initializedFirstFrame:
+            # Initialize data
+            self.prevFrame = frame.copy()
+
+            # Initialization done
+            self.initializedFirstFrame = True
+
+            return H
+
+        # Run the ECC algorithm. The results are stored in warp_matrix.
+        # (cc, H) = cv2.findTransformECC(self.prevFrame, frame, H, self.warp_mode, self.criteria)
+        try:
+            (cc, H) = cv2.findTransformECC(self.prevFrame, frame, H, self.warp_mode, self.criteria, None, 1)
+        except Exception as e:
+            LOGGER.warning(f'WARNING: find transform failed. Set warp as identity {e}')
+
+        return H
+
+    def applyFeatures(self, raw_frame, detections=None):
+        """Initialize."""
+        height, width, _ = raw_frame.shape
+        frame = cv2.cvtColor(raw_frame, cv2.COLOR_BGR2GRAY)
+        H = np.eye(2, 3)
+
+        # Downscale image (TODO: consider using pyramids)
+        if self.downscale > 1.0:
+            # frame = cv2.GaussianBlur(frame, (3, 3), 1.5)
+            frame = cv2.resize(frame, (width // self.downscale, height // self.downscale))
+            width = width // self.downscale
+            height = height // self.downscale
+
+        # Find the keypoints
+        mask = np.zeros_like(frame)
+        # mask[int(0.05 * height): int(0.95 * height), int(0.05 * width): int(0.95 * width)] = 255
+        mask[int(0.02 * height):int(0.98 * height), int(0.02 * width):int(0.98 * width)] = 255
+        if detections is not None:
+            for det in detections:
+                tlbr = (det[:4] / self.downscale).astype(np.int_)
+                mask[tlbr[1]:tlbr[3], tlbr[0]:tlbr[2]] = 0
+
+        keypoints = self.detector.detect(frame, mask)
+
+        # Compute the descriptors
+        keypoints, descriptors = self.extractor.compute(frame, keypoints)
+
+        # Handle first frame
+        if not self.initializedFirstFrame:
+            # Initialize data
+            self.prevFrame = frame.copy()
+            self.prevKeyPoints = copy.copy(keypoints)
+            self.prevDescriptors = copy.copy(descriptors)
+
+            # Initialization done
+            self.initializedFirstFrame = True
+
+            return H
+
+        # Match descriptors.
+        knnMatches = self.matcher.knnMatch(self.prevDescriptors, descriptors, 2)
+
+        # Filtered matches based on smallest spatial distance
+        matches = []
+        spatialDistances = []
+
+        maxSpatialDistance = 0.25 * np.array([width, height])
+
+        # Handle empty matches case
+        if len(knnMatches) == 0:
+            # Store to next iteration
+            self.prevFrame = frame.copy()
+            self.prevKeyPoints = copy.copy(keypoints)
+            self.prevDescriptors = copy.copy(descriptors)
+
+            return H
+
+        for m, n in knnMatches:
+            if m.distance < 0.9 * n.distance:
+                prevKeyPointLocation = self.prevKeyPoints[m.queryIdx].pt
+                currKeyPointLocation = keypoints[m.trainIdx].pt
+
+                spatialDistance = (prevKeyPointLocation[0] - currKeyPointLocation[0],
+                                   prevKeyPointLocation[1] - currKeyPointLocation[1])
+
+                if (np.abs(spatialDistance[0]) < maxSpatialDistance[0]) and \
+                        (np.abs(spatialDistance[1]) < maxSpatialDistance[1]):
+                    spatialDistances.append(spatialDistance)
+                    matches.append(m)
+
+        meanSpatialDistances = np.mean(spatialDistances, 0)
+        stdSpatialDistances = np.std(spatialDistances, 0)
+
+        inliers = (spatialDistances - meanSpatialDistances) < 2.5 * stdSpatialDistances
+
+        goodMatches = []
+        prevPoints = []
+        currPoints = []
+        for i in range(len(matches)):
+            if inliers[i, 0] and inliers[i, 1]:
+                goodMatches.append(matches[i])
+                prevPoints.append(self.prevKeyPoints[matches[i].queryIdx].pt)
+                currPoints.append(keypoints[matches[i].trainIdx].pt)
+
+        prevPoints = np.array(prevPoints)
+        currPoints = np.array(currPoints)
+
+        # Draw the keypoint matches on the output image
+        # if False:
+        #     import matplotlib.pyplot as plt
+        #     matches_img = np.hstack((self.prevFrame, frame))
+        #     matches_img = cv2.cvtColor(matches_img, cv2.COLOR_GRAY2BGR)
+        #     W = np.size(self.prevFrame, 1)
+        #     for m in goodMatches:
+        #         prev_pt = np.array(self.prevKeyPoints[m.queryIdx].pt, dtype=np.int_)
+        #         curr_pt = np.array(keypoints[m.trainIdx].pt, dtype=np.int_)
+        #         curr_pt[0] += W
+        #         color = np.random.randint(0, 255, 3)
+        #         color = (int(color[0]), int(color[1]), int(color[2]))
+        #
+        #         matches_img = cv2.line(matches_img, prev_pt, curr_pt, tuple(color), 1, cv2.LINE_AA)
+        #         matches_img = cv2.circle(matches_img, prev_pt, 2, tuple(color), -1)
+        #         matches_img = cv2.circle(matches_img, curr_pt, 2, tuple(color), -1)
+        #
+        #     plt.figure()
+        #     plt.imshow(matches_img)
+        #     plt.show()
+
+        # Find rigid matrix
+        if (np.size(prevPoints, 0) > 4) and (np.size(prevPoints, 0) == np.size(prevPoints, 0)):
+            H, inliers = cv2.estimateAffinePartial2D(prevPoints, currPoints, cv2.RANSAC)
+
+            # Handle downscale
+            if self.downscale > 1.0:
+                H[0, 2] *= self.downscale
+                H[1, 2] *= self.downscale
+        else:
+            LOGGER.warning('WARNING: not enough matching points')
+
+        # Store to next iteration
+        self.prevFrame = frame.copy()
+        self.prevKeyPoints = copy.copy(keypoints)
+        self.prevDescriptors = copy.copy(descriptors)
+
+        return H
+
+    def applySparseOptFlow(self, raw_frame, detections=None):
+        """Initialize."""
+        height, width, _ = raw_frame.shape
+        frame = cv2.cvtColor(raw_frame, cv2.COLOR_BGR2GRAY)
+        H = np.eye(2, 3)
+
+        # Downscale image
+        if self.downscale > 1.0:
+            # frame = cv2.GaussianBlur(frame, (3, 3), 1.5)
+            frame = cv2.resize(frame, (width // self.downscale, height // self.downscale))
+
+        # Find the keypoints
+        keypoints = cv2.goodFeaturesToTrack(frame, mask=None, **self.feature_params)
+
+        # Handle first frame
+        if not self.initializedFirstFrame:
+            # Initialize data
+            self.prevFrame = frame.copy()
+            self.prevKeyPoints = copy.copy(keypoints)
+
+            # Initialization done
+            self.initializedFirstFrame = True
+
+            return H
+
+        # Find correspondences
+        matchedKeypoints, status, err = cv2.calcOpticalFlowPyrLK(self.prevFrame, frame, self.prevKeyPoints, None)
+
+        # Leave good correspondences only
+        prevPoints = []
+        currPoints = []
+
+        for i in range(len(status)):
+            if status[i]:
+                prevPoints.append(self.prevKeyPoints[i])
+                currPoints.append(matchedKeypoints[i])
+
+        prevPoints = np.array(prevPoints)
+        currPoints = np.array(currPoints)
+
+        # Find rigid matrix
+        if (np.size(prevPoints, 0) > 4) and (np.size(prevPoints, 0) == np.size(prevPoints, 0)):
+            H, inliers = cv2.estimateAffinePartial2D(prevPoints, currPoints, cv2.RANSAC)
+
+            # Handle downscale
+            if self.downscale > 1.0:
+                H[0, 2] *= self.downscale
+                H[1, 2] *= self.downscale
+        else:
+            LOGGER.warning('WARNING: not enough matching points')
+
+        # Store to next iteration
+        self.prevFrame = frame.copy()
+        self.prevKeyPoints = copy.copy(keypoints)
+
+        return H
--- a/tracking/trackers/utils/kalman_filter.py
+++ b/tracking/trackers/utils/kalman_filter.py
@ -0,0 +1,368 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+
+import numpy as np
+import scipy.linalg
+
+
+class KalmanFilterXYAH:
+    """
+    For bytetrack. A simple Kalman filter for tracking bounding boxes in image space.
+
+    The 8-dimensional state space (x, y, a, h, vx, vy, va, vh) contains the bounding box center position (x, y),
+    aspect ratio a, height h, and their respective velocities.
+
+    Object motion follows a constant velocity model. The bounding box location (x, y, a, h) is taken as direct
+    observation of the state space (linear observation model).
+    """
+
+    def __init__(self):
+        """Initialize Kalman filter model matrices with motion and observation uncertainty weights."""
+        ndim, dt = 4, 1.
+
+        # Create Kalman filter model matrices.
+        self._motion_mat = np.eye(2 * ndim, 2 * ndim)
+        for i in range(ndim):
+            self._motion_mat[i, ndim + i] = dt
+        self._update_mat = np.eye(ndim, 2 * ndim)
+
+        # Motion and observation uncertainty are chosen relative to the current state estimate. These weights control
+        # the amount of uncertainty in the model. This is a bit hacky.
+        self._std_weight_position = 1. / 20
+        self._std_weight_velocity = 1. / 160
+
+    def initiate(self, measurement):
+        """
+        Create track from unassociated measurement.
+
+        Parameters
+        ----------
+        measurement : ndarray
+            Bounding box coordinates (x, y, a, h) with center position (x, y),
+            aspect ratio a, and height h.
+
+        Returns
+        -------
+        (ndarray, ndarray)
+            Returns the mean vector (8 dimensional) and covariance matrix (8x8
+            dimensional) of the new track. Unobserved velocities are initialized
+            to 0 mean.
+        """
+        mean_pos = measurement
+        mean_vel = np.zeros_like(mean_pos)
+        mean = np.r_[mean_pos, mean_vel]
+
+        std = [
+            2 * self._std_weight_position * measurement[3], 2 * self._std_weight_position * measurement[3], 1e-2,
+            2 * self._std_weight_position * measurement[3], 10 * self._std_weight_velocity * measurement[3],
+            10 * self._std_weight_velocity * measurement[3], 1e-5, 10 * self._std_weight_velocity * measurement[3]]
+        covariance = np.diag(np.square(std))
+        return mean, covariance
+
+    def predict(self, mean, covariance):
+        """
+        Run Kalman filter prediction step.
+
+        Parameters
+        ----------
+        mean : ndarray
+            The 8 dimensional mean vector of the object state at the previous time step.
+        covariance : ndarray
+            The 8x8 dimensional covariance matrix of the object state at the previous time step.
+
+        Returns
+        -------
+        (ndarray, ndarray)
+            Returns the mean vector and covariance matrix of the predicted state. Unobserved velocities are
+            initialized to 0 mean.
+        """
+        std_pos = [
+            self._std_weight_position * mean[3], self._std_weight_position * mean[3], 1e-2,
+            self._std_weight_position * mean[3]]
+        std_vel = [
+            self._std_weight_velocity * mean[3], self._std_weight_velocity * mean[3], 1e-5,
+            self._std_weight_velocity * mean[3]]
+        motion_cov = np.diag(np.square(np.r_[std_pos, std_vel]))
+
+        # mean = np.dot(self._motion_mat, mean)
+        mean = np.dot(mean, self._motion_mat.T)
+        covariance = np.linalg.multi_dot((self._motion_mat, covariance, self._motion_mat.T)) + motion_cov
+
+        return mean, covariance
+
+    def project(self, mean, covariance):
+        """
+        Project state distribution to measurement space.
+
+        Parameters
+        ----------
+        mean : ndarray
+            The state's mean vector (8 dimensional array).
+        covariance : ndarray
+            The state's covariance matrix (8x8 dimensional).
+
+        Returns
+        -------
+        (ndarray, ndarray)
+            Returns the projected mean and covariance matrix of the given state estimate.
+        """
+        std = [
+            self._std_weight_position * mean[3], self._std_weight_position * mean[3], 1e-1,
+            self._std_weight_position * mean[3]]
+        innovation_cov = np.diag(np.square(std))
+
+        mean = np.dot(self._update_mat, mean)
+        covariance = np.linalg.multi_dot((self._update_mat, covariance, self._update_mat.T))
+        return mean, covariance + innovation_cov
+
+    def multi_predict(self, mean, covariance):
+        """
+        Run Kalman filter prediction step (Vectorized version).
+
+        Parameters
+        ----------
+        mean : ndarray
+            The Nx8 dimensional mean matrix of the object states at the previous time step.
+        covariance : ndarray
+            The Nx8x8 dimensional covariance matrix of the object states at the previous time step.
+
+        Returns
+        -------
+        (ndarray, ndarray)
+            Returns the mean vector and covariance matrix of the predicted state. Unobserved velocities are
+            initialized to 0 mean.
+        """
+        std_pos = [
+            self._std_weight_position * mean[:, 3], self._std_weight_position * mean[:, 3],
+            1e-2 * np.ones_like(mean[:, 3]), self._std_weight_position * mean[:, 3]]
+        std_vel = [
+            self._std_weight_velocity * mean[:, 3], self._std_weight_velocity * mean[:, 3],
+            1e-5 * np.ones_like(mean[:, 3]), self._std_weight_velocity * mean[:, 3]]
+        sqr = np.square(np.r_[std_pos, std_vel]).T
+
+        motion_cov = [np.diag(sqr[i]) for i in range(len(mean))]
+        motion_cov = np.asarray(motion_cov)
+
+        mean = np.dot(mean, self._motion_mat.T)
+        left = np.dot(self._motion_mat, covariance).transpose((1, 0, 2))
+        covariance = np.dot(left, self._motion_mat.T) + motion_cov
+
+        return mean, covariance
+
+    def update(self, mean, covariance, measurement):
+        """
+        Run Kalman filter correction step.
+
+        Parameters
+        ----------
+        mean : ndarray
+            The predicted state's mean vector (8 dimensional).
+        covariance : ndarray
+            The state's covariance matrix (8x8 dimensional).
+        measurement : ndarray
+            The 4 dimensional measurement vector (x, y, a, h), where (x, y) is the center position, a the aspect
+            ratio, and h the height of the bounding box.
+
+        Returns
+        -------
+        (ndarray, ndarray)
+            Returns the measurement-corrected state distribution.
+        """
+        projected_mean, projected_cov = self.project(mean, covariance)
+
+        chol_factor, lower = scipy.linalg.cho_factor(projected_cov, lower=True, check_finite=False)
+        kalman_gain = scipy.linalg.cho_solve((chol_factor, lower),
+                                             np.dot(covariance, self._update_mat.T).T,
+                                             check_finite=False).T
+        innovation = measurement - projected_mean
+
+        new_mean = mean + np.dot(innovation, kalman_gain.T)
+        new_covariance = covariance - np.linalg.multi_dot((kalman_gain, projected_cov, kalman_gain.T))
+        return new_mean, new_covariance
+
+    def gating_distance(self, mean, covariance, measurements, only_position=False, metric='maha'):
+        """
+        Compute gating distance between state distribution and measurements. A suitable distance threshold can be
+        obtained from `chi2inv95`. If `only_position` is False, the chi-square distribution has 4 degrees of
+        freedom, otherwise 2.
+
+        Parameters
+        ----------
+        mean : ndarray
+            Mean vector over the state distribution (8 dimensional).
+        covariance : ndarray
+            Covariance of the state distribution (8x8 dimensional).
+        measurements : ndarray
+            An Nx4 dimensional matrix of N measurements, each in format (x, y, a, h) where (x, y) is the bounding box
+            center position, a the aspect ratio, and h the height.
+        only_position : Optional[bool]
+            If True, distance computation is done with respect to the bounding box center position only.
+
+        Returns
+        -------
+        ndarray
+            Returns an array of length N, where the i-th element contains the squared Mahalanobis distance between
+            (mean, covariance) and `measurements[i]`.
+        """
+        mean, covariance = self.project(mean, covariance)
+        if only_position:
+            mean, covariance = mean[:2], covariance[:2, :2]
+            measurements = measurements[:, :2]
+
+        d = measurements - mean
+        if metric == 'gaussian':
+            return np.sum(d * d, axis=1)
+        elif metric == 'maha':
+            cholesky_factor = np.linalg.cholesky(covariance)
+            z = scipy.linalg.solve_triangular(cholesky_factor, d.T, lower=True, check_finite=False, overwrite_b=True)
+            return np.sum(z * z, axis=0)  # square maha
+        else:
+            raise ValueError('invalid distance metric')
+
+
+class KalmanFilterXYWH(KalmanFilterXYAH):
+    """
+    For BoT-SORT. A simple Kalman filter for tracking bounding boxes in image space.
+
+    The 8-dimensional state space (x, y, w, h, vx, vy, vw, vh) contains the bounding box center position (x, y),
+    width w, height h, and their respective velocities.
+
+    Object motion follows a constant velocity model. The bounding box location (x, y, w, h) is taken as direct
+    observation of the state space (linear observation model).
+    """
+
+    def initiate(self, measurement):
+        """
+        Create track from unassociated measurement.
+
+        Parameters
+        ----------
+        measurement : ndarray
+            Bounding box coordinates (x, y, w, h) with center position (x, y), width w, and height h.
+
+        Returns
+        -------
+        (ndarray, ndarray)
+            Returns the mean vector (8 dimensional) and covariance matrix (8x8 dimensional) of the new track.
+            Unobserved velocities are initialized to 0 mean.
+        """
+        mean_pos = measurement
+        mean_vel = np.zeros_like(mean_pos)
+        mean = np.r_[mean_pos, mean_vel]
+
+        std = [
+            2 * self._std_weight_position * measurement[2], 2 * self._std_weight_position * measurement[3],
+            2 * self._std_weight_position * measurement[2], 2 * self._std_weight_position * measurement[3],
+            10 * self._std_weight_velocity * measurement[2], 10 * self._std_weight_velocity * measurement[3],
+            10 * self._std_weight_velocity * measurement[2], 10 * self._std_weight_velocity * measurement[3]]
+        covariance = np.diag(np.square(std))
+        return mean, covariance
+
+    def predict(self, mean, covariance):
+        """
+        Run Kalman filter prediction step.
+
+        Parameters
+        ----------
+        mean : ndarray
+            The 8 dimensional mean vector of the object state at the previous time step.
+        covariance : ndarray
+            The 8x8 dimensional covariance matrix of the object state at the previous time step.
+
+        Returns
+        -------
+        (ndarray, ndarray)
+            Returns the mean vector and covariance matrix of the predicted state. Unobserved velocities are
+            initialized to 0 mean.
+        """
+        std_pos = [
+            self._std_weight_position * mean[2], self._std_weight_position * mean[3],
+            self._std_weight_position * mean[2], self._std_weight_position * mean[3]]
+        std_vel = [
+            self._std_weight_velocity * mean[2], self._std_weight_velocity * mean[3],
+            self._std_weight_velocity * mean[2], self._std_weight_velocity * mean[3]]
+        motion_cov = np.diag(np.square(np.r_[std_pos, std_vel]))
+
+        mean = np.dot(mean, self._motion_mat.T)
+        covariance = np.linalg.multi_dot((self._motion_mat, covariance, self._motion_mat.T)) + motion_cov
+
+        return mean, covariance
+
+    def project(self, mean, covariance):
+        """
+        Project state distribution to measurement space.
+
+        Parameters
+        ----------
+        mean : ndarray
+            The state's mean vector (8 dimensional array).
+        covariance : ndarray
+            The state's covariance matrix (8x8 dimensional).
+
+        Returns
+        -------
+        (ndarray, ndarray)
+            Returns the projected mean and covariance matrix of the given state estimate.
+        """
+        std = [
+            self._std_weight_position * mean[2], self._std_weight_position * mean[3],
+            self._std_weight_position * mean[2], self._std_weight_position * mean[3]]
+        innovation_cov = np.diag(np.square(std))
+
+        mean = np.dot(self._update_mat, mean)
+        covariance = np.linalg.multi_dot((self._update_mat, covariance, self._update_mat.T))
+        return mean, covariance + innovation_cov
+
+    def multi_predict(self, mean, covariance):
+        """
+        Run Kalman filter prediction step (Vectorized version).
+
+        Parameters
+        ----------
+        mean : ndarray
+            The Nx8 dimensional mean matrix of the object states at the previous time step.
+        covariance : ndarray
+            The Nx8x8 dimensional covariance matrix of the object states at the previous time step.
+
+        Returns
+        -------
+        (ndarray, ndarray)
+            Returns the mean vector and covariance matrix of the predicted state. Unobserved velocities are
+            initialized to 0 mean.
+        """
+        std_pos = [
+            self._std_weight_position * mean[:, 2], self._std_weight_position * mean[:, 3],
+            self._std_weight_position * mean[:, 2], self._std_weight_position * mean[:, 3]]
+        std_vel = [
+            self._std_weight_velocity * mean[:, 2], self._std_weight_velocity * mean[:, 3],
+            self._std_weight_velocity * mean[:, 2], self._std_weight_velocity * mean[:, 3]]
+        sqr = np.square(np.r_[std_pos, std_vel]).T
+
+        motion_cov = [np.diag(sqr[i]) for i in range(len(mean))]
+        motion_cov = np.asarray(motion_cov)
+
+        mean = np.dot(mean, self._motion_mat.T)
+        left = np.dot(self._motion_mat, covariance).transpose((1, 0, 2))
+        covariance = np.dot(left, self._motion_mat.T) + motion_cov
+
+        return mean, covariance
+
+    def update(self, mean, covariance, measurement):
+        """
+        Run Kalman filter correction step.
+
+        Parameters
+        ----------
+        mean : ndarray
+            The predicted state's mean vector (8 dimensional).
+        covariance : ndarray
+            The state's covariance matrix (8x8 dimensional).
+        measurement : ndarray
+            The 4 dimensional measurement vector (x, y, w, h), where (x, y) is the center position, w the width,
+            and h the height of the bounding box.
+
+        Returns
+        -------
+        (ndarray, ndarray)
+            Returns the measurement-corrected state distribution.
+        """
+        return super().update(mean, covariance, measurement)
--- a/tracking/trackers/utils/matching.py
+++ b/tracking/trackers/utils/matching.py
@ -0,0 +1,215 @@
+# Ultralytics YOLO ğŸš€, AGPL-3.0 license
+
+import numpy as np
+import math
+import torch
+import scipy
+from scipy.spatial.distance import cdist
+
+# from ultralytics.utils.metrics import bbox_ioa
+
+try:
+    import lap  # for linear_assignment
+
+    assert lap.__version__  # verify package is not directory
+except (ImportError, AssertionError, AttributeError):
+    from ultralytics.utils.checks import check_requirements
+
+    check_requirements('lapx>=0.5.2')  # update to lap package from https://github.com/rathaROG/lapx
+    import lap
+
+
+def bbox_iou(box1, box2, GIoU=False, DIoU=False, CIoU=False, eps=1e-7):
+    '''由根目录下 utils.metrics.metrics.bbox_iou 更改而来'''
+    # Returns Intersection over Union (IoU) of box1(1,4) to box2(n,4)
+
+    # Get the coordinates of bounding boxes
+    # x1, y1, x2, y2 = box1
+    # box1 = torch.tensor(box1)
+    # box2 = torch.tensor(box2)
+    
+    b1_x1, b1_y1, b1_x2, b1_y2 = box1.T
+    b2_x1, b2_y1, b2_x2, b2_y2 = box2.T
+    w1, h1 = b1_x2 - b1_x1, (b1_y2 - b1_y1).clip(eps)
+    w2, h2 = b2_x2 - b2_x1, (b2_y2 - b2_y1).clip(eps)
+
+    # Intersection area
+    # inter = (b1_x2.minimum(b2_x2) - b1_x1.maximum(b2_x1)).clamp(0) * \
+    #         (b1_y2.minimum(b2_y2) - b1_y1.maximum(b2_y1)).clamp(0)
+            
+    inter = (np.minimum(b1_x2[:, None], b2_x2) - np.maximum(b1_x1[:, None], b2_x1)).clip(0) * \
+                 (np.minimum(b1_y2[:, None], b2_y2) - np.maximum(b1_y1[:, None], b2_y1)).clip(0)
+
+    # Union Area
+    box1_area = w1 * h1
+    box2_area = w2 * h2
+    
+    union = box1_area[:, None] + box2_area - inter + eps
+
+    # IoU
+    iou = inter / union
+    if CIoU or DIoU or GIoU:
+        cw = np.maximum(b1_x2[:, None], b2_x2) - np.minimum(b1_x1[:, None], b2_x1)  # convex (smallest enclosing box) width
+        ch = np.maximum(b1_y2[:, None], b2_y2) - np.minimum(b1_y1[:, None], b2_y1)  # convex height
+        if CIoU or DIoU:  # Distance or Complete IoU https://arxiv.org/abs/1911.08287v1
+            c2 = cw ** 2 + ch ** 2 + eps  # convex diagonal squared
+            '''center dist ** 2''' 
+            rho2 = ((b1_x1[:, None] + b1_x2[:, None] - b2_x1 - b2_x2) ** 2 + \
+                    (b1_y1[:, None] + b1_y2[:, None] - b2_y1 - b2_y2) ** 2) / 4 
+            if CIoU:  # https://github.com/Zzh-tju/DIoU-SSD-pytorch/blob/master/utils/box/box_utils.py#L47
+                v = (4 / math.pi ** 2) * (np.arctan(w1 / h1)[:, None] - np.arctan(w2 / h2))**2
+                with torch.no_grad():
+                    alpha = v / (v - iou + (1 + eps))
+                return iou - (rho2 / c2 + v * alpha)  # CIoU
+            return iou - rho2 / c2  # DIoU
+        c_area = cw * ch + eps  # convex area
+        return iou - (c_area - union) / c_area  # GIoU https://arxiv.org/pdf/1902.09630.pdf
+    return iou  # IoU
+
+
+def bbox_ioa(box1, box2, iou=False, eps=1e-7):
+    """
+    Calculate the intersection over box2 area given box1 and box2. Boxes are in x1y1x2y2 format.
+
+    Args:
+        box1 (np.array): A numpy array of shape (n, 4) representing n bounding boxes.
+        box2 (np.array): A numpy array of shape (m, 4) representing m bounding boxes.
+        iou (bool): Calculate the standard iou if True else return inter_area/box2_area.
+        eps (float, optional): A small value to avoid division by zero. Defaults to 1e-7.
+
+    Returns:
+        (np.array): A numpy array of shape (n, m) representing the intersection over box2 area.
+    """
+
+    # Get the coordinates of bounding boxes
+    b1_x1, b1_y1, b1_x2, b1_y2 = box1.T
+    b2_x1, b2_y1, b2_x2, b2_y2 = box2.T
+
+    # Intersection area
+    inter_area = (np.minimum(b1_x2[:, None], b2_x2) - np.maximum(b1_x1[:, None], b2_x1)).clip(0) * \
+                 (np.minimum(b1_y2[:, None], b2_y2) - np.maximum(b1_y1[:, None], b2_y1)).clip(0)
+
+    # box2 area
+    area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1)
+    if iou:
+        box1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1)
+        area = area + box1_area[:, None] - inter_area
+
+    # Intersection over box2 area
+    return inter_area / (area + eps)
+
+
+# def linear_assignment(cost_matrix, thresh, use_lap=True):
+def linear_assignment(cost_matrix: np.ndarray, thresh: float, use_lap: bool = True) -> tuple:
+    """
+    Perform linear assignment using scipy or lap.lapjv.
+
+    Args:
+        cost_matrix (np.ndarray): The matrix containing cost values for assignments.
+        thresh (float): Threshold for considering an assignment valid.
+        use_lap (bool, optional): Whether to use lap.lapjv. Defaults to True.
+
+    Returns:
+        (tuple): Tuple containing matched indices, unmatched indices from 'a', and unmatched indices from 'b'.
+    """
+
+    if cost_matrix.size == 0:
+        return np.empty((0, 2), dtype=int), tuple(range(cost_matrix.shape[0])), tuple(range(cost_matrix.shape[1]))
+
+    if use_lap:
+        # https://github.com/gatagat/lap
+        _, x, y = lap.lapjv(cost_matrix, extend_cost=True, cost_limit=thresh)
+        matches = [[ix, mx] for ix, mx in enumerate(x) if mx >= 0]
+        unmatched_a = np.where(x < 0)[0]
+        unmatched_b = np.where(y < 0)[0]
+    else:
+        # https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.linear_sum_assignment.html
+        x, y = scipy.optimize.linear_sum_assignment(cost_matrix)  # row x, col y
+        matches = np.asarray([[x[i], y[i]] for i in range(len(x)) if cost_matrix[x[i], y[i]] <= thresh])
+        if len(matches) == 0:
+            unmatched_a = list(np.arange(cost_matrix.shape[0]))
+            unmatched_b = list(np.arange(cost_matrix.shape[1]))
+        else:
+            unmatched_a = list(set(np.arange(cost_matrix.shape[0])) - set(matches[:, 0]))
+            unmatched_b = list(set(np.arange(cost_matrix.shape[1])) - set(matches[:, 1]))
+
+    return matches, unmatched_a, unmatched_b
+
+
+# def iou_distance(atracks, btracks):
+def iou_distance(atracks: list, btracks: list) -> np.ndarray:
+    """
+    Compute cost based on Intersection over Union (IoU) between tracks.
+
+    Args:
+        atracks (list[STrack] | list[np.ndarray]): List of tracks 'a' or bounding boxes.
+        btracks (list[STrack] | list[np.ndarray]): List of tracks 'b' or bounding boxes.
+
+    Returns:
+        (np.ndarray): Cost matrix computed based on IoU.
+    """
+
+    if (len(atracks) > 0 and isinstance(atracks[0], np.ndarray)) \
+            or (len(btracks) > 0 and isinstance(btracks[0], np.ndarray)):
+        atlbrs = atracks
+        btlbrs = btracks
+    else:
+        atlbrs = [track.tlbr for track in atracks]
+        btlbrs = [track.tlbr for track in btracks]
+
+    ious = np.zeros((len(atlbrs), len(btlbrs)), dtype=np.float32)
+    if len(atlbrs) and len(btlbrs):
+        box1 = np.ascontiguousarray(atlbrs, dtype=np.float32)
+        box2 = np.ascontiguousarray(btlbrs, dtype=np.float32)
+        
+        ious = bbox_ioa(box1, box2, iou=True)
+        ious_g = bbox_iou(box1, box2, GIoU=True).clip(-1.0, 1.0)
+        ious_d = bbox_iou(box1, box2, DIoU=True).clip(-1.0, 1.0)
+        ious_c = bbox_iou(box1, box2, CIoU=True).clip(-1.0, 1.0)
+        
+    return 1 - ious  # cost matrix
+
+
+def embedding_distance(tracks, detections, metric='cosine'):
+    """
+    Compute distance between tracks and detections based on embeddings.
+
+    Args:
+        tracks (list[STrack]): List of tracks.
+        detections (list[BaseTrack]): List of detections.
+        metric (str, optional): Metric for distance computation. Defaults to 'cosine'.
+
+    Returns:
+        (np.ndarray): Cost matrix computed based on embeddings.
+    """
+
+    cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float32)
+    if cost_matrix.size == 0:
+        return cost_matrix
+    det_features = np.asarray([track.curr_feat for track in detections], dtype=np.float32)
+    # for i, track in enumerate(tracks):
+    # cost_matrix[i, :] = np.maximum(0.0, cdist(track.smooth_feat.reshape(1,-1), det_features, metric))
+    track_features = np.asarray([track.smooth_feat for track in tracks], dtype=np.float32)
+    cost_matrix = np.maximum(0.0, cdist(track_features, det_features, metric))  # Normalized features
+    return cost_matrix
+
+
+def fuse_score(cost_matrix, detections):
+    """
+    Fuses cost matrix with detection scores to produce a single similarity matrix.
+
+    Args:
+        cost_matrix (np.ndarray): The matrix containing cost values for assignments.
+        detections (list[BaseTrack]): List of detections with scores.
+
+    Returns:
+        (np.ndarray): Fused similarity matrix.
+    """
+
+    if cost_matrix.size == 0:
+        return cost_matrix
+    iou_sim = 1 - cost_matrix
+    det_scores = np.array([det.score for det in detections])
+    det_scores = np.expand_dims(det_scores, axis=0).repeat(cost_matrix.shape[0], axis=0)
+    fuse_sim = iou_sim * det_scores
+    return 1 - fuse_sim  # fuse_cost