diff --git a/src/GeoLocalizator.py b/src/GeoLocalizator.py new file mode 100644 index 0000000..88f0ca1 --- /dev/null +++ b/src/GeoLocalizator.py @@ -0,0 +1,30 @@ +from typing import List, Tuple + + +class GeoLocalizator: + def geo_localize(self, trees: List, coordenates: Tuple) -> List: + """ + Localize from Tree object List all them coordenates. + + :param List trees: The list with all trees to operate with. + :param Tuple coordenates: The coordenates off the target image. + :return: The list off all Trees with all coordenates calculated. + """ + self._coordenates = coordenates + + for tree in trees: + x, y = self._calculate_coordenates(tree) + tree.set_coordenates(x, y) + + return trees + + def _calculate_coordenates(self, tree: "Tree") -> Tuple: + """ + Localize Tree object with its coordenates. + + :param Tree tree: The Tree to work with. + :return: The Tuple of GPS coordenates witch represent its center. + """ + x = None + y = None + return x, y diff --git a/src/NeuralNetwork.py b/src/NeuralNetwork.py new file mode 100644 index 0000000..91bfb89 --- /dev/null +++ b/src/NeuralNetwork.py @@ -0,0 +1,68 @@ +from typing import List + +import numpy as np +import numpy.typing as npt + +from keras_retinanet import models +from keras_retinanet.utils.gpu import setup_gpu +from keras_retinanet.utils.image import preprocess_image, resize_image + +from .Tree import Tree + + +class NeuralNetwork: + def __init__(self, path_to_model: str, score: float = 0.5): + """ + Neural Network will use RetinaNet to detect trees. + + :param str path_to_model: The path where the model is stored. + :param float score: Score of confidance. 0.5 by default. + """ + + # select GPU + gpu = 0 + setup_gpu(gpu) + + # Load RetinaNet model + self._model = models.load_model(path_to_model, backbone_name="resnet50") + + # Set score confidance + self._score = score + + def detect_trees(self, img: npt.ArrayLike, row: int, col: int) -> List: + """ + Detect trees in sub-image. + + :param npt.ArrayLike img: The image where detection will be. Must be (400x400x3). + :param int row: Padding row. + :param int col: Padding col. + :return: List with all detected trees. + """ + + # Preprocess image + image = preprocess_image(img) + image, scale = resize_image(image) + + # Predict Trees + boxes, scores, labels = self._model.predict_on_batch( + np.expand_dims(image, axis=0) + ) + + # Scale boxes + boxes /= scale + + # Generate Tree objects + trees = [] + for box, score, _ in zip(boxes[0], scores[0], labels[0]): + # Boxes are sorted from 1->0 + if score < 0.5: + break + + box = box.astype(int) + + x1, y1, x2, y2 = box + width = x2 - x1 + height = y2 - y1 + trees.append(Tree(y1 + row, x1 + col, width, height)) + + return trees \ No newline at end of file diff --git a/src/Tree.py b/src/Tree.py new file mode 100644 index 0000000..a0deb37 --- /dev/null +++ b/src/Tree.py @@ -0,0 +1,18 @@ +class Tree: + def __init__(self, row: int, col: int, width: int, height: int): + self._row = row + self._col = col + self._width = width + self._height = height + self._coordx = None + self._coordy = None + + def set_coordenates(self, x: float, y: float): + """ + Set coordenates in Tree object. + + :param float x: x coordenate (West-East). + :param float y: y coordenate (North-Sourth). + """ + self._coordx = x + self._coordy = y diff --git a/src/TreeDetector.py b/src/TreeDetector.py index 41d4deb..f0e2f0d 100644 --- a/src/TreeDetector.py +++ b/src/TreeDetector.py @@ -1,12 +1,43 @@ +from typing import List, Tuple + import cv2 as cv +import numpy as np +import numpy.typing as npt class TreeDetector: - def __init__(self, img): - self.__img = cv.imread(img) + def __init__(self, nn): + """ + TreeDetector constructor. + + :param NeuralNetwork nn: the neural network witch we will used. + """ + self._img = None + self._coordenates = None + self._nn = nn # Neural Network + self._trees = [] + + def recognize(self, img: npt.ArrayLike, coordenates: Tuple) -> List: + """ + Recognize trees in image. - def recognize(self): - return "done" + :param npt.ArrayLike img: The target image. + :param tuple coordenates: The coordenates of the image. + :return: the list with all detected trees. + """ + self._img = img + self._slide() + return self._trees - def slide(self): - return 400 \ No newline at end of file + def _slide(self): + """ + Iterates around the image and calls NN to detect trees in sub-image. + """ + STEP = 400 + cols, rows = self._img.shape[:-1] + for col in range(0, cols - STEP, STEP): + for row in range(0, rows - STEP, STEP): + trees = self._nn.detect_trees( + self._img[row : row + STEP, col : col + STEP], row, col + ) + self._trees.append(trees) diff --git a/src/TreePainter.py b/src/TreePainter.py new file mode 100644 index 0000000..cf124ae --- /dev/null +++ b/src/TreePainter.py @@ -0,0 +1,17 @@ +from typing import List + +import cv2 as cv +import numpy.typing as npt + + +class TreePainter: + def draw(self, canvas: npt.ArrayLike, trees: List): + """ + Draw all trees in image. + + :param npt.ArrayLike canvas: The image witch will be used as canvas. + :param List trees: The list off all trees. + """ + + for tree in trees: + continue diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/keras_retinanet/__init__.py b/src/keras_retinanet/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/keras_retinanet/backend/__init__.py b/src/keras_retinanet/backend/__init__.py new file mode 100644 index 0000000..4bace69 --- /dev/null +++ b/src/keras_retinanet/backend/__init__.py @@ -0,0 +1,2 @@ +from .dynamic import * # noqa: F401,F403 +from .common import * # noqa: F401,F403 diff --git a/src/keras_retinanet/backend/cntk_backend.py b/src/keras_retinanet/backend/cntk_backend.py new file mode 100644 index 0000000..70aae54 --- /dev/null +++ b/src/keras_retinanet/backend/cntk_backend.py @@ -0,0 +1,15 @@ +""" +Copyright 2017-2018 Fizyr (https://fizyr.com) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" diff --git a/src/keras_retinanet/backend/common.py b/src/keras_retinanet/backend/common.py new file mode 100644 index 0000000..8f8dcc6 --- /dev/null +++ b/src/keras_retinanet/backend/common.py @@ -0,0 +1,85 @@ +""" +Copyright 2017-2018 Fizyr (https://fizyr.com) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import keras.backend +from .dynamic import meshgrid + + +def bbox_transform_inv(boxes, deltas, mean=None, std=None): + """ Applies deltas (usually regression results) to boxes (usually anchors). + + Before applying the deltas to the boxes, the normalization that was previously applied (in the generator) has to be removed. + The mean and std are the mean and std as applied in the generator. They are unnormalized in this function and then applied to the boxes. + + Args + boxes : np.array of shape (B, N, 4), where B is the batch size, N the number of boxes and 4 values for (x1, y1, x2, y2). + deltas: np.array of same shape as boxes. These deltas (d_x1, d_y1, d_x2, d_y2) are a factor of the width/height. + mean : The mean value used when computing deltas (defaults to [0, 0, 0, 0]). + std : The standard deviation used when computing deltas (defaults to [0.2, 0.2, 0.2, 0.2]). + + Returns + A np.array of the same shape as boxes, but with deltas applied to each box. + The mean and std are used during training to normalize the regression values (networks love normalization). + """ + if mean is None: + mean = [0, 0, 0, 0] + if std is None: + std = [0.2, 0.2, 0.2, 0.2] + + width = boxes[:, :, 2] - boxes[:, :, 0] + height = boxes[:, :, 3] - boxes[:, :, 1] + + x1 = boxes[:, :, 0] + (deltas[:, :, 0] * std[0] + mean[0]) * width + y1 = boxes[:, :, 1] + (deltas[:, :, 1] * std[1] + mean[1]) * height + x2 = boxes[:, :, 2] + (deltas[:, :, 2] * std[2] + mean[2]) * width + y2 = boxes[:, :, 3] + (deltas[:, :, 3] * std[3] + mean[3]) * height + + pred_boxes = keras.backend.stack([x1, y1, x2, y2], axis=2) + + return pred_boxes + + +def shift(shape, stride, anchors): + """ Produce shifted anchors based on shape of the map and stride size. + + Args + shape : Shape to shift the anchors over. + stride : Stride to shift the anchors with over the shape. + anchors: The anchors to apply at each location. + """ + shift_x = (keras.backend.arange(0, shape[1], dtype=keras.backend.floatx()) + keras.backend.constant(0.5, dtype=keras.backend.floatx())) * stride + shift_y = (keras.backend.arange(0, shape[0], dtype=keras.backend.floatx()) + keras.backend.constant(0.5, dtype=keras.backend.floatx())) * stride + + shift_x, shift_y = meshgrid(shift_x, shift_y) + shift_x = keras.backend.reshape(shift_x, [-1]) + shift_y = keras.backend.reshape(shift_y, [-1]) + + shifts = keras.backend.stack([ + shift_x, + shift_y, + shift_x, + shift_y + ], axis=0) + + shifts = keras.backend.transpose(shifts) + number_of_anchors = keras.backend.shape(anchors)[0] + + k = keras.backend.shape(shifts)[0] # number of base points = feat_h * feat_w + + shifted_anchors = keras.backend.reshape(anchors, [1, number_of_anchors, 4]) + keras.backend.cast(keras.backend.reshape(shifts, [k, 1, 4]), keras.backend.floatx()) + shifted_anchors = keras.backend.reshape(shifted_anchors, [k * number_of_anchors, 4]) + + return shifted_anchors diff --git a/src/keras_retinanet/backend/dynamic.py b/src/keras_retinanet/backend/dynamic.py new file mode 100644 index 0000000..361b685 --- /dev/null +++ b/src/keras_retinanet/backend/dynamic.py @@ -0,0 +1,25 @@ +import os + +_BACKEND = "tensorflow" + +if "KERAS_BACKEND" in os.environ: + _backend = os.environ["KERAS_BACKEND"] + + backends = { + "cntk", + "tensorflow", + "theano" + } + + assert _backend in backends + + _BACKEND = _backend + +if _BACKEND == "cntk": + from .cntk_backend import * # noqa: F401,F403 +elif _BACKEND == "theano": + from .theano_backend import * # noqa: F401,F403 +elif _BACKEND == "tensorflow": + from .tensorflow_backend import * # noqa: F401,F403 +else: + raise ValueError("Unknown backend: " + str(_BACKEND)) diff --git a/src/keras_retinanet/backend/tensorflow_backend.py b/src/keras_retinanet/backend/tensorflow_backend.py new file mode 100644 index 0000000..a41ac80 --- /dev/null +++ b/src/keras_retinanet/backend/tensorflow_backend.py @@ -0,0 +1,110 @@ +""" +Copyright 2017-2018 Fizyr (https://fizyr.com) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import tensorflow + + +def ones(*args, **kwargs): + """ See https://www.tensorflow.org/api_docs/python/tf/ones . + """ + return tensorflow.ones(*args, **kwargs) + + +def transpose(*args, **kwargs): + """ See https://www.tensorflow.org/api_docs/python/tf/transpose . + """ + return tensorflow.transpose(*args, **kwargs) + + +def map_fn(*args, **kwargs): + """ See https://www.tensorflow.org/api_docs/python/tf/map_fn . + """ + return tensorflow.map_fn(*args, **kwargs) + + +def pad(*args, **kwargs): + """ See https://www.tensorflow.org/api_docs/python/tf/pad . + """ + return tensorflow.pad(*args, **kwargs) + + +def top_k(*args, **kwargs): + """ See https://www.tensorflow.org/api_docs/python/tf/nn/top_k . + """ + return tensorflow.nn.top_k(*args, **kwargs) + + +def clip_by_value(*args, **kwargs): + """ See https://www.tensorflow.org/api_docs/python/tf/clip_by_value . + """ + return tensorflow.clip_by_value(*args, **kwargs) + + +def resize_images(images, size, method='bilinear', align_corners=False): + """ See https://www.tensorflow.org/versions/r1.14/api_docs/python/tf/image/resize_images . + + Args + method: The method used for interpolation. One of ('bilinear', 'nearest', 'bicubic', 'area'). + """ + methods = { + 'bilinear': tensorflow.image.ResizeMethod.BILINEAR, + 'nearest' : tensorflow.image.ResizeMethod.NEAREST_NEIGHBOR, + 'bicubic' : tensorflow.image.ResizeMethod.BICUBIC, + 'area' : tensorflow.image.ResizeMethod.AREA, + } + return tensorflow.compat.v1.image.resize_images(images, size, methods[method], align_corners) + + +def non_max_suppression(*args, **kwargs): + """ See https://www.tensorflow.org/api_docs/python/tf/image/non_max_suppression . + """ + return tensorflow.image.non_max_suppression(*args, **kwargs) + + +def range(*args, **kwargs): + """ See https://www.tensorflow.org/api_docs/python/tf/range . + """ + return tensorflow.range(*args, **kwargs) + + +def scatter_nd(*args, **kwargs): + """ See https://www.tensorflow.org/api_docs/python/tf/scatter_nd . + """ + return tensorflow.scatter_nd(*args, **kwargs) + + +def gather_nd(*args, **kwargs): + """ See https://www.tensorflow.org/api_docs/python/tf/gather_nd . + """ + return tensorflow.gather_nd(*args, **kwargs) + + +def meshgrid(*args, **kwargs): + """ See https://www.tensorflow.org/api_docs/python/tf/meshgrid . + """ + return tensorflow.meshgrid(*args, **kwargs) + + +def where(*args, **kwargs): + """ See https://www.tensorflow.org/api_docs/python/tf/where . + """ + return tensorflow.where(*args, **kwargs) + + +def unstack(*args, **kwargs): + """ See https://www.tensorflow.org/api_docs/python/tf/unstack . + """ + return tensorflow.unstack(*args, **kwargs) diff --git a/src/keras_retinanet/backend/theano_backend.py b/src/keras_retinanet/backend/theano_backend.py new file mode 100644 index 0000000..70aae54 --- /dev/null +++ b/src/keras_retinanet/backend/theano_backend.py @@ -0,0 +1,15 @@ +""" +Copyright 2017-2018 Fizyr (https://fizyr.com) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" diff --git a/src/keras_retinanet/bin/__init__.py b/src/keras_retinanet/bin/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/keras_retinanet/bin/convert_model.py b/src/keras_retinanet/bin/convert_model.py new file mode 100644 index 0000000..4ae4cfd --- /dev/null +++ b/src/keras_retinanet/bin/convert_model.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python + +""" +Copyright 2017-2018 Fizyr (https://fizyr.com) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import argparse +import os +import sys + +# Allow relative imports when being executed as script. +if __name__ == "__main__" and __package__ is None: + sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..')) + import keras_retinanet.bin # noqa: F401 + __package__ = "keras_retinanet.bin" + +# Change these to absolute imports if you copy this script outside the keras_retinanet package. +from .. import models +from ..utils.config import read_config_file, parse_anchor_parameters +from ..utils.gpu import setup_gpu +from ..utils.keras_version import check_keras_version +from ..utils.tf_version import check_tf_version + + +def parse_args(args): + parser = argparse.ArgumentParser(description='Script for converting a training model to an inference model.') + + parser.add_argument('model_in', help='The model to convert.') + parser.add_argument('model_out', help='Path to save the converted model to.') + parser.add_argument('--backbone', help='The backbone of the model to convert.', default='resnet50') + parser.add_argument('--no-nms', help='Disables non maximum suppression.', dest='nms', action='store_false') + parser.add_argument('--no-class-specific-filter', help='Disables class specific filtering.', dest='class_specific_filter', action='store_false') + parser.add_argument('--config', help='Path to a configuration parameters .ini file.') + parser.add_argument('--nms-threshold', help='Value for non maximum suppression threshold.', type=float, default=0.5) + parser.add_argument('--score-threshold', help='Threshold for prefiltering boxes.', type=float, default=0.05) + parser.add_argument('--max-detections', help='Maximum number of detections to keep.', type=int, default=300) + parser.add_argument('--parallel-iterations', help='Number of batch items to process in parallel.', type=int, default=32) + + return parser.parse_args(args) + + +def main(args=None): + # parse arguments + if args is None: + args = sys.argv[1:] + args = parse_args(args) + + # make sure keras and tensorflow are the minimum required version + check_keras_version() + check_tf_version() + + # set modified tf session to avoid using the GPUs + setup_gpu('cpu') + + # optionally load config parameters + anchor_parameters = None + if args.config: + args.config = read_config_file(args.config) + if 'anchor_parameters' in args.config: + anchor_parameters = parse_anchor_parameters(args.config) + + # load the model + model = models.load_model(args.model_in, backbone_name=args.backbone) + + # check if this is indeed a training model + models.check_training_model(model) + + # convert the model + model = models.convert_model( + model, + nms=args.nms, + class_specific_filter=args.class_specific_filter, + anchor_params=anchor_parameters, + nms_threshold=args.nms_threshold, + score_threshold=args.score_threshold, + max_detections=args.max_detections, + parallel_iterations=args.parallel_iterations + ) + + # save model + model.save(args.model_out) + + +if __name__ == '__main__': + main() diff --git a/src/keras_retinanet/bin/debug.py b/src/keras_retinanet/bin/debug.py new file mode 100644 index 0000000..32d431c --- /dev/null +++ b/src/keras_retinanet/bin/debug.py @@ -0,0 +1,322 @@ +#!/usr/bin/env python + +""" +Copyright 2017-2018 Fizyr (https://fizyr.com) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import argparse +import os +import sys +import cv2 + +# Set keycodes for changing images +# 81, 83 are left and right arrows on linux in Ascii code (probably not needed) +# 65361, 65363 are left and right arrows in linux +# 2424832, 2555904 are left and right arrows on Windows +# 110, 109 are 'n' and 'm' on mac, windows, linux +# (unfortunately arrow keys not picked up on mac) +leftkeys = (81, 110, 65361, 2424832) +rightkeys = (83, 109, 65363, 2555904) + +# Allow relative imports when being executed as script. +if __name__ == "__main__" and __package__ is None: + sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..')) + import keras_retinanet.bin # noqa: F401 + __package__ = "keras_retinanet.bin" + +# Change these to absolute imports if you copy this script outside the keras_retinanet package. +from ..preprocessing.pascal_voc import PascalVocGenerator +from ..preprocessing.csv_generator import CSVGenerator +from ..preprocessing.kitti import KittiGenerator +from ..preprocessing.open_images import OpenImagesGenerator +from ..utils.anchors import anchors_for_shape, compute_gt_annotations +from ..utils.config import read_config_file, parse_anchor_parameters +from ..utils.image import random_visual_effect_generator +from ..utils.keras_version import check_keras_version +from ..utils.tf_version import check_tf_version +from ..utils.transform import random_transform_generator +from ..utils.visualization import draw_annotations, draw_boxes, draw_caption + + +def create_generator(args): + """ Create the data generators. + + Args: + args: parseargs arguments object. + """ + # create random transform generator for augmenting training data + transform_generator = random_transform_generator( + min_rotation=-0.1, + max_rotation=0.1, + min_translation=(-0.1, -0.1), + max_translation=(0.1, 0.1), + min_shear=-0.1, + max_shear=0.1, + min_scaling=(0.9, 0.9), + max_scaling=(1.1, 1.1), + flip_x_chance=0.5, + flip_y_chance=0.5, + ) + + visual_effect_generator = random_visual_effect_generator( + contrast_range=(0.9, 1.1), + brightness_range=(-.1, .1), + hue_range=(-0.05, 0.05), + saturation_range=(0.95, 1.05) + ) + + if args.dataset_type == 'coco': + # import here to prevent unnecessary dependency on cocoapi + from ..preprocessing.coco import CocoGenerator + + generator = CocoGenerator( + args.coco_path, + args.coco_set, + transform_generator=transform_generator, + visual_effect_generator=visual_effect_generator, + image_min_side=args.image_min_side, + image_max_side=args.image_max_side, + config=args.config + ) + elif args.dataset_type == 'pascal': + generator = PascalVocGenerator( + args.pascal_path, + args.pascal_set, + image_extension=args.image_extension, + transform_generator=transform_generator, + visual_effect_generator=visual_effect_generator, + image_min_side=args.image_min_side, + image_max_side=args.image_max_side, + config=args.config + ) + elif args.dataset_type == 'csv': + generator = CSVGenerator( + args.annotations, + args.classes, + transform_generator=transform_generator, + visual_effect_generator=visual_effect_generator, + image_min_side=args.image_min_side, + image_max_side=args.image_max_side, + config=args.config + ) + elif args.dataset_type == 'oid': + generator = OpenImagesGenerator( + args.main_dir, + subset=args.subset, + version=args.version, + labels_filter=args.labels_filter, + parent_label=args.parent_label, + annotation_cache_dir=args.annotation_cache_dir, + transform_generator=transform_generator, + visual_effect_generator=visual_effect_generator, + image_min_side=args.image_min_side, + image_max_side=args.image_max_side, + config=args.config + ) + elif args.dataset_type == 'kitti': + generator = KittiGenerator( + args.kitti_path, + subset=args.subset, + transform_generator=transform_generator, + visual_effect_generator=visual_effect_generator, + image_min_side=args.image_min_side, + image_max_side=args.image_max_side, + config=args.config + ) + else: + raise ValueError('Invalid data type received: {}'.format(args.dataset_type)) + + return generator + + +def parse_args(args): + """ Parse the arguments. + """ + parser = argparse.ArgumentParser(description='Debug script for a RetinaNet network.') + subparsers = parser.add_subparsers(help='Arguments for specific dataset types.', dest='dataset_type') + subparsers.required = True + + coco_parser = subparsers.add_parser('coco') + coco_parser.add_argument('coco_path', help='Path to dataset directory (ie. /tmp/COCO).') + coco_parser.add_argument('--coco-set', help='Name of the set to show (defaults to val2017).', default='val2017') + + pascal_parser = subparsers.add_parser('pascal') + pascal_parser.add_argument('pascal_path', help='Path to dataset directory (ie. /tmp/VOCdevkit).') + pascal_parser.add_argument('--pascal-set', help='Name of the set to show (defaults to test).', default='test') + pascal_parser.add_argument('--image-extension', help='Declares the dataset images\' extension.', default='.jpg') + + kitti_parser = subparsers.add_parser('kitti') + kitti_parser.add_argument('kitti_path', help='Path to dataset directory (ie. /tmp/kitti).') + kitti_parser.add_argument('subset', help='Argument for loading a subset from train/val.') + + def csv_list(string): + return string.split(',') + + oid_parser = subparsers.add_parser('oid') + oid_parser.add_argument('main_dir', help='Path to dataset directory.') + oid_parser.add_argument('subset', help='Argument for loading a subset from train/validation/test.') + oid_parser.add_argument('--version', help='The current dataset version is v4.', default='v4') + oid_parser.add_argument('--labels-filter', help='A list of labels to filter.', type=csv_list, default=None) + oid_parser.add_argument('--annotation-cache-dir', help='Path to store annotation cache.', default='.') + oid_parser.add_argument('--parent-label', help='Use the hierarchy children of this label.', default=None) + + csv_parser = subparsers.add_parser('csv') + csv_parser.add_argument('annotations', help='Path to CSV file containing annotations for evaluation.') + csv_parser.add_argument('classes', help='Path to a CSV file containing class label mapping.') + + parser.add_argument('--no-resize', help='Disable image resizing.', dest='resize', action='store_false') + parser.add_argument('--anchors', help='Show positive anchors on the image.', action='store_true') + parser.add_argument('--display-name', help='Display image name on the bottom left corner.', action='store_true') + parser.add_argument('--annotations', help='Show annotations on the image. Green annotations have anchors, red annotations don\'t and therefore don\'t contribute to training.', action='store_true') + parser.add_argument('--random-transform', help='Randomly transform image and annotations.', action='store_true') + parser.add_argument('--image-min-side', help='Rescale the image so the smallest side is min_side.', type=int, default=800) + parser.add_argument('--image-max-side', help='Rescale the image if the largest side is larger than max_side.', type=int, default=1333) + parser.add_argument('--config', help='Path to a configuration parameters .ini file.') + parser.add_argument('--no-gui', help='Do not open a GUI window. Save images to an output directory instead.', action='store_true') + parser.add_argument('--output-dir', help='The output directory to save images to if --no-gui is specified.', default='.') + parser.add_argument('--flatten-output', help='Flatten the folder structure of saved output images into a single folder.', action='store_true') + + return parser.parse_args(args) + + +def run(generator, args, anchor_params): + """ Main loop. + + Args + generator: The generator to debug. + args: parseargs args object. + """ + # display images, one at a time + i = 0 + while True: + # load the data + image = generator.load_image(i) + annotations = generator.load_annotations(i) + if len(annotations['labels']) > 0 : + # apply random transformations + if args.random_transform: + image, annotations = generator.random_transform_group_entry(image, annotations) + image, annotations = generator.random_visual_effect_group_entry(image, annotations) + + # resize the image and annotations + if args.resize: + image, image_scale = generator.resize_image(image) + annotations['bboxes'] *= image_scale + + anchors = anchors_for_shape(image.shape, anchor_params=anchor_params) + positive_indices, _, max_indices = compute_gt_annotations(anchors, annotations['bboxes']) + + # draw anchors on the image + if args.anchors: + draw_boxes(image, anchors[positive_indices], (255, 255, 0), thickness=1) + + # draw annotations on the image + if args.annotations: + # draw annotations in red + draw_annotations(image, annotations, color=(0, 0, 255), label_to_name=generator.label_to_name) + + # draw regressed anchors in green to override most red annotations + # result is that annotations without anchors are red, with anchors are green + draw_boxes(image, annotations['bboxes'][max_indices[positive_indices], :], (0, 255, 0)) + + # display name on the image + if args.display_name: + draw_caption(image, [0, image.shape[0]], os.path.basename(generator.image_path(i))) + + # write to file and advance if no-gui selected + if args.no_gui: + output_path = make_output_path(args.output_dir, generator.image_path(i), flatten=args.flatten_output) + os.makedirs(os.path.dirname(output_path), exist_ok=True) + cv2.imwrite(output_path, image) + i += 1 + if i == generator.size(): # have written all images + break + else: + continue + + # if we are using the GUI, then show an image + cv2.imshow('Image', image) + key = cv2.waitKeyEx() + + # press right for next image and left for previous (linux or windows, doesn't work for macOS) + # if you run macOS, press "n" or "m" (will also work on linux and windows) + + if key in rightkeys: + i = (i + 1) % generator.size() + if key in leftkeys: + i -= 1 + if i < 0: + i = generator.size() - 1 + + # press q or Esc to quit + if (key == ord('q')) or (key == 27): + return False + + return True + + +def make_output_path(output_dir, image_path, flatten = False): + """ Compute the output path for a debug image. """ + + # If the output hierarchy is flattened to a single folder, throw away all leading folders. + if flatten: + path = os.path.basename(image_path) + + # Otherwise, make sure absolute paths are taken relative to the filesystem root. + else: + # Make sure to drop drive letters on Windows, otherwise relpath wil fail. + _, path = os.path.splitdrive(image_path) + if os.path.isabs(path): + path = os.path.relpath(path, '/') + + # In all cases, append "_debug" to the filename, before the extension. + base, extension = os.path.splitext(path) + path = base + "_debug" + extension + + # Finally, join the whole thing to the output directory. + return os.path.join(output_dir, path) + + +def main(args=None): + # parse arguments + if args is None: + args = sys.argv[1:] + args = parse_args(args) + + # make sure keras and tensorflow are the minimum required version + check_keras_version() + check_tf_version() + + # create the generator + generator = create_generator(args) + + # optionally load config parameters + if args.config: + args.config = read_config_file(args.config) + + # optionally load anchor parameters + anchor_params = None + if args.config and 'anchor_parameters' in args.config: + anchor_params = parse_anchor_parameters(args.config) + + # create the display window if necessary + if not args.no_gui: + cv2.namedWindow('Image', cv2.WINDOW_NORMAL) + + run(generator, args, anchor_params=anchor_params) + + +if __name__ == '__main__': + main() diff --git a/src/keras_retinanet/bin/evaluate.py b/src/keras_retinanet/bin/evaluate.py new file mode 100644 index 0000000..90c095c --- /dev/null +++ b/src/keras_retinanet/bin/evaluate.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python + +""" +Copyright 2017-2018 Fizyr (https://fizyr.com) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import argparse +import os +import sys + +# Allow relative imports when being executed as script. +if __name__ == "__main__" and __package__ is None: + sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..')) + import keras_retinanet.bin # noqa: F401 + __package__ = "keras_retinanet.bin" + +# Change these to absolute imports if you copy this script outside the keras_retinanet package. +from .. import models +from ..preprocessing.csv_generator import CSVGenerator +from ..preprocessing.pascal_voc import PascalVocGenerator +from ..utils.anchors import make_shapes_callback +from ..utils.config import read_config_file, parse_anchor_parameters +from ..utils.eval import evaluate +from ..utils.gpu import setup_gpu +from ..utils.keras_version import check_keras_version +from ..utils.tf_version import check_tf_version + + +def create_generator(args, preprocess_image): + """ Create generators for evaluation. + """ + common_args = { + 'preprocess_image': preprocess_image, + } + + if args.dataset_type == 'coco': + # import here to prevent unnecessary dependency on cocoapi + from ..preprocessing.coco import CocoGenerator + + validation_generator = CocoGenerator( + args.coco_path, + 'val2017', + image_min_side=args.image_min_side, + image_max_side=args.image_max_side, + config=args.config, + shuffle_groups=False, + **common_args + ) + elif args.dataset_type == 'pascal': + validation_generator = PascalVocGenerator( + args.pascal_path, + 'test', + image_extension=args.image_extension, + image_min_side=args.image_min_side, + image_max_side=args.image_max_side, + config=args.config, + shuffle_groups=False, + **common_args + ) + elif args.dataset_type == 'csv': + validation_generator = CSVGenerator( + args.annotations, + args.classes, + image_min_side=args.image_min_side, + image_max_side=args.image_max_side, + config=args.config, + shuffle_groups=False, + **common_args + ) + else: + raise ValueError('Invalid data type received: {}'.format(args.dataset_type)) + + return validation_generator + + +def parse_args(args): + """ Parse the arguments. + """ + parser = argparse.ArgumentParser(description='Evaluation script for a RetinaNet network.') + subparsers = parser.add_subparsers(help='Arguments for specific dataset types.', dest='dataset_type') + subparsers.required = True + + coco_parser = subparsers.add_parser('coco') + coco_parser.add_argument('coco_path', help='Path to dataset directory (ie. /tmp/COCO).') + + pascal_parser = subparsers.add_parser('pascal') + pascal_parser.add_argument('pascal_path', help='Path to dataset directory (ie. /tmp/VOCdevkit).') + pascal_parser.add_argument('--image-extension', help='Declares the dataset images\' extension.', default='.jpg') + + csv_parser = subparsers.add_parser('csv') + csv_parser.add_argument('annotations', help='Path to CSV file containing annotations for evaluation.') + csv_parser.add_argument('classes', help='Path to a CSV file containing class label mapping.') + + parser.add_argument('model', help='Path to RetinaNet model.') + parser.add_argument('--convert-model', help='Convert the model to an inference model (ie. the input is a training model).', action='store_true') + parser.add_argument('--backbone', help='The backbone of the model.', default='resnet50') + parser.add_argument('--gpu', help='Id of the GPU to use (as reported by nvidia-smi).', type=int) + parser.add_argument('--score-threshold', help='Threshold on score to filter detections with (defaults to 0.05).', default=0.05, type=float) + parser.add_argument('--iou-threshold', help='IoU Threshold to count for a positive detection (defaults to 0.5).', default=0.5, type=float) + parser.add_argument('--max-detections', help='Max Detections per image (defaults to 100).', default=100, type=int) + parser.add_argument('--save-path', help='Path for saving images with detections (doesn\'t work for COCO).') + parser.add_argument('--image-min-side', help='Rescale the image so the smallest side is min_side.', type=int, default=800) + parser.add_argument('--image-max-side', help='Rescale the image if the largest side is larger than max_side.', type=int, default=1333) + parser.add_argument('--config', help='Path to a configuration parameters .ini file (only used with --convert-model).') + + return parser.parse_args(args) + + +def main(args=None): + # parse arguments + if args is None: + args = sys.argv[1:] + args = parse_args(args) + + # make sure keras and tensorflow are the minimum required version + check_keras_version() + check_tf_version() + + # optionally choose specific GPU + if args.gpu: + setup_gpu(args.gpu) + + # make save path if it doesn't exist + if args.save_path is not None and not os.path.exists(args.save_path): + os.makedirs(args.save_path) + + # optionally load config parameters + if args.config: + args.config = read_config_file(args.config) + + # create the generator + backbone = models.backbone(args.backbone) + generator = create_generator(args, backbone.preprocess_image) + + # optionally load anchor parameters + anchor_params = None + if args.config and 'anchor_parameters' in args.config: + anchor_params = parse_anchor_parameters(args.config) + + # load the model + print('Loading model, this may take a second...') + model = models.load_model(args.model, backbone_name=args.backbone) + generator.compute_shapes = make_shapes_callback(model) + + # optionally convert the model + if args.convert_model: + model = models.convert_model(model, anchor_params=anchor_params) + + # print model summary + # print(model.summary()) + + # start evaluation + if args.dataset_type == 'coco': + from ..utils.coco_eval import evaluate_coco + evaluate_coco(generator, model, args.score_threshold) + else: + average_precisions, inference_time = evaluate( + generator, + model, + iou_threshold=args.iou_threshold, + score_threshold=args.score_threshold, + max_detections=args.max_detections, + save_path=args.save_path + ) + + # print evaluation + total_instances = [] + precisions = [] + for label, (average_precision, num_annotations) in average_precisions.items(): + print('{:.0f} instances of class'.format(num_annotations), + generator.label_to_name(label), 'with average precision: {:.4f}'.format(average_precision)) + total_instances.append(num_annotations) + precisions.append(average_precision) + + if sum(total_instances) == 0: + print('No test instances found.') + return + + print('Inference time for {:.0f} images: {:.4f}'.format(generator.size(), inference_time)) + + print('mAP using the weighted average of precisions among classes: {:.4f}'.format(sum([a * b for a, b in zip(total_instances, precisions)]) / sum(total_instances))) + print('mAP: {:.4f}'.format(sum(precisions) / sum(x > 0 for x in total_instances))) + + +if __name__ == '__main__': + main() diff --git a/src/keras_retinanet/bin/train.py b/src/keras_retinanet/bin/train.py new file mode 100644 index 0000000..128f01d --- /dev/null +++ b/src/keras_retinanet/bin/train.py @@ -0,0 +1,539 @@ +#!/usr/bin/env python + +""" +Copyright 2017-2018 Fizyr (https://fizyr.com) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import argparse +import os +import sys +import warnings + +import keras +import keras.preprocessing.image +import tensorflow as tf + +# Allow relative imports when being executed as script. +if __name__ == "__main__" and __package__ is None: + sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..')) + import keras_retinanet.bin # noqa: F401 + __package__ = "keras_retinanet.bin" + +# Change these to absolute imports if you copy this script outside the keras_retinanet package. +from .. import layers # noqa: F401 +from .. import losses +from .. import models +from ..callbacks import RedirectModel +from ..callbacks.eval import Evaluate +from ..models.retinanet import retinanet_bbox +from ..preprocessing.csv_generator import CSVGenerator +from ..preprocessing.kitti import KittiGenerator +from ..preprocessing.open_images import OpenImagesGenerator +from ..preprocessing.pascal_voc import PascalVocGenerator +from ..utils.anchors import make_shapes_callback +from ..utils.config import read_config_file, parse_anchor_parameters +from ..utils.gpu import setup_gpu +from ..utils.image import random_visual_effect_generator +from ..utils.keras_version import check_keras_version +from ..utils.model import freeze as freeze_model +from ..utils.tf_version import check_tf_version +from ..utils.transform import random_transform_generator + + +def makedirs(path): + # Intended behavior: try to create the directory, + # pass if the directory exists already, fails otherwise. + # Meant for Python 2.7/3.n compatibility. + try: + os.makedirs(path) + except OSError: + if not os.path.isdir(path): + raise + + +def model_with_weights(model, weights, skip_mismatch): + """ Load weights for model. + + Args + model : The model to load weights for. + weights : The weights to load. + skip_mismatch : If True, skips layers whose shape of weights doesn't match with the model. + """ + if weights is not None: + model.load_weights(weights, by_name=True, skip_mismatch=skip_mismatch) + return model + + +def create_models(backbone_retinanet, num_classes, weights, multi_gpu=0, + freeze_backbone=False, lr=1e-5, config=None): + """ Creates three models (model, training_model, prediction_model). + + Args + backbone_retinanet : A function to call to create a retinanet model with a given backbone. + num_classes : The number of classes to train. + weights : The weights to load into the model. + multi_gpu : The number of GPUs to use for training. + freeze_backbone : If True, disables learning for the backbone. + config : Config parameters, None indicates the default configuration. + + Returns + model : The base model. This is also the model that is saved in snapshots. + training_model : The training model. If multi_gpu=0, this is identical to model. + prediction_model : The model wrapped with utility functions to perform object detection (applies regression values and performs NMS). + """ + + modifier = freeze_model if freeze_backbone else None + + # load anchor parameters, or pass None (so that defaults will be used) + anchor_params = None + num_anchors = None + if config and 'anchor_parameters' in config: + anchor_params = parse_anchor_parameters(config) + num_anchors = anchor_params.num_anchors() + + # Keras recommends initialising a multi-gpu model on the CPU to ease weight sharing, and to prevent OOM errors. + # optionally wrap in a parallel model + if multi_gpu > 1: + from keras.utils import multi_gpu_model + with tf.device('/cpu:0'): + model = model_with_weights(backbone_retinanet(num_classes, num_anchors=num_anchors, modifier=modifier), weights=weights, skip_mismatch=True) + training_model = multi_gpu_model(model, gpus=multi_gpu) + else: + model = model_with_weights(backbone_retinanet(num_classes, num_anchors=num_anchors, modifier=modifier), weights=weights, skip_mismatch=True) + training_model = model + + # make prediction model + prediction_model = retinanet_bbox(model=model, anchor_params=anchor_params) + + # compile model + training_model.compile( + loss={ + 'regression' : losses.smooth_l1(), + 'classification': losses.focal() + }, + optimizer=keras.optimizers.adam(lr=lr, clipnorm=0.001) + ) + + return model, training_model, prediction_model + + +def create_callbacks(model, training_model, prediction_model, validation_generator, args): + """ Creates the callbacks to use during training. + + Args + model: The base model. + training_model: The model that is used for training. + prediction_model: The model that should be used for validation. + validation_generator: The generator for creating validation data. + args: parseargs args object. + + Returns: + A list of callbacks used for training. + """ + callbacks = [] + + tensorboard_callback = None + + if args.tensorboard_dir: + makedirs(args.tensorboard_dir) + tensorboard_callback = keras.callbacks.TensorBoard( + log_dir = args.tensorboard_dir, + histogram_freq = 0, + batch_size = args.batch_size, + write_graph = True, + write_grads = False, + write_images = False, + embeddings_freq = 0, + embeddings_layer_names = None, + embeddings_metadata = None + ) + + if args.evaluation and validation_generator: + if args.dataset_type == 'coco': + from ..callbacks.coco import CocoEval + + # use prediction model for evaluation + evaluation = CocoEval(validation_generator, tensorboard=tensorboard_callback) + else: + evaluation = Evaluate(validation_generator, tensorboard=tensorboard_callback, weighted_average=args.weighted_average) + evaluation = RedirectModel(evaluation, prediction_model) + callbacks.append(evaluation) + + # save the model + if args.snapshots: + # ensure directory created first; otherwise h5py will error after epoch. + makedirs(args.snapshot_path) + checkpoint = keras.callbacks.ModelCheckpoint( + os.path.join( + args.snapshot_path, + '{backbone}_{dataset_type}_{{epoch:02d}}.h5'.format(backbone=args.backbone, dataset_type=args.dataset_type) + ), + verbose=1, + # save_best_only=True, + # monitor="mAP", + # mode='max' + ) + checkpoint = RedirectModel(checkpoint, model) + callbacks.append(checkpoint) + + callbacks.append(keras.callbacks.ReduceLROnPlateau( + monitor = 'loss', + factor = args.reduce_lr_factor, + patience = args.reduce_lr_patience, + verbose = 1, + mode = 'auto', + min_delta = 0.0001, + cooldown = 0, + min_lr = 0 + )) + + callbacks.append(keras.callbacks.EarlyStopping( + monitor = 'mAP', + patience = 15, + mode = 'max', + min_delta = 0.01 + )) + + if args.tensorboard_dir: + callbacks.append(tensorboard_callback) + + return callbacks + + +def create_generators(args, preprocess_image): + """ Create generators for training and validation. + + Args + args : parseargs object containing configuration for generators. + preprocess_image : Function that preprocesses an image for the network. + """ + common_args = { + 'batch_size' : args.batch_size, + 'config' : args.config, + 'image_min_side' : args.image_min_side, + 'image_max_side' : args.image_max_side, + 'no_resize' : args.no_resize, + 'preprocess_image' : preprocess_image, + } + + # create random transform generator for augmenting training data + if args.random_transform: + transform_generator = random_transform_generator( + min_rotation=-0.1, + max_rotation=0.1, + min_translation=(-0.1, -0.1), + max_translation=(0.1, 0.1), + min_shear=-0.1, + max_shear=0.1, + min_scaling=(0.9, 0.9), + max_scaling=(1.1, 1.1), + flip_x_chance=0.5, + flip_y_chance=0.5, + ) + visual_effect_generator = random_visual_effect_generator( + contrast_range=(0.9, 1.1), + brightness_range=(-.1, .1), + hue_range=(-0.05, 0.05), + saturation_range=(0.95, 1.05) + ) + else: + transform_generator = random_transform_generator(flip_x_chance=0.5) + visual_effect_generator = None + + if args.dataset_type == 'coco': + # import here to prevent unnecessary dependency on cocoapi + from ..preprocessing.coco import CocoGenerator + + train_generator = CocoGenerator( + args.coco_path, + 'train2017', + transform_generator=transform_generator, + visual_effect_generator=visual_effect_generator, + **common_args + ) + + validation_generator = CocoGenerator( + args.coco_path, + 'val2017', + shuffle_groups=False, + **common_args + ) + elif args.dataset_type == 'pascal': + train_generator = PascalVocGenerator( + args.pascal_path, + 'train', + image_extension=args.image_extension, + transform_generator=transform_generator, + visual_effect_generator=visual_effect_generator, + **common_args + ) + + validation_generator = PascalVocGenerator( + args.pascal_path, + 'val', + image_extension=args.image_extension, + shuffle_groups=False, + **common_args + ) + elif args.dataset_type == 'csv': + train_generator = CSVGenerator( + args.annotations, + args.classes, + transform_generator=transform_generator, + visual_effect_generator=visual_effect_generator, + **common_args + ) + + if args.val_annotations: + validation_generator = CSVGenerator( + args.val_annotations, + args.classes, + shuffle_groups=False, + **common_args + ) + else: + validation_generator = None + elif args.dataset_type == 'oid': + train_generator = OpenImagesGenerator( + args.main_dir, + subset='train', + version=args.version, + labels_filter=args.labels_filter, + annotation_cache_dir=args.annotation_cache_dir, + parent_label=args.parent_label, + transform_generator=transform_generator, + visual_effect_generator=visual_effect_generator, + **common_args + ) + + validation_generator = OpenImagesGenerator( + args.main_dir, + subset='validation', + version=args.version, + labels_filter=args.labels_filter, + annotation_cache_dir=args.annotation_cache_dir, + parent_label=args.parent_label, + shuffle_groups=False, + **common_args + ) + elif args.dataset_type == 'kitti': + train_generator = KittiGenerator( + args.kitti_path, + subset='train', + transform_generator=transform_generator, + visual_effect_generator=visual_effect_generator, + **common_args + ) + + validation_generator = KittiGenerator( + args.kitti_path, + subset='val', + shuffle_groups=False, + **common_args + ) + else: + raise ValueError('Invalid data type received: {}'.format(args.dataset_type)) + + return train_generator, validation_generator + + +def check_args(parsed_args): + """ Function to check for inherent contradictions within parsed arguments. + For example, batch_size < num_gpus + Intended to raise errors prior to backend initialisation. + + Args + parsed_args: parser.parse_args() + + Returns + parsed_args + """ + + if parsed_args.multi_gpu > 1 and parsed_args.batch_size < parsed_args.multi_gpu: + raise ValueError( + "Batch size ({}) must be equal to or higher than the number of GPUs ({})".format(parsed_args.batch_size, + parsed_args.multi_gpu)) + + if parsed_args.multi_gpu > 1 and parsed_args.snapshot: + raise ValueError( + "Multi GPU training ({}) and resuming from snapshots ({}) is not supported.".format(parsed_args.multi_gpu, + parsed_args.snapshot)) + + if parsed_args.multi_gpu > 1 and not parsed_args.multi_gpu_force: + raise ValueError("Multi-GPU support is experimental, use at own risk! Run with --multi-gpu-force if you wish to continue.") + + if 'resnet' not in parsed_args.backbone: + warnings.warn('Using experimental backbone {}. Only resnet50 has been properly tested.'.format(parsed_args.backbone)) + + return parsed_args + + +def parse_args(args): + """ Parse the arguments. + """ + parser = argparse.ArgumentParser(description='Simple training script for training a RetinaNet network.') + subparsers = parser.add_subparsers(help='Arguments for specific dataset types.', dest='dataset_type') + subparsers.required = True + + coco_parser = subparsers.add_parser('coco') + coco_parser.add_argument('coco_path', help='Path to dataset directory (ie. /tmp/COCO).') + + pascal_parser = subparsers.add_parser('pascal') + pascal_parser.add_argument('pascal_path', help='Path to dataset directory (ie. /tmp/VOCdevkit).') + pascal_parser.add_argument('--image-extension', help='Declares the dataset images\' extension.', default='.jpg') + + kitti_parser = subparsers.add_parser('kitti') + kitti_parser.add_argument('kitti_path', help='Path to dataset directory (ie. /tmp/kitti).') + + def csv_list(string): + return string.split(',') + + oid_parser = subparsers.add_parser('oid') + oid_parser.add_argument('main_dir', help='Path to dataset directory.') + oid_parser.add_argument('--version', help='The current dataset version is v4.', default='v4') + oid_parser.add_argument('--labels-filter', help='A list of labels to filter.', type=csv_list, default=None) + oid_parser.add_argument('--annotation-cache-dir', help='Path to store annotation cache.', default='.') + oid_parser.add_argument('--parent-label', help='Use the hierarchy children of this label.', default=None) + + csv_parser = subparsers.add_parser('csv') + csv_parser.add_argument('annotations', help='Path to CSV file containing annotations for training.') + csv_parser.add_argument('classes', help='Path to a CSV file containing class label mapping.') + csv_parser.add_argument('--val-annotations', help='Path to CSV file containing annotations for validation (optional).') + + group = parser.add_mutually_exclusive_group() + group.add_argument('--snapshot', help='Resume training from a snapshot.') + group.add_argument('--imagenet-weights', help='Initialize the model with pretrained imagenet weights. This is the default behaviour.', action='store_const', const=True, default=True) + group.add_argument('--weights', help='Initialize the model with weights from a file.') + group.add_argument('--no-weights', help='Don\'t initialize the model with any weights.', dest='imagenet_weights', action='store_const', const=False) + parser.add_argument('--backbone', help='Backbone model used by retinanet.', default='resnet50', type=str) + parser.add_argument('--batch-size', help='Size of the batches.', default=1, type=int) + parser.add_argument('--gpu', help='Id of the GPU to use (as reported by nvidia-smi).', type=int) + parser.add_argument('--multi-gpu', help='Number of GPUs to use for parallel processing.', type=int, default=0) + parser.add_argument('--multi-gpu-force', help='Extra flag needed to enable (experimental) multi-gpu support.', action='store_true') + parser.add_argument('--initial-epoch', help='Epoch from which to begin the train, useful if resuming from snapshot.', type=int, default=0) + parser.add_argument('--epochs', help='Number of epochs to train.', type=int, default=50) + parser.add_argument('--steps', help='Number of steps per epoch.', type=int, default=10000) + parser.add_argument('--lr', help='Learning rate.', type=float, default=1e-5) + parser.add_argument('--snapshot-path', help='Path to store snapshots of models during training (defaults to \'./snapshots\')', default='./snapshots') + parser.add_argument('--tensorboard-dir', help='Log directory for Tensorboard output', default='') # default='./logs') => https://github.com/tensorflow/tensorflow/pull/34870 + parser.add_argument('--no-snapshots', help='Disable saving snapshots.', dest='snapshots', action='store_false') + parser.add_argument('--no-evaluation', help='Disable per epoch evaluation.', dest='evaluation', action='store_false') + parser.add_argument('--freeze-backbone', help='Freeze training of backbone layers.', action='store_true') + parser.add_argument('--random-transform', help='Randomly transform image and annotations.', action='store_true') + parser.add_argument('--image-min-side', help='Rescale the image so the smallest side is min_side.', type=int, default=800) + parser.add_argument('--image-max-side', help='Rescale the image if the largest side is larger than max_side.', type=int, default=1333) + parser.add_argument('--no-resize', help='Don''t rescale the image.', action='store_true') + parser.add_argument('--config', help='Path to a configuration parameters .ini file.') + parser.add_argument('--weighted-average', help='Compute the mAP using the weighted average of precisions among classes.', action='store_true') + parser.add_argument('--compute-val-loss', help='Compute validation loss during training', dest='compute_val_loss', action='store_true') + parser.add_argument('--reduce-lr-patience', help='Reduce learning rate after validation loss decreases over reduce_lr_patience epochs', type=int, default=2) + parser.add_argument('--reduce-lr-factor', help='When learning rate is reduced due to reduce_lr_patience, multiply by reduce_lr_factor', type=float, default=0.1) + + # Fit generator arguments + parser.add_argument('--multiprocessing', help='Use multiprocessing in fit_generator.', action='store_true') + parser.add_argument('--workers', help='Number of generator workers.', type=int, default=1) + parser.add_argument('--max-queue-size', help='Queue length for multiprocessing workers in fit_generator.', type=int, default=10) + + return check_args(parser.parse_args(args)) + + +def main(args=None): + # parse arguments + if args is None: + args = sys.argv[1:] + args = parse_args(args) + + # create object that stores backbone information + backbone = models.backbone(args.backbone) + + # make sure keras and tensorflow are the minimum required version + check_keras_version() + check_tf_version() + + # optionally choose specific GPU + if args.gpu is not None: + setup_gpu(args.gpu) + + # optionally load config parameters + if args.config: + args.config = read_config_file(args.config) + + # create the generators + train_generator, validation_generator = create_generators(args, backbone.preprocess_image) + + # create the model + if args.snapshot is not None: + print('Loading model, this may take a second...') + model = models.load_model(args.snapshot, backbone_name=args.backbone) + training_model = model + anchor_params = None + if args.config and 'anchor_parameters' in args.config: + anchor_params = parse_anchor_parameters(args.config) + prediction_model = retinanet_bbox(model=model, anchor_params=anchor_params) + else: + weights = args.weights + # default to imagenet if nothing else is specified + if weights is None and args.imagenet_weights: + weights = backbone.download_imagenet() + + print('Creating model, this may take a second...') + model, training_model, prediction_model = create_models( + backbone_retinanet=backbone.retinanet, + num_classes=train_generator.num_classes(), + weights=weights, + multi_gpu=args.multi_gpu, + freeze_backbone=args.freeze_backbone, + lr=args.lr, + config=args.config + ) + + # print model summary + print(model.summary()) + + # this lets the generator compute backbone layer shapes using the actual backbone model + if 'vgg' in args.backbone or 'densenet' in args.backbone: + train_generator.compute_shapes = make_shapes_callback(model) + if validation_generator: + validation_generator.compute_shapes = train_generator.compute_shapes + + # create the callbacks + callbacks = create_callbacks( + model, + training_model, + prediction_model, + validation_generator, + args, + ) + + if not args.compute_val_loss: + validation_generator = None + + # start training + return training_model.fit_generator( + generator=train_generator, + steps_per_epoch=args.steps, + epochs=args.epochs, + verbose=1, + callbacks=callbacks, + workers=args.workers, + use_multiprocessing=args.multiprocessing, + max_queue_size=args.max_queue_size, + validation_data=validation_generator, + initial_epoch=args.initial_epoch + ) + + +if __name__ == '__main__': + main() diff --git a/src/keras_retinanet/callbacks/__init__.py b/src/keras_retinanet/callbacks/__init__.py new file mode 100644 index 0000000..7316c99 --- /dev/null +++ b/src/keras_retinanet/callbacks/__init__.py @@ -0,0 +1 @@ +from .common import * # noqa: F401,F403 diff --git a/src/keras_retinanet/callbacks/coco.py b/src/keras_retinanet/callbacks/coco.py new file mode 100644 index 0000000..7f9cc70 --- /dev/null +++ b/src/keras_retinanet/callbacks/coco.py @@ -0,0 +1,67 @@ +""" +Copyright 2017-2018 Fizyr (https://fizyr.com) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import keras +from ..utils.coco_eval import evaluate_coco + + +class CocoEval(keras.callbacks.Callback): + """ Performs COCO evaluation on each epoch. + """ + def __init__(self, generator, tensorboard=None, threshold=0.05): + """ CocoEval callback intializer. + + Args + generator : The generator used for creating validation data. + tensorboard : If given, the results will be written to tensorboard. + threshold : The score threshold to use. + """ + self.generator = generator + self.threshold = threshold + self.tensorboard = tensorboard + + super(CocoEval, self).__init__() + + def on_epoch_end(self, epoch, logs=None): + logs = logs or {} + + coco_tag = ['AP @[ IoU=0.50:0.95 | area= all | maxDets=100 ]', + 'AP @[ IoU=0.50 | area= all | maxDets=100 ]', + 'AP @[ IoU=0.75 | area= all | maxDets=100 ]', + 'AP @[ IoU=0.50:0.95 | area= small | maxDets=100 ]', + 'AP @[ IoU=0.50:0.95 | area=medium | maxDets=100 ]', + 'AP @[ IoU=0.50:0.95 | area= large | maxDets=100 ]', + 'AR @[ IoU=0.50:0.95 | area= all | maxDets= 1 ]', + 'AR @[ IoU=0.50:0.95 | area= all | maxDets= 10 ]', + 'AR @[ IoU=0.50:0.95 | area= all | maxDets=100 ]', + 'AR @[ IoU=0.50:0.95 | area= small | maxDets=100 ]', + 'AR @[ IoU=0.50:0.95 | area=medium | maxDets=100 ]', + 'AR @[ IoU=0.50:0.95 | area= large | maxDets=100 ]'] + coco_eval_stats = evaluate_coco(self.generator, self.model, self.threshold) + + if coco_eval_stats is not None: + for index, result in enumerate(coco_eval_stats): + logs[coco_tag[index]] = result + + if self.tensorboard: + import tensorflow as tf + if tf.version.VERSION < '2.0.0' and self.tensorboard.writer: + summary = tf.Summary() + for index, result in enumerate(coco_eval_stats): + summary_value = summary.value.add() + summary_value.simple_value = result + summary_value.tag = '{}. {}'.format(index + 1, coco_tag[index]) + self.tensorboard.writer.add_summary(summary, epoch) diff --git a/src/keras_retinanet/callbacks/common.py b/src/keras_retinanet/callbacks/common.py new file mode 100644 index 0000000..67c00e1 --- /dev/null +++ b/src/keras_retinanet/callbacks/common.py @@ -0,0 +1,46 @@ +import keras.callbacks + + +class RedirectModel(keras.callbacks.Callback): + """Callback which wraps another callback, but executed on a different model. + + ```python + model = keras.models.load_model('model.h5') + model_checkpoint = ModelCheckpoint(filepath='snapshot.h5') + parallel_model = multi_gpu_model(model, gpus=2) + parallel_model.fit(X_train, Y_train, callbacks=[RedirectModel(model_checkpoint, model)]) + ``` + + Args + callback : callback to wrap. + model : model to use when executing callbacks. + """ + + def __init__(self, + callback, + model): + super(RedirectModel, self).__init__() + + self.callback = callback + self.redirect_model = model + + def on_epoch_begin(self, epoch, logs=None): + self.callback.on_epoch_begin(epoch, logs=logs) + + def on_epoch_end(self, epoch, logs=None): + self.callback.on_epoch_end(epoch, logs=logs) + + def on_batch_begin(self, batch, logs=None): + self.callback.on_batch_begin(batch, logs=logs) + + def on_batch_end(self, batch, logs=None): + self.callback.on_batch_end(batch, logs=logs) + + def on_train_begin(self, logs=None): + # overwrite the model with our custom model + self.callback.set_model(self.redirect_model) + + self.callback.on_train_begin(logs=logs) + + def on_train_end(self, logs=None): + self.callback.on_train_end(logs=logs) diff --git a/src/keras_retinanet/callbacks/eval.py b/src/keras_retinanet/callbacks/eval.py new file mode 100644 index 0000000..abdc8bb --- /dev/null +++ b/src/keras_retinanet/callbacks/eval.py @@ -0,0 +1,98 @@ +""" +Copyright 2017-2018 Fizyr (https://fizyr.com) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import keras +from ..utils.eval import evaluate + + +class Evaluate(keras.callbacks.Callback): + """ Evaluation callback for arbitrary datasets. + """ + + def __init__( + self, + generator, + iou_threshold=0.5, + score_threshold=0.05, + max_detections=100, + save_path=None, + tensorboard=None, + weighted_average=False, + verbose=1 + ): + """ Evaluate a given dataset using a given model at the end of every epoch during training. + + # Arguments + generator : The generator that represents the dataset to evaluate. + iou_threshold : The threshold used to consider when a detection is positive or negative. + score_threshold : The score confidence threshold to use for detections. + max_detections : The maximum number of detections to use per image. + save_path : The path to save images with visualized detections to. + tensorboard : Instance of keras.callbacks.TensorBoard used to log the mAP value. + weighted_average : Compute the mAP using the weighted average of precisions among classes. + verbose : Set the verbosity level, by default this is set to 1. + """ + self.generator = generator + self.iou_threshold = iou_threshold + self.score_threshold = score_threshold + self.max_detections = max_detections + self.save_path = save_path + self.tensorboard = tensorboard + self.weighted_average = weighted_average + self.verbose = verbose + + super(Evaluate, self).__init__() + + def on_epoch_end(self, epoch, logs=None): + logs = logs or {} + + # run evaluation + average_precisions, _ = evaluate( + self.generator, + self.model, + iou_threshold=self.iou_threshold, + score_threshold=self.score_threshold, + max_detections=self.max_detections, + save_path=self.save_path + ) + + # compute per class average precision + total_instances = [] + precisions = [] + for label, (average_precision, num_annotations) in average_precisions.items(): + if self.verbose == 1: + print('{:.0f} instances of class'.format(num_annotations), + self.generator.label_to_name(label), 'with average precision: {:.4f}'.format(average_precision)) + total_instances.append(num_annotations) + precisions.append(average_precision) + if self.weighted_average: + self.mean_ap = sum([a * b for a, b in zip(total_instances, precisions)]) / sum(total_instances) + else: + self.mean_ap = sum(precisions) / sum(x > 0 for x in total_instances) + + if self.tensorboard: + import tensorflow as tf + if tf.version.VERSION < '2.0.0' and self.tensorboard.writer: + summary = tf.Summary() + summary_value = summary.value.add() + summary_value.simple_value = self.mean_ap + summary_value.tag = "mAP" + self.tensorboard.writer.add_summary(summary, epoch) + + logs['mAP'] = self.mean_ap + + if self.verbose == 1: + print('mAP: {:.4f}'.format(self.mean_ap)) diff --git a/src/keras_retinanet/initializers.py b/src/keras_retinanet/initializers.py new file mode 100644 index 0000000..f41faf8 --- /dev/null +++ b/src/keras_retinanet/initializers.py @@ -0,0 +1,39 @@ +""" +Copyright 2017-2018 Fizyr (https://fizyr.com) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import keras + +import numpy as np +import math + + +class PriorProbability(keras.initializers.Initializer): + """ Apply a prior probability to the weights. + """ + + def __init__(self, probability=0.01): + self.probability = probability + + def get_config(self): + return { + 'probability': self.probability + } + + def __call__(self, shape, dtype=None): + # set bias to -log((1 - p)/p) for foreground + result = np.ones(shape, dtype=dtype) * -math.log((1 - self.probability) / self.probability) + + return result diff --git a/src/keras_retinanet/layers/__init__.py b/src/keras_retinanet/layers/__init__.py new file mode 100644 index 0000000..5a8c7d3 --- /dev/null +++ b/src/keras_retinanet/layers/__init__.py @@ -0,0 +1,2 @@ +from ._misc import RegressBoxes, UpsampleLike, Anchors, ClipBoxes # noqa: F401 +from .filter_detections import FilterDetections # noqa: F401 diff --git a/src/keras_retinanet/layers/_misc.py b/src/keras_retinanet/layers/_misc.py new file mode 100644 index 0000000..6fc19b0 --- /dev/null +++ b/src/keras_retinanet/layers/_misc.py @@ -0,0 +1,185 @@ +""" +Copyright 2017-2018 Fizyr (https://fizyr.com) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import keras +from .. import backend +from ..utils import anchors as utils_anchors + +import numpy as np + + +class Anchors(keras.layers.Layer): + """ Keras layer for generating achors for a given shape. + """ + + def __init__(self, size, stride, ratios=None, scales=None, *args, **kwargs): + """ Initializer for an Anchors layer. + + Args + size: The base size of the anchors to generate. + stride: The stride of the anchors to generate. + ratios: The ratios of the anchors to generate (defaults to AnchorParameters.default.ratios). + scales: The scales of the anchors to generate (defaults to AnchorParameters.default.scales). + """ + self.size = size + self.stride = stride + self.ratios = ratios + self.scales = scales + + if ratios is None: + self.ratios = utils_anchors.AnchorParameters.default.ratios + elif isinstance(ratios, list): + self.ratios = np.array(ratios) + if scales is None: + self.scales = utils_anchors.AnchorParameters.default.scales + elif isinstance(scales, list): + self.scales = np.array(scales) + + self.num_anchors = len(self.ratios) * len(self.scales) + self.anchors = keras.backend.variable(utils_anchors.generate_anchors( + base_size=self.size, + ratios=self.ratios, + scales=self.scales, + )) + + super(Anchors, self).__init__(*args, **kwargs) + + def call(self, inputs, **kwargs): + features = inputs + features_shape = keras.backend.shape(features) + + # generate proposals from bbox deltas and shifted anchors + if keras.backend.image_data_format() == 'channels_first': + anchors = backend.shift(features_shape[2:4], self.stride, self.anchors) + else: + anchors = backend.shift(features_shape[1:3], self.stride, self.anchors) + anchors = keras.backend.tile(keras.backend.expand_dims(anchors, axis=0), (features_shape[0], 1, 1)) + + return anchors + + def compute_output_shape(self, input_shape): + if None not in input_shape[1:]: + if keras.backend.image_data_format() == 'channels_first': + total = np.prod(input_shape[2:4]) * self.num_anchors + else: + total = np.prod(input_shape[1:3]) * self.num_anchors + + return (input_shape[0], total, 4) + else: + return (input_shape[0], None, 4) + + def get_config(self): + config = super(Anchors, self).get_config() + config.update({ + 'size' : self.size, + 'stride' : self.stride, + 'ratios' : self.ratios.tolist(), + 'scales' : self.scales.tolist(), + }) + + return config + + +class UpsampleLike(keras.layers.Layer): + """ Keras layer for upsampling a Tensor to be the same shape as another Tensor. + """ + + def call(self, inputs, **kwargs): + source, target = inputs + target_shape = keras.backend.shape(target) + if keras.backend.image_data_format() == 'channels_first': + source = backend.transpose(source, (0, 2, 3, 1)) + output = backend.resize_images(source, (target_shape[2], target_shape[3]), method='nearest') + output = backend.transpose(output, (0, 3, 1, 2)) + return output + else: + return backend.resize_images(source, (target_shape[1], target_shape[2]), method='nearest') + + def compute_output_shape(self, input_shape): + if keras.backend.image_data_format() == 'channels_first': + return (input_shape[0][0], input_shape[0][1]) + input_shape[1][2:4] + else: + return (input_shape[0][0],) + input_shape[1][1:3] + (input_shape[0][-1],) + + +class RegressBoxes(keras.layers.Layer): + """ Keras layer for applying regression values to boxes. + """ + + def __init__(self, mean=None, std=None, *args, **kwargs): + """ Initializer for the RegressBoxes layer. + + Args + mean: The mean value of the regression values which was used for normalization. + std: The standard value of the regression values which was used for normalization. + """ + if mean is None: + mean = np.array([0, 0, 0, 0]) + if std is None: + std = np.array([0.2, 0.2, 0.2, 0.2]) + + if isinstance(mean, (list, tuple)): + mean = np.array(mean) + elif not isinstance(mean, np.ndarray): + raise ValueError('Expected mean to be a np.ndarray, list or tuple. Received: {}'.format(type(mean))) + + if isinstance(std, (list, tuple)): + std = np.array(std) + elif not isinstance(std, np.ndarray): + raise ValueError('Expected std to be a np.ndarray, list or tuple. Received: {}'.format(type(std))) + + self.mean = mean + self.std = std + super(RegressBoxes, self).__init__(*args, **kwargs) + + def call(self, inputs, **kwargs): + anchors, regression = inputs + return backend.bbox_transform_inv(anchors, regression, mean=self.mean, std=self.std) + + def compute_output_shape(self, input_shape): + return input_shape[0] + + def get_config(self): + config = super(RegressBoxes, self).get_config() + config.update({ + 'mean': self.mean.tolist(), + 'std' : self.std.tolist(), + }) + + return config + + +class ClipBoxes(keras.layers.Layer): + """ Keras layer to clip box values to lie inside a given shape. + """ + def call(self, inputs, **kwargs): + image, boxes = inputs + shape = keras.backend.cast(keras.backend.shape(image), keras.backend.floatx()) + if keras.backend.image_data_format() == 'channels_first': + _, _, height, width = backend.unstack(shape, axis=0) + else: + _, height, width, _ = backend.unstack(shape, axis=0) + + x1, y1, x2, y2 = backend.unstack(boxes, axis=-1) + x1 = backend.clip_by_value(x1, 0, width - 1) + y1 = backend.clip_by_value(y1, 0, height - 1) + x2 = backend.clip_by_value(x2, 0, width - 1) + y2 = backend.clip_by_value(y2, 0, height - 1) + + return keras.backend.stack([x1, y1, x2, y2], axis=2) + + def compute_output_shape(self, input_shape): + return input_shape[1] diff --git a/src/keras_retinanet/layers/filter_detections.py b/src/keras_retinanet/layers/filter_detections.py new file mode 100644 index 0000000..f73e918 --- /dev/null +++ b/src/keras_retinanet/layers/filter_detections.py @@ -0,0 +1,223 @@ +""" +Copyright 2017-2018 Fizyr (https://fizyr.com) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import keras +from .. import backend + + +def filter_detections( + boxes, + classification, + other = [], + class_specific_filter = True, + nms = True, + score_threshold = 0.05, + max_detections = 300, + nms_threshold = 0.5 +): + """ Filter detections using the boxes and classification values. + + Args + boxes : Tensor of shape (num_boxes, 4) containing the boxes in (x1, y1, x2, y2) format. + classification : Tensor of shape (num_boxes, num_classes) containing the classification scores. + other : List of tensors of shape (num_boxes, ...) to filter along with the boxes and classification scores. + class_specific_filter : Whether to perform filtering per class, or take the best scoring class and filter those. + nms : Flag to enable/disable non maximum suppression. + score_threshold : Threshold used to prefilter the boxes with. + max_detections : Maximum number of detections to keep. + nms_threshold : Threshold for the IoU value to determine when a box should be suppressed. + + Returns + A list of [boxes, scores, labels, other[0], other[1], ...]. + boxes is shaped (max_detections, 4) and contains the (x1, y1, x2, y2) of the non-suppressed boxes. + scores is shaped (max_detections,) and contains the scores of the predicted class. + labels is shaped (max_detections,) and contains the predicted label. + other[i] is shaped (max_detections, ...) and contains the filtered other[i] data. + In case there are less than max_detections detections, the tensors are padded with -1's. + """ + def _filter_detections(scores, labels): + # threshold based on score + indices = backend.where(keras.backend.greater(scores, score_threshold)) + + if nms: + filtered_boxes = backend.gather_nd(boxes, indices) + filtered_scores = keras.backend.gather(scores, indices)[:, 0] + + # perform NMS + nms_indices = backend.non_max_suppression(filtered_boxes, filtered_scores, max_output_size=max_detections, iou_threshold=nms_threshold) + + # filter indices based on NMS + indices = keras.backend.gather(indices, nms_indices) + + # add indices to list of all indices + labels = backend.gather_nd(labels, indices) + indices = keras.backend.stack([indices[:, 0], labels], axis=1) + + return indices + + if class_specific_filter: + all_indices = [] + # perform per class filtering + for c in range(int(classification.shape[1])): + scores = classification[:, c] + labels = c * backend.ones((keras.backend.shape(scores)[0],), dtype='int64') + all_indices.append(_filter_detections(scores, labels)) + + # concatenate indices to single tensor + indices = keras.backend.concatenate(all_indices, axis=0) + else: + scores = keras.backend.max(classification, axis = 1) + labels = keras.backend.argmax(classification, axis = 1) + indices = _filter_detections(scores, labels) + + # select top k + scores = backend.gather_nd(classification, indices) + labels = indices[:, 1] + scores, top_indices = backend.top_k(scores, k=keras.backend.minimum(max_detections, keras.backend.shape(scores)[0])) + + # filter input using the final set of indices + indices = keras.backend.gather(indices[:, 0], top_indices) + boxes = keras.backend.gather(boxes, indices) + labels = keras.backend.gather(labels, top_indices) + other_ = [keras.backend.gather(o, indices) for o in other] + + # zero pad the outputs + pad_size = keras.backend.maximum(0, max_detections - keras.backend.shape(scores)[0]) + boxes = backend.pad(boxes, [[0, pad_size], [0, 0]], constant_values=-1) + scores = backend.pad(scores, [[0, pad_size]], constant_values=-1) + labels = backend.pad(labels, [[0, pad_size]], constant_values=-1) + labels = keras.backend.cast(labels, 'int32') + other_ = [backend.pad(o, [[0, pad_size]] + [[0, 0] for _ in range(1, len(o.shape))], constant_values=-1) for o in other_] + + # set shapes, since we know what they are + boxes.set_shape([max_detections, 4]) + scores.set_shape([max_detections]) + labels.set_shape([max_detections]) + for o, s in zip(other_, [list(keras.backend.int_shape(o)) for o in other]): + o.set_shape([max_detections] + s[1:]) + + return [boxes, scores, labels] + other_ + + +class FilterDetections(keras.layers.Layer): + """ Keras layer for filtering detections using score threshold and NMS. + """ + + def __init__( + self, + nms = True, + class_specific_filter = True, + nms_threshold = 0.5, + score_threshold = 0.05, + max_detections = 300, + parallel_iterations = 32, + **kwargs + ): + """ Filters detections using score threshold, NMS and selecting the top-k detections. + + Args + nms : Flag to enable/disable NMS. + class_specific_filter : Whether to perform filtering per class, or take the best scoring class and filter those. + nms_threshold : Threshold for the IoU value to determine when a box should be suppressed. + score_threshold : Threshold used to prefilter the boxes with. + max_detections : Maximum number of detections to keep. + parallel_iterations : Number of batch items to process in parallel. + """ + self.nms = nms + self.class_specific_filter = class_specific_filter + self.nms_threshold = nms_threshold + self.score_threshold = score_threshold + self.max_detections = max_detections + self.parallel_iterations = parallel_iterations + super(FilterDetections, self).__init__(**kwargs) + + def call(self, inputs, **kwargs): + """ Constructs the NMS graph. + + Args + inputs : List of [boxes, classification, other[0], other[1], ...] tensors. + """ + boxes = inputs[0] + classification = inputs[1] + other = inputs[2:] + + # wrap nms with our parameters + def _filter_detections(args): + boxes = args[0] + classification = args[1] + other = args[2] + + return filter_detections( + boxes, + classification, + other, + nms = self.nms, + class_specific_filter = self.class_specific_filter, + score_threshold = self.score_threshold, + max_detections = self.max_detections, + nms_threshold = self.nms_threshold, + ) + + # call filter_detections on each batch + outputs = backend.map_fn( + _filter_detections, + elems=[boxes, classification, other], + dtype=[keras.backend.floatx(), keras.backend.floatx(), 'int32'] + [o.dtype for o in other], + parallel_iterations=self.parallel_iterations + ) + + return outputs + + def compute_output_shape(self, input_shape): + """ Computes the output shapes given the input shapes. + + Args + input_shape : List of input shapes [boxes, classification, other[0], other[1], ...]. + + Returns + List of tuples representing the output shapes: + [filtered_boxes.shape, filtered_scores.shape, filtered_labels.shape, filtered_other[0].shape, filtered_other[1].shape, ...] + """ + return [ + (input_shape[0][0], self.max_detections, 4), + (input_shape[1][0], self.max_detections), + (input_shape[1][0], self.max_detections), + ] + [ + tuple([input_shape[i][0], self.max_detections] + list(input_shape[i][2:])) for i in range(2, len(input_shape)) + ] + + def compute_mask(self, inputs, mask=None): + """ This is required in Keras when there is more than 1 output. + """ + return (len(inputs) + 1) * [None] + + def get_config(self): + """ Gets the configuration of this layer. + + Returns + Dictionary containing the parameters of this layer. + """ + config = super(FilterDetections, self).get_config() + config.update({ + 'nms' : self.nms, + 'class_specific_filter' : self.class_specific_filter, + 'nms_threshold' : self.nms_threshold, + 'score_threshold' : self.score_threshold, + 'max_detections' : self.max_detections, + 'parallel_iterations' : self.parallel_iterations, + }) + + return config diff --git a/src/keras_retinanet/losses.py b/src/keras_retinanet/losses.py new file mode 100644 index 0000000..382a975 --- /dev/null +++ b/src/keras_retinanet/losses.py @@ -0,0 +1,118 @@ +""" +Copyright 2017-2018 Fizyr (https://fizyr.com) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import keras +from . import backend + + +def focal(alpha=0.25, gamma=2.0, cutoff=0.5): + """ Create a functor for computing the focal loss. + + Args + alpha: Scale the focal weight with alpha. + gamma: Take the power of the focal weight with gamma. + cutoff: Positive prediction cutoff for soft targets + + Returns + A functor that computes the focal loss using the alpha and gamma. + """ + def _focal(y_true, y_pred): + """ Compute the focal loss given the target tensor and the predicted tensor. + + As defined in https://arxiv.org/abs/1708.02002 + + Args + y_true: Tensor of target data from the generator with shape (B, N, num_classes). + y_pred: Tensor of predicted data from the network with shape (B, N, num_classes). + + Returns + The focal loss of y_pred w.r.t. y_true. + """ + labels = y_true[:, :, :-1] + anchor_state = y_true[:, :, -1] # -1 for ignore, 0 for background, 1 for object + classification = y_pred + + # filter out "ignore" anchors + indices = backend.where(keras.backend.not_equal(anchor_state, -1)) + labels = backend.gather_nd(labels, indices) + classification = backend.gather_nd(classification, indices) + + # compute the focal loss + alpha_factor = keras.backend.ones_like(labels) * alpha + alpha_factor = backend.where(keras.backend.greater(labels, cutoff), alpha_factor, 1 - alpha_factor) + focal_weight = backend.where(keras.backend.greater(labels, cutoff), 1 - classification, classification) + focal_weight = alpha_factor * focal_weight ** gamma + + cls_loss = focal_weight * keras.backend.binary_crossentropy(labels, classification) + + # compute the normalizer: the number of positive anchors + normalizer = backend.where(keras.backend.equal(anchor_state, 1)) + normalizer = keras.backend.cast(keras.backend.shape(normalizer)[0], keras.backend.floatx()) + normalizer = keras.backend.maximum(keras.backend.cast_to_floatx(1.0), normalizer) + + return keras.backend.sum(cls_loss) / normalizer + + return _focal + + +def smooth_l1(sigma=3.0): + """ Create a smooth L1 loss functor. + + Args + sigma: This argument defines the point where the loss changes from L2 to L1. + + Returns + A functor for computing the smooth L1 loss given target data and predicted data. + """ + sigma_squared = sigma ** 2 + + def _smooth_l1(y_true, y_pred): + """ Compute the smooth L1 loss of y_pred w.r.t. y_true. + + Args + y_true: Tensor from the generator of shape (B, N, 5). The last value for each box is the state of the anchor (ignore, negative, positive). + y_pred: Tensor from the network of shape (B, N, 4). + + Returns + The smooth L1 loss of y_pred w.r.t. y_true. + """ + # separate target and state + regression = y_pred + regression_target = y_true[:, :, :-1] + anchor_state = y_true[:, :, -1] + + # filter out "ignore" anchors + indices = backend.where(keras.backend.equal(anchor_state, 1)) + regression = backend.gather_nd(regression, indices) + regression_target = backend.gather_nd(regression_target, indices) + + # compute smooth L1 loss + # f(x) = 0.5 * (sigma * x)^2 if |x| < 1 / sigma / sigma + # |x| - 0.5 / sigma / sigma otherwise + regression_diff = regression - regression_target + regression_diff = keras.backend.abs(regression_diff) + regression_loss = backend.where( + keras.backend.less(regression_diff, 1.0 / sigma_squared), + 0.5 * sigma_squared * keras.backend.pow(regression_diff, 2), + regression_diff - 0.5 / sigma_squared + ) + + # compute the normalizer: the number of positive anchors + normalizer = keras.backend.maximum(1, keras.backend.shape(indices)[0]) + normalizer = keras.backend.cast(normalizer, dtype=keras.backend.floatx()) + return keras.backend.sum(regression_loss) / normalizer + + return _smooth_l1 diff --git a/src/keras_retinanet/models/__init__.py b/src/keras_retinanet/models/__init__.py new file mode 100644 index 0000000..3b05ca8 --- /dev/null +++ b/src/keras_retinanet/models/__init__.py @@ -0,0 +1,125 @@ +from __future__ import print_function +import sys + + +class Backbone(object): + """ This class stores additional information on backbones. + """ + def __init__(self, backbone): + # a dictionary mapping custom layer names to the correct classes + from .. import layers + from .. import losses + from .. import initializers + self.custom_objects = { + 'UpsampleLike' : layers.UpsampleLike, + 'PriorProbability' : initializers.PriorProbability, + 'RegressBoxes' : layers.RegressBoxes, + 'FilterDetections' : layers.FilterDetections, + 'Anchors' : layers.Anchors, + 'ClipBoxes' : layers.ClipBoxes, + '_smooth_l1' : losses.smooth_l1(), + '_focal' : losses.focal(), + } + + self.backbone = backbone + self.validate() + + def retinanet(self, *args, **kwargs): + """ Returns a retinanet model using the correct backbone. + """ + raise NotImplementedError('retinanet method not implemented.') + + def download_imagenet(self): + """ Downloads ImageNet weights and returns path to weights file. + """ + raise NotImplementedError('download_imagenet method not implemented.') + + def validate(self): + """ Checks whether the backbone string is correct. + """ + raise NotImplementedError('validate method not implemented.') + + def preprocess_image(self, inputs): + """ Takes as input an image and prepares it for being passed through the network. + Having this function in Backbone allows other backbones to define a specific preprocessing step. + """ + raise NotImplementedError('preprocess_image method not implemented.') + + +def backbone(backbone_name): + """ Returns a backbone object for the given backbone. + """ + if 'densenet' in backbone_name: + from .densenet import DenseNetBackbone as b + elif 'seresnext' in backbone_name or 'seresnet' in backbone_name or 'senet' in backbone_name: + from .senet import SeBackbone as b + elif 'resnet' in backbone_name: + from .resnet import ResNetBackbone as b + elif 'mobilenet' in backbone_name: + from .mobilenet import MobileNetBackbone as b + elif 'vgg' in backbone_name: + from .vgg import VGGBackbone as b + elif 'EfficientNet' in backbone_name: + from .effnet import EfficientNetBackbone as b + else: + raise NotImplementedError('Backbone class for \'{}\' not implemented.'.format(backbone)) + + return b(backbone_name) + + +def load_model(filepath, backbone_name='resnet50'): + """ Loads a retinanet model using the correct custom objects. + + Args + filepath: one of the following: + - string, path to the saved model, or + - h5py.File object from which to load the model + backbone_name : Backbone with which the model was trained. + + Returns + A keras.models.Model object. + + Raises + ImportError: if h5py is not available. + ValueError: In case of an invalid savefile. + """ + import keras.models + return keras.models.load_model(filepath, custom_objects=backbone(backbone_name).custom_objects) + + +def convert_model(model, nms=True, class_specific_filter=True, anchor_params=None, **kwargs): + """ Converts a training model to an inference model. + + Args + model : A retinanet training model. + nms : Boolean, whether to add NMS filtering to the converted model. + class_specific_filter : Whether to use class specific filtering or filter for the best scoring class only. + anchor_params : Anchor parameters object. If omitted, default values are used. + **kwargs : Inference and minimal retinanet model settings. + + Returns + A keras.models.Model object. + + Raises + ImportError: if h5py is not available. + ValueError: In case of an invalid savefile. + """ + from .retinanet import retinanet_bbox + return retinanet_bbox(model=model, nms=nms, class_specific_filter=class_specific_filter, anchor_params=anchor_params, **kwargs) + + +def assert_training_model(model): + """ Assert that the model is a training model. + """ + assert(all(output in model.output_names for output in ['regression', 'classification'])), \ + "Input is not a training model (no 'regression' and 'classification' outputs were found, outputs are: {}).".format(model.output_names) + + +def check_training_model(model): + """ Check that model is a training model and exit otherwise. + """ + try: + assert_training_model(model) + except AssertionError as e: + print(e, file=sys.stderr) + sys.exit(1) diff --git a/src/keras_retinanet/models/densenet.py b/src/keras_retinanet/models/densenet.py new file mode 100644 index 0000000..c3aafd0 --- /dev/null +++ b/src/keras_retinanet/models/densenet.py @@ -0,0 +1,105 @@ +""" +Copyright 2018 vidosits (https://github.com/vidosits/) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import keras +from keras.applications import densenet +from keras.utils import get_file + +from . import retinanet +from . import Backbone +from ..utils.image import preprocess_image + + +allowed_backbones = { + 'densenet121': ([6, 12, 24, 16], densenet.DenseNet121), + 'densenet169': ([6, 12, 32, 32], densenet.DenseNet169), + 'densenet201': ([6, 12, 48, 32], densenet.DenseNet201), +} + + +class DenseNetBackbone(Backbone): + """ Describes backbone information and provides utility functions. + """ + + def retinanet(self, *args, **kwargs): + """ Returns a retinanet model using the correct backbone. + """ + return densenet_retinanet(*args, backbone=self.backbone, **kwargs) + + def download_imagenet(self): + """ Download pre-trained weights for the specified backbone name. + This name is in the format {backbone}_weights_tf_dim_ordering_tf_kernels_notop + where backbone is the densenet + number of layers (e.g. densenet121). + For more info check the explanation from the keras densenet script itself: + https://github.com/keras-team/keras/blob/master/keras/applications/densenet.py + """ + origin = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/' + file_name = '{}_weights_tf_dim_ordering_tf_kernels_notop.h5' + + # load weights + if keras.backend.image_data_format() == 'channels_first': + raise ValueError('Weights for "channels_first" format are not available.') + + weights_url = origin + file_name.format(self.backbone) + return get_file(file_name.format(self.backbone), weights_url, cache_subdir='models') + + def validate(self): + """ Checks whether the backbone string is correct. + """ + backbone = self.backbone.split('_')[0] + + if backbone not in allowed_backbones: + raise ValueError('Backbone (\'{}\') not in allowed backbones ({}).'.format(backbone, allowed_backbones.keys())) + + def preprocess_image(self, inputs): + """ Takes as input an image and prepares it for being passed through the network. + """ + return preprocess_image(inputs, mode='tf') + + +def densenet_retinanet(num_classes, backbone='densenet121', inputs=None, modifier=None, **kwargs): + """ Constructs a retinanet model using a densenet backbone. + + Args + num_classes: Number of classes to predict. + backbone: Which backbone to use (one of ('densenet121', 'densenet169', 'densenet201')). + inputs: The inputs to the network (defaults to a Tensor of shape (None, None, 3)). + modifier: A function handler which can modify the backbone before using it in retinanet (this can be used to freeze backbone layers for example). + + Returns + RetinaNet model with a DenseNet backbone. + """ + # choose default input + if inputs is None: + inputs = keras.layers.Input((None, None, 3)) + + blocks, creator = allowed_backbones[backbone] + model = creator(input_tensor=inputs, include_top=False, pooling=None, weights=None) + + # get last conv layer from the end of each dense block + layer_outputs = [model.get_layer(name='conv{}_block{}_concat'.format(idx + 2, block_num)).output for idx, block_num in enumerate(blocks)] + + # create the densenet backbone + model = keras.models.Model(inputs=inputs, outputs=layer_outputs[1:], name=model.name) + + # invoke modifier if given + if modifier: + model = modifier(model) + + # create the full model + model = retinanet.retinanet(inputs=inputs, num_classes=num_classes, backbone_layers=model.outputs, **kwargs) + + return model diff --git a/src/keras_retinanet/models/effnet.py b/src/keras_retinanet/models/effnet.py new file mode 100644 index 0000000..12591ad --- /dev/null +++ b/src/keras_retinanet/models/effnet.py @@ -0,0 +1,153 @@ +""" +Copyright 2017-2018 Fizyr (https://fizyr.com) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import keras +from keras.utils import get_file + +from . import retinanet +from . import Backbone +import efficientnet.keras as efn + + +class EfficientNetBackbone(Backbone): + """ Describes backbone information and provides utility functions. + """ + + def __init__(self, backbone): + super(EfficientNetBackbone, self).__init__(backbone) + self.preprocess_image_func = None + + def retinanet(self, *args, **kwargs): + """ Returns a retinanet model using the correct backbone. + """ + return effnet_retinanet(*args, backbone=self.backbone, **kwargs) + + def download_imagenet(self): + """ Downloads ImageNet weights and returns path to weights file. + """ + from efficientnet.weights import IMAGENET_WEIGHTS_PATH + from efficientnet.weights import IMAGENET_WEIGHTS_HASHES + + model_name = 'efficientnet-b' + self.backbone[-1] + file_name = model_name + '_weights_tf_dim_ordering_tf_kernels_autoaugment_notop.h5' + file_hash = IMAGENET_WEIGHTS_HASHES[model_name][1] + weights_path = get_file(file_name, IMAGENET_WEIGHTS_PATH + file_name, cache_subdir='models', file_hash=file_hash) + return weights_path + + def validate(self): + """ Checks whether the backbone string is correct. + """ + allowed_backbones = ['EfficientNetB0', 'EfficientNetB1', 'EfficientNetB2', 'EfficientNetB3', 'EfficientNetB4', + 'EfficientNetB5', 'EfficientNetB6', 'EfficientNetB7'] + backbone = self.backbone.split('_')[0] + + if backbone not in allowed_backbones: + raise ValueError('Backbone (\'{}\') not in allowed backbones ({}).'.format(backbone, allowed_backbones)) + + def preprocess_image(self, inputs): + """ Takes as input an image and prepares it for being passed through the network. + """ + return efn.preprocess_input(inputs) + + +def effnet_retinanet(num_classes, backbone='EfficientNetB0', inputs=None, modifier=None, **kwargs): + """ Constructs a retinanet model using a resnet backbone. + + Args + num_classes: Number of classes to predict. + backbone: Which backbone to use (one of ('resnet50', 'resnet101', 'resnet152')). + inputs: The inputs to the network (defaults to a Tensor of shape (None, None, 3)). + modifier: A function handler which can modify the backbone before using it in retinanet (this can be used to freeze backbone layers for example). + + Returns + RetinaNet model with a ResNet backbone. + """ + # choose default input + if inputs is None: + if keras.backend.image_data_format() == 'channels_first': + inputs = keras.layers.Input(shape=(3, None, None)) + else: + # inputs = keras.layers.Input(shape=(224, 224, 3)) + inputs = keras.layers.Input(shape=(None, None, 3)) + + # get last conv layer from the end of each block [28x28, 14x14, 7x7] + if backbone == 'EfficientNetB0': + model = efn.EfficientNetB0(input_tensor=inputs, include_top=False, weights=None) + elif backbone == 'EfficientNetB1': + model = efn.EfficientNetB1(input_tensor=inputs, include_top=False, weights=None) + elif backbone == 'EfficientNetB2': + model = efn.EfficientNetB2(input_tensor=inputs, include_top=False, weights=None) + elif backbone == 'EfficientNetB3': + model = efn.EfficientNetB3(input_tensor=inputs, include_top=False, weights=None) + elif backbone == 'EfficientNetB4': + model = efn.EfficientNetB4(input_tensor=inputs, include_top=False, weights=None) + elif backbone == 'EfficientNetB5': + model = efn.EfficientNetB5(input_tensor=inputs, include_top=False, weights=None) + elif backbone == 'EfficientNetB6': + model = efn.EfficientNetB6(input_tensor=inputs, include_top=False, weights=None) + elif backbone == 'EfficientNetB7': + model = efn.EfficientNetB7(input_tensor=inputs, include_top=False, weights=None) + else: + raise ValueError('Backbone (\'{}\') is invalid.'.format(backbone)) + + layer_outputs = ['block4a_expand_activation', 'block6a_expand_activation', 'top_activation'] + + layer_outputs = [ + model.get_layer(name=layer_outputs[0]).output, # 28x28 + model.get_layer(name=layer_outputs[1]).output, # 14x14 + model.get_layer(name=layer_outputs[2]).output, # 7x7 + ] + # create the densenet backbone + model = keras.models.Model(inputs=inputs, outputs=layer_outputs, name=model.name) + + # invoke modifier if given + if modifier: + model = modifier(model) + + # create the full model + return retinanet.retinanet(inputs=inputs, num_classes=num_classes, backbone_layers=model.outputs, **kwargs) + + +def EfficientNetB0_retinanet(num_classes, inputs=None, **kwargs): + return effnet_retinanet(num_classes=num_classes, backbone='EfficientNetB0', inputs=inputs, **kwargs) + + +def EfficientNetB1_retinanet(num_classes, inputs=None, **kwargs): + return effnet_retinanet(num_classes=num_classes, backbone='EfficientNetB1', inputs=inputs, **kwargs) + + +def EfficientNetB2_retinanet(num_classes, inputs=None, **kwargs): + return effnet_retinanet(num_classes=num_classes, backbone='EfficientNetB2', inputs=inputs, **kwargs) + + +def EfficientNetB3_retinanet(num_classes, inputs=None, **kwargs): + return effnet_retinanet(num_classes=num_classes, backbone='EfficientNetB3', inputs=inputs, **kwargs) + + +def EfficientNetB4_retinanet(num_classes, inputs=None, **kwargs): + return effnet_retinanet(num_classes=num_classes, backbone='EfficientNetB4', inputs=inputs, **kwargs) + + +def EfficientNetB5_retinanet(num_classes, inputs=None, **kwargs): + return effnet_retinanet(num_classes=num_classes, backbone='EfficientNetB5', inputs=inputs, **kwargs) + + +def EfficientNetB6_retinanet(num_classes, inputs=None, **kwargs): + return effnet_retinanet(num_classes=num_classes, backbone='EfficientNetB6', inputs=inputs, **kwargs) + + +def EfficientNetB7_retinanet(num_classes, inputs=None, **kwargs): + return effnet_retinanet(num_classes=num_classes, backbone='EfficientNetB7', inputs=inputs, **kwargs) diff --git a/src/keras_retinanet/models/mobilenet.py b/src/keras_retinanet/models/mobilenet.py new file mode 100644 index 0000000..4a3850b --- /dev/null +++ b/src/keras_retinanet/models/mobilenet.py @@ -0,0 +1,109 @@ +""" +Copyright 2017-2018 lvaleriu (https://github.com/lvaleriu/) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import keras +from keras.applications import mobilenet +from keras.utils import get_file +from ..utils.image import preprocess_image + +from . import retinanet +from . import Backbone + + +class MobileNetBackbone(Backbone): + """ Describes backbone information and provides utility functions. + """ + + allowed_backbones = ['mobilenet128', 'mobilenet160', 'mobilenet192', 'mobilenet224'] + + def retinanet(self, *args, **kwargs): + """ Returns a retinanet model using the correct backbone. + """ + return mobilenet_retinanet(*args, backbone=self.backbone, **kwargs) + + def download_imagenet(self): + """ Download pre-trained weights for the specified backbone name. + This name is in the format mobilenet{rows}_{alpha} where rows is the + imagenet shape dimension and 'alpha' controls the width of the network. + For more info check the explanation from the keras mobilenet script itself. + """ + + alpha = float(self.backbone.split('_')[1]) + rows = int(self.backbone.split('_')[0].replace('mobilenet', '')) + + # load weights + if keras.backend.image_data_format() == 'channels_first': + raise ValueError('Weights for "channels_last" format ' + 'are not available.') + if alpha == 1.0: + alpha_text = '1_0' + elif alpha == 0.75: + alpha_text = '7_5' + elif alpha == 0.50: + alpha_text = '5_0' + else: + alpha_text = '2_5' + + model_name = 'mobilenet_{}_{}_tf_no_top.h5'.format(alpha_text, rows) + weights_url = mobilenet.mobilenet.BASE_WEIGHT_PATH + model_name + weights_path = get_file(model_name, weights_url, cache_subdir='models') + + return weights_path + + def validate(self): + """ Checks whether the backbone string is correct. + """ + backbone = self.backbone.split('_')[0] + + if backbone not in MobileNetBackbone.allowed_backbones: + raise ValueError('Backbone (\'{}\') not in allowed backbones ({}).'.format(backbone, MobileNetBackbone.allowed_backbones)) + + def preprocess_image(self, inputs): + """ Takes as input an image and prepares it for being passed through the network. + """ + return preprocess_image(inputs, mode='tf') + + +def mobilenet_retinanet(num_classes, backbone='mobilenet224_1.0', inputs=None, modifier=None, **kwargs): + """ Constructs a retinanet model using a mobilenet backbone. + + Args + num_classes: Number of classes to predict. + backbone: Which backbone to use (one of ('mobilenet128', 'mobilenet160', 'mobilenet192', 'mobilenet224')). + inputs: The inputs to the network (defaults to a Tensor of shape (None, None, 3)). + modifier: A function handler which can modify the backbone before using it in retinanet (this can be used to freeze backbone layers for example). + + Returns + RetinaNet model with a MobileNet backbone. + """ + alpha = float(backbone.split('_')[1]) + + # choose default input + if inputs is None: + inputs = keras.layers.Input((None, None, 3)) + + backbone = mobilenet.MobileNet(input_tensor=inputs, alpha=alpha, include_top=False, pooling=None, weights=None) + + # create the full model + layer_names = ['conv_pw_5_relu', 'conv_pw_11_relu', 'conv_pw_13_relu'] + layer_outputs = [backbone.get_layer(name).output for name in layer_names] + backbone = keras.models.Model(inputs=inputs, outputs=layer_outputs, name=backbone.name) + + # invoke modifier if given + if modifier: + backbone = modifier(backbone) + + return retinanet.retinanet(inputs=inputs, num_classes=num_classes, backbone_layers=backbone.outputs, **kwargs) diff --git a/src/keras_retinanet/models/resnet.py b/src/keras_retinanet/models/resnet.py new file mode 100644 index 0000000..3ed555d --- /dev/null +++ b/src/keras_retinanet/models/resnet.py @@ -0,0 +1,124 @@ +""" +Copyright 2017-2018 Fizyr (https://fizyr.com) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import keras +from keras.utils import get_file +import keras_resnet +import keras_resnet.models + +from . import retinanet +from . import Backbone +from ..utils.image import preprocess_image + + +class ResNetBackbone(Backbone): + """ Describes backbone information and provides utility functions. + """ + + def __init__(self, backbone): + super(ResNetBackbone, self).__init__(backbone) + self.custom_objects.update(keras_resnet.custom_objects) + + def retinanet(self, *args, **kwargs): + """ Returns a retinanet model using the correct backbone. + """ + return resnet_retinanet(*args, backbone=self.backbone, **kwargs) + + def download_imagenet(self): + """ Downloads ImageNet weights and returns path to weights file. + """ + resnet_filename = 'ResNet-{}-model.keras.h5' + resnet_resource = 'https://github.com/fizyr/keras-models/releases/download/v0.0.1/{}'.format(resnet_filename) + depth = int(self.backbone.replace('resnet', '')) + + filename = resnet_filename.format(depth) + resource = resnet_resource.format(depth) + if depth == 50: + checksum = '3e9f4e4f77bbe2c9bec13b53ee1c2319' + elif depth == 101: + checksum = '05dc86924389e5b401a9ea0348a3213c' + elif depth == 152: + checksum = '6ee11ef2b135592f8031058820bb9e71' + + return get_file( + filename, + resource, + cache_subdir='models', + md5_hash=checksum + ) + + def validate(self): + """ Checks whether the backbone string is correct. + """ + allowed_backbones = ['resnet50', 'resnet101', 'resnet152'] + backbone = self.backbone.split('_')[0] + + if backbone not in allowed_backbones: + raise ValueError('Backbone (\'{}\') not in allowed backbones ({}).'.format(backbone, allowed_backbones)) + + def preprocess_image(self, inputs): + """ Takes as input an image and prepares it for being passed through the network. + """ + return preprocess_image(inputs, mode='caffe') + + +def resnet_retinanet(num_classes, backbone='resnet50', inputs=None, modifier=None, **kwargs): + """ Constructs a retinanet model using a resnet backbone. + + Args + num_classes: Number of classes to predict. + backbone: Which backbone to use (one of ('resnet50', 'resnet101', 'resnet152')). + inputs: The inputs to the network (defaults to a Tensor of shape (None, None, 3)). + modifier: A function handler which can modify the backbone before using it in retinanet (this can be used to freeze backbone layers for example). + + Returns + RetinaNet model with a ResNet backbone. + """ + # choose default input + if inputs is None: + if keras.backend.image_data_format() == 'channels_first': + inputs = keras.layers.Input(shape=(3, None, None)) + else: + inputs = keras.layers.Input(shape=(None, None, 3)) + + # create the resnet backbone + if backbone == 'resnet50': + resnet = keras_resnet.models.ResNet50(inputs, include_top=False, freeze_bn=True) + elif backbone == 'resnet101': + resnet = keras_resnet.models.ResNet101(inputs, include_top=False, freeze_bn=True) + elif backbone == 'resnet152': + resnet = keras_resnet.models.ResNet152(inputs, include_top=False, freeze_bn=True) + else: + raise ValueError('Backbone (\'{}\') is invalid.'.format(backbone)) + + # invoke modifier if given + if modifier: + resnet = modifier(resnet) + + # create the full model + return retinanet.retinanet(inputs=inputs, num_classes=num_classes, backbone_layers=resnet.outputs[1:], **kwargs) + + +def resnet50_retinanet(num_classes, inputs=None, **kwargs): + return resnet_retinanet(num_classes=num_classes, backbone='resnet50', inputs=inputs, **kwargs) + + +def resnet101_retinanet(num_classes, inputs=None, **kwargs): + return resnet_retinanet(num_classes=num_classes, backbone='resnet101', inputs=inputs, **kwargs) + + +def resnet152_retinanet(num_classes, inputs=None, **kwargs): + return resnet_retinanet(num_classes=num_classes, backbone='resnet152', inputs=inputs, **kwargs) diff --git a/src/keras_retinanet/models/retinanet.py b/src/keras_retinanet/models/retinanet.py new file mode 100644 index 0000000..b0065bb --- /dev/null +++ b/src/keras_retinanet/models/retinanet.py @@ -0,0 +1,364 @@ +""" +Copyright 2017-2018 Fizyr (https://fizyr.com) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import keras +from .. import initializers +from .. import layers +from ..utils.anchors import AnchorParameters +from . import assert_training_model + + +def default_classification_model( + num_classes, + num_anchors, + pyramid_feature_size=256, + prior_probability=0.01, + classification_feature_size=256, + name='classification_submodel' +): + """ Creates the default classification submodel. + + Args + num_classes : Number of classes to predict a score for at each feature level. + num_anchors : Number of anchors to predict classification scores for at each feature level. + pyramid_feature_size : The number of filters to expect from the feature pyramid levels. + classification_feature_size : The number of filters to use in the layers in the classification submodel. + name : The name of the submodel. + + Returns + A keras.models.Model that predicts classes for each anchor. + """ + options = { + 'kernel_size' : 3, + 'strides' : 1, + 'padding' : 'same', + } + + if keras.backend.image_data_format() == 'channels_first': + inputs = keras.layers.Input(shape=(pyramid_feature_size, None, None)) + else: + inputs = keras.layers.Input(shape=(None, None, pyramid_feature_size)) + outputs = inputs + for i in range(4): + outputs = keras.layers.Conv2D( + filters=classification_feature_size, + activation='relu', + name='pyramid_classification_{}'.format(i), + kernel_initializer=keras.initializers.normal(mean=0.0, stddev=0.01, seed=None), + bias_initializer='zeros', + **options + )(outputs) + + outputs = keras.layers.Conv2D( + filters=num_classes * num_anchors, + kernel_initializer=keras.initializers.normal(mean=0.0, stddev=0.01, seed=None), + bias_initializer=initializers.PriorProbability(probability=prior_probability), + name='pyramid_classification', + **options + )(outputs) + + # reshape output and apply sigmoid + if keras.backend.image_data_format() == 'channels_first': + outputs = keras.layers.Permute((2, 3, 1), name='pyramid_classification_permute')(outputs) + outputs = keras.layers.Reshape((-1, num_classes), name='pyramid_classification_reshape')(outputs) + outputs = keras.layers.Activation('sigmoid', name='pyramid_classification_sigmoid')(outputs) + + return keras.models.Model(inputs=inputs, outputs=outputs, name=name) + + +def default_regression_model(num_values, num_anchors, pyramid_feature_size=256, regression_feature_size=256, name='regression_submodel'): + """ Creates the default regression submodel. + + Args + num_values : Number of values to regress. + num_anchors : Number of anchors to regress for each feature level. + pyramid_feature_size : The number of filters to expect from the feature pyramid levels. + regression_feature_size : The number of filters to use in the layers in the regression submodel. + name : The name of the submodel. + + Returns + A keras.models.Model that predicts regression values for each anchor. + """ + # All new conv layers except the final one in the + # RetinaNet (classification) subnets are initialized + # with bias b = 0 and a Gaussian weight fill with stddev = 0.01. + options = { + 'kernel_size' : 3, + 'strides' : 1, + 'padding' : 'same', + 'kernel_initializer' : keras.initializers.normal(mean=0.0, stddev=0.01, seed=None), + 'bias_initializer' : 'zeros' + } + + if keras.backend.image_data_format() == 'channels_first': + inputs = keras.layers.Input(shape=(pyramid_feature_size, None, None)) + else: + inputs = keras.layers.Input(shape=(None, None, pyramid_feature_size)) + outputs = inputs + for i in range(4): + outputs = keras.layers.Conv2D( + filters=regression_feature_size, + activation='relu', + name='pyramid_regression_{}'.format(i), + **options + )(outputs) + + outputs = keras.layers.Conv2D(num_anchors * num_values, name='pyramid_regression', **options)(outputs) + if keras.backend.image_data_format() == 'channels_first': + outputs = keras.layers.Permute((2, 3, 1), name='pyramid_regression_permute')(outputs) + outputs = keras.layers.Reshape((-1, num_values), name='pyramid_regression_reshape')(outputs) + + return keras.models.Model(inputs=inputs, outputs=outputs, name=name) + + +def __create_pyramid_features(C3, C4, C5, feature_size=256): + """ Creates the FPN layers on top of the backbone features. + + Args + C3 : Feature stage C3 from the backbone. + C4 : Feature stage C4 from the backbone. + C5 : Feature stage C5 from the backbone. + feature_size : The feature size to use for the resulting feature levels. + + Returns + A list of feature levels [P3, P4, P5, P6, P7]. + """ + # upsample C5 to get P5 from the FPN paper + P5 = keras.layers.Conv2D(feature_size, kernel_size=1, strides=1, padding='same', name='C5_reduced')(C5) + P5_upsampled = layers.UpsampleLike(name='P5_upsampled')([P5, C4]) + P5 = keras.layers.Conv2D(feature_size, kernel_size=3, strides=1, padding='same', name='P5')(P5) + + # add P5 elementwise to C4 + P4 = keras.layers.Conv2D(feature_size, kernel_size=1, strides=1, padding='same', name='C4_reduced')(C4) + P4 = keras.layers.Add(name='P4_merged')([P5_upsampled, P4]) + P4_upsampled = layers.UpsampleLike(name='P4_upsampled')([P4, C3]) + P4 = keras.layers.Conv2D(feature_size, kernel_size=3, strides=1, padding='same', name='P4')(P4) + + # add P4 elementwise to C3 + P3 = keras.layers.Conv2D(feature_size, kernel_size=1, strides=1, padding='same', name='C3_reduced')(C3) + P3 = keras.layers.Add(name='P3_merged')([P4_upsampled, P3]) + P3 = keras.layers.Conv2D(feature_size, kernel_size=3, strides=1, padding='same', name='P3')(P3) + + # "P6 is obtained via a 3x3 stride-2 conv on C5" + P6 = keras.layers.Conv2D(feature_size, kernel_size=3, strides=2, padding='same', name='P6')(C5) + + # "P7 is computed by applying ReLU followed by a 3x3 stride-2 conv on P6" + P7 = keras.layers.Activation('relu', name='C6_relu')(P6) + P7 = keras.layers.Conv2D(feature_size, kernel_size=3, strides=2, padding='same', name='P7')(P7) + + return [P3, P4, P5, P6, P7] + + +def default_submodels(num_classes, num_anchors): + """ Create a list of default submodels used for object detection. + + The default submodels contains a regression submodel and a classification submodel. + + Args + num_classes : Number of classes to use. + num_anchors : Number of base anchors. + + Returns + A list of tuple, where the first element is the name of the submodel and the second element is the submodel itself. + """ + return [ + ('regression', default_regression_model(4, num_anchors)), + ('classification', default_classification_model(num_classes, num_anchors)) + ] + + +def __build_model_pyramid(name, model, features): + """ Applies a single submodel to each FPN level. + + Args + name : Name of the submodel. + model : The submodel to evaluate. + features : The FPN features. + + Returns + A tensor containing the response from the submodel on the FPN features. + """ + return keras.layers.Concatenate(axis=1, name=name)([model(f) for f in features]) + + +def __build_pyramid(models, features): + """ Applies all submodels to each FPN level. + + Args + models : List of submodels to run on each pyramid level (by default only regression, classifcation). + features : The FPN features. + + Returns + A list of tensors, one for each submodel. + """ + return [__build_model_pyramid(n, m, features) for n, m in models] + + +def __build_anchors(anchor_parameters, features): + """ Builds anchors for the shape of the features from FPN. + + Args + anchor_parameters : Parameteres that determine how anchors are generated. + features : The FPN features. + + Returns + A tensor containing the anchors for the FPN features. + + The shape is: + ``` + (batch_size, num_anchors, 4) + ``` + """ + anchors = [ + layers.Anchors( + size=anchor_parameters.sizes[i], + stride=anchor_parameters.strides[i], + ratios=anchor_parameters.ratios, + scales=anchor_parameters.scales, + name='anchors_{}'.format(i) + )(f) for i, f in enumerate(features) + ] + + return keras.layers.Concatenate(axis=1, name='anchors')(anchors) + + +def retinanet( + inputs, + backbone_layers, + num_classes, + num_anchors = None, + create_pyramid_features = __create_pyramid_features, + submodels = None, + name = 'retinanet' +): + """ Construct a RetinaNet model on top of a backbone. + + This model is the minimum model necessary for training (with the unfortunate exception of anchors as output). + + Args + inputs : keras.layers.Input (or list of) for the input to the model. + num_classes : Number of classes to classify. + num_anchors : Number of base anchors. + create_pyramid_features : Functor for creating pyramid features given the features C3, C4, C5 from the backbone. + submodels : Submodels to run on each feature map (default is regression and classification submodels). + name : Name of the model. + + Returns + A keras.models.Model which takes an image as input and outputs generated anchors and the result from each submodel on every pyramid level. + + The order of the outputs is as defined in submodels: + ``` + [ + regression, classification, other[0], other[1], ... + ] + ``` + """ + + if num_anchors is None: + num_anchors = AnchorParameters.default.num_anchors() + + if submodels is None: + submodels = default_submodels(num_classes, num_anchors) + + C3, C4, C5 = backbone_layers + + # compute pyramid features as per https://arxiv.org/abs/1708.02002 + features = create_pyramid_features(C3, C4, C5) + + # for all pyramid levels, run available submodels + pyramids = __build_pyramid(submodels, features) + + return keras.models.Model(inputs=inputs, outputs=pyramids, name=name) + + +def retinanet_bbox( + model = None, + nms = True, + class_specific_filter = True, + name = 'retinanet-bbox', + anchor_params = None, + nms_threshold = 0.5, + score_threshold = 0.05, + max_detections = 300, + parallel_iterations = 32, + **kwargs +): + """ Construct a RetinaNet model on top of a backbone and adds convenience functions to output boxes directly. + + This model uses the minimum retinanet model and appends a few layers to compute boxes within the graph. + These layers include applying the regression values to the anchors and performing NMS. + + Args + model : RetinaNet model to append bbox layers to. If None, it will create a RetinaNet model using **kwargs. + nms : Whether to use non-maximum suppression for the filtering step. + class_specific_filter : Whether to use class specific filtering or filter for the best scoring class only. + name : Name of the model. + anchor_params : Struct containing anchor parameters. If None, default values are used. + nms_threshold : Threshold for the IoU value to determine when a box should be suppressed. + score_threshold : Threshold used to prefilter the boxes with. + max_detections : Maximum number of detections to keep. + parallel_iterations : Number of batch items to process in parallel. + **kwargs : Additional kwargs to pass to the minimal retinanet model. + + Returns + A keras.models.Model which takes an image as input and outputs the detections on the image. + + The order is defined as follows: + ``` + [ + boxes, scores, labels, other[0], other[1], ... + ] + ``` + """ + + # if no anchor parameters are passed, use default values + if anchor_params is None: + anchor_params = AnchorParameters.default + + # create RetinaNet model + if model is None: + model = retinanet(num_anchors=anchor_params.num_anchors(), **kwargs) + else: + assert_training_model(model) + + # compute the anchors + features = [model.get_layer(p_name).output for p_name in ['P3', 'P4', 'P5', 'P6', 'P7']] + anchors = __build_anchors(anchor_params, features) + + # we expect the anchors, regression and classification values as first output + regression = model.outputs[0] + classification = model.outputs[1] + + # "other" can be any additional output from custom submodels, by default this will be [] + other = model.outputs[2:] + + # apply predicted regression to anchors + boxes = layers.RegressBoxes(name='boxes')([anchors, regression]) + boxes = layers.ClipBoxes(name='clipped_boxes')([model.inputs[0], boxes]) + + # filter detections (apply NMS / score threshold / select top-k) + detections = layers.FilterDetections( + nms = nms, + class_specific_filter = class_specific_filter, + name = 'filtered_detections', + nms_threshold = nms_threshold, + score_threshold = score_threshold, + max_detections = max_detections, + parallel_iterations = parallel_iterations + )([boxes, classification] + other) + + # construct the model + return keras.models.Model(inputs=model.inputs, outputs=detections, name=name) diff --git a/src/keras_retinanet/models/senet.py b/src/keras_retinanet/models/senet.py new file mode 100644 index 0000000..deb1eac --- /dev/null +++ b/src/keras_retinanet/models/senet.py @@ -0,0 +1,155 @@ +""" +Copyright 2017-2018 Fizyr (https://fizyr.com) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import keras +from keras.utils import get_file + +from . import retinanet +from . import Backbone +from classification_models.keras import Classifiers + + +class SeBackbone(Backbone): + """ Describes backbone information and provides utility functions. + """ + + def __init__(self, backbone): + super(SeBackbone, self).__init__(backbone) + _, self.preprocess_image_func = Classifiers.get(self.backbone) + + def retinanet(self, *args, **kwargs): + """ Returns a retinanet model using the correct backbone. + """ + return senet_retinanet(*args, backbone=self.backbone, **kwargs) + + def download_imagenet(self): + """ Downloads ImageNet weights and returns path to weights file. + """ + from classification_models.weights import WEIGHTS_COLLECTION + + weights_path = None + for el in WEIGHTS_COLLECTION: + if el['model'] == self.backbone and not el['include_top']: + weights_path = get_file(el['name'], el['url'], cache_subdir='models', file_hash=el['md5']) + + if weights_path is None: + raise ValueError('Unable to find imagenet weights for backbone {}!'.format(self.backbone)) + + return weights_path + + def validate(self): + """ Checks whether the backbone string is correct. + """ + allowed_backbones = ['seresnet18', 'seresnet34', 'seresnet50', 'seresnet101', 'seresnet152', + 'seresnext50', 'seresnext101', 'senet154'] + backbone = self.backbone.split('_')[0] + + if backbone not in allowed_backbones: + raise ValueError('Backbone (\'{}\') not in allowed backbones ({}).'.format(backbone, allowed_backbones)) + + def preprocess_image(self, inputs): + """ Takes as input an image and prepares it for being passed through the network. + """ + return self.preprocess_image_func(inputs) + + +def senet_retinanet(num_classes, backbone='seresnext50', inputs=None, modifier=None, **kwargs): + """ Constructs a retinanet model using a resnet backbone. + + Args + num_classes: Number of classes to predict. + backbone: Which backbone to use (one of ('resnet50', 'resnet101', 'resnet152')). + inputs: The inputs to the network (defaults to a Tensor of shape (None, None, 3)). + modifier: A function handler which can modify the backbone before using it in retinanet (this can be used to freeze backbone layers for example). + + Returns + RetinaNet model with a ResNet backbone. + """ + # choose default input + if inputs is None: + if keras.backend.image_data_format() == 'channels_first': + inputs = keras.layers.Input(shape=(3, None, None)) + else: + # inputs = keras.layers.Input(shape=(224, 224, 3)) + inputs = keras.layers.Input(shape=(None, None, 3)) + + classifier, _ = Classifiers.get(backbone) + model = classifier(input_tensor=inputs, include_top=False, weights=None) + + # get last conv layer from the end of each block [28x28, 14x14, 7x7] + if backbone == 'seresnet18' or backbone == 'seresnet34': + layer_outputs = ['stage3_unit1_relu1', 'stage4_unit1_relu1', 'relu1'] + elif backbone == 'seresnet50': + layer_outputs = ['activation_36', 'activation_66', 'activation_81'] + elif backbone == 'seresnet101': + layer_outputs = ['activation_36', 'activation_151', 'activation_166'] + elif backbone == 'seresnet152': + layer_outputs = ['activation_56', 'activation_236', 'activation_251'] + elif backbone == 'seresnext50': + layer_outputs = ['activation_37', 'activation_67', 'activation_81'] + elif backbone == 'seresnext101': + layer_outputs = ['activation_37', 'activation_152', 'activation_166'] + elif backbone == 'senet154': + layer_outputs = ['activation_59', 'activation_239', 'activation_253'] + else: + raise ValueError('Backbone (\'{}\') is invalid.'.format(backbone)) + + layer_outputs = [ + model.get_layer(name=layer_outputs[0]).output, # 28x28 + model.get_layer(name=layer_outputs[1]).output, # 14x14 + model.get_layer(name=layer_outputs[2]).output, # 7x7 + ] + # create the densenet backbone + model = keras.models.Model(inputs=inputs, outputs=layer_outputs, name=model.name) + + # invoke modifier if given + if modifier: + model = modifier(model) + + # create the full model + return retinanet.retinanet(inputs=inputs, num_classes=num_classes, backbone_layers=model.outputs, **kwargs) + + +def seresnet18_retinanet(num_classes, inputs=None, **kwargs): + return senet_retinanet(num_classes=num_classes, backbone='seresnet18', inputs=inputs, **kwargs) + + +def seresnet34_retinanet(num_classes, inputs=None, **kwargs): + return senet_retinanet(num_classes=num_classes, backbone='seresnet34', inputs=inputs, **kwargs) + + +def seresnet50_retinanet(num_classes, inputs=None, **kwargs): + return senet_retinanet(num_classes=num_classes, backbone='seresnet50', inputs=inputs, **kwargs) + + +def seresnet101_retinanet(num_classes, inputs=None, **kwargs): + return senet_retinanet(num_classes=num_classes, backbone='seresnet101', inputs=inputs, **kwargs) + + +def seresnet152_retinanet(num_classes, inputs=None, **kwargs): + return senet_retinanet(num_classes=num_classes, backbone='seresnet152', inputs=inputs, **kwargs) + + +def seresnext50_retinanet(num_classes, inputs=None, **kwargs): + return senet_retinanet(num_classes=num_classes, backbone='seresnext50', inputs=inputs, **kwargs) + + +def seresnext101_retinanet(num_classes, inputs=None, **kwargs): + return senet_retinanet(num_classes=num_classes, backbone='seresnext101', inputs=inputs, **kwargs) + + +def senet154_retinanet(num_classes, inputs=None, **kwargs): + return senet_retinanet(num_classes=num_classes, backbone='senet154', inputs=inputs, **kwargs) diff --git a/src/keras_retinanet/models/vgg.py b/src/keras_retinanet/models/vgg.py new file mode 100644 index 0000000..fad7e4b --- /dev/null +++ b/src/keras_retinanet/models/vgg.py @@ -0,0 +1,99 @@ +""" +Copyright 2017-2018 cgratie (https://github.com/cgratie/) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + + +import keras +from keras.utils import get_file + +from . import retinanet +from . import Backbone +from ..utils.image import preprocess_image + + +class VGGBackbone(Backbone): + """ Describes backbone information and provides utility functions. + """ + + def retinanet(self, *args, **kwargs): + """ Returns a retinanet model using the correct backbone. + """ + return vgg_retinanet(*args, backbone=self.backbone, **kwargs) + + def download_imagenet(self): + """ Downloads ImageNet weights and returns path to weights file. + Weights can be downloaded at https://github.com/fizyr/keras-models/releases . + """ + if self.backbone == 'vgg16': + resource = keras.applications.vgg16.vgg16.WEIGHTS_PATH_NO_TOP + checksum = '6d6bbae143d832006294945121d1f1fc' + elif self.backbone == 'vgg19': + resource = keras.applications.vgg19.vgg19.WEIGHTS_PATH_NO_TOP + checksum = '253f8cb515780f3b799900260a226db6' + else: + raise ValueError("Backbone '{}' not recognized.".format(self.backbone)) + + return get_file( + '{}_weights_tf_dim_ordering_tf_kernels_notop.h5'.format(self.backbone), + resource, + cache_subdir='models', + file_hash=checksum + ) + + def validate(self): + """ Checks whether the backbone string is correct. + """ + allowed_backbones = ['vgg16', 'vgg19'] + + if self.backbone not in allowed_backbones: + raise ValueError('Backbone (\'{}\') not in allowed backbones ({}).'.format(self.backbone, allowed_backbones)) + + def preprocess_image(self, inputs): + """ Takes as input an image and prepares it for being passed through the network. + """ + return preprocess_image(inputs, mode='caffe') + + +def vgg_retinanet(num_classes, backbone='vgg16', inputs=None, modifier=None, **kwargs): + """ Constructs a retinanet model using a vgg backbone. + + Args + num_classes: Number of classes to predict. + backbone: Which backbone to use (one of ('vgg16', 'vgg19')). + inputs: The inputs to the network (defaults to a Tensor of shape (None, None, 3)). + modifier: A function handler which can modify the backbone before using it in retinanet (this can be used to freeze backbone layers for example). + + Returns + RetinaNet model with a VGG backbone. + """ + # choose default input + if inputs is None: + inputs = keras.layers.Input(shape=(None, None, 3)) + + # create the vgg backbone + if backbone == 'vgg16': + vgg = keras.applications.VGG16(input_tensor=inputs, include_top=False, weights=None) + elif backbone == 'vgg19': + vgg = keras.applications.VGG19(input_tensor=inputs, include_top=False, weights=None) + else: + raise ValueError("Backbone '{}' not recognized.".format(backbone)) + + if modifier: + vgg = modifier(vgg) + + # create the full model + layer_names = ["block3_pool", "block4_pool", "block5_pool"] + layer_outputs = [vgg.get_layer(name).output for name in layer_names] + return retinanet.retinanet(inputs=inputs, num_classes=num_classes, backbone_layers=layer_outputs, **kwargs) diff --git a/src/keras_retinanet/preprocessing/__init__.py b/src/keras_retinanet/preprocessing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/keras_retinanet/preprocessing/coco.py b/src/keras_retinanet/preprocessing/coco.py new file mode 100644 index 0000000..b684b80 --- /dev/null +++ b/src/keras_retinanet/preprocessing/coco.py @@ -0,0 +1,159 @@ +""" +Copyright 2017-2018 Fizyr (https://fizyr.com) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from ..preprocessing.generator import Generator +from ..utils.image import read_image_bgr + +import os +import numpy as np + +from pycocotools.coco import COCO + + +class CocoGenerator(Generator): + """ Generate data from the COCO dataset. + + See https://github.com/cocodataset/cocoapi/tree/master/PythonAPI for more information. + """ + + def __init__(self, data_dir, set_name, **kwargs): + """ Initialize a COCO data generator. + + Args + data_dir: Path to where the COCO dataset is stored. + set_name: Name of the set to parse. + """ + self.data_dir = data_dir + self.set_name = set_name + self.coco = COCO(os.path.join(data_dir, 'annotations', 'instances_' + set_name + '.json')) + self.image_ids = self.coco.getImgIds() + + self.load_classes() + + super(CocoGenerator, self).__init__(**kwargs) + + def load_classes(self): + """ Loads the class to label mapping (and inverse) for COCO. + """ + # load class names (name -> label) + categories = self.coco.loadCats(self.coco.getCatIds()) + categories.sort(key=lambda x: x['id']) + + self.classes = {} + self.coco_labels = {} + self.coco_labels_inverse = {} + for c in categories: + self.coco_labels[len(self.classes)] = c['id'] + self.coco_labels_inverse[c['id']] = len(self.classes) + self.classes[c['name']] = len(self.classes) + + # also load the reverse (label -> name) + self.labels = {} + for key, value in self.classes.items(): + self.labels[value] = key + + def size(self): + """ Size of the COCO dataset. + """ + return len(self.image_ids) + + def num_classes(self): + """ Number of classes in the dataset. For COCO this is 80. + """ + return len(self.classes) + + def has_label(self, label): + """ Return True if label is a known label. + """ + return label in self.labels + + def has_name(self, name): + """ Returns True if name is a known class. + """ + return name in self.classes + + def name_to_label(self, name): + """ Map name to label. + """ + return self.classes[name] + + def label_to_name(self, label): + """ Map label to name. + """ + return self.labels[label] + + def coco_label_to_label(self, coco_label): + """ Map COCO label to the label as used in the network. + COCO has some gaps in the order of labels. The highest label is 90, but there are 80 classes. + """ + return self.coco_labels_inverse[coco_label] + + def coco_label_to_name(self, coco_label): + """ Map COCO label to name. + """ + return self.label_to_name(self.coco_label_to_label(coco_label)) + + def label_to_coco_label(self, label): + """ Map label as used by the network to labels as used by COCO. + """ + return self.coco_labels[label] + + def image_path(self, image_index): + """ Returns the image path for image_index. + """ + image_info = self.coco.loadImgs(self.image_ids[image_index])[0] + path = os.path.join(self.data_dir, 'images', self.set_name, image_info['file_name']) + return path + + def image_aspect_ratio(self, image_index): + """ Compute the aspect ratio for an image with image_index. + """ + image = self.coco.loadImgs(self.image_ids[image_index])[0] + return float(image['width']) / float(image['height']) + + def load_image(self, image_index): + """ Load an image at the image_index. + """ + path = self.image_path(image_index) + return read_image_bgr(path) + + def load_annotations(self, image_index): + """ Load annotations for an image_index. + """ + # get ground truth annotations + annotations_ids = self.coco.getAnnIds(imgIds=self.image_ids[image_index], iscrowd=False) + annotations = {'labels': np.empty((0,)), 'bboxes': np.empty((0, 4))} + + # some images appear to miss annotations (like image with id 257034) + if len(annotations_ids) == 0: + return annotations + + # parse annotations + coco_annotations = self.coco.loadAnns(annotations_ids) + for idx, a in enumerate(coco_annotations): + # some annotations have basically no width / height, skip them + if a['bbox'][2] < 1 or a['bbox'][3] < 1: + continue + + annotations['labels'] = np.concatenate([annotations['labels'], [self.coco_label_to_label(a['category_id'])]], axis=0) + annotations['bboxes'] = np.concatenate([annotations['bboxes'], [[ + a['bbox'][0], + a['bbox'][1], + a['bbox'][0] + a['bbox'][2], + a['bbox'][1] + a['bbox'][3], + ]]], axis=0) + + return annotations diff --git a/src/keras_retinanet/preprocessing/csv_generator.py b/src/keras_retinanet/preprocessing/csv_generator.py new file mode 100644 index 0000000..c756224 --- /dev/null +++ b/src/keras_retinanet/preprocessing/csv_generator.py @@ -0,0 +1,225 @@ +""" +Copyright 2017-2018 yhenon (https://github.com/yhenon/) +Copyright 2017-2018 Fizyr (https://fizyr.com) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from .generator import Generator +from ..utils.image import read_image_bgr + +import numpy as np +from PIL import Image +from six import raise_from + +import csv +import sys +import os.path +from collections import OrderedDict + + +def _parse(value, function, fmt): + """ + Parse a string into a value, and format a nice ValueError if it fails. + + Returns `function(value)`. + Any `ValueError` raised is catched and a new `ValueError` is raised + with message `fmt.format(e)`, where `e` is the caught `ValueError`. + """ + try: + return function(value) + except ValueError as e: + raise_from(ValueError(fmt.format(e)), None) + + +def _read_classes(csv_reader): + """ Parse the classes file given by csv_reader. + """ + result = OrderedDict() + for line, row in enumerate(csv_reader): + line += 1 + + try: + class_name, class_id = row + except ValueError: + raise_from(ValueError('line {}: format should be \'class_name,class_id\''.format(line)), None) + class_id = _parse(class_id, int, 'line {}: malformed class ID: {{}}'.format(line)) + + if class_name in result: + raise ValueError('line {}: duplicate class name: \'{}\''.format(line, class_name)) + result[class_name] = class_id + return result + + +def _read_annotations(csv_reader, classes): + """ Read annotations from the csv_reader. + """ + result = OrderedDict() + for line, row in enumerate(csv_reader): + line += 1 + + try: + img_file, x1, y1, x2, y2, class_name = row[:6] + except ValueError: + raise_from(ValueError('line {}: format should be \'img_file,x1,y1,x2,y2,class_name\' or \'img_file,,,,,\''.format(line)), None) + + if img_file not in result: + result[img_file] = [] + + # If a row contains only an image path, it's an image without annotations. + if (x1, y1, x2, y2, class_name) == ('', '', '', '', ''): + continue + + x1 = _parse(x1, int, 'line {}: malformed x1: {{}}'.format(line)) + y1 = _parse(y1, int, 'line {}: malformed y1: {{}}'.format(line)) + x2 = _parse(x2, int, 'line {}: malformed x2: {{}}'.format(line)) + y2 = _parse(y2, int, 'line {}: malformed y2: {{}}'.format(line)) + + # Check that the bounding box is valid. + if x2 <= x1: + raise ValueError('line {}: x2 ({}) must be higher than x1 ({})'.format(line, x2, x1)) + if y2 <= y1: + raise ValueError('line {}: y2 ({}) must be higher than y1 ({})'.format(line, y2, y1)) + + # check if the current class name is correctly present + if class_name not in classes: + raise ValueError('line {}: unknown class name: \'{}\' (classes: {})'.format(line, class_name, classes)) + + result[img_file].append({'x1': x1, 'x2': x2, 'y1': y1, 'y2': y2, 'class': class_name}) + return result + + +def _open_for_csv(path): + """ Open a file with flags suitable for csv.reader. + + This is different for python2 it means with mode 'rb', + for python3 this means 'r' with "universal newlines". + """ + if sys.version_info[0] < 3: + return open(path, 'rb') + else: + return open(path, 'r', newline='') + + +class CSVGenerator(Generator): + """ Generate data for a custom CSV dataset. + + See https://github.com/fizyr/keras-retinanet#csv-datasets for more information. + """ + + def __init__( + self, + csv_data_file, + csv_class_file, + base_dir=None, + **kwargs + ): + """ Initialize a CSV data generator. + + Args + csv_data_file: Path to the CSV annotations file. + csv_class_file: Path to the CSV classes file. + base_dir: Directory w.r.t. where the files are to be searched (defaults to the directory containing the csv_data_file). + """ + self.image_names = [] + self.image_data = {} + self.base_dir = base_dir + + # Take base_dir from annotations file if not explicitly specified. + if self.base_dir is None: + self.base_dir = os.path.dirname(csv_data_file) + + # parse the provided class file + try: + with _open_for_csv(csv_class_file) as file: + self.classes = _read_classes(csv.reader(file, delimiter=',')) + except ValueError as e: + raise_from(ValueError('invalid CSV class file: {}: {}'.format(csv_class_file, e)), None) + + self.labels = {} + for key, value in self.classes.items(): + self.labels[value] = key + + # csv with img_path, x1, y1, x2, y2, class_name + try: + with _open_for_csv(csv_data_file) as file: + self.image_data = _read_annotations(csv.reader(file, delimiter=','), self.classes) + except ValueError as e: + raise_from(ValueError('invalid CSV annotations file: {}: {}'.format(csv_data_file, e)), None) + self.image_names = list(self.image_data.keys()) + + super(CSVGenerator, self).__init__(**kwargs) + + def size(self): + """ Size of the dataset. + """ + return len(self.image_names) + + def num_classes(self): + """ Number of classes in the dataset. + """ + return max(self.classes.values()) + 1 + + def has_label(self, label): + """ Return True if label is a known label. + """ + return label in self.labels + + def has_name(self, name): + """ Returns True if name is a known class. + """ + return name in self.classes + + def name_to_label(self, name): + """ Map name to label. + """ + return self.classes[name] + + def label_to_name(self, label): + """ Map label to name. + """ + return self.labels[label] + + def image_path(self, image_index): + """ Returns the image path for image_index. + """ + return os.path.join(self.base_dir, self.image_names[image_index]) + + def image_aspect_ratio(self, image_index): + """ Compute the aspect ratio for an image with image_index. + """ + # PIL is fast for metadata + image = Image.open(self.image_path(image_index)) + return float(image.width) / float(image.height) + + def load_image(self, image_index): + """ Load an image at the image_index. + """ + return read_image_bgr(self.image_path(image_index)) + + def load_annotations(self, image_index): + """ Load annotations for an image_index. + """ + path = self.image_names[image_index] + annotations = {'labels': np.empty((0,)), 'bboxes': np.empty((0, 4))} + + for idx, annot in enumerate(self.image_data[path]): + annotations['labels'] = np.concatenate((annotations['labels'], [self.name_to_label(annot['class'])])) + annotations['bboxes'] = np.concatenate((annotations['bboxes'], [[ + float(annot['x1']), + float(annot['y1']), + float(annot['x2']), + float(annot['y2']), + ]])) + + return annotations diff --git a/src/keras_retinanet/preprocessing/generator.py b/src/keras_retinanet/preprocessing/generator.py new file mode 100644 index 0000000..ae1c43b --- /dev/null +++ b/src/keras_retinanet/preprocessing/generator.py @@ -0,0 +1,377 @@ +""" +Copyright 2017-2018 Fizyr (https://fizyr.com) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np +import random +import warnings + +import keras + +from ..utils.anchors import ( + anchor_targets_bbox, + anchors_for_shape, + guess_shapes +) +from ..utils.config import parse_anchor_parameters +from ..utils.image import ( + TransformParameters, + adjust_transform_for_image, + apply_transform, + preprocess_image, + resize_image, +) +from ..utils.transform import transform_aabb + + +class Generator(keras.utils.Sequence): + """ Abstract generator class. + """ + + def __init__( + self, + transform_generator = None, + visual_effect_generator=None, + batch_size=1, + group_method='ratio', # one of 'none', 'random', 'ratio' + shuffle_groups=True, + image_min_side=800, + image_max_side=1333, + no_resize=False, + transform_parameters=None, + compute_anchor_targets=anchor_targets_bbox, + compute_shapes=guess_shapes, + preprocess_image=preprocess_image, + config=None + ): + """ Initialize Generator object. + + Args + transform_generator : A generator used to randomly transform images and annotations. + batch_size : The size of the batches to generate. + group_method : Determines how images are grouped together (defaults to 'ratio', one of ('none', 'random', 'ratio')). + shuffle_groups : If True, shuffles the groups each epoch. + image_min_side : After resizing the minimum side of an image is equal to image_min_side. + image_max_side : If after resizing the maximum side is larger than image_max_side, scales down further so that the max side is equal to image_max_side. + no_resize : If True, no image/annotation resizing is performed. + transform_parameters : The transform parameters used for data augmentation. + compute_anchor_targets : Function handler for computing the targets of anchors for an image and its annotations. + compute_shapes : Function handler for computing the shapes of the pyramid for a given input. + preprocess_image : Function handler for preprocessing an image (scaling / normalizing) for passing through a network. + """ + self.transform_generator = transform_generator + self.visual_effect_generator = visual_effect_generator + self.batch_size = int(batch_size) + self.group_method = group_method + self.shuffle_groups = shuffle_groups + self.image_min_side = image_min_side + self.image_max_side = image_max_side + self.no_resize = no_resize + self.transform_parameters = transform_parameters or TransformParameters() + self.compute_anchor_targets = compute_anchor_targets + self.compute_shapes = compute_shapes + self.preprocess_image = preprocess_image + self.config = config + + # Define groups + self.group_images() + + # Shuffle when initializing + if self.shuffle_groups: + self.on_epoch_end() + + def on_epoch_end(self): + if self.shuffle_groups: + random.shuffle(self.groups) + + def size(self): + """ Size of the dataset. + """ + raise NotImplementedError('size method not implemented') + + def num_classes(self): + """ Number of classes in the dataset. + """ + raise NotImplementedError('num_classes method not implemented') + + def has_label(self, label): + """ Returns True if label is a known label. + """ + raise NotImplementedError('has_label method not implemented') + + def has_name(self, name): + """ Returns True if name is a known class. + """ + raise NotImplementedError('has_name method not implemented') + + def name_to_label(self, name): + """ Map name to label. + """ + raise NotImplementedError('name_to_label method not implemented') + + def label_to_name(self, label): + """ Map label to name. + """ + raise NotImplementedError('label_to_name method not implemented') + + def image_aspect_ratio(self, image_index): + """ Compute the aspect ratio for an image with image_index. + """ + raise NotImplementedError('image_aspect_ratio method not implemented') + + def image_path(self, image_index): + """ Get the path to an image. + """ + raise NotImplementedError('image_path method not implemented') + + def load_image(self, image_index): + """ Load an image at the image_index. + """ + raise NotImplementedError('load_image method not implemented') + + def load_annotations(self, image_index): + """ Load annotations for an image_index. + """ + raise NotImplementedError('load_annotations method not implemented') + + def load_annotations_group(self, group): + """ Load annotations for all images in group. + """ + annotations_group = [self.load_annotations(image_index) for image_index in group] + for annotations in annotations_group: + assert(isinstance(annotations, dict)), '\'load_annotations\' should return a list of dictionaries, received: {}'.format(type(annotations)) + assert('labels' in annotations), '\'load_annotations\' should return a list of dictionaries that contain \'labels\' and \'bboxes\'.' + assert('bboxes' in annotations), '\'load_annotations\' should return a list of dictionaries that contain \'labels\' and \'bboxes\'.' + + return annotations_group + + def filter_annotations(self, image_group, annotations_group, group): + """ Filter annotations by removing those that are outside of the image bounds or whose width/height < 0. + """ + # test all annotations + for index, (image, annotations) in enumerate(zip(image_group, annotations_group)): + # test x2 < x1 | y2 < y1 | x1 < 0 | y1 < 0 | x2 <= 0 | y2 <= 0 | x2 >= image.shape[1] | y2 >= image.shape[0] + invalid_indices = np.where( + (annotations['bboxes'][:, 2] <= annotations['bboxes'][:, 0]) | + (annotations['bboxes'][:, 3] <= annotations['bboxes'][:, 1]) | + (annotations['bboxes'][:, 0] < 0) | + (annotations['bboxes'][:, 1] < 0) | + (annotations['bboxes'][:, 2] > image.shape[1]) | + (annotations['bboxes'][:, 3] > image.shape[0]) + )[0] + + # delete invalid indices + if len(invalid_indices): + warnings.warn('Image {} with id {} (shape {}) contains the following invalid boxes: {}.'.format( + self.image_path(group[index]), + group[index], + image.shape, + annotations['bboxes'][invalid_indices, :] + )) + for k in annotations_group[index].keys(): + annotations_group[index][k] = np.delete(annotations[k], invalid_indices, axis=0) + return image_group, annotations_group + + def load_image_group(self, group): + """ Load images for all images in a group. + """ + return [self.load_image(image_index) for image_index in group] + + def random_visual_effect_group_entry(self, image, annotations): + """ Randomly transforms image and annotation. + """ + visual_effect = next(self.visual_effect_generator) + # apply visual effect + image = visual_effect(image) + return image, annotations + + def random_visual_effect_group(self, image_group, annotations_group): + """ Randomly apply visual effect on each image. + """ + assert(len(image_group) == len(annotations_group)) + + if self.visual_effect_generator is None: + # do nothing + return image_group, annotations_group + + for index in range(len(image_group)): + # apply effect on a single group entry + image_group[index], annotations_group[index] = self.random_visual_effect_group_entry( + image_group[index], annotations_group[index] + ) + + return image_group, annotations_group + + def random_transform_group_entry(self, image, annotations, transform=None): + """ Randomly transforms image and annotation. + """ + # randomly transform both image and annotations + if transform is not None or self.transform_generator: + if transform is None: + transform = adjust_transform_for_image(next(self.transform_generator), image, self.transform_parameters.relative_translation) + + # apply transformation to image + image = apply_transform(transform, image, self.transform_parameters) + + # Transform the bounding boxes in the annotations. + annotations['bboxes'] = annotations['bboxes'].copy() + for index in range(annotations['bboxes'].shape[0]): + annotations['bboxes'][index, :] = transform_aabb(transform, annotations['bboxes'][index, :]) + + return image, annotations + + def random_transform_group(self, image_group, annotations_group): + """ Randomly transforms each image and its annotations. + """ + + assert(len(image_group) == len(annotations_group)) + + for index in range(len(image_group)): + # transform a single group entry + image_group[index], annotations_group[index] = self.random_transform_group_entry(image_group[index], annotations_group[index]) + + return image_group, annotations_group + + def resize_image(self, image): + """ Resize an image using image_min_side and image_max_side. + """ + if self.no_resize: + return image, 1 + else: + return resize_image(image, min_side=self.image_min_side, max_side=self.image_max_side) + + def preprocess_group_entry(self, image, annotations): + """ Preprocess image and its annotations. + """ + # preprocess the image + image = self.preprocess_image(image) + + # resize image + image, image_scale = self.resize_image(image) + + # apply resizing to annotations too + annotations['bboxes'] *= image_scale + + # convert to the wanted keras floatx + image = keras.backend.cast_to_floatx(image) + + return image, annotations + + def preprocess_group(self, image_group, annotations_group): + """ Preprocess each image and its annotations in its group. + """ + assert(len(image_group) == len(annotations_group)) + + for index in range(len(image_group)): + # preprocess a single group entry + image_group[index], annotations_group[index] = self.preprocess_group_entry(image_group[index], annotations_group[index]) + + return image_group, annotations_group + + def group_images(self): + """ Order the images according to self.order and makes groups of self.batch_size. + """ + # determine the order of the images + order = list(range(self.size())) + if self.group_method == 'random': + random.shuffle(order) + elif self.group_method == 'ratio': + order.sort(key=lambda x: self.image_aspect_ratio(x)) + + # divide into groups, one group = one batch + self.groups = [[order[x % len(order)] for x in range(i, i + self.batch_size)] for i in range(0, len(order), self.batch_size)] + + def compute_inputs(self, image_group): + """ Compute inputs for the network using an image_group. + """ + # get the max image shape + max_shape = tuple(max(image.shape[x] for image in image_group) for x in range(3)) + + # construct an image batch object + image_batch = np.zeros((self.batch_size,) + max_shape, dtype=keras.backend.floatx()) + + # copy all images to the upper left part of the image batch object + for image_index, image in enumerate(image_group): + image_batch[image_index, :image.shape[0], :image.shape[1], :image.shape[2]] = image + + if keras.backend.image_data_format() == 'channels_first': + image_batch = image_batch.transpose((0, 3, 1, 2)) + + return image_batch + + def generate_anchors(self, image_shape): + anchor_params = None + if self.config and 'anchor_parameters' in self.config: + anchor_params = parse_anchor_parameters(self.config) + return anchors_for_shape(image_shape, anchor_params=anchor_params, shapes_callback=self.compute_shapes) + + def compute_targets(self, image_group, annotations_group): + """ Compute target outputs for the network using images and their annotations. + """ + # get the max image shape + max_shape = tuple(max(image.shape[x] for image in image_group) for x in range(3)) + anchors = self.generate_anchors(max_shape) + + batches = self.compute_anchor_targets( + anchors, + image_group, + annotations_group, + self.num_classes() + ) + + return list(batches) + + def compute_input_output(self, group): + """ Compute inputs and target outputs for the network. + """ + # load images and annotations + image_group = self.load_image_group(group) + annotations_group = self.load_annotations_group(group) + + # check validity of annotations + image_group, annotations_group = self.filter_annotations(image_group, annotations_group, group) + + # randomly apply visual effect + image_group, annotations_group = self.random_visual_effect_group(image_group, annotations_group) + + # randomly transform data + image_group, annotations_group = self.random_transform_group(image_group, annotations_group) + + # perform preprocessing steps + image_group, annotations_group = self.preprocess_group(image_group, annotations_group) + + # compute network inputs + inputs = self.compute_inputs(image_group) + + # compute network targets + targets = self.compute_targets(image_group, annotations_group) + + return inputs, targets + + def __len__(self): + """ + Number of batches for generator. + """ + + return len(self.groups) + + def __getitem__(self, index): + """ + Keras sequence method for generating batches. + """ + group = self.groups[index] + inputs, targets = self.compute_input_output(group) + + return inputs, targets diff --git a/src/keras_retinanet/preprocessing/kitti.py b/src/keras_retinanet/preprocessing/kitti.py new file mode 100644 index 0000000..5922558 --- /dev/null +++ b/src/keras_retinanet/preprocessing/kitti.py @@ -0,0 +1,168 @@ +""" +Copyright 2017-2018 lvaleriu (https://github.com/lvaleriu/) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import csv +import os.path + +import numpy as np +from PIL import Image + +from .generator import Generator +from ..utils.image import read_image_bgr + +kitti_classes = { + 'Car': 0, + 'Van': 1, + 'Truck': 2, + 'Pedestrian': 3, + 'Person_sitting': 4, + 'Cyclist': 5, + 'Tram': 6, + 'Misc': 7, + 'DontCare': 7 +} + + +class KittiGenerator(Generator): + """ Generate data for a KITTI dataset. + + See http://www.cvlibs.net/datasets/kitti/ for more information. + """ + + def __init__( + self, + base_dir, + subset='train', + **kwargs + ): + """ Initialize a KITTI data generator. + + Args + base_dir: Directory w.r.t. where the files are to be searched (defaults to the directory containing the csv_data_file). + subset: The subset to generate data for (defaults to 'train'). + """ + self.base_dir = base_dir + + label_dir = os.path.join(self.base_dir, subset, 'labels') + image_dir = os.path.join(self.base_dir, subset, 'images') + + """ + 1 type Describes the type of object: 'Car', 'Van', 'Truck', + 'Pedestrian', 'Person_sitting', 'Cyclist', 'Tram', + 'Misc' or 'DontCare' + 1 truncated Float from 0 (non-truncated) to 1 (truncated), where + truncated refers to the object leaving image boundaries + 1 occluded Integer (0,1,2,3) indicating occlusion state: + 0 = fully visible, 1 = partly occluded + 2 = largely occluded, 3 = unknown + 1 alpha Observation angle of object, ranging [-pi..pi] + 4 bbox 2D bounding box of object in the image (0-based index): + contains left, top, right, bottom pixel coordinates + 3 dimensions 3D object dimensions: height, width, length (in meters) + 3 location 3D object location x,y,z in camera coordinates (in meters) + 1 rotation_y Rotation ry around Y-axis in camera coordinates [-pi..pi] + """ + + self.labels = {} + self.classes = kitti_classes + for name, label in self.classes.items(): + self.labels[label] = name + + self.image_data = dict() + self.images = [] + for i, fn in enumerate(os.listdir(label_dir)): + label_fp = os.path.join(label_dir, fn) + image_fp = os.path.join(image_dir, fn.replace('.txt', '.png')) + + self.images.append(image_fp) + + fieldnames = ['type', 'truncated', 'occluded', 'alpha', 'left', 'top', 'right', 'bottom', 'dh', 'dw', 'dl', + 'lx', 'ly', 'lz', 'ry'] + with open(label_fp, 'r') as csv_file: + reader = csv.DictReader(csv_file, delimiter=' ', fieldnames=fieldnames) + boxes = [] + for line, row in enumerate(reader): + label = row['type'] + cls_id = kitti_classes[label] + + annotation = {'cls_id': cls_id, 'x1': row['left'], 'x2': row['right'], 'y2': row['bottom'], 'y1': row['top']} + boxes.append(annotation) + + self.image_data[i] = boxes + + super(KittiGenerator, self).__init__(**kwargs) + + def size(self): + """ Size of the dataset. + """ + return len(self.images) + + def num_classes(self): + """ Number of classes in the dataset. + """ + return max(self.classes.values()) + 1 + + def has_label(self, label): + """ Return True if label is a known label. + """ + return label in self.labels + + def has_name(self, name): + """ Returns True if name is a known class. + """ + return name in self.classes + + def name_to_label(self, name): + """ Map name to label. + """ + raise NotImplementedError() + + def label_to_name(self, label): + """ Map label to name. + """ + return self.labels[label] + + def image_aspect_ratio(self, image_index): + """ Compute the aspect ratio for an image with image_index. + """ + # PIL is fast for metadata + image = Image.open(self.images[image_index]) + return float(image.width) / float(image.height) + + def image_path(self, image_index): + """ Get the path to an image. + """ + return self.images[image_index] + + def load_image(self, image_index): + """ Load an image at the image_index. + """ + return read_image_bgr(self.image_path(image_index)) + + def load_annotations(self, image_index): + """ Load annotations for an image_index. + """ + image_data = self.image_data[image_index] + annotations = {'labels': np.empty((len(image_data),)), 'bboxes': np.empty((len(image_data), 4))} + + for idx, ann in enumerate(image_data): + annotations['bboxes'][idx, 0] = float(ann['x1']) + annotations['bboxes'][idx, 1] = float(ann['y1']) + annotations['bboxes'][idx, 2] = float(ann['x2']) + annotations['bboxes'][idx, 3] = float(ann['y2']) + annotations['labels'][idx] = int(ann['cls_id']) + + return annotations diff --git a/src/keras_retinanet/preprocessing/open_images.py b/src/keras_retinanet/preprocessing/open_images.py new file mode 100644 index 0000000..a5ac737 --- /dev/null +++ b/src/keras_retinanet/preprocessing/open_images.py @@ -0,0 +1,375 @@ +""" +Copyright 2017-2018 lvaleriu (https://github.com/lvaleriu/) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import csv +import json +import os +import warnings + +import numpy as np +from PIL import Image + +from .generator import Generator +from ..utils.image import read_image_bgr + + +def load_hierarchy(metadata_dir, version='v4'): + hierarchy = None + if version == 'challenge2018': + hierarchy = 'bbox_labels_500_hierarchy.json' + elif version == 'v4': + hierarchy = 'bbox_labels_600_hierarchy.json' + elif version == 'v3': + hierarchy = 'bbox_labels_600_hierarchy.json' + + hierarchy_json = os.path.join(metadata_dir, hierarchy) + with open(hierarchy_json) as f: + hierarchy_data = json.loads(f.read()) + + return hierarchy_data + + +def load_hierarchy_children(hierarchy): + res = [hierarchy['LabelName']] + + if 'Subcategory' in hierarchy: + for subcategory in hierarchy['Subcategory']: + children = load_hierarchy_children(subcategory) + + for c in children: + res.append(c) + + return res + + +def find_hierarchy_parent(hierarchy, parent_cls): + if hierarchy['LabelName'] == parent_cls: + return hierarchy + elif 'Subcategory' in hierarchy: + for child in hierarchy['Subcategory']: + res = find_hierarchy_parent(child, parent_cls) + if res is not None: + return res + + return None + + +def get_labels(metadata_dir, version='v4'): + if version == 'v4' or version == 'challenge2018': + csv_file = 'class-descriptions-boxable.csv' if version == 'v4' else 'challenge-2018-class-descriptions-500.csv' + + boxable_classes_descriptions = os.path.join(metadata_dir, csv_file) + id_to_labels = {} + cls_index = {} + + i = 0 + with open(boxable_classes_descriptions) as f: + for row in csv.reader(f): + # make sure the csv row is not empty (usually the last one) + if len(row): + label = row[0] + description = row[1].replace("\"", "").replace("'", "").replace('`', '') + + id_to_labels[i] = description + cls_index[label] = i + + i += 1 + else: + trainable_classes_path = os.path.join(metadata_dir, 'classes-bbox-trainable.txt') + description_path = os.path.join(metadata_dir, 'class-descriptions.csv') + + description_table = {} + with open(description_path) as f: + for row in csv.reader(f): + # make sure the csv row is not empty (usually the last one) + if len(row): + description_table[row[0]] = row[1].replace("\"", "").replace("'", "").replace('`', '') + + with open(trainable_classes_path, 'rb') as f: + trainable_classes = f.read().split('\n') + + id_to_labels = dict([(i, description_table[c]) for i, c in enumerate(trainable_classes)]) + cls_index = dict([(c, i) for i, c in enumerate(trainable_classes)]) + + return id_to_labels, cls_index + + +def generate_images_annotations_json(main_dir, metadata_dir, subset, cls_index, version='v4'): + validation_image_ids = {} + + if version == 'v4': + annotations_path = os.path.join(metadata_dir, subset, '{}-annotations-bbox.csv'.format(subset)) + elif version == 'challenge2018': + validation_image_ids_path = os.path.join(metadata_dir, 'challenge-2018-image-ids-valset-od.csv') + + with open(validation_image_ids_path, 'r') as csv_file: + reader = csv.DictReader(csv_file, fieldnames=['ImageID']) + next(reader) + for line, row in enumerate(reader): + image_id = row['ImageID'] + validation_image_ids[image_id] = True + + annotations_path = os.path.join(metadata_dir, 'challenge-2018-train-annotations-bbox.csv') + else: + annotations_path = os.path.join(metadata_dir, subset, 'annotations-human-bbox.csv') + + fieldnames = ['ImageID', 'Source', 'LabelName', 'Confidence', + 'XMin', 'XMax', 'YMin', 'YMax', + 'IsOccluded', 'IsTruncated', 'IsGroupOf', 'IsDepiction', 'IsInside'] + + id_annotations = dict() + with open(annotations_path, 'r') as csv_file: + reader = csv.DictReader(csv_file, fieldnames=fieldnames) + next(reader) + + images_sizes = {} + for line, row in enumerate(reader): + frame = row['ImageID'] + + if version == 'challenge2018': + if subset == 'train': + if frame in validation_image_ids: + continue + elif subset == 'validation': + if frame not in validation_image_ids: + continue + else: + raise NotImplementedError('This generator handles only the train and validation subsets') + + class_name = row['LabelName'] + + if class_name not in cls_index: + continue + + cls_id = cls_index[class_name] + + if version == 'challenge2018': + # We recommend participants to use the provided subset of the training set as a validation set. + # This is preferable over using the V4 val/test sets, as the training set is more densely annotated. + img_path = os.path.join(main_dir, 'images', 'train', frame + '.jpg') + else: + img_path = os.path.join(main_dir, 'images', subset, frame + '.jpg') + + if frame in images_sizes: + width, height = images_sizes[frame] + else: + try: + with Image.open(img_path) as img: + width, height = img.width, img.height + images_sizes[frame] = (width, height) + except Exception as ex: + if version == 'challenge2018': + raise ex + continue + + x1 = float(row['XMin']) + x2 = float(row['XMax']) + y1 = float(row['YMin']) + y2 = float(row['YMax']) + + x1_int = int(round(x1 * width)) + x2_int = int(round(x2 * width)) + y1_int = int(round(y1 * height)) + y2_int = int(round(y2 * height)) + + # Check that the bounding box is valid. + if x2 <= x1: + raise ValueError('line {}: x2 ({}) must be higher than x1 ({})'.format(line, x2, x1)) + if y2 <= y1: + raise ValueError('line {}: y2 ({}) must be higher than y1 ({})'.format(line, y2, y1)) + + if y2_int == y1_int: + warnings.warn('filtering line {}: rounding y2 ({}) and y1 ({}) makes them equal'.format(line, y2, y1)) + continue + + if x2_int == x1_int: + warnings.warn('filtering line {}: rounding x2 ({}) and x1 ({}) makes them equal'.format(line, x2, x1)) + continue + + img_id = row['ImageID'] + annotation = {'cls_id': cls_id, 'x1': x1, 'x2': x2, 'y1': y1, 'y2': y2} + + if img_id in id_annotations: + annotations = id_annotations[img_id] + annotations['boxes'].append(annotation) + else: + id_annotations[img_id] = {'w': width, 'h': height, 'boxes': [annotation]} + return id_annotations + + +class OpenImagesGenerator(Generator): + def __init__( + self, main_dir, subset, version='v4', + labels_filter=None, annotation_cache_dir='.', + parent_label=None, + **kwargs + ): + if version == 'challenge2018': + metadata = 'challenge2018' + elif version == 'v4': + metadata = '2018_04' + elif version == 'v3': + metadata = '2017_11' + else: + raise NotImplementedError('There is currently no implementation for versions older than v3') + + if version == 'challenge2018': + self.base_dir = os.path.join(main_dir, 'images', 'train') + else: + self.base_dir = os.path.join(main_dir, 'images', subset) + + metadata_dir = os.path.join(main_dir, metadata) + annotation_cache_json = os.path.join(annotation_cache_dir, subset + '.json') + + self.hierarchy = load_hierarchy(metadata_dir, version=version) + id_to_labels, cls_index = get_labels(metadata_dir, version=version) + + if os.path.exists(annotation_cache_json): + with open(annotation_cache_json, 'r') as f: + self.annotations = json.loads(f.read()) + else: + self.annotations = generate_images_annotations_json(main_dir, metadata_dir, subset, cls_index, version=version) + json.dump(self.annotations, open(annotation_cache_json, "w")) + + if labels_filter is not None or parent_label is not None: + self.id_to_labels, self.annotations = self.__filter_data(id_to_labels, cls_index, labels_filter, parent_label) + else: + self.id_to_labels = id_to_labels + + self.id_to_image_id = dict([(i, k) for i, k in enumerate(self.annotations)]) + + super(OpenImagesGenerator, self).__init__(**kwargs) + + def __filter_data(self, id_to_labels, cls_index, labels_filter=None, parent_label=None): + """ + If you want to work with a subset of the labels just set a list with trainable labels + :param labels_filter: Ex: labels_filter = ['Helmet', 'Hat', 'Analog television'] + :param parent_label: If parent_label is set this will bring you the parent label + but also its children in the semantic hierarchy as defined in OID, ex: Animal + hierarchical tree + :return: + """ + + children_id_to_labels = {} + + if parent_label is None: + # there is/are no other sublabel(s) other than the labels itself + + for label in labels_filter: + for i, lb in id_to_labels.items(): + if lb == label: + children_id_to_labels[i] = label + break + else: + parent_cls = None + for i, lb in id_to_labels.items(): + if lb == parent_label: + parent_id = i + for c, index in cls_index.items(): + if index == parent_id: + parent_cls = c + break + + if parent_cls is None: + raise Exception('Couldnt find label {}'.format(parent_label)) + + parent_tree = find_hierarchy_parent(self.hierarchy, parent_cls) + + if parent_tree is None: + raise Exception('Couldnt find parent {} in the semantic hierarchical tree'.format(parent_label)) + + children = load_hierarchy_children(parent_tree) + + for cls in children: + index = cls_index[cls] + label = id_to_labels[index] + children_id_to_labels[index] = label + + id_map = dict([(ind, i) for i, ind in enumerate(children_id_to_labels.keys())]) + + filtered_annotations = {} + for k in self.annotations: + img_ann = self.annotations[k] + + filtered_boxes = [] + for ann in img_ann['boxes']: + cls_id = ann['cls_id'] + if cls_id in children_id_to_labels: + ann['cls_id'] = id_map[cls_id] + filtered_boxes.append(ann) + + if len(filtered_boxes) > 0: + filtered_annotations[k] = {'w': img_ann['w'], 'h': img_ann['h'], 'boxes': filtered_boxes} + + children_id_to_labels = dict([(id_map[i], l) for (i, l) in children_id_to_labels.items()]) + + return children_id_to_labels, filtered_annotations + + def size(self): + return len(self.annotations) + + def num_classes(self): + return len(self.id_to_labels) + + def has_label(self, label): + """ Return True if label is a known label. + """ + return label in self.id_to_labels + + def has_name(self, name): + """ Returns True if name is a known class. + """ + raise NotImplementedError() + + def name_to_label(self, name): + raise NotImplementedError() + + def label_to_name(self, label): + return self.id_to_labels[label] + + def image_aspect_ratio(self, image_index): + img_annotations = self.annotations[self.id_to_image_id[image_index]] + height, width = img_annotations['h'], img_annotations['w'] + return float(width) / float(height) + + def image_path(self, image_index): + path = os.path.join(self.base_dir, self.id_to_image_id[image_index] + '.jpg') + return path + + def load_image(self, image_index): + return read_image_bgr(self.image_path(image_index)) + + def load_annotations(self, image_index): + image_annotations = self.annotations[self.id_to_image_id[image_index]] + + labels = image_annotations['boxes'] + height, width = image_annotations['h'], image_annotations['w'] + + annotations = {'labels': np.empty((len(labels),)), 'bboxes': np.empty((len(labels), 4))} + for idx, ann in enumerate(labels): + cls_id = ann['cls_id'] + x1 = ann['x1'] * width + x2 = ann['x2'] * width + y1 = ann['y1'] * height + y2 = ann['y2'] * height + + annotations['bboxes'][idx, 0] = x1 + annotations['bboxes'][idx, 1] = y1 + annotations['bboxes'][idx, 2] = x2 + annotations['bboxes'][idx, 3] = y2 + annotations['labels'][idx] = cls_id + + return annotations diff --git a/src/keras_retinanet/preprocessing/pascal_voc.py b/src/keras_retinanet/preprocessing/pascal_voc.py new file mode 100644 index 0000000..564fb37 --- /dev/null +++ b/src/keras_retinanet/preprocessing/pascal_voc.py @@ -0,0 +1,203 @@ +""" +Copyright 2017-2018 Fizyr (https://fizyr.com) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from ..preprocessing.generator import Generator +from ..utils.image import read_image_bgr + +import os +import numpy as np +from six import raise_from +from PIL import Image + +try: + import xml.etree.cElementTree as ET +except ImportError: + import xml.etree.ElementTree as ET + +voc_classes = { + 'aeroplane' : 0, + 'bicycle' : 1, + 'bird' : 2, + 'boat' : 3, + 'bottle' : 4, + 'bus' : 5, + 'car' : 6, + 'cat' : 7, + 'chair' : 8, + 'cow' : 9, + 'diningtable' : 10, + 'dog' : 11, + 'horse' : 12, + 'motorbike' : 13, + 'person' : 14, + 'pottedplant' : 15, + 'sheep' : 16, + 'sofa' : 17, + 'train' : 18, + 'tvmonitor' : 19 +} + + +def _findNode(parent, name, debug_name=None, parse=None): + if debug_name is None: + debug_name = name + + result = parent.find(name) + if result is None: + raise ValueError('missing element \'{}\''.format(debug_name)) + if parse is not None: + try: + return parse(result.text) + except ValueError as e: + raise_from(ValueError('illegal value for \'{}\': {}'.format(debug_name, e)), None) + return result + + +class PascalVocGenerator(Generator): + """ Generate data for a Pascal VOC dataset. + + See http://host.robots.ox.ac.uk/pascal/VOC/ for more information. + """ + + def __init__( + self, + data_dir, + set_name, + classes=voc_classes, + image_extension='.jpg', + skip_truncated=False, + skip_difficult=False, + **kwargs + ): + """ Initialize a Pascal VOC data generator. + + Args + base_dir: Directory w.r.t. where the files are to be searched (defaults to the directory containing the csv_data_file). + csv_class_file: Path to the CSV classes file. + """ + self.data_dir = data_dir + self.set_name = set_name + self.classes = classes + self.image_names = [l.strip().split(None, 1)[0] for l in open(os.path.join(data_dir, 'ImageSets', 'Main', set_name + '.txt')).readlines()] + self.image_extension = image_extension + self.skip_truncated = skip_truncated + self.skip_difficult = skip_difficult + + self.labels = {} + for key, value in self.classes.items(): + self.labels[value] = key + + super(PascalVocGenerator, self).__init__(**kwargs) + + def size(self): + """ Size of the dataset. + """ + return len(self.image_names) + + def num_classes(self): + """ Number of classes in the dataset. + """ + return len(self.classes) + + def has_label(self, label): + """ Return True if label is a known label. + """ + return label in self.labels + + def has_name(self, name): + """ Returns True if name is a known class. + """ + return name in self.classes + + def name_to_label(self, name): + """ Map name to label. + """ + return self.classes[name] + + def label_to_name(self, label): + """ Map label to name. + """ + return self.labels[label] + + def image_aspect_ratio(self, image_index): + """ Compute the aspect ratio for an image with image_index. + """ + path = os.path.join(self.data_dir, 'JPEGImages', self.image_names[image_index] + self.image_extension) + image = Image.open(path) + return float(image.width) / float(image.height) + + def image_path(self, image_index): + """ Get the path to an image. + """ + return os.path.join(self.data_dir, 'JPEGImages', self.image_names[image_index] + self.image_extension) + + def load_image(self, image_index): + """ Load an image at the image_index. + """ + return read_image_bgr(self.image_path(image_index)) + + def __parse_annotation(self, element): + """ Parse an annotation given an XML element. + """ + truncated = _findNode(element, 'truncated', parse=int) + difficult = _findNode(element, 'difficult', parse=int) + + class_name = _findNode(element, 'name').text + if class_name not in self.classes: + raise ValueError('class name \'{}\' not found in classes: {}'.format(class_name, list(self.classes.keys()))) + + box = np.zeros((4,)) + label = self.name_to_label(class_name) + + bndbox = _findNode(element, 'bndbox') + box[0] = _findNode(bndbox, 'xmin', 'bndbox.xmin', parse=float) - 1 + box[1] = _findNode(bndbox, 'ymin', 'bndbox.ymin', parse=float) - 1 + box[2] = _findNode(bndbox, 'xmax', 'bndbox.xmax', parse=float) - 1 + box[3] = _findNode(bndbox, 'ymax', 'bndbox.ymax', parse=float) - 1 + + return truncated, difficult, box, label + + def __parse_annotations(self, xml_root): + """ Parse all annotations under the xml_root. + """ + annotations = {'labels': np.empty((len(xml_root.findall('object')),)), 'bboxes': np.empty((len(xml_root.findall('object')), 4))} + for i, element in enumerate(xml_root.iter('object')): + try: + truncated, difficult, box, label = self.__parse_annotation(element) + except ValueError as e: + raise_from(ValueError('could not parse object #{}: {}'.format(i, e)), None) + + if truncated and self.skip_truncated: + continue + if difficult and self.skip_difficult: + continue + + annotations['bboxes'][i, :] = box + annotations['labels'][i] = label + + return annotations + + def load_annotations(self, image_index): + """ Load annotations for an image_index. + """ + filename = self.image_names[image_index] + '.xml' + try: + tree = ET.parse(os.path.join(self.data_dir, 'Annotations', filename)) + return self.__parse_annotations(tree.getroot()) + except ET.ParseError as e: + raise_from(ValueError('invalid annotations file: {}: {}'.format(filename, e)), None) + except ValueError as e: + raise_from(ValueError('invalid annotations file: {}: {}'.format(filename, e)), None) diff --git a/src/keras_retinanet/utils/__init__.py b/src/keras_retinanet/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/keras_retinanet/utils/anchors.py b/src/keras_retinanet/utils/anchors.py new file mode 100644 index 0000000..493d73d --- /dev/null +++ b/src/keras_retinanet/utils/anchors.py @@ -0,0 +1,346 @@ +""" +Copyright 2017-2018 Fizyr (https://fizyr.com) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np +import keras + +from ..utils.compute_overlap import compute_overlap + + +class AnchorParameters: + """ The parameteres that define how anchors are generated. + + Args + sizes : List of sizes to use. Each size corresponds to one feature level. + strides : List of strides to use. Each stride correspond to one feature level. + ratios : List of ratios to use per location in a feature map. + scales : List of scales to use per location in a feature map. + """ + def __init__(self, sizes, strides, ratios, scales): + self.sizes = sizes + self.strides = strides + self.ratios = ratios + self.scales = scales + + def num_anchors(self): + return len(self.ratios) * len(self.scales) + + +""" +The default anchor parameters. +""" +AnchorParameters.default = AnchorParameters( + sizes = [32, 64, 128, 256, 512], + strides = [8, 16, 32, 64, 128], + ratios = np.array([0.5, 1, 2], keras.backend.floatx()), + scales = np.array([2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)], keras.backend.floatx()), +) + + +def anchor_targets_bbox( + anchors, + image_group, + annotations_group, + num_classes, + negative_overlap=0.4, + positive_overlap=0.5 +): + """ Generate anchor targets for bbox detection. + + Args + anchors: np.array of annotations of shape (N, 4) for (x1, y1, x2, y2). + image_group: List of BGR images. + annotations_group: List of annotation dictionaries with each annotation containing 'labels' and 'bboxes' of an image. + num_classes: Number of classes to predict. + mask_shape: If the image is padded with zeros, mask_shape can be used to mark the relevant part of the image. + negative_overlap: IoU overlap for negative anchors (all anchors with overlap < negative_overlap are negative). + positive_overlap: IoU overlap or positive anchors (all anchors with overlap > positive_overlap are positive). + + Returns + labels_batch: batch that contains labels & anchor states (np.array of shape (batch_size, N, num_classes + 1), + where N is the number of anchors for an image and the last column defines the anchor state (-1 for ignore, 0 for bg, 1 for fg). + regression_batch: batch that contains bounding-box regression targets for an image & anchor states (np.array of shape (batch_size, N, 4 + 1), + where N is the number of anchors for an image, the first 4 columns define regression targets for (x1, y1, x2, y2) and the + last column defines anchor states (-1 for ignore, 0 for bg, 1 for fg). + """ + + assert(len(image_group) == len(annotations_group)), "The length of the images and annotations need to be equal." + assert(len(annotations_group) > 0), "No data received to compute anchor targets for." + for annotations in annotations_group: + assert('bboxes' in annotations), "Annotations should contain bboxes." + assert('labels' in annotations), "Annotations should contain labels." + + batch_size = len(image_group) + + regression_batch = np.zeros((batch_size, anchors.shape[0], 4 + 1), dtype=keras.backend.floatx()) + labels_batch = np.zeros((batch_size, anchors.shape[0], num_classes + 1), dtype=keras.backend.floatx()) + + # compute labels and regression targets + for index, (image, annotations) in enumerate(zip(image_group, annotations_group)): + if annotations['bboxes'].shape[0]: + # obtain indices of gt annotations with the greatest overlap + positive_indices, ignore_indices, argmax_overlaps_inds = compute_gt_annotations(anchors, annotations['bboxes'], negative_overlap, positive_overlap) + + labels_batch[index, ignore_indices, -1] = -1 + labels_batch[index, positive_indices, -1] = 1 + + regression_batch[index, ignore_indices, -1] = -1 + regression_batch[index, positive_indices, -1] = 1 + + # compute target class labels + labels_batch[index, positive_indices, annotations['labels'][argmax_overlaps_inds[positive_indices]].astype(int)] = 1 + + regression_batch[index, :, :-1] = bbox_transform(anchors, annotations['bboxes'][argmax_overlaps_inds, :]) + + # ignore annotations outside of image + if image.shape: + anchors_centers = np.vstack([(anchors[:, 0] + anchors[:, 2]) / 2, (anchors[:, 1] + anchors[:, 3]) / 2]).T + indices = np.logical_or(anchors_centers[:, 0] >= image.shape[1], anchors_centers[:, 1] >= image.shape[0]) + + labels_batch[index, indices, -1] = -1 + regression_batch[index, indices, -1] = -1 + + return regression_batch, labels_batch + + +def compute_gt_annotations( + anchors, + annotations, + negative_overlap=0.4, + positive_overlap=0.5 +): + """ Obtain indices of gt annotations with the greatest overlap. + + Args + anchors: np.array of annotations of shape (N, 4) for (x1, y1, x2, y2). + annotations: np.array of shape (N, 5) for (x1, y1, x2, y2, label). + negative_overlap: IoU overlap for negative anchors (all anchors with overlap < negative_overlap are negative). + positive_overlap: IoU overlap or positive anchors (all anchors with overlap > positive_overlap are positive). + + Returns + positive_indices: indices of positive anchors + ignore_indices: indices of ignored anchors + argmax_overlaps_inds: ordered overlaps indices + """ + + overlaps = compute_overlap(anchors.astype(np.float64), annotations.astype(np.float64)) + argmax_overlaps_inds = np.argmax(overlaps, axis=1) + max_overlaps = overlaps[np.arange(overlaps.shape[0]), argmax_overlaps_inds] + + # assign "dont care" labels + positive_indices = max_overlaps >= positive_overlap + ignore_indices = (max_overlaps > negative_overlap) & ~positive_indices + + return positive_indices, ignore_indices, argmax_overlaps_inds + + +def layer_shapes(image_shape, model): + """Compute layer shapes given input image shape and the model. + + Args + image_shape: The shape of the image. + model: The model to use for computing how the image shape is transformed in the pyramid. + + Returns + A dictionary mapping layer names to image shapes. + """ + shape = { + model.layers[0].name: (None,) + image_shape, + } + + for layer in model.layers[1:]: + nodes = layer._inbound_nodes + for node in nodes: + inputs = [shape[lr.name] for lr in node.inbound_layers] + if not inputs: + continue + shape[layer.name] = layer.compute_output_shape(inputs[0] if len(inputs) == 1 else inputs) + + return shape + + +def make_shapes_callback(model): + """ Make a function for getting the shape of the pyramid levels. + """ + def get_shapes(image_shape, pyramid_levels): + shape = layer_shapes(image_shape, model) + image_shapes = [shape["P{}".format(level)][1:3] for level in pyramid_levels] + return image_shapes + + return get_shapes + + +def guess_shapes(image_shape, pyramid_levels): + """Guess shapes based on pyramid levels. + + Args + image_shape: The shape of the image. + pyramid_levels: A list of what pyramid levels are used. + + Returns + A list of image shapes at each pyramid level. + """ + image_shape = np.array(image_shape[:2]) + image_shapes = [(image_shape + 2 ** x - 1) // (2 ** x) for x in pyramid_levels] + return image_shapes + + +def anchors_for_shape( + image_shape, + pyramid_levels=None, + anchor_params=None, + shapes_callback=None, +): + """ Generators anchors for a given shape. + + Args + image_shape: The shape of the image. + pyramid_levels: List of ints representing which pyramids to use (defaults to [3, 4, 5, 6, 7]). + anchor_params: Struct containing anchor parameters. If None, default values are used. + shapes_callback: Function to call for getting the shape of the image at different pyramid levels. + + Returns + np.array of shape (N, 4) containing the (x1, y1, x2, y2) coordinates for the anchors. + """ + + if pyramid_levels is None: + pyramid_levels = [3, 4, 5, 6, 7] + + if anchor_params is None: + anchor_params = AnchorParameters.default + + if shapes_callback is None: + shapes_callback = guess_shapes + image_shapes = shapes_callback(image_shape, pyramid_levels) + + # compute anchors over all pyramid levels + all_anchors = np.zeros((0, 4)) + for idx, p in enumerate(pyramid_levels): + anchors = generate_anchors( + base_size=anchor_params.sizes[idx], + ratios=anchor_params.ratios, + scales=anchor_params.scales + ) + shifted_anchors = shift(image_shapes[idx], anchor_params.strides[idx], anchors) + all_anchors = np.append(all_anchors, shifted_anchors, axis=0) + + return all_anchors + + +def shift(shape, stride, anchors): + """ Produce shifted anchors based on shape of the map and stride size. + + Args + shape : Shape to shift the anchors over. + stride : Stride to shift the anchors with over the shape. + anchors: The anchors to apply at each location. + """ + + # create a grid starting from half stride from the top left corner + shift_x = (np.arange(0, shape[1]) + 0.5) * stride + shift_y = (np.arange(0, shape[0]) + 0.5) * stride + + shift_x, shift_y = np.meshgrid(shift_x, shift_y) + + shifts = np.vstack(( + shift_x.ravel(), shift_y.ravel(), + shift_x.ravel(), shift_y.ravel() + )).transpose() + + # add A anchors (1, A, 4) to + # cell K shifts (K, 1, 4) to get + # shift anchors (K, A, 4) + # reshape to (K*A, 4) shifted anchors + A = anchors.shape[0] + K = shifts.shape[0] + all_anchors = (anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose((1, 0, 2))) + all_anchors = all_anchors.reshape((K * A, 4)) + + return all_anchors + + +def generate_anchors(base_size=16, ratios=None, scales=None): + """ + Generate anchor (reference) windows by enumerating aspect ratios X + scales w.r.t. a reference window. + """ + + if ratios is None: + ratios = AnchorParameters.default.ratios + + if scales is None: + scales = AnchorParameters.default.scales + + num_anchors = len(ratios) * len(scales) + + # initialize output anchors + anchors = np.zeros((num_anchors, 4)) + + # scale base_size + anchors[:, 2:] = base_size * np.tile(scales, (2, len(ratios))).T + + # compute areas of anchors + areas = anchors[:, 2] * anchors[:, 3] + + # correct for ratios + anchors[:, 2] = np.sqrt(areas / np.repeat(ratios, len(scales))) + anchors[:, 3] = anchors[:, 2] * np.repeat(ratios, len(scales)) + + # transform from (x_ctr, y_ctr, w, h) -> (x1, y1, x2, y2) + anchors[:, 0::2] -= np.tile(anchors[:, 2] * 0.5, (2, 1)).T + anchors[:, 1::2] -= np.tile(anchors[:, 3] * 0.5, (2, 1)).T + + return anchors + + +def bbox_transform(anchors, gt_boxes, mean=None, std=None): + """Compute bounding-box regression targets for an image.""" + + # The Mean and std are calculated from COCO dataset. + # Bounding box normalization was firstly introduced in the Fast R-CNN paper. + # See https://github.com/fizyr/keras-retinanet/issues/1273#issuecomment-585828825 for more details + if mean is None: + mean = np.array([0, 0, 0, 0]) + if std is None: + std = np.array([0.2, 0.2, 0.2, 0.2]) + + if isinstance(mean, (list, tuple)): + mean = np.array(mean) + elif not isinstance(mean, np.ndarray): + raise ValueError('Expected mean to be a np.ndarray, list or tuple. Received: {}'.format(type(mean))) + + if isinstance(std, (list, tuple)): + std = np.array(std) + elif not isinstance(std, np.ndarray): + raise ValueError('Expected std to be a np.ndarray, list or tuple. Received: {}'.format(type(std))) + + anchor_widths = anchors[:, 2] - anchors[:, 0] + anchor_heights = anchors[:, 3] - anchors[:, 1] + + # According to the information provided by a keras-retinanet author, they got marginally better results using + # the following way of bounding box parametrization. + # See https://github.com/fizyr/keras-retinanet/issues/1273#issuecomment-585828825 for more details + targets_dx1 = (gt_boxes[:, 0] - anchors[:, 0]) / anchor_widths + targets_dy1 = (gt_boxes[:, 1] - anchors[:, 1]) / anchor_heights + targets_dx2 = (gt_boxes[:, 2] - anchors[:, 2]) / anchor_widths + targets_dy2 = (gt_boxes[:, 3] - anchors[:, 3]) / anchor_heights + + targets = np.stack((targets_dx1, targets_dy1, targets_dx2, targets_dy2)) + targets = targets.T + + targets = (targets - mean) / std + + return targets diff --git a/src/keras_retinanet/utils/coco_eval.py b/src/keras_retinanet/utils/coco_eval.py new file mode 100644 index 0000000..e8d39c5 --- /dev/null +++ b/src/keras_retinanet/utils/coco_eval.py @@ -0,0 +1,93 @@ +""" +Copyright 2017-2018 Fizyr (https://fizyr.com) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from pycocotools.cocoeval import COCOeval + +import keras +import numpy as np +import json + +import progressbar +assert(callable(progressbar.progressbar)), "Using wrong progressbar module, install 'progressbar2' instead." + + +def evaluate_coco(generator, model, threshold=0.05): + """ Use the pycocotools to evaluate a COCO model on a dataset. + + Args + generator : The generator for generating the evaluation data. + model : The model to evaluate. + threshold : The score threshold to use. + """ + # start collecting results + results = [] + image_ids = [] + for index in progressbar.progressbar(range(generator.size()), prefix='COCO evaluation: '): + image = generator.load_image(index) + image = generator.preprocess_image(image) + image, scale = generator.resize_image(image) + + if keras.backend.image_data_format() == 'channels_first': + image = image.transpose((2, 0, 1)) + + # run network + boxes, scores, labels = model.predict_on_batch(np.expand_dims(image, axis=0)) + + # correct boxes for image scale + boxes /= scale + + # change to (x, y, w, h) (MS COCO standard) + boxes[:, :, 2] -= boxes[:, :, 0] + boxes[:, :, 3] -= boxes[:, :, 1] + + # compute predicted labels and scores + for box, score, label in zip(boxes[0], scores[0], labels[0]): + # scores are sorted, so we can break + if score < threshold: + break + + # append detection for each positively labeled class + image_result = { + 'image_id' : generator.image_ids[index], + 'category_id' : generator.label_to_coco_label(label), + 'score' : float(score), + 'bbox' : box.tolist(), + } + + # append detection to results + results.append(image_result) + + # append image to list of processed images + image_ids.append(generator.image_ids[index]) + + if not len(results): + return + + # write output + json.dump(results, open('{}_bbox_results.json'.format(generator.set_name), 'w'), indent=4) + json.dump(image_ids, open('{}_processed_image_ids.json'.format(generator.set_name), 'w'), indent=4) + + # load results in COCO evaluation tool + coco_true = generator.coco + coco_pred = coco_true.loadRes('{}_bbox_results.json'.format(generator.set_name)) + + # run COCO evaluation + coco_eval = COCOeval(coco_true, coco_pred, 'bbox') + coco_eval.params.imgIds = image_ids + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + return coco_eval.stats diff --git a/src/keras_retinanet/utils/colors.py b/src/keras_retinanet/utils/colors.py new file mode 100644 index 0000000..7f1b685 --- /dev/null +++ b/src/keras_retinanet/utils/colors.py @@ -0,0 +1,112 @@ +import warnings + + +def label_color(label): + """ Return a color from a set of predefined colors. Contains 80 colors in total. + + Args + label: The label to get the color for. + + Returns + A list of three values representing a RGB color. + + If no color is defined for a certain label, the color green is returned and a warning is printed. + """ + if label < len(colors): + return colors[label] + else: + warnings.warn('Label {} has no color, returning default.'.format(label)) + return (0, 255, 0) + + +""" +Generated using: + +``` +colors = [list((matplotlib.colors.hsv_to_rgb([x, 1.0, 1.0]) * 255).astype(int)) for x in np.arange(0, 1, 1.0 / 80)] +shuffle(colors) +pprint(colors) +``` +""" +colors = [ + [31 , 0 , 255] , + [0 , 159 , 255] , + [255 , 95 , 0] , + [255 , 19 , 0] , + [255 , 0 , 0] , + [255 , 38 , 0] , + [0 , 255 , 25] , + [255 , 0 , 133] , + [255 , 172 , 0] , + [108 , 0 , 255] , + [0 , 82 , 255] , + [0 , 255 , 6] , + [255 , 0 , 152] , + [223 , 0 , 255] , + [12 , 0 , 255] , + [0 , 255 , 178] , + [108 , 255 , 0] , + [184 , 0 , 255] , + [255 , 0 , 76] , + [146 , 255 , 0] , + [51 , 0 , 255] , + [0 , 197 , 255] , + [255 , 248 , 0] , + [255 , 0 , 19] , + [255 , 0 , 38] , + [89 , 255 , 0] , + [127 , 255 , 0] , + [255 , 153 , 0] , + [0 , 255 , 255] , + [0 , 255 , 216] , + [0 , 255 , 121] , + [255 , 0 , 248] , + [70 , 0 , 255] , + [0 , 255 , 159] , + [0 , 216 , 255] , + [0 , 6 , 255] , + [0 , 63 , 255] , + [31 , 255 , 0] , + [255 , 57 , 0] , + [255 , 0 , 210] , + [0 , 255 , 102] , + [242 , 255 , 0] , + [255 , 191 , 0] , + [0 , 255 , 63] , + [255 , 0 , 95] , + [146 , 0 , 255] , + [184 , 255 , 0] , + [255 , 114 , 0] , + [0 , 255 , 235] , + [255 , 229 , 0] , + [0 , 178 , 255] , + [255 , 0 , 114] , + [255 , 0 , 57] , + [0 , 140 , 255] , + [0 , 121 , 255] , + [12 , 255 , 0] , + [255 , 210 , 0] , + [0 , 255 , 44] , + [165 , 255 , 0] , + [0 , 25 , 255] , + [0 , 255 , 140] , + [0 , 101 , 255] , + [0 , 255 , 82] , + [223 , 255 , 0] , + [242 , 0 , 255] , + [89 , 0 , 255] , + [165 , 0 , 255] , + [70 , 255 , 0] , + [255 , 0 , 172] , + [255 , 76 , 0] , + [203 , 255 , 0] , + [204 , 0 , 255] , + [255 , 0 , 229] , + [255 , 133 , 0] , + [127 , 0 , 255] , + [0 , 235 , 255] , + [0 , 255 , 197] , + [255 , 0 , 191] , + [0 , 44 , 255] , + [50 , 255 , 0] +] diff --git a/src/keras_retinanet/utils/compute_overlap.pyx b/src/keras_retinanet/utils/compute_overlap.pyx new file mode 100644 index 0000000..e8b7930 --- /dev/null +++ b/src/keras_retinanet/utils/compute_overlap.pyx @@ -0,0 +1,53 @@ +# -------------------------------------------------------- +# Fast R-CNN +# Copyright (c) 2015 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Sergey Karayev +# -------------------------------------------------------- + +cimport cython +import numpy as np +cimport numpy as np + + +def compute_overlap( + np.ndarray[double, ndim=2] boxes, + np.ndarray[double, ndim=2] query_boxes +): + """ + Args + a: (N, 4) ndarray of float + b: (K, 4) ndarray of float + + Returns + overlaps: (N, K) ndarray of overlap between boxes and query_boxes + """ + cdef unsigned int N = boxes.shape[0] + cdef unsigned int K = query_boxes.shape[0] + cdef np.ndarray[double, ndim=2] overlaps = np.zeros((N, K), dtype=np.float64) + cdef double iw, ih, box_area + cdef double ua + cdef unsigned int k, n + for k in range(K): + box_area = ( + (query_boxes[k, 2] - query_boxes[k, 0]) * + (query_boxes[k, 3] - query_boxes[k, 1]) + ) + for n in range(N): + iw = ( + min(boxes[n, 2], query_boxes[k, 2]) - + max(boxes[n, 0], query_boxes[k, 0]) + ) + if iw > 0: + ih = ( + min(boxes[n, 3], query_boxes[k, 3]) - + max(boxes[n, 1], query_boxes[k, 1]) + ) + if ih > 0: + ua = np.float64( + (boxes[n, 2] - boxes[n, 0]) * + (boxes[n, 3] - boxes[n, 1]) + + box_area - iw * ih + ) + overlaps[n, k] = iw * ih / ua + return overlaps diff --git a/src/keras_retinanet/utils/config.py b/src/keras_retinanet/utils/config.py new file mode 100644 index 0000000..137f47c --- /dev/null +++ b/src/keras_retinanet/utils/config.py @@ -0,0 +1,47 @@ +""" +Copyright 2017-2018 Fizyr (https://fizyr.com) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import configparser +import numpy as np +import keras +from ..utils.anchors import AnchorParameters + + +def read_config_file(config_path): + config = configparser.ConfigParser() + + with open(config_path, 'r') as file: + config.read_file(file) + + assert 'anchor_parameters' in config, \ + "Malformed config file. Verify that it contains the anchor_parameters section." + + config_keys = set(config['anchor_parameters']) + default_keys = set(AnchorParameters.default.__dict__.keys()) + + assert config_keys <= default_keys, \ + "Malformed config file. These keys are not valid: {}".format(config_keys - default_keys) + + return config + + +def parse_anchor_parameters(config): + ratios = np.array(list(map(float, config['anchor_parameters']['ratios'].split(' '))), keras.backend.floatx()) + scales = np.array(list(map(float, config['anchor_parameters']['scales'].split(' '))), keras.backend.floatx()) + sizes = list(map(int, config['anchor_parameters']['sizes'].split(' '))) + strides = list(map(int, config['anchor_parameters']['strides'].split(' '))) + + return AnchorParameters(sizes, strides, ratios, scales) diff --git a/src/keras_retinanet/utils/eval.py b/src/keras_retinanet/utils/eval.py new file mode 100644 index 0000000..da411b0 --- /dev/null +++ b/src/keras_retinanet/utils/eval.py @@ -0,0 +1,244 @@ +""" +Copyright 2017-2018 Fizyr (https://fizyr.com) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from .anchors import compute_overlap +from .visualization import draw_detections, draw_annotations + +import keras +import numpy as np +import os +import time + +import cv2 +import progressbar +assert(callable(progressbar.progressbar)), "Using wrong progressbar module, install 'progressbar2' instead." + + +def _compute_ap(recall, precision): + """ Compute the average precision, given the recall and precision curves. + + Code originally from https://github.com/rbgirshick/py-faster-rcnn. + + # Arguments + recall: The recall curve (list). + precision: The precision curve (list). + # Returns + The average precision as computed in py-faster-rcnn. + """ + # correct AP calculation + # first append sentinel values at the end + mrec = np.concatenate(([0.], recall, [1.])) + mpre = np.concatenate(([0.], precision, [0.])) + + # compute the precision envelope + for i in range(mpre.size - 1, 0, -1): + mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) + + # to calculate area under PR curve, look for points + # where X axis (recall) changes value + i = np.where(mrec[1:] != mrec[:-1])[0] + + # and sum (\Delta recall) * prec + ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) + return ap + + +def _get_detections(generator, model, score_threshold=0.05, max_detections=100, save_path=None): + """ Get the detections from the model using the generator. + + The result is a list of lists such that the size is: + all_detections[num_images][num_classes] = detections[num_detections, 4 + num_classes] + + # Arguments + generator : The generator used to run images through the model. + model : The model to run on the images. + score_threshold : The score confidence threshold to use. + max_detections : The maximum number of detections to use per image. + save_path : The path to save the images with visualized detections to. + # Returns + A list of lists containing the detections for each image in the generator. + """ + all_detections = [[None for i in range(generator.num_classes()) if generator.has_label(i)] for j in range(generator.size())] + all_inferences = [None for i in range(generator.size())] + + for i in progressbar.progressbar(range(generator.size()), prefix='Running network: '): + raw_image = generator.load_image(i) + image = generator.preprocess_image(raw_image.copy()) + image, scale = generator.resize_image(image) + + if keras.backend.image_data_format() == 'channels_first': + image = image.transpose((2, 0, 1)) + + # run network + start = time.time() + boxes, scores, labels = model.predict_on_batch(np.expand_dims(image, axis=0))[:3] + inference_time = time.time() - start + + # correct boxes for image scale + boxes /= scale + + # select indices which have a score above the threshold + indices = np.where(scores[0, :] > score_threshold)[0] + + # select those scores + scores = scores[0][indices] + + # find the order with which to sort the scores + scores_sort = np.argsort(-scores)[:max_detections] + + # select detections + image_boxes = boxes[0, indices[scores_sort], :] + image_scores = scores[scores_sort] + image_labels = labels[0, indices[scores_sort]] + image_detections = np.concatenate([image_boxes, np.expand_dims(image_scores, axis=1), np.expand_dims(image_labels, axis=1)], axis=1) + + if save_path is not None: + draw_annotations(raw_image, generator.load_annotations(i), label_to_name=generator.label_to_name) + draw_detections(raw_image, image_boxes, image_scores, image_labels, label_to_name=generator.label_to_name, score_threshold=score_threshold) + + cv2.imwrite(os.path.join(save_path, '{}.png'.format(i)), raw_image) + + # copy detections to all_detections + for label in range(generator.num_classes()): + if not generator.has_label(label): + continue + + all_detections[i][label] = image_detections[image_detections[:, -1] == label, :-1] + + all_inferences[i] = inference_time + + return all_detections, all_inferences + + +def _get_annotations(generator): + """ Get the ground truth annotations from the generator. + + The result is a list of lists such that the size is: + all_detections[num_images][num_classes] = annotations[num_detections, 5] + + # Arguments + generator : The generator used to retrieve ground truth annotations. + # Returns + A list of lists containing the annotations for each image in the generator. + """ + all_annotations = [[None for i in range(generator.num_classes())] for j in range(generator.size())] + + for i in progressbar.progressbar(range(generator.size()), prefix='Parsing annotations: '): + # load the annotations + annotations = generator.load_annotations(i) + + # copy detections to all_annotations + for label in range(generator.num_classes()): + if not generator.has_label(label): + continue + + all_annotations[i][label] = annotations['bboxes'][annotations['labels'] == label, :].copy() + + return all_annotations + + +def evaluate( + generator, + model, + iou_threshold=0.5, + score_threshold=0.05, + max_detections=100, + save_path=None +): + """ Evaluate a given dataset using a given model. + + # Arguments + generator : The generator that represents the dataset to evaluate. + model : The model to evaluate. + iou_threshold : The threshold used to consider when a detection is positive or negative. + score_threshold : The score confidence threshold to use for detections. + max_detections : The maximum number of detections to use per image. + save_path : The path to save images with visualized detections to. + # Returns + A dict mapping class names to mAP scores. + """ + # gather all detections and annotations + all_detections, all_inferences = _get_detections(generator, model, score_threshold=score_threshold, max_detections=max_detections, save_path=save_path) + all_annotations = _get_annotations(generator) + average_precisions = {} + + # all_detections = pickle.load(open('all_detections.pkl', 'rb')) + # all_annotations = pickle.load(open('all_annotations.pkl', 'rb')) + # pickle.dump(all_detections, open('all_detections.pkl', 'wb')) + # pickle.dump(all_annotations, open('all_annotations.pkl', 'wb')) + + # process detections and annotations + for label in range(generator.num_classes()): + if not generator.has_label(label): + continue + + false_positives = np.zeros((0,)) + true_positives = np.zeros((0,)) + scores = np.zeros((0,)) + num_annotations = 0.0 + + for i in range(generator.size()): + detections = all_detections[i][label] + annotations = all_annotations[i][label] + num_annotations += annotations.shape[0] + detected_annotations = [] + + for d in detections: + scores = np.append(scores, d[4]) + + if annotations.shape[0] == 0: + false_positives = np.append(false_positives, 1) + true_positives = np.append(true_positives, 0) + continue + + overlaps = compute_overlap(np.expand_dims(d, axis=0), annotations) + assigned_annotation = np.argmax(overlaps, axis=1) + max_overlap = overlaps[0, assigned_annotation] + + if max_overlap >= iou_threshold and assigned_annotation not in detected_annotations: + false_positives = np.append(false_positives, 0) + true_positives = np.append(true_positives, 1) + detected_annotations.append(assigned_annotation) + else: + false_positives = np.append(false_positives, 1) + true_positives = np.append(true_positives, 0) + + # no annotations -> AP for this class is 0 (is this correct?) + if num_annotations == 0: + average_precisions[label] = 0, 0 + continue + + # sort by score + indices = np.argsort(-scores) + false_positives = false_positives[indices] + true_positives = true_positives[indices] + + # compute false positives and true positives + false_positives = np.cumsum(false_positives) + true_positives = np.cumsum(true_positives) + + # compute recall and precision + recall = true_positives / num_annotations + precision = true_positives / np.maximum(true_positives + false_positives, np.finfo(np.float64).eps) + + # compute average precision + average_precision = _compute_ap(recall, precision) + average_precisions[label] = average_precision, num_annotations + + # inference time + inference_time = np.sum(all_inferences) / generator.size() + + return average_precisions, inference_time diff --git a/src/keras_retinanet/utils/gpu.py b/src/keras_retinanet/utils/gpu.py new file mode 100644 index 0000000..968c2b2 --- /dev/null +++ b/src/keras_retinanet/utils/gpu.py @@ -0,0 +1,53 @@ +""" +Copyright 2017-2019 Fizyr (https://fizyr.com) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import tensorflow as tf + +from .tf_version import tf_version_ok + + +def setup_gpu(gpu_id): + if tf_version_ok((2, 0, 0)): + if gpu_id == 'cpu' or gpu_id == -1: + tf.config.experimental.set_visible_devices([], 'GPU') + return + + gpus = tf.config.experimental.list_physical_devices('GPU') + if gpus: + # Restrict TensorFlow to only use the first GPU. + try: + # Currently, memory growth needs to be the same across GPUs. + for gpu in gpus: + tf.config.experimental.set_memory_growth(gpu, True) + + # Use only the selcted gpu. + tf.config.experimental.set_visible_devices(gpus[int(gpu_id)], 'GPU') + except RuntimeError as e: + # Visible devices must be set before GPUs have been initialized. + print(e) + + logical_gpus = tf.config.experimental.list_logical_devices('GPU') + print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs") + else: + import os + if gpu_id == 'cpu' or gpu_id == -1: + os.environ['CUDA_VISIBLE_DEVICES'] = "" + return + + os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id) + config = tf.ConfigProto() + config.gpu_options.allow_growth = True + tf.keras.backend.set_session(tf.Session(config=config)) diff --git a/src/keras_retinanet/utils/image.py b/src/keras_retinanet/utils/image.py new file mode 100644 index 0000000..b3116cd --- /dev/null +++ b/src/keras_retinanet/utils/image.py @@ -0,0 +1,356 @@ +""" +Copyright 2017-2018 Fizyr (https://fizyr.com) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from __future__ import division +import numpy as np +import cv2 +from PIL import Image + +from .transform import change_transform_origin + + +def read_image_bgr(path): + """ Read an image in BGR format. + + Args + path: Path to the image. + """ + # We deliberately don't use cv2.imread here, since it gives no feedback on errors while reading the image. + image = np.ascontiguousarray(Image.open(path).convert('RGB')) + return image[:, :, ::-1] + + +def preprocess_image(x, mode='caffe'): + """ Preprocess an image by subtracting the ImageNet mean. + + Args + x: np.array of shape (None, None, 3) or (3, None, None). + mode: One of "caffe" or "tf". + - caffe: will zero-center each color channel with + respect to the ImageNet dataset, without scaling. + - tf: will scale pixels between -1 and 1, sample-wise. + + Returns + The input with the ImageNet mean subtracted. + """ + # mostly identical to "https://github.com/keras-team/keras-applications/blob/master/keras_applications/imagenet_utils.py" + # except for converting RGB -> BGR since we assume BGR already + + # covert always to float32 to keep compatibility with opencv + x = x.astype(np.float32) + + if mode == 'tf': + x /= 127.5 + x -= 1. + elif mode == 'caffe': + x -= [103.939, 116.779, 123.68] + + return x + + +def adjust_transform_for_image(transform, image, relative_translation): + """ Adjust a transformation for a specific image. + + The translation of the matrix will be scaled with the size of the image. + The linear part of the transformation will adjusted so that the origin of the transformation will be at the center of the image. + """ + height, width, channels = image.shape + + result = transform + + # Scale the translation with the image size if specified. + if relative_translation: + result[0:2, 2] *= [width, height] + + # Move the origin of transformation. + result = change_transform_origin(transform, (0.5 * width, 0.5 * height)) + + return result + + +class TransformParameters: + """ Struct holding parameters determining how to apply a transformation to an image. + + Args + fill_mode: One of: 'constant', 'nearest', 'reflect', 'wrap' + interpolation: One of: 'nearest', 'linear', 'cubic', 'area', 'lanczos4' + cval: Fill value to use with fill_mode='constant' + relative_translation: If true (the default), interpret translation as a factor of the image size. + If false, interpret it as absolute pixels. + """ + def __init__( + self, + fill_mode = 'nearest', + interpolation = 'linear', + cval = 0, + relative_translation = True, + ): + self.fill_mode = fill_mode + self.cval = cval + self.interpolation = interpolation + self.relative_translation = relative_translation + + def cvBorderMode(self): + if self.fill_mode == 'constant': + return cv2.BORDER_CONSTANT + if self.fill_mode == 'nearest': + return cv2.BORDER_REPLICATE + if self.fill_mode == 'reflect': + return cv2.BORDER_REFLECT_101 + if self.fill_mode == 'wrap': + return cv2.BORDER_WRAP + + def cvInterpolation(self): + if self.interpolation == 'nearest': + return cv2.INTER_NEAREST + if self.interpolation == 'linear': + return cv2.INTER_LINEAR + if self.interpolation == 'cubic': + return cv2.INTER_CUBIC + if self.interpolation == 'area': + return cv2.INTER_AREA + if self.interpolation == 'lanczos4': + return cv2.INTER_LANCZOS4 + + +def apply_transform(matrix, image, params): + """ + Apply a transformation to an image. + + The origin of transformation is at the top left corner of the image. + + The matrix is interpreted such that a point (x, y) on the original image is moved to transform * (x, y) in the generated image. + Mathematically speaking, that means that the matrix is a transformation from the transformed image space to the original image space. + + Args + matrix: A homogeneous 3 by 3 matrix holding representing the transformation to apply. + image: The image to transform. + params: The transform parameters (see TransformParameters) + """ + output = cv2.warpAffine( + image, + matrix[:2, :], + dsize = (image.shape[1], image.shape[0]), + flags = params.cvInterpolation(), + borderMode = params.cvBorderMode(), + borderValue = params.cval, + ) + return output + + +def compute_resize_scale(image_shape, min_side=800, max_side=1333): + """ Compute an image scale such that the image size is constrained to min_side and max_side. + + Args + min_side: The image's min side will be equal to min_side after resizing. + max_side: If after resizing the image's max side is above max_side, resize until the max side is equal to max_side. + + Returns + A resizing scale. + """ + (rows, cols, _) = image_shape + + smallest_side = min(rows, cols) + + # rescale the image so the smallest side is min_side + scale = min_side / smallest_side + + # check if the largest side is now greater than max_side, which can happen + # when images have a large aspect ratio + largest_side = max(rows, cols) + if largest_side * scale > max_side: + scale = max_side / largest_side + + return scale + + +def resize_image(img, min_side=800, max_side=1333): + """ Resize an image such that the size is constrained to min_side and max_side. + + Args + min_side: The image's min side will be equal to min_side after resizing. + max_side: If after resizing the image's max side is above max_side, resize until the max side is equal to max_side. + + Returns + A resized image. + """ + # compute scale to resize the image + scale = compute_resize_scale(img.shape, min_side=min_side, max_side=max_side) + + # resize the image with the computed scale + img = cv2.resize(img, None, fx=scale, fy=scale) + + return img, scale + + +def _uniform(val_range): + """ Uniformly sample from the given range. + + Args + val_range: A pair of lower and upper bound. + """ + return np.random.uniform(val_range[0], val_range[1]) + + +def _check_range(val_range, min_val=None, max_val=None): + """ Check whether the range is a valid range. + + Args + val_range: A pair of lower and upper bound. + min_val: Minimal value for the lower bound. + max_val: Maximal value for the upper bound. + """ + if val_range[0] > val_range[1]: + raise ValueError('interval lower bound > upper bound') + if min_val is not None and val_range[0] < min_val: + raise ValueError('invalid interval lower bound') + if max_val is not None and val_range[1] > max_val: + raise ValueError('invalid interval upper bound') + + +def _clip(image): + """ + Clip and convert an image to np.uint8. + + Args + image: Image to clip. + """ + return np.clip(image, 0, 255).astype(np.uint8) + + +class VisualEffect: + """ Struct holding parameters and applying image color transformation. + + Args + contrast_factor: A factor for adjusting contrast. Should be between 0 and 3. + brightness_delta: Brightness offset between -1 and 1 added to the pixel values. + hue_delta: Hue offset between -1 and 1 added to the hue channel. + saturation_factor: A factor multiplying the saturation values of each pixel. + """ + + def __init__( + self, + contrast_factor, + brightness_delta, + hue_delta, + saturation_factor, + ): + self.contrast_factor = contrast_factor + self.brightness_delta = brightness_delta + self.hue_delta = hue_delta + self.saturation_factor = saturation_factor + + def __call__(self, image): + """ Apply a visual effect on the image. + + Args + image: Image to adjust + """ + + if self.contrast_factor: + image = adjust_contrast(image, self.contrast_factor) + if self.brightness_delta: + image = adjust_brightness(image, self.brightness_delta) + + if self.hue_delta or self.saturation_factor: + + image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) + + if self.hue_delta: + image = adjust_hue(image, self.hue_delta) + if self.saturation_factor: + image = adjust_saturation(image, self.saturation_factor) + + image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR) + + return image + + +def random_visual_effect_generator( + contrast_range=(0.9, 1.1), + brightness_range=(-.1, .1), + hue_range=(-0.05, 0.05), + saturation_range=(0.95, 1.05) +): + """ Generate visual effect parameters uniformly sampled from the given intervals. + + Args + contrast_factor: A factor interval for adjusting contrast. Should be between 0 and 3. + brightness_delta: An interval between -1 and 1 for the amount added to the pixels. + hue_delta: An interval between -1 and 1 for the amount added to the hue channel. + The values are rotated if they exceed 180. + saturation_factor: An interval for the factor multiplying the saturation values of each + pixel. + """ + _check_range(contrast_range, 0) + _check_range(brightness_range, -1, 1) + _check_range(hue_range, -1, 1) + _check_range(saturation_range, 0) + + def _generate(): + while True: + yield VisualEffect( + contrast_factor=_uniform(contrast_range), + brightness_delta=_uniform(brightness_range), + hue_delta=_uniform(hue_range), + saturation_factor=_uniform(saturation_range), + ) + + return _generate() + + +def adjust_contrast(image, factor): + """ Adjust contrast of an image. + + Args + image: Image to adjust. + factor: A factor for adjusting contrast. + """ + mean = image.mean(axis=0).mean(axis=0) + return _clip((image - mean) * factor + mean) + + +def adjust_brightness(image, delta): + """ Adjust brightness of an image + + Args + image: Image to adjust. + delta: Brightness offset between -1 and 1 added to the pixel values. + """ + return _clip(image + delta * 255) + + +def adjust_hue(image, delta): + """ Adjust hue of an image. + + Args + image: Image to adjust. + delta: An interval between -1 and 1 for the amount added to the hue channel. + The values are rotated if they exceed 180. + """ + image[..., 0] = np.mod(image[..., 0] + delta * 180, 180) + return image + + +def adjust_saturation(image, factor): + """ Adjust saturation of an image. + + Args + image: Image to adjust. + factor: An interval for the factor multiplying the saturation values of each pixel. + """ + image[..., 1] = np.clip(image[..., 1] * factor, 0 , 255) + return image diff --git a/src/keras_retinanet/utils/keras_version.py b/src/keras_retinanet/utils/keras_version.py new file mode 100644 index 0000000..626f265 --- /dev/null +++ b/src/keras_retinanet/utils/keras_version.py @@ -0,0 +1,55 @@ +""" +Copyright 2017-2018 Fizyr (https://fizyr.com) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from __future__ import print_function + +import keras +import sys + +minimum_keras_version = 2, 3, 0 + + +def keras_version(): + """ Get the Keras version. + + Returns + tuple of (major, minor, patch). + """ + return tuple(map(int, keras.__version__.split('.'))) + + +def keras_version_ok(): + """ Check if the current Keras version is higher than the minimum version. + """ + return keras_version() >= minimum_keras_version + + +def assert_keras_version(): + """ Assert that the Keras version is up to date. + """ + detected = keras.__version__ + required = '.'.join(map(str, minimum_keras_version)) + assert(keras_version() >= minimum_keras_version), 'You are using keras version {}. The minimum required version is {}.'.format(detected, required) + + +def check_keras_version(): + """ Check that the Keras version is up to date. If it isn't, print an error message and exit the script. + """ + try: + assert_keras_version() + except AssertionError as e: + print(e, file=sys.stderr) + sys.exit(1) diff --git a/src/keras_retinanet/utils/model.py b/src/keras_retinanet/utils/model.py new file mode 100644 index 0000000..702262c --- /dev/null +++ b/src/keras_retinanet/utils/model.py @@ -0,0 +1,28 @@ +""" +Copyright 2017-2018 Fizyr (https://fizyr.com) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + + +def freeze(model): + """ Set all layers in a model to non-trainable. + + The weights for these layers will not be updated during training. + + This function modifies the given model in-place, + but it also returns the modified model to allow easy chaining with other functions. + """ + for layer in model.layers: + layer.trainable = False + return model diff --git a/src/keras_retinanet/utils/tf_version.py b/src/keras_retinanet/utils/tf_version.py new file mode 100644 index 0000000..e6eb31a --- /dev/null +++ b/src/keras_retinanet/utils/tf_version.py @@ -0,0 +1,58 @@ +""" +Copyright 2017-2019 Fizyr (https://fizyr.com) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from __future__ import print_function + +import tensorflow as tf +import sys + +MINIMUM_TF_VERSION = 1, 14, 0 +BLACKLISTED_TF_VERSIONS = [ + (2, 0, 0), # Has a number of memory leaks and issues with eager execution. + (2, 0, 1), # Has a number of memory leaks and issues with eager execution. +] + + +def tf_version(): + """ Get the Tensorflow version. + Returns + tuple of (major, minor, patch). + """ + return tuple(map(int, tf.version.VERSION.split('-')[0].split('.'))) + + +def tf_version_ok(minimum_tf_version=MINIMUM_TF_VERSION, blacklisted=BLACKLISTED_TF_VERSIONS): + """ Check if the current Tensorflow version is higher than the minimum version. + """ + return tf_version() >= minimum_tf_version and tf_version() not in blacklisted + + +def assert_tf_version(minimum_tf_version=MINIMUM_TF_VERSION, blacklisted=BLACKLISTED_TF_VERSIONS): + """ Assert that the Tensorflow version is up to date. + """ + detected = tf.version.VERSION + required = '.'.join(map(str, minimum_tf_version)) + assert(tf_version_ok(minimum_tf_version, blacklisted)), 'You are using tensorflow version {}. The minimum required version is {} (blacklisted: {}).'.format(detected, required, blacklisted) + + +def check_tf_version(): + """ Check that the Tensorflow version is up to date. If it isn't, print an error message and exit the script. + """ + try: + assert_tf_version() + except AssertionError as e: + print(e, file=sys.stderr) + sys.exit(1) diff --git a/src/keras_retinanet/utils/transform.py b/src/keras_retinanet/utils/transform.py new file mode 100644 index 0000000..4c6afe6 --- /dev/null +++ b/src/keras_retinanet/utils/transform.py @@ -0,0 +1,289 @@ +""" +Copyright 2017-2018 Fizyr (https://fizyr.com) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np + +DEFAULT_PRNG = np.random + + +def colvec(*args): + """ Create a numpy array representing a column vector. """ + return np.array([args]).T + + +def transform_aabb(transform, aabb): + """ Apply a transformation to an axis aligned bounding box. + + The result is a new AABB in the same coordinate system as the original AABB. + The new AABB contains all corner points of the original AABB after applying the given transformation. + + Args + transform: The transformation to apply. + x1: The minimum x value of the AABB. + y1: The minimum y value of the AABB. + x2: The maximum x value of the AABB. + y2: The maximum y value of the AABB. + Returns + The new AABB as tuple (x1, y1, x2, y2) + """ + x1, y1, x2, y2 = aabb + # Transform all 4 corners of the AABB. + points = transform.dot([ + [x1, x2, x1, x2], + [y1, y2, y2, y1], + [1, 1, 1, 1 ], + ]) + + # Extract the min and max corners again. + min_corner = points.min(axis=1) + max_corner = points.max(axis=1) + + return [min_corner[0], min_corner[1], max_corner[0], max_corner[1]] + + +def _random_vector(min, max, prng=DEFAULT_PRNG): + """ Construct a random vector between min and max. + Args + min: the minimum value for each component + max: the maximum value for each component + """ + min = np.array(min) + max = np.array(max) + assert min.shape == max.shape + assert len(min.shape) == 1 + return prng.uniform(min, max) + + +def rotation(angle): + """ Construct a homogeneous 2D rotation matrix. + Args + angle: the angle in radians + Returns + the rotation matrix as 3 by 3 numpy array + """ + return np.array([ + [np.cos(angle), -np.sin(angle), 0], + [np.sin(angle), np.cos(angle), 0], + [0, 0, 1] + ]) + + +def random_rotation(min, max, prng=DEFAULT_PRNG): + """ Construct a random rotation between -max and max. + Args + min: a scalar for the minimum absolute angle in radians + max: a scalar for the maximum absolute angle in radians + prng: the pseudo-random number generator to use. + Returns + a homogeneous 3 by 3 rotation matrix + """ + return rotation(prng.uniform(min, max)) + + +def translation(translation): + """ Construct a homogeneous 2D translation matrix. + # Arguments + translation: the translation 2D vector + # Returns + the translation matrix as 3 by 3 numpy array + """ + return np.array([ + [1, 0, translation[0]], + [0, 1, translation[1]], + [0, 0, 1] + ]) + + +def random_translation(min, max, prng=DEFAULT_PRNG): + """ Construct a random 2D translation between min and max. + Args + min: a 2D vector with the minimum translation for each dimension + max: a 2D vector with the maximum translation for each dimension + prng: the pseudo-random number generator to use. + Returns + a homogeneous 3 by 3 translation matrix + """ + return translation(_random_vector(min, max, prng)) + + +def shear(angle): + """ Construct a homogeneous 2D shear matrix. + Args + angle: the shear angle in radians + Returns + the shear matrix as 3 by 3 numpy array + """ + return np.array([ + [1, -np.sin(angle), 0], + [0, np.cos(angle), 0], + [0, 0, 1] + ]) + + +def random_shear(min, max, prng=DEFAULT_PRNG): + """ Construct a random 2D shear matrix with shear angle between -max and max. + Args + min: the minimum shear angle in radians. + max: the maximum shear angle in radians. + prng: the pseudo-random number generator to use. + Returns + a homogeneous 3 by 3 shear matrix + """ + return shear(prng.uniform(min, max)) + + +def scaling(factor): + """ Construct a homogeneous 2D scaling matrix. + Args + factor: a 2D vector for X and Y scaling + Returns + the zoom matrix as 3 by 3 numpy array + """ + return np.array([ + [factor[0], 0, 0], + [0, factor[1], 0], + [0, 0, 1] + ]) + + +def random_scaling(min, max, prng=DEFAULT_PRNG): + """ Construct a random 2D scale matrix between -max and max. + Args + min: a 2D vector containing the minimum scaling factor for X and Y. + min: a 2D vector containing The maximum scaling factor for X and Y. + prng: the pseudo-random number generator to use. + Returns + a homogeneous 3 by 3 scaling matrix + """ + return scaling(_random_vector(min, max, prng)) + + +def random_flip(flip_x_chance, flip_y_chance, prng=DEFAULT_PRNG): + """ Construct a transformation randomly containing X/Y flips (or not). + Args + flip_x_chance: The chance that the result will contain a flip along the X axis. + flip_y_chance: The chance that the result will contain a flip along the Y axis. + prng: The pseudo-random number generator to use. + Returns + a homogeneous 3 by 3 transformation matrix + """ + flip_x = prng.uniform(0, 1) < flip_x_chance + flip_y = prng.uniform(0, 1) < flip_y_chance + # 1 - 2 * bool gives 1 for False and -1 for True. + return scaling((1 - 2 * flip_x, 1 - 2 * flip_y)) + + +def change_transform_origin(transform, center): + """ Create a new transform representing the same transformation, + only with the origin of the linear part changed. + Args + transform: the transformation matrix + center: the new origin of the transformation + Returns + translate(center) * transform * translate(-center) + """ + center = np.array(center) + return np.linalg.multi_dot([translation(center), transform, translation(-center)]) + + +def random_transform( + min_rotation=0, + max_rotation=0, + min_translation=(0, 0), + max_translation=(0, 0), + min_shear=0, + max_shear=0, + min_scaling=(1, 1), + max_scaling=(1, 1), + flip_x_chance=0, + flip_y_chance=0, + prng=DEFAULT_PRNG +): + """ Create a random transformation. + + The transformation consists of the following operations in this order (from left to right): + * rotation + * translation + * shear + * scaling + * flip x (if applied) + * flip y (if applied) + + Note that by default, the data generators in `keras_retinanet.preprocessing.generators` interpret the translation + as factor of the image size. So an X translation of 0.1 would translate the image by 10% of it's width. + Set `relative_translation` to `False` in the `TransformParameters` of a data generator to have it interpret + the translation directly as pixel distances instead. + + Args + min_rotation: The minimum rotation in radians for the transform as scalar. + max_rotation: The maximum rotation in radians for the transform as scalar. + min_translation: The minimum translation for the transform as 2D column vector. + max_translation: The maximum translation for the transform as 2D column vector. + min_shear: The minimum shear angle for the transform in radians. + max_shear: The maximum shear angle for the transform in radians. + min_scaling: The minimum scaling for the transform as 2D column vector. + max_scaling: The maximum scaling for the transform as 2D column vector. + flip_x_chance: The chance (0 to 1) that a transform will contain a flip along X direction. + flip_y_chance: The chance (0 to 1) that a transform will contain a flip along Y direction. + prng: The pseudo-random number generator to use. + """ + return np.linalg.multi_dot([ + random_rotation(min_rotation, max_rotation, prng), + random_translation(min_translation, max_translation, prng), + random_shear(min_shear, max_shear, prng), + random_scaling(min_scaling, max_scaling, prng), + random_flip(flip_x_chance, flip_y_chance, prng) + ]) + + +def random_transform_generator(prng=None, **kwargs): + """ Create a random transform generator. + + Uses a dedicated, newly created, properly seeded PRNG by default instead of the global DEFAULT_PRNG. + + The transformation consists of the following operations in this order (from left to right): + * rotation + * translation + * shear + * scaling + * flip x (if applied) + * flip y (if applied) + + Note that by default, the data generators in `keras_retinanet.preprocessing.generators` interpret the translation + as factor of the image size. So an X translation of 0.1 would translate the image by 10% of it's width. + Set `relative_translation` to `False` in the `TransformParameters` of a data generator to have it interpret + the translation directly as pixel distances instead. + + Args + min_rotation: The minimum rotation in radians for the transform as scalar. + max_rotation: The maximum rotation in radians for the transform as scalar. + min_translation: The minimum translation for the transform as 2D column vector. + max_translation: The maximum translation for the transform as 2D column vector. + min_shear: The minimum shear angle for the transform in radians. + max_shear: The maximum shear angle for the transform in radians. + min_scaling: The minimum scaling for the transform as 2D column vector. + max_scaling: The maximum scaling for the transform as 2D column vector. + flip_x_chance: The chance (0 to 1) that a transform will contain a flip along X direction. + flip_y_chance: The chance (0 to 1) that a transform will contain a flip along Y direction. + prng: The pseudo-random number generator to use. + """ + + if prng is None: + # RandomState automatically seeds using the best available method. + prng = np.random.RandomState() + + while True: + yield random_transform(prng=prng, **kwargs) diff --git a/src/keras_retinanet/utils/visualization.py b/src/keras_retinanet/utils/visualization.py new file mode 100644 index 0000000..c551043 --- /dev/null +++ b/src/keras_retinanet/utils/visualization.py @@ -0,0 +1,106 @@ +""" +Copyright 2017-2018 Fizyr (https://fizyr.com) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import cv2 +import numpy as np + +from .colors import label_color + + +def draw_box(image, box, color, thickness=2): + """ Draws a box on an image with a given color. + + # Arguments + image : The image to draw on. + box : A list of 4 elements (x1, y1, x2, y2). + color : The color of the box. + thickness : The thickness of the lines to draw a box with. + """ + b = np.array(box).astype(int) + cv2.rectangle(image, (b[0], b[1]), (b[2], b[3]), color, thickness, cv2.LINE_AA) + + +def draw_caption(image, box, caption): + """ Draws a caption above the box in an image. + + # Arguments + image : The image to draw on. + box : A list of 4 elements (x1, y1, x2, y2). + caption : String containing the text to draw. + """ + b = np.array(box).astype(int) + # cv2.putText(image, caption, (b[0], b[1] - 10), cv2.FONT_HERSHEY_PLAIN, 1, (0, 0, 0), 2) + # cv2.putText(image, caption, (b[0], b[1] - 10), cv2.FONT_HERSHEY_PLAIN, 1, (255, 255, 255), 1) + + +def draw_boxes(image, boxes, color, thickness=2): + """ Draws boxes on an image with a given color. + + # Arguments + image : The image to draw on. + boxes : A [N, 4] matrix (x1, y1, x2, y2). + color : The color of the boxes. + thickness : The thickness of the lines to draw boxes with. + """ + for b in boxes: + draw_box(image, b, color, thickness=thickness) + + +def draw_detections(image, boxes, scores, labels, color=None, label_to_name=None, score_threshold=0.5): + """ Draws detections in an image. + + # Arguments + image : The image to draw on. + boxes : A [N, 4] matrix (x1, y1, x2, y2). + scores : A list of N classification scores. + labels : A list of N labels. + color : The color of the boxes. By default the color from keras_retinanet.utils.colors.label_color will be used. + label_to_name : (optional) Functor for mapping a label to a name. + score_threshold : Threshold used for determining what detections to draw. + """ + selection = np.where(scores > score_threshold)[0] + + for i in selection: + c = color if color is not None else label_color(labels[i]) + draw_box(image, boxes[i, :], color=c) + + # draw labels + caption = (label_to_name(labels[i]) if label_to_name else labels[i]) + ': {0:.2f}'.format(scores[i]) + draw_caption(image, boxes[i, :], caption) + + +def draw_annotations(image, annotations, color=(0, 255, 0), label_to_name=None): + """ Draws annotations in an image. + + # Arguments + image : The image to draw on. + annotations : A [N, 5] matrix (x1, y1, x2, y2, label) or dictionary containing bboxes (shaped [N, 4]) and labels (shaped [N]). + color : The color of the boxes. By default the color from keras_retinanet.utils.colors.label_color will be used. + label_to_name : (optional) Functor for mapping a label to a name. + """ + if isinstance(annotations, np.ndarray): + annotations = {'bboxes': annotations[:, :4], 'labels': annotations[:, 4]} + + assert('bboxes' in annotations) + assert('labels' in annotations) + assert(annotations['bboxes'].shape[0] == annotations['labels'].shape[0]) + + for i in range(annotations['bboxes'].shape[0]): + label = annotations['labels'][i] + c = color if color is not None else label_color(label) + caption = '{}'.format(label_to_name(label) if label_to_name else label) + draw_caption(image, annotations['bboxes'][i], caption) + draw_box(image, annotations['bboxes'][i], color=c)