diff --git a/src/GeoLocalizator.py b/src/GeoLocalizator.py
new file mode 100644
index 0000000..88f0ca1
--- /dev/null
+++ b/src/GeoLocalizator.py
@@ -0,0 +1,30 @@
+from typing import List, Tuple
+
+
+class GeoLocalizator:
+    def geo_localize(self, trees: List, coordenates: Tuple) -> List:
+        """
+        Localize from Tree object List all them coordenates.
+
+        :param List trees: The list with all trees to operate with.
+        :param Tuple coordenates: The coordenates off the target image.
+        :return: The list off all Trees with all coordenates calculated.
+        """
+        self._coordenates = coordenates
+
+        for tree in trees:
+            x, y = self._calculate_coordenates(tree)
+            tree.set_coordenates(x, y)
+
+        return trees
+
+    def _calculate_coordenates(self, tree: "Tree") -> Tuple:
+        """
+        Localize Tree object with its coordenates.
+
+        :param Tree tree: The Tree to work with.
+        :return: The Tuple of GPS coordenates witch represent its center.
+        """
+        x = None
+        y = None
+        return x, y
diff --git a/src/NeuralNetwork.py b/src/NeuralNetwork.py
new file mode 100644
index 0000000..91bfb89
--- /dev/null
+++ b/src/NeuralNetwork.py
@@ -0,0 +1,68 @@
+from typing import List
+
+import numpy as np
+import numpy.typing as npt
+
+from keras_retinanet import models
+from keras_retinanet.utils.gpu import setup_gpu
+from keras_retinanet.utils.image import preprocess_image, resize_image
+
+from .Tree import Tree
+
+
+class NeuralNetwork:
+    def __init__(self, path_to_model: str, score: float = 0.5):
+        """
+        Neural Network will use RetinaNet to detect trees.
+
+        :param str path_to_model: The path where the model is stored.
+        :param float score: Score of confidance. 0.5 by default.
+        """
+
+        # select GPU
+        gpu = 0
+        setup_gpu(gpu)
+
+        # Load RetinaNet model
+        self._model = models.load_model(path_to_model, backbone_name="resnet50")
+
+        # Set score confidance
+        self._score = score
+
+    def detect_trees(self, img: npt.ArrayLike, row: int, col: int) -> List:
+        """
+        Detect trees in sub-image.
+
+        :param npt.ArrayLike img: The image where detection will be. Must be (400x400x3).
+        :param int row: Padding row.
+        :param int col: Padding col.
+        :return: List with all detected trees.
+        """
+
+        # Preprocess image
+        image = preprocess_image(img)
+        image, scale = resize_image(image)
+
+        # Predict Trees
+        boxes, scores, labels = self._model.predict_on_batch(
+            np.expand_dims(image, axis=0)
+        )
+
+        # Scale boxes
+        boxes /= scale
+
+        # Generate Tree objects
+        trees = []
+        for box, score, _ in zip(boxes[0], scores[0], labels[0]):
+            # Boxes are sorted from 1->0
+            if score < 0.5:
+                break
+
+            box = box.astype(int)
+
+            x1, y1, x2, y2 = box
+            width = x2 - x1
+            height = y2 - y1
+            trees.append(Tree(y1 + row, x1 + col, width, height))
+
+        return trees
\ No newline at end of file
diff --git a/src/Tree.py b/src/Tree.py
new file mode 100644
index 0000000..a0deb37
--- /dev/null
+++ b/src/Tree.py
@@ -0,0 +1,18 @@
+class Tree:
+    def __init__(self, row: int, col: int, width: int, height: int):
+        self._row = row
+        self._col = col
+        self._width = width
+        self._height = height
+        self._coordx = None
+        self._coordy = None
+
+    def set_coordenates(self, x: float, y: float):
+        """
+        Set coordenates in Tree object.
+
+        :param float x: x coordenate (West-East).
+        :param float y: y coordenate (North-Sourth).
+        """
+        self._coordx = x
+        self._coordy = y
diff --git a/src/TreeDetector.py b/src/TreeDetector.py
index 41d4deb..f0e2f0d 100644
--- a/src/TreeDetector.py
+++ b/src/TreeDetector.py
@@ -1,12 +1,43 @@
+from typing import List, Tuple
+
 import cv2 as cv
+import numpy as np
+import numpy.typing as npt
 
 
 class TreeDetector:
-    def __init__(self, img):
-        self.__img = cv.imread(img)
+    def __init__(self, nn):
+        """
+        TreeDetector constructor.
+
+        :param NeuralNetwork nn: the neural network witch we will used.
+        """
+        self._img = None
+        self._coordenates = None
+        self._nn = nn  # Neural Network
+        self._trees = []
+
+    def recognize(self, img: npt.ArrayLike, coordenates: Tuple) -> List:
+        """
+        Recognize trees in image.
 
-    def recognize(self):
-        return "done"
+        :param npt.ArrayLike img: The target image.
+        :param tuple coordenates: The coordenates of the image.
+        :return: the list with all detected trees.
+        """
+        self._img = img
+        self._slide()
+        return self._trees
 
-    def slide(self):
-        return 400
\ No newline at end of file
+    def _slide(self):
+        """
+        Iterates around the image and calls NN to detect trees in sub-image.
+        """
+        STEP = 400
+        cols, rows = self._img.shape[:-1]
+        for col in range(0, cols - STEP, STEP):
+            for row in range(0, rows - STEP, STEP):
+                trees = self._nn.detect_trees(
+                    self._img[row : row + STEP, col : col + STEP], row, col
+                )
+                self._trees.append(trees)
diff --git a/src/TreePainter.py b/src/TreePainter.py
new file mode 100644
index 0000000..cf124ae
--- /dev/null
+++ b/src/TreePainter.py
@@ -0,0 +1,17 @@
+from typing import List
+
+import cv2 as cv
+import numpy.typing as npt
+
+
+class TreePainter:
+    def draw(self, canvas: npt.ArrayLike, trees: List):
+        """
+        Draw all trees in image.
+
+        :param npt.ArrayLike canvas: The image witch will be used as canvas.
+        :param List trees: The list off all trees.
+        """
+
+        for tree in trees:
+            continue
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/keras_retinanet/__init__.py b/src/keras_retinanet/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/keras_retinanet/backend/__init__.py b/src/keras_retinanet/backend/__init__.py
new file mode 100644
index 0000000..4bace69
--- /dev/null
+++ b/src/keras_retinanet/backend/__init__.py
@@ -0,0 +1,2 @@
+from .dynamic import *  # noqa: F401,F403
+from .common import *   # noqa: F401,F403
diff --git a/src/keras_retinanet/backend/cntk_backend.py b/src/keras_retinanet/backend/cntk_backend.py
new file mode 100644
index 0000000..70aae54
--- /dev/null
+++ b/src/keras_retinanet/backend/cntk_backend.py
@@ -0,0 +1,15 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
diff --git a/src/keras_retinanet/backend/common.py b/src/keras_retinanet/backend/common.py
new file mode 100644
index 0000000..8f8dcc6
--- /dev/null
+++ b/src/keras_retinanet/backend/common.py
@@ -0,0 +1,85 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import keras.backend
+from .dynamic import meshgrid
+
+
+def bbox_transform_inv(boxes, deltas, mean=None, std=None):
+    """ Applies deltas (usually regression results) to boxes (usually anchors).
+
+    Before applying the deltas to the boxes, the normalization that was previously applied (in the generator) has to be removed.
+    The mean and std are the mean and std as applied in the generator. They are unnormalized in this function and then applied to the boxes.
+
+    Args
+        boxes : np.array of shape (B, N, 4), where B is the batch size, N the number of boxes and 4 values for (x1, y1, x2, y2).
+        deltas: np.array of same shape as boxes. These deltas (d_x1, d_y1, d_x2, d_y2) are a factor of the width/height.
+        mean  : The mean value used when computing deltas (defaults to [0, 0, 0, 0]).
+        std   : The standard deviation used when computing deltas (defaults to [0.2, 0.2, 0.2, 0.2]).
+
+    Returns
+        A np.array of the same shape as boxes, but with deltas applied to each box.
+        The mean and std are used during training to normalize the regression values (networks love normalization).
+    """
+    if mean is None:
+        mean = [0, 0, 0, 0]
+    if std is None:
+        std = [0.2, 0.2, 0.2, 0.2]
+
+    width  = boxes[:, :, 2] - boxes[:, :, 0]
+    height = boxes[:, :, 3] - boxes[:, :, 1]
+
+    x1 = boxes[:, :, 0] + (deltas[:, :, 0] * std[0] + mean[0]) * width
+    y1 = boxes[:, :, 1] + (deltas[:, :, 1] * std[1] + mean[1]) * height
+    x2 = boxes[:, :, 2] + (deltas[:, :, 2] * std[2] + mean[2]) * width
+    y2 = boxes[:, :, 3] + (deltas[:, :, 3] * std[3] + mean[3]) * height
+
+    pred_boxes = keras.backend.stack([x1, y1, x2, y2], axis=2)
+
+    return pred_boxes
+
+
+def shift(shape, stride, anchors):
+    """ Produce shifted anchors based on shape of the map and stride size.
+
+    Args
+        shape  : Shape to shift the anchors over.
+        stride : Stride to shift the anchors with over the shape.
+        anchors: The anchors to apply at each location.
+    """
+    shift_x = (keras.backend.arange(0, shape[1], dtype=keras.backend.floatx()) + keras.backend.constant(0.5, dtype=keras.backend.floatx())) * stride
+    shift_y = (keras.backend.arange(0, shape[0], dtype=keras.backend.floatx()) + keras.backend.constant(0.5, dtype=keras.backend.floatx())) * stride
+
+    shift_x, shift_y = meshgrid(shift_x, shift_y)
+    shift_x = keras.backend.reshape(shift_x, [-1])
+    shift_y = keras.backend.reshape(shift_y, [-1])
+
+    shifts = keras.backend.stack([
+        shift_x,
+        shift_y,
+        shift_x,
+        shift_y
+    ], axis=0)
+
+    shifts            = keras.backend.transpose(shifts)
+    number_of_anchors = keras.backend.shape(anchors)[0]
+
+    k = keras.backend.shape(shifts)[0]  # number of base points = feat_h * feat_w
+
+    shifted_anchors = keras.backend.reshape(anchors, [1, number_of_anchors, 4]) + keras.backend.cast(keras.backend.reshape(shifts, [k, 1, 4]), keras.backend.floatx())
+    shifted_anchors = keras.backend.reshape(shifted_anchors, [k * number_of_anchors, 4])
+
+    return shifted_anchors
diff --git a/src/keras_retinanet/backend/dynamic.py b/src/keras_retinanet/backend/dynamic.py
new file mode 100644
index 0000000..361b685
--- /dev/null
+++ b/src/keras_retinanet/backend/dynamic.py
@@ -0,0 +1,25 @@
+import os
+
+_BACKEND = "tensorflow"
+
+if "KERAS_BACKEND" in os.environ:
+    _backend = os.environ["KERAS_BACKEND"]
+
+    backends = {
+        "cntk",
+        "tensorflow",
+        "theano"
+    }
+
+    assert _backend in backends
+
+    _BACKEND = _backend
+
+if _BACKEND == "cntk":
+    from .cntk_backend import *  # noqa: F401,F403
+elif _BACKEND == "theano":
+    from .theano_backend import *  # noqa: F401,F403
+elif _BACKEND == "tensorflow":
+    from .tensorflow_backend import *  # noqa: F401,F403
+else:
+    raise ValueError("Unknown backend: " + str(_BACKEND))
diff --git a/src/keras_retinanet/backend/tensorflow_backend.py b/src/keras_retinanet/backend/tensorflow_backend.py
new file mode 100644
index 0000000..a41ac80
--- /dev/null
+++ b/src/keras_retinanet/backend/tensorflow_backend.py
@@ -0,0 +1,110 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import tensorflow
+
+
+def ones(*args, **kwargs):
+    """ See https://www.tensorflow.org/api_docs/python/tf/ones .
+    """
+    return tensorflow.ones(*args, **kwargs)
+
+
+def transpose(*args, **kwargs):
+    """ See https://www.tensorflow.org/api_docs/python/tf/transpose .
+    """
+    return tensorflow.transpose(*args, **kwargs)
+
+
+def map_fn(*args, **kwargs):
+    """ See https://www.tensorflow.org/api_docs/python/tf/map_fn .
+    """
+    return tensorflow.map_fn(*args, **kwargs)
+
+
+def pad(*args, **kwargs):
+    """ See https://www.tensorflow.org/api_docs/python/tf/pad .
+    """
+    return tensorflow.pad(*args, **kwargs)
+
+
+def top_k(*args, **kwargs):
+    """ See https://www.tensorflow.org/api_docs/python/tf/nn/top_k .
+    """
+    return tensorflow.nn.top_k(*args, **kwargs)
+
+
+def clip_by_value(*args, **kwargs):
+    """ See https://www.tensorflow.org/api_docs/python/tf/clip_by_value .
+    """
+    return tensorflow.clip_by_value(*args, **kwargs)
+
+
+def resize_images(images, size, method='bilinear', align_corners=False):
+    """ See https://www.tensorflow.org/versions/r1.14/api_docs/python/tf/image/resize_images .
+
+    Args
+        method: The method used for interpolation. One of ('bilinear', 'nearest', 'bicubic', 'area').
+    """
+    methods = {
+        'bilinear': tensorflow.image.ResizeMethod.BILINEAR,
+        'nearest' : tensorflow.image.ResizeMethod.NEAREST_NEIGHBOR,
+        'bicubic' : tensorflow.image.ResizeMethod.BICUBIC,
+        'area'    : tensorflow.image.ResizeMethod.AREA,
+    }
+    return tensorflow.compat.v1.image.resize_images(images, size, methods[method], align_corners)
+
+
+def non_max_suppression(*args, **kwargs):
+    """ See https://www.tensorflow.org/api_docs/python/tf/image/non_max_suppression .
+    """
+    return tensorflow.image.non_max_suppression(*args, **kwargs)
+
+
+def range(*args, **kwargs):
+    """ See https://www.tensorflow.org/api_docs/python/tf/range .
+    """
+    return tensorflow.range(*args, **kwargs)
+
+
+def scatter_nd(*args, **kwargs):
+    """ See https://www.tensorflow.org/api_docs/python/tf/scatter_nd .
+    """
+    return tensorflow.scatter_nd(*args, **kwargs)
+
+
+def gather_nd(*args, **kwargs):
+    """ See https://www.tensorflow.org/api_docs/python/tf/gather_nd .
+    """
+    return tensorflow.gather_nd(*args, **kwargs)
+
+
+def meshgrid(*args, **kwargs):
+    """ See https://www.tensorflow.org/api_docs/python/tf/meshgrid .
+    """
+    return tensorflow.meshgrid(*args, **kwargs)
+
+
+def where(*args, **kwargs):
+    """ See https://www.tensorflow.org/api_docs/python/tf/where .
+    """
+    return tensorflow.where(*args, **kwargs)
+
+
+def unstack(*args, **kwargs):
+    """ See https://www.tensorflow.org/api_docs/python/tf/unstack .
+    """
+    return tensorflow.unstack(*args, **kwargs)
diff --git a/src/keras_retinanet/backend/theano_backend.py b/src/keras_retinanet/backend/theano_backend.py
new file mode 100644
index 0000000..70aae54
--- /dev/null
+++ b/src/keras_retinanet/backend/theano_backend.py
@@ -0,0 +1,15 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
diff --git a/src/keras_retinanet/bin/__init__.py b/src/keras_retinanet/bin/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/keras_retinanet/bin/convert_model.py b/src/keras_retinanet/bin/convert_model.py
new file mode 100644
index 0000000..4ae4cfd
--- /dev/null
+++ b/src/keras_retinanet/bin/convert_model.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python
+
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import argparse
+import os
+import sys
+
+# Allow relative imports when being executed as script.
+if __name__ == "__main__" and __package__ is None:
+    sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..'))
+    import keras_retinanet.bin  # noqa: F401
+    __package__ = "keras_retinanet.bin"
+
+# Change these to absolute imports if you copy this script outside the keras_retinanet package.
+from .. import models
+from ..utils.config import read_config_file, parse_anchor_parameters
+from ..utils.gpu import setup_gpu
+from ..utils.keras_version import check_keras_version
+from ..utils.tf_version import check_tf_version
+
+
+def parse_args(args):
+    parser = argparse.ArgumentParser(description='Script for converting a training model to an inference model.')
+
+    parser.add_argument('model_in', help='The model to convert.')
+    parser.add_argument('model_out', help='Path to save the converted model to.')
+    parser.add_argument('--backbone', help='The backbone of the model to convert.', default='resnet50')
+    parser.add_argument('--no-nms', help='Disables non maximum suppression.', dest='nms', action='store_false')
+    parser.add_argument('--no-class-specific-filter', help='Disables class specific filtering.', dest='class_specific_filter', action='store_false')
+    parser.add_argument('--config', help='Path to a configuration parameters .ini file.')
+    parser.add_argument('--nms-threshold', help='Value for non maximum suppression threshold.', type=float, default=0.5)
+    parser.add_argument('--score-threshold', help='Threshold for prefiltering boxes.', type=float, default=0.05)
+    parser.add_argument('--max-detections', help='Maximum number of detections to keep.', type=int, default=300)
+    parser.add_argument('--parallel-iterations', help='Number of batch items to process in parallel.', type=int, default=32)
+
+    return parser.parse_args(args)
+
+
+def main(args=None):
+    # parse arguments
+    if args is None:
+        args = sys.argv[1:]
+    args = parse_args(args)
+
+    # make sure keras and tensorflow are the minimum required version
+    check_keras_version()
+    check_tf_version()
+
+    # set modified tf session to avoid using the GPUs
+    setup_gpu('cpu')
+
+    # optionally load config parameters
+    anchor_parameters = None
+    if args.config:
+        args.config = read_config_file(args.config)
+        if 'anchor_parameters' in args.config:
+            anchor_parameters = parse_anchor_parameters(args.config)
+
+    # load the model
+    model = models.load_model(args.model_in, backbone_name=args.backbone)
+
+    # check if this is indeed a training model
+    models.check_training_model(model)
+
+    # convert the model
+    model = models.convert_model(
+        model,
+        nms=args.nms,
+        class_specific_filter=args.class_specific_filter,
+        anchor_params=anchor_parameters,
+        nms_threshold=args.nms_threshold,
+        score_threshold=args.score_threshold,
+        max_detections=args.max_detections,
+        parallel_iterations=args.parallel_iterations
+    )
+
+    # save model
+    model.save(args.model_out)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/keras_retinanet/bin/debug.py b/src/keras_retinanet/bin/debug.py
new file mode 100644
index 0000000..32d431c
--- /dev/null
+++ b/src/keras_retinanet/bin/debug.py
@@ -0,0 +1,322 @@
+#!/usr/bin/env python
+
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import argparse
+import os
+import sys
+import cv2
+
+# Set keycodes for changing images
+# 81, 83 are left and right arrows on linux in Ascii code (probably not needed)
+# 65361, 65363 are left and right arrows in linux
+# 2424832, 2555904 are left and right arrows on Windows
+# 110, 109 are 'n' and 'm' on mac, windows, linux
+# (unfortunately arrow keys not picked up on mac)
+leftkeys = (81, 110, 65361, 2424832)
+rightkeys = (83, 109, 65363, 2555904)
+
+# Allow relative imports when being executed as script.
+if __name__ == "__main__" and __package__ is None:
+    sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..'))
+    import keras_retinanet.bin  # noqa: F401
+    __package__ = "keras_retinanet.bin"
+
+# Change these to absolute imports if you copy this script outside the keras_retinanet package.
+from ..preprocessing.pascal_voc import PascalVocGenerator
+from ..preprocessing.csv_generator import CSVGenerator
+from ..preprocessing.kitti import KittiGenerator
+from ..preprocessing.open_images import OpenImagesGenerator
+from ..utils.anchors import anchors_for_shape, compute_gt_annotations
+from ..utils.config import read_config_file, parse_anchor_parameters
+from ..utils.image import random_visual_effect_generator
+from ..utils.keras_version import check_keras_version
+from ..utils.tf_version import check_tf_version
+from ..utils.transform import random_transform_generator
+from ..utils.visualization import draw_annotations, draw_boxes, draw_caption
+
+
+def create_generator(args):
+    """ Create the data generators.
+
+    Args:
+        args: parseargs arguments object.
+    """
+    # create random transform generator for augmenting training data
+    transform_generator = random_transform_generator(
+        min_rotation=-0.1,
+        max_rotation=0.1,
+        min_translation=(-0.1, -0.1),
+        max_translation=(0.1, 0.1),
+        min_shear=-0.1,
+        max_shear=0.1,
+        min_scaling=(0.9, 0.9),
+        max_scaling=(1.1, 1.1),
+        flip_x_chance=0.5,
+        flip_y_chance=0.5,
+    )
+
+    visual_effect_generator = random_visual_effect_generator(
+        contrast_range=(0.9, 1.1),
+        brightness_range=(-.1, .1),
+        hue_range=(-0.05, 0.05),
+        saturation_range=(0.95, 1.05)
+    )
+
+    if args.dataset_type == 'coco':
+        # import here to prevent unnecessary dependency on cocoapi
+        from ..preprocessing.coco import CocoGenerator
+
+        generator = CocoGenerator(
+            args.coco_path,
+            args.coco_set,
+            transform_generator=transform_generator,
+            visual_effect_generator=visual_effect_generator,
+            image_min_side=args.image_min_side,
+            image_max_side=args.image_max_side,
+            config=args.config
+        )
+    elif args.dataset_type == 'pascal':
+        generator = PascalVocGenerator(
+            args.pascal_path,
+            args.pascal_set,
+            image_extension=args.image_extension,
+            transform_generator=transform_generator,
+            visual_effect_generator=visual_effect_generator,
+            image_min_side=args.image_min_side,
+            image_max_side=args.image_max_side,
+            config=args.config
+        )
+    elif args.dataset_type == 'csv':
+        generator = CSVGenerator(
+            args.annotations,
+            args.classes,
+            transform_generator=transform_generator,
+            visual_effect_generator=visual_effect_generator,
+            image_min_side=args.image_min_side,
+            image_max_side=args.image_max_side,
+            config=args.config
+        )
+    elif args.dataset_type == 'oid':
+        generator = OpenImagesGenerator(
+            args.main_dir,
+            subset=args.subset,
+            version=args.version,
+            labels_filter=args.labels_filter,
+            parent_label=args.parent_label,
+            annotation_cache_dir=args.annotation_cache_dir,
+            transform_generator=transform_generator,
+            visual_effect_generator=visual_effect_generator,
+            image_min_side=args.image_min_side,
+            image_max_side=args.image_max_side,
+            config=args.config
+        )
+    elif args.dataset_type == 'kitti':
+        generator = KittiGenerator(
+            args.kitti_path,
+            subset=args.subset,
+            transform_generator=transform_generator,
+            visual_effect_generator=visual_effect_generator,
+            image_min_side=args.image_min_side,
+            image_max_side=args.image_max_side,
+            config=args.config
+        )
+    else:
+        raise ValueError('Invalid data type received: {}'.format(args.dataset_type))
+
+    return generator
+
+
+def parse_args(args):
+    """ Parse the arguments.
+    """
+    parser     = argparse.ArgumentParser(description='Debug script for a RetinaNet network.')
+    subparsers = parser.add_subparsers(help='Arguments for specific dataset types.', dest='dataset_type')
+    subparsers.required = True
+
+    coco_parser = subparsers.add_parser('coco')
+    coco_parser.add_argument('coco_path',  help='Path to dataset directory (ie. /tmp/COCO).')
+    coco_parser.add_argument('--coco-set', help='Name of the set to show (defaults to val2017).', default='val2017')
+
+    pascal_parser = subparsers.add_parser('pascal')
+    pascal_parser.add_argument('pascal_path', help='Path to dataset directory (ie. /tmp/VOCdevkit).')
+    pascal_parser.add_argument('--pascal-set',  help='Name of the set to show (defaults to test).', default='test')
+    pascal_parser.add_argument('--image-extension',   help='Declares the dataset images\' extension.', default='.jpg')
+
+    kitti_parser = subparsers.add_parser('kitti')
+    kitti_parser.add_argument('kitti_path', help='Path to dataset directory (ie. /tmp/kitti).')
+    kitti_parser.add_argument('subset', help='Argument for loading a subset from train/val.')
+
+    def csv_list(string):
+        return string.split(',')
+
+    oid_parser = subparsers.add_parser('oid')
+    oid_parser.add_argument('main_dir', help='Path to dataset directory.')
+    oid_parser.add_argument('subset', help='Argument for loading a subset from train/validation/test.')
+    oid_parser.add_argument('--version',  help='The current dataset version is v4.', default='v4')
+    oid_parser.add_argument('--labels-filter',  help='A list of labels to filter.', type=csv_list, default=None)
+    oid_parser.add_argument('--annotation-cache-dir', help='Path to store annotation cache.', default='.')
+    oid_parser.add_argument('--parent-label', help='Use the hierarchy children of this label.', default=None)
+
+    csv_parser = subparsers.add_parser('csv')
+    csv_parser.add_argument('annotations', help='Path to CSV file containing annotations for evaluation.')
+    csv_parser.add_argument('classes',     help='Path to a CSV file containing class label mapping.')
+
+    parser.add_argument('--no-resize', help='Disable image resizing.', dest='resize', action='store_false')
+    parser.add_argument('--anchors', help='Show positive anchors on the image.', action='store_true')
+    parser.add_argument('--display-name', help='Display image name on the bottom left corner.', action='store_true')
+    parser.add_argument('--annotations', help='Show annotations on the image. Green annotations have anchors, red annotations don\'t and therefore don\'t contribute to training.', action='store_true')
+    parser.add_argument('--random-transform', help='Randomly transform image and annotations.', action='store_true')
+    parser.add_argument('--image-min-side', help='Rescale the image so the smallest side is min_side.', type=int, default=800)
+    parser.add_argument('--image-max-side', help='Rescale the image if the largest side is larger than max_side.', type=int, default=1333)
+    parser.add_argument('--config', help='Path to a configuration parameters .ini file.')
+    parser.add_argument('--no-gui', help='Do not open a GUI window. Save images to an output directory instead.', action='store_true')
+    parser.add_argument('--output-dir', help='The output directory to save images to if --no-gui is specified.', default='.')
+    parser.add_argument('--flatten-output', help='Flatten the folder structure of saved output images into a single folder.', action='store_true')
+
+    return parser.parse_args(args)
+
+
+def run(generator, args, anchor_params):
+    """ Main loop.
+
+    Args
+        generator: The generator to debug.
+        args: parseargs args object.
+    """
+    # display images, one at a time
+    i = 0
+    while True:
+        # load the data
+        image       = generator.load_image(i)
+        annotations = generator.load_annotations(i)
+        if len(annotations['labels']) > 0 :
+            # apply random transformations
+            if args.random_transform:
+                image, annotations = generator.random_transform_group_entry(image, annotations)
+                image, annotations = generator.random_visual_effect_group_entry(image, annotations)
+
+            # resize the image and annotations
+            if args.resize:
+                image, image_scale = generator.resize_image(image)
+                annotations['bboxes'] *= image_scale
+
+            anchors = anchors_for_shape(image.shape, anchor_params=anchor_params)
+            positive_indices, _, max_indices = compute_gt_annotations(anchors, annotations['bboxes'])
+
+            # draw anchors on the image
+            if args.anchors:
+                draw_boxes(image, anchors[positive_indices], (255, 255, 0), thickness=1)
+
+            # draw annotations on the image
+            if args.annotations:
+                # draw annotations in red
+                draw_annotations(image, annotations, color=(0, 0, 255), label_to_name=generator.label_to_name)
+
+                # draw regressed anchors in green to override most red annotations
+                # result is that annotations without anchors are red, with anchors are green
+                draw_boxes(image, annotations['bboxes'][max_indices[positive_indices], :], (0, 255, 0))
+
+            # display name on the image
+            if args.display_name:
+                draw_caption(image, [0, image.shape[0]], os.path.basename(generator.image_path(i)))
+
+        # write to file and advance if no-gui selected
+        if args.no_gui:
+            output_path = make_output_path(args.output_dir, generator.image_path(i), flatten=args.flatten_output)
+            os.makedirs(os.path.dirname(output_path), exist_ok=True)
+            cv2.imwrite(output_path, image)
+            i += 1
+            if i == generator.size():  # have written all images
+                break
+            else:
+                continue
+
+        # if we are using the GUI, then show an image
+        cv2.imshow('Image', image)
+        key = cv2.waitKeyEx()
+
+        # press right for next image and left for previous (linux or windows, doesn't work for macOS)
+        # if you run macOS, press "n" or "m" (will also work on linux and windows)
+
+        if key in rightkeys:
+            i = (i + 1) % generator.size()
+        if key in leftkeys:
+            i -= 1
+            if i < 0:
+                i = generator.size() - 1
+
+        # press q or Esc to quit
+        if (key == ord('q')) or (key == 27):
+            return False
+
+    return True
+
+
+def make_output_path(output_dir, image_path, flatten = False):
+    """ Compute the output path for a debug image. """
+
+    # If the output hierarchy is flattened to a single folder, throw away all leading folders.
+    if flatten:
+        path = os.path.basename(image_path)
+
+    # Otherwise, make sure absolute paths are taken relative to the filesystem root.
+    else:
+        # Make sure to drop drive letters on Windows, otherwise relpath wil fail.
+        _, path = os.path.splitdrive(image_path)
+        if os.path.isabs(path):
+            path = os.path.relpath(path, '/')
+
+    # In all cases, append "_debug" to the filename, before the extension.
+    base, extension = os.path.splitext(path)
+    path = base + "_debug" + extension
+
+    # Finally, join the whole thing to the output directory.
+    return os.path.join(output_dir, path)
+
+
+def main(args=None):
+    # parse arguments
+    if args is None:
+        args = sys.argv[1:]
+    args = parse_args(args)
+
+    # make sure keras and tensorflow are the minimum required version
+    check_keras_version()
+    check_tf_version()
+
+    # create the generator
+    generator = create_generator(args)
+
+    # optionally load config parameters
+    if args.config:
+        args.config = read_config_file(args.config)
+
+    # optionally load anchor parameters
+    anchor_params = None
+    if args.config and 'anchor_parameters' in args.config:
+        anchor_params = parse_anchor_parameters(args.config)
+
+    # create the display window if necessary
+    if not args.no_gui:
+        cv2.namedWindow('Image', cv2.WINDOW_NORMAL)
+
+    run(generator, args, anchor_params=anchor_params)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/keras_retinanet/bin/evaluate.py b/src/keras_retinanet/bin/evaluate.py
new file mode 100644
index 0000000..90c095c
--- /dev/null
+++ b/src/keras_retinanet/bin/evaluate.py
@@ -0,0 +1,196 @@
+#!/usr/bin/env python
+
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import argparse
+import os
+import sys
+
+# Allow relative imports when being executed as script.
+if __name__ == "__main__" and __package__ is None:
+    sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..'))
+    import keras_retinanet.bin  # noqa: F401
+    __package__ = "keras_retinanet.bin"
+
+# Change these to absolute imports if you copy this script outside the keras_retinanet package.
+from .. import models
+from ..preprocessing.csv_generator import CSVGenerator
+from ..preprocessing.pascal_voc import PascalVocGenerator
+from ..utils.anchors import make_shapes_callback
+from ..utils.config import read_config_file, parse_anchor_parameters
+from ..utils.eval import evaluate
+from ..utils.gpu import setup_gpu
+from ..utils.keras_version import check_keras_version
+from ..utils.tf_version import check_tf_version
+
+
+def create_generator(args, preprocess_image):
+    """ Create generators for evaluation.
+    """
+    common_args = {
+        'preprocess_image': preprocess_image,
+    }
+
+    if args.dataset_type == 'coco':
+        # import here to prevent unnecessary dependency on cocoapi
+        from ..preprocessing.coco import CocoGenerator
+
+        validation_generator = CocoGenerator(
+            args.coco_path,
+            'val2017',
+            image_min_side=args.image_min_side,
+            image_max_side=args.image_max_side,
+            config=args.config,
+            shuffle_groups=False,
+            **common_args
+        )
+    elif args.dataset_type == 'pascal':
+        validation_generator = PascalVocGenerator(
+            args.pascal_path,
+            'test',
+            image_extension=args.image_extension,
+            image_min_side=args.image_min_side,
+            image_max_side=args.image_max_side,
+            config=args.config,
+            shuffle_groups=False,
+            **common_args
+        )
+    elif args.dataset_type == 'csv':
+        validation_generator = CSVGenerator(
+            args.annotations,
+            args.classes,
+            image_min_side=args.image_min_side,
+            image_max_side=args.image_max_side,
+            config=args.config,
+            shuffle_groups=False,
+            **common_args
+        )
+    else:
+        raise ValueError('Invalid data type received: {}'.format(args.dataset_type))
+
+    return validation_generator
+
+
+def parse_args(args):
+    """ Parse the arguments.
+    """
+    parser     = argparse.ArgumentParser(description='Evaluation script for a RetinaNet network.')
+    subparsers = parser.add_subparsers(help='Arguments for specific dataset types.', dest='dataset_type')
+    subparsers.required = True
+
+    coco_parser = subparsers.add_parser('coco')
+    coco_parser.add_argument('coco_path', help='Path to dataset directory (ie. /tmp/COCO).')
+
+    pascal_parser = subparsers.add_parser('pascal')
+    pascal_parser.add_argument('pascal_path', help='Path to dataset directory (ie. /tmp/VOCdevkit).')
+    pascal_parser.add_argument('--image-extension',   help='Declares the dataset images\' extension.', default='.jpg')
+
+    csv_parser = subparsers.add_parser('csv')
+    csv_parser.add_argument('annotations', help='Path to CSV file containing annotations for evaluation.')
+    csv_parser.add_argument('classes', help='Path to a CSV file containing class label mapping.')
+
+    parser.add_argument('model',              help='Path to RetinaNet model.')
+    parser.add_argument('--convert-model',    help='Convert the model to an inference model (ie. the input is a training model).', action='store_true')
+    parser.add_argument('--backbone',         help='The backbone of the model.', default='resnet50')
+    parser.add_argument('--gpu',              help='Id of the GPU to use (as reported by nvidia-smi).', type=int)
+    parser.add_argument('--score-threshold',  help='Threshold on score to filter detections with (defaults to 0.05).', default=0.05, type=float)
+    parser.add_argument('--iou-threshold',    help='IoU Threshold to count for a positive detection (defaults to 0.5).', default=0.5, type=float)
+    parser.add_argument('--max-detections',   help='Max Detections per image (defaults to 100).', default=100, type=int)
+    parser.add_argument('--save-path',        help='Path for saving images with detections (doesn\'t work for COCO).')
+    parser.add_argument('--image-min-side',   help='Rescale the image so the smallest side is min_side.', type=int, default=800)
+    parser.add_argument('--image-max-side',   help='Rescale the image if the largest side is larger than max_side.', type=int, default=1333)
+    parser.add_argument('--config',           help='Path to a configuration parameters .ini file (only used with --convert-model).')
+
+    return parser.parse_args(args)
+
+
+def main(args=None):
+    # parse arguments
+    if args is None:
+        args = sys.argv[1:]
+    args = parse_args(args)
+
+    # make sure keras and tensorflow are the minimum required version
+    check_keras_version()
+    check_tf_version()
+
+    # optionally choose specific GPU
+    if args.gpu:
+        setup_gpu(args.gpu)
+
+    # make save path if it doesn't exist
+    if args.save_path is not None and not os.path.exists(args.save_path):
+        os.makedirs(args.save_path)
+
+    # optionally load config parameters
+    if args.config:
+        args.config = read_config_file(args.config)
+
+    # create the generator
+    backbone = models.backbone(args.backbone)
+    generator = create_generator(args, backbone.preprocess_image)
+
+    # optionally load anchor parameters
+    anchor_params = None
+    if args.config and 'anchor_parameters' in args.config:
+        anchor_params = parse_anchor_parameters(args.config)
+
+    # load the model
+    print('Loading model, this may take a second...')
+    model = models.load_model(args.model, backbone_name=args.backbone)
+    generator.compute_shapes = make_shapes_callback(model)
+
+    # optionally convert the model
+    if args.convert_model:
+        model = models.convert_model(model, anchor_params=anchor_params)
+
+    # print model summary
+    # print(model.summary())
+
+    # start evaluation
+    if args.dataset_type == 'coco':
+        from ..utils.coco_eval import evaluate_coco
+        evaluate_coco(generator, model, args.score_threshold)
+    else:
+        average_precisions, inference_time = evaluate(
+            generator,
+            model,
+            iou_threshold=args.iou_threshold,
+            score_threshold=args.score_threshold,
+            max_detections=args.max_detections,
+            save_path=args.save_path
+        )
+
+        # print evaluation
+        total_instances = []
+        precisions = []
+        for label, (average_precision, num_annotations) in average_precisions.items():
+            print('{:.0f} instances of class'.format(num_annotations),
+                  generator.label_to_name(label), 'with average precision: {:.4f}'.format(average_precision))
+            total_instances.append(num_annotations)
+            precisions.append(average_precision)
+
+        if sum(total_instances) == 0:
+            print('No test instances found.')
+            return
+
+        print('Inference time for {:.0f} images: {:.4f}'.format(generator.size(), inference_time))
+
+        print('mAP using the weighted average of precisions among classes: {:.4f}'.format(sum([a * b for a, b in zip(total_instances, precisions)]) / sum(total_instances)))
+        print('mAP: {:.4f}'.format(sum(precisions) / sum(x > 0 for x in total_instances)))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/keras_retinanet/bin/train.py b/src/keras_retinanet/bin/train.py
new file mode 100644
index 0000000..128f01d
--- /dev/null
+++ b/src/keras_retinanet/bin/train.py
@@ -0,0 +1,539 @@
+#!/usr/bin/env python
+
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import argparse
+import os
+import sys
+import warnings
+
+import keras
+import keras.preprocessing.image
+import tensorflow as tf
+
+# Allow relative imports when being executed as script.
+if __name__ == "__main__" and __package__ is None:
+    sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..'))
+    import keras_retinanet.bin  # noqa: F401
+    __package__ = "keras_retinanet.bin"
+
+# Change these to absolute imports if you copy this script outside the keras_retinanet package.
+from .. import layers  # noqa: F401
+from .. import losses
+from .. import models
+from ..callbacks import RedirectModel
+from ..callbacks.eval import Evaluate
+from ..models.retinanet import retinanet_bbox
+from ..preprocessing.csv_generator import CSVGenerator
+from ..preprocessing.kitti import KittiGenerator
+from ..preprocessing.open_images import OpenImagesGenerator
+from ..preprocessing.pascal_voc import PascalVocGenerator
+from ..utils.anchors import make_shapes_callback
+from ..utils.config import read_config_file, parse_anchor_parameters
+from ..utils.gpu import setup_gpu
+from ..utils.image import random_visual_effect_generator
+from ..utils.keras_version import check_keras_version
+from ..utils.model import freeze as freeze_model
+from ..utils.tf_version import check_tf_version
+from ..utils.transform import random_transform_generator
+
+
+def makedirs(path):
+    # Intended behavior: try to create the directory,
+    # pass if the directory exists already, fails otherwise.
+    # Meant for Python 2.7/3.n compatibility.
+    try:
+        os.makedirs(path)
+    except OSError:
+        if not os.path.isdir(path):
+            raise
+
+
+def model_with_weights(model, weights, skip_mismatch):
+    """ Load weights for model.
+
+    Args
+        model         : The model to load weights for.
+        weights       : The weights to load.
+        skip_mismatch : If True, skips layers whose shape of weights doesn't match with the model.
+    """
+    if weights is not None:
+        model.load_weights(weights, by_name=True, skip_mismatch=skip_mismatch)
+    return model
+
+
+def create_models(backbone_retinanet, num_classes, weights, multi_gpu=0,
+                  freeze_backbone=False, lr=1e-5, config=None):
+    """ Creates three models (model, training_model, prediction_model).
+
+    Args
+        backbone_retinanet : A function to call to create a retinanet model with a given backbone.
+        num_classes        : The number of classes to train.
+        weights            : The weights to load into the model.
+        multi_gpu          : The number of GPUs to use for training.
+        freeze_backbone    : If True, disables learning for the backbone.
+        config             : Config parameters, None indicates the default configuration.
+
+    Returns
+        model            : The base model. This is also the model that is saved in snapshots.
+        training_model   : The training model. If multi_gpu=0, this is identical to model.
+        prediction_model : The model wrapped with utility functions to perform object detection (applies regression values and performs NMS).
+    """
+
+    modifier = freeze_model if freeze_backbone else None
+
+    # load anchor parameters, or pass None (so that defaults will be used)
+    anchor_params = None
+    num_anchors   = None
+    if config and 'anchor_parameters' in config:
+        anchor_params = parse_anchor_parameters(config)
+        num_anchors   = anchor_params.num_anchors()
+
+    # Keras recommends initialising a multi-gpu model on the CPU to ease weight sharing, and to prevent OOM errors.
+    # optionally wrap in a parallel model
+    if multi_gpu > 1:
+        from keras.utils import multi_gpu_model
+        with tf.device('/cpu:0'):
+            model = model_with_weights(backbone_retinanet(num_classes, num_anchors=num_anchors, modifier=modifier), weights=weights, skip_mismatch=True)
+        training_model = multi_gpu_model(model, gpus=multi_gpu)
+    else:
+        model          = model_with_weights(backbone_retinanet(num_classes, num_anchors=num_anchors, modifier=modifier), weights=weights, skip_mismatch=True)
+        training_model = model
+
+    # make prediction model
+    prediction_model = retinanet_bbox(model=model, anchor_params=anchor_params)
+
+    # compile model
+    training_model.compile(
+        loss={
+            'regression'    : losses.smooth_l1(),
+            'classification': losses.focal()
+        },
+        optimizer=keras.optimizers.adam(lr=lr, clipnorm=0.001)
+    )
+
+    return model, training_model, prediction_model
+
+
+def create_callbacks(model, training_model, prediction_model, validation_generator, args):
+    """ Creates the callbacks to use during training.
+
+    Args
+        model: The base model.
+        training_model: The model that is used for training.
+        prediction_model: The model that should be used for validation.
+        validation_generator: The generator for creating validation data.
+        args: parseargs args object.
+
+    Returns:
+        A list of callbacks used for training.
+    """
+    callbacks = []
+
+    tensorboard_callback = None
+
+    if args.tensorboard_dir:
+        makedirs(args.tensorboard_dir)
+        tensorboard_callback = keras.callbacks.TensorBoard(
+            log_dir                = args.tensorboard_dir,
+            histogram_freq         = 0,
+            batch_size             = args.batch_size,
+            write_graph            = True,
+            write_grads            = False,
+            write_images           = False,
+            embeddings_freq        = 0,
+            embeddings_layer_names = None,
+            embeddings_metadata    = None
+        )
+
+    if args.evaluation and validation_generator:
+        if args.dataset_type == 'coco':
+            from ..callbacks.coco import CocoEval
+
+            # use prediction model for evaluation
+            evaluation = CocoEval(validation_generator, tensorboard=tensorboard_callback)
+        else:
+            evaluation = Evaluate(validation_generator, tensorboard=tensorboard_callback, weighted_average=args.weighted_average)
+        evaluation = RedirectModel(evaluation, prediction_model)
+        callbacks.append(evaluation)
+
+    # save the model
+    if args.snapshots:
+        # ensure directory created first; otherwise h5py will error after epoch.
+        makedirs(args.snapshot_path)
+        checkpoint = keras.callbacks.ModelCheckpoint(
+            os.path.join(
+                args.snapshot_path,
+                '{backbone}_{dataset_type}_{{epoch:02d}}.h5'.format(backbone=args.backbone, dataset_type=args.dataset_type)
+            ),
+            verbose=1,
+            # save_best_only=True,
+            # monitor="mAP",
+            # mode='max'
+        )
+        checkpoint = RedirectModel(checkpoint, model)
+        callbacks.append(checkpoint)
+
+    callbacks.append(keras.callbacks.ReduceLROnPlateau(
+        monitor    = 'loss',
+        factor     = args.reduce_lr_factor,
+        patience   = args.reduce_lr_patience,
+        verbose    = 1,
+        mode       = 'auto',
+        min_delta  = 0.0001,
+        cooldown   = 0,
+        min_lr     = 0
+    ))
+
+    callbacks.append(keras.callbacks.EarlyStopping(
+        monitor    = 'mAP',
+        patience   = 15,
+        mode       = 'max',
+        min_delta  = 0.01
+    ))
+
+    if args.tensorboard_dir:
+        callbacks.append(tensorboard_callback)
+
+    return callbacks
+
+
+def create_generators(args, preprocess_image):
+    """ Create generators for training and validation.
+
+    Args
+        args             : parseargs object containing configuration for generators.
+        preprocess_image : Function that preprocesses an image for the network.
+    """
+    common_args = {
+        'batch_size'       : args.batch_size,
+        'config'           : args.config,
+        'image_min_side'   : args.image_min_side,
+        'image_max_side'   : args.image_max_side,
+        'no_resize'        : args.no_resize,
+        'preprocess_image' : preprocess_image,
+    }
+
+    # create random transform generator for augmenting training data
+    if args.random_transform:
+        transform_generator = random_transform_generator(
+            min_rotation=-0.1,
+            max_rotation=0.1,
+            min_translation=(-0.1, -0.1),
+            max_translation=(0.1, 0.1),
+            min_shear=-0.1,
+            max_shear=0.1,
+            min_scaling=(0.9, 0.9),
+            max_scaling=(1.1, 1.1),
+            flip_x_chance=0.5,
+            flip_y_chance=0.5,
+        )
+        visual_effect_generator = random_visual_effect_generator(
+            contrast_range=(0.9, 1.1),
+            brightness_range=(-.1, .1),
+            hue_range=(-0.05, 0.05),
+            saturation_range=(0.95, 1.05)
+        )
+    else:
+        transform_generator = random_transform_generator(flip_x_chance=0.5)
+        visual_effect_generator = None
+
+    if args.dataset_type == 'coco':
+        # import here to prevent unnecessary dependency on cocoapi
+        from ..preprocessing.coco import CocoGenerator
+
+        train_generator = CocoGenerator(
+            args.coco_path,
+            'train2017',
+            transform_generator=transform_generator,
+            visual_effect_generator=visual_effect_generator,
+            **common_args
+        )
+
+        validation_generator = CocoGenerator(
+            args.coco_path,
+            'val2017',
+            shuffle_groups=False,
+            **common_args
+        )
+    elif args.dataset_type == 'pascal':
+        train_generator = PascalVocGenerator(
+            args.pascal_path,
+            'train',
+            image_extension=args.image_extension,
+            transform_generator=transform_generator,
+            visual_effect_generator=visual_effect_generator,
+            **common_args
+        )
+
+        validation_generator = PascalVocGenerator(
+            args.pascal_path,
+            'val',
+            image_extension=args.image_extension,
+            shuffle_groups=False,
+            **common_args
+        )
+    elif args.dataset_type == 'csv':
+        train_generator = CSVGenerator(
+            args.annotations,
+            args.classes,
+            transform_generator=transform_generator,
+            visual_effect_generator=visual_effect_generator,
+            **common_args
+        )
+
+        if args.val_annotations:
+            validation_generator = CSVGenerator(
+                args.val_annotations,
+                args.classes,
+                shuffle_groups=False,
+                **common_args
+            )
+        else:
+            validation_generator = None
+    elif args.dataset_type == 'oid':
+        train_generator = OpenImagesGenerator(
+            args.main_dir,
+            subset='train',
+            version=args.version,
+            labels_filter=args.labels_filter,
+            annotation_cache_dir=args.annotation_cache_dir,
+            parent_label=args.parent_label,
+            transform_generator=transform_generator,
+            visual_effect_generator=visual_effect_generator,
+            **common_args
+        )
+
+        validation_generator = OpenImagesGenerator(
+            args.main_dir,
+            subset='validation',
+            version=args.version,
+            labels_filter=args.labels_filter,
+            annotation_cache_dir=args.annotation_cache_dir,
+            parent_label=args.parent_label,
+            shuffle_groups=False,
+            **common_args
+        )
+    elif args.dataset_type == 'kitti':
+        train_generator = KittiGenerator(
+            args.kitti_path,
+            subset='train',
+            transform_generator=transform_generator,
+            visual_effect_generator=visual_effect_generator,
+            **common_args
+        )
+
+        validation_generator = KittiGenerator(
+            args.kitti_path,
+            subset='val',
+            shuffle_groups=False,
+            **common_args
+        )
+    else:
+        raise ValueError('Invalid data type received: {}'.format(args.dataset_type))
+
+    return train_generator, validation_generator
+
+
+def check_args(parsed_args):
+    """ Function to check for inherent contradictions within parsed arguments.
+    For example, batch_size < num_gpus
+    Intended to raise errors prior to backend initialisation.
+
+    Args
+        parsed_args: parser.parse_args()
+
+    Returns
+        parsed_args
+    """
+
+    if parsed_args.multi_gpu > 1 and parsed_args.batch_size < parsed_args.multi_gpu:
+        raise ValueError(
+            "Batch size ({}) must be equal to or higher than the number of GPUs ({})".format(parsed_args.batch_size,
+                                                                                             parsed_args.multi_gpu))
+
+    if parsed_args.multi_gpu > 1 and parsed_args.snapshot:
+        raise ValueError(
+            "Multi GPU training ({}) and resuming from snapshots ({}) is not supported.".format(parsed_args.multi_gpu,
+                                                                                                parsed_args.snapshot))
+
+    if parsed_args.multi_gpu > 1 and not parsed_args.multi_gpu_force:
+        raise ValueError("Multi-GPU support is experimental, use at own risk! Run with --multi-gpu-force if you wish to continue.")
+
+    if 'resnet' not in parsed_args.backbone:
+        warnings.warn('Using experimental backbone {}. Only resnet50 has been properly tested.'.format(parsed_args.backbone))
+
+    return parsed_args
+
+
+def parse_args(args):
+    """ Parse the arguments.
+    """
+    parser     = argparse.ArgumentParser(description='Simple training script for training a RetinaNet network.')
+    subparsers = parser.add_subparsers(help='Arguments for specific dataset types.', dest='dataset_type')
+    subparsers.required = True
+
+    coco_parser = subparsers.add_parser('coco')
+    coco_parser.add_argument('coco_path', help='Path to dataset directory (ie. /tmp/COCO).')
+
+    pascal_parser = subparsers.add_parser('pascal')
+    pascal_parser.add_argument('pascal_path', help='Path to dataset directory (ie. /tmp/VOCdevkit).')
+    pascal_parser.add_argument('--image-extension',   help='Declares the dataset images\' extension.', default='.jpg')
+
+    kitti_parser = subparsers.add_parser('kitti')
+    kitti_parser.add_argument('kitti_path', help='Path to dataset directory (ie. /tmp/kitti).')
+
+    def csv_list(string):
+        return string.split(',')
+
+    oid_parser = subparsers.add_parser('oid')
+    oid_parser.add_argument('main_dir', help='Path to dataset directory.')
+    oid_parser.add_argument('--version',  help='The current dataset version is v4.', default='v4')
+    oid_parser.add_argument('--labels-filter',  help='A list of labels to filter.', type=csv_list, default=None)
+    oid_parser.add_argument('--annotation-cache-dir', help='Path to store annotation cache.', default='.')
+    oid_parser.add_argument('--parent-label', help='Use the hierarchy children of this label.', default=None)
+
+    csv_parser = subparsers.add_parser('csv')
+    csv_parser.add_argument('annotations', help='Path to CSV file containing annotations for training.')
+    csv_parser.add_argument('classes', help='Path to a CSV file containing class label mapping.')
+    csv_parser.add_argument('--val-annotations', help='Path to CSV file containing annotations for validation (optional).')
+
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument('--snapshot',          help='Resume training from a snapshot.')
+    group.add_argument('--imagenet-weights',  help='Initialize the model with pretrained imagenet weights. This is the default behaviour.', action='store_const', const=True, default=True)
+    group.add_argument('--weights',           help='Initialize the model with weights from a file.')
+    group.add_argument('--no-weights',        help='Don\'t initialize the model with any weights.', dest='imagenet_weights', action='store_const', const=False)
+    parser.add_argument('--backbone',         help='Backbone model used by retinanet.', default='resnet50', type=str)
+    parser.add_argument('--batch-size',       help='Size of the batches.', default=1, type=int)
+    parser.add_argument('--gpu',              help='Id of the GPU to use (as reported by nvidia-smi).', type=int)
+    parser.add_argument('--multi-gpu',        help='Number of GPUs to use for parallel processing.', type=int, default=0)
+    parser.add_argument('--multi-gpu-force',  help='Extra flag needed to enable (experimental) multi-gpu support.', action='store_true')
+    parser.add_argument('--initial-epoch',    help='Epoch from which to begin the train, useful if resuming from snapshot.', type=int, default=0)
+    parser.add_argument('--epochs',           help='Number of epochs to train.', type=int, default=50)
+    parser.add_argument('--steps',            help='Number of steps per epoch.', type=int, default=10000)
+    parser.add_argument('--lr',               help='Learning rate.', type=float, default=1e-5)
+    parser.add_argument('--snapshot-path',    help='Path to store snapshots of models during training (defaults to \'./snapshots\')', default='./snapshots')
+    parser.add_argument('--tensorboard-dir',  help='Log directory for Tensorboard output', default='')  # default='./logs') => https://github.com/tensorflow/tensorflow/pull/34870
+    parser.add_argument('--no-snapshots',     help='Disable saving snapshots.', dest='snapshots', action='store_false')
+    parser.add_argument('--no-evaluation',    help='Disable per epoch evaluation.', dest='evaluation', action='store_false')
+    parser.add_argument('--freeze-backbone',  help='Freeze training of backbone layers.', action='store_true')
+    parser.add_argument('--random-transform', help='Randomly transform image and annotations.', action='store_true')
+    parser.add_argument('--image-min-side',   help='Rescale the image so the smallest side is min_side.', type=int, default=800)
+    parser.add_argument('--image-max-side',   help='Rescale the image if the largest side is larger than max_side.', type=int, default=1333)
+    parser.add_argument('--no-resize',        help='Don''t rescale the image.', action='store_true')
+    parser.add_argument('--config',           help='Path to a configuration parameters .ini file.')
+    parser.add_argument('--weighted-average', help='Compute the mAP using the weighted average of precisions among classes.', action='store_true')
+    parser.add_argument('--compute-val-loss', help='Compute validation loss during training', dest='compute_val_loss', action='store_true')
+    parser.add_argument('--reduce-lr-patience', help='Reduce learning rate after validation loss decreases over reduce_lr_patience epochs', type=int, default=2)
+    parser.add_argument('--reduce-lr-factor', help='When learning rate is reduced due to reduce_lr_patience, multiply by reduce_lr_factor', type=float, default=0.1)
+
+    # Fit generator arguments
+    parser.add_argument('--multiprocessing',  help='Use multiprocessing in fit_generator.', action='store_true')
+    parser.add_argument('--workers',          help='Number of generator workers.', type=int, default=1)
+    parser.add_argument('--max-queue-size',   help='Queue length for multiprocessing workers in fit_generator.', type=int, default=10)
+
+    return check_args(parser.parse_args(args))
+
+
+def main(args=None):
+    # parse arguments
+    if args is None:
+        args = sys.argv[1:]
+    args = parse_args(args)
+
+    # create object that stores backbone information
+    backbone = models.backbone(args.backbone)
+
+    # make sure keras and tensorflow are the minimum required version
+    check_keras_version()
+    check_tf_version()
+
+    # optionally choose specific GPU
+    if args.gpu is not None:
+        setup_gpu(args.gpu)
+
+    # optionally load config parameters
+    if args.config:
+        args.config = read_config_file(args.config)
+
+    # create the generators
+    train_generator, validation_generator = create_generators(args, backbone.preprocess_image)
+
+    # create the model
+    if args.snapshot is not None:
+        print('Loading model, this may take a second...')
+        model            = models.load_model(args.snapshot, backbone_name=args.backbone)
+        training_model   = model
+        anchor_params    = None
+        if args.config and 'anchor_parameters' in args.config:
+            anchor_params = parse_anchor_parameters(args.config)
+        prediction_model = retinanet_bbox(model=model, anchor_params=anchor_params)
+    else:
+        weights = args.weights
+        # default to imagenet if nothing else is specified
+        if weights is None and args.imagenet_weights:
+            weights = backbone.download_imagenet()
+
+        print('Creating model, this may take a second...')
+        model, training_model, prediction_model = create_models(
+            backbone_retinanet=backbone.retinanet,
+            num_classes=train_generator.num_classes(),
+            weights=weights,
+            multi_gpu=args.multi_gpu,
+            freeze_backbone=args.freeze_backbone,
+            lr=args.lr,
+            config=args.config
+        )
+
+    # print model summary
+    print(model.summary())
+
+    # this lets the generator compute backbone layer shapes using the actual backbone model
+    if 'vgg' in args.backbone or 'densenet' in args.backbone:
+        train_generator.compute_shapes = make_shapes_callback(model)
+        if validation_generator:
+            validation_generator.compute_shapes = train_generator.compute_shapes
+
+    # create the callbacks
+    callbacks = create_callbacks(
+        model,
+        training_model,
+        prediction_model,
+        validation_generator,
+        args,
+    )
+
+    if not args.compute_val_loss:
+        validation_generator = None
+
+    # start training
+    return training_model.fit_generator(
+        generator=train_generator,
+        steps_per_epoch=args.steps,
+        epochs=args.epochs,
+        verbose=1,
+        callbacks=callbacks,
+        workers=args.workers,
+        use_multiprocessing=args.multiprocessing,
+        max_queue_size=args.max_queue_size,
+        validation_data=validation_generator,
+        initial_epoch=args.initial_epoch
+    )
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/keras_retinanet/callbacks/__init__.py b/src/keras_retinanet/callbacks/__init__.py
new file mode 100644
index 0000000..7316c99
--- /dev/null
+++ b/src/keras_retinanet/callbacks/__init__.py
@@ -0,0 +1 @@
+from .common import *  # noqa: F401,F403
diff --git a/src/keras_retinanet/callbacks/coco.py b/src/keras_retinanet/callbacks/coco.py
new file mode 100644
index 0000000..7f9cc70
--- /dev/null
+++ b/src/keras_retinanet/callbacks/coco.py
@@ -0,0 +1,67 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import keras
+from ..utils.coco_eval import evaluate_coco
+
+
+class CocoEval(keras.callbacks.Callback):
+    """ Performs COCO evaluation on each epoch.
+    """
+    def __init__(self, generator, tensorboard=None, threshold=0.05):
+        """ CocoEval callback intializer.
+
+        Args
+            generator   : The generator used for creating validation data.
+            tensorboard : If given, the results will be written to tensorboard.
+            threshold   : The score threshold to use.
+        """
+        self.generator = generator
+        self.threshold = threshold
+        self.tensorboard = tensorboard
+
+        super(CocoEval, self).__init__()
+
+    def on_epoch_end(self, epoch, logs=None):
+        logs = logs or {}
+
+        coco_tag = ['AP @[ IoU=0.50:0.95 | area=   all | maxDets=100 ]',
+                    'AP @[ IoU=0.50      | area=   all | maxDets=100 ]',
+                    'AP @[ IoU=0.75      | area=   all | maxDets=100 ]',
+                    'AP @[ IoU=0.50:0.95 | area= small | maxDets=100 ]',
+                    'AP @[ IoU=0.50:0.95 | area=medium | maxDets=100 ]',
+                    'AP @[ IoU=0.50:0.95 | area= large | maxDets=100 ]',
+                    'AR @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ]',
+                    'AR @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ]',
+                    'AR @[ IoU=0.50:0.95 | area=   all | maxDets=100 ]',
+                    'AR @[ IoU=0.50:0.95 | area= small | maxDets=100 ]',
+                    'AR @[ IoU=0.50:0.95 | area=medium | maxDets=100 ]',
+                    'AR @[ IoU=0.50:0.95 | area= large | maxDets=100 ]']
+        coco_eval_stats = evaluate_coco(self.generator, self.model, self.threshold)
+
+        if coco_eval_stats is not None:
+            for index, result in enumerate(coco_eval_stats):
+                logs[coco_tag[index]] = result
+
+            if self.tensorboard:
+                import tensorflow as tf
+                if tf.version.VERSION < '2.0.0' and self.tensorboard.writer:
+                    summary = tf.Summary()
+                    for index, result in enumerate(coco_eval_stats):
+                        summary_value = summary.value.add()
+                        summary_value.simple_value = result
+                        summary_value.tag = '{}. {}'.format(index + 1, coco_tag[index])
+                        self.tensorboard.writer.add_summary(summary, epoch)
diff --git a/src/keras_retinanet/callbacks/common.py b/src/keras_retinanet/callbacks/common.py
new file mode 100644
index 0000000..67c00e1
--- /dev/null
+++ b/src/keras_retinanet/callbacks/common.py
@@ -0,0 +1,46 @@
+import keras.callbacks
+
+
+class RedirectModel(keras.callbacks.Callback):
+    """Callback which wraps another callback, but executed on a different model.
+
+    ```python
+    model = keras.models.load_model('model.h5')
+    model_checkpoint = ModelCheckpoint(filepath='snapshot.h5')
+    parallel_model = multi_gpu_model(model, gpus=2)
+    parallel_model.fit(X_train, Y_train, callbacks=[RedirectModel(model_checkpoint, model)])
+    ```
+
+    Args
+        callback : callback to wrap.
+        model    : model to use when executing callbacks.
+    """
+
+    def __init__(self,
+                 callback,
+                 model):
+        super(RedirectModel, self).__init__()
+
+        self.callback = callback
+        self.redirect_model = model
+
+    def on_epoch_begin(self, epoch, logs=None):
+        self.callback.on_epoch_begin(epoch, logs=logs)
+
+    def on_epoch_end(self, epoch, logs=None):
+        self.callback.on_epoch_end(epoch, logs=logs)
+
+    def on_batch_begin(self, batch, logs=None):
+        self.callback.on_batch_begin(batch, logs=logs)
+
+    def on_batch_end(self, batch, logs=None):
+        self.callback.on_batch_end(batch, logs=logs)
+
+    def on_train_begin(self, logs=None):
+        # overwrite the model with our custom model
+        self.callback.set_model(self.redirect_model)
+
+        self.callback.on_train_begin(logs=logs)
+
+    def on_train_end(self, logs=None):
+        self.callback.on_train_end(logs=logs)
diff --git a/src/keras_retinanet/callbacks/eval.py b/src/keras_retinanet/callbacks/eval.py
new file mode 100644
index 0000000..abdc8bb
--- /dev/null
+++ b/src/keras_retinanet/callbacks/eval.py
@@ -0,0 +1,98 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import keras
+from ..utils.eval import evaluate
+
+
+class Evaluate(keras.callbacks.Callback):
+    """ Evaluation callback for arbitrary datasets.
+    """
+
+    def __init__(
+        self,
+        generator,
+        iou_threshold=0.5,
+        score_threshold=0.05,
+        max_detections=100,
+        save_path=None,
+        tensorboard=None,
+        weighted_average=False,
+        verbose=1
+    ):
+        """ Evaluate a given dataset using a given model at the end of every epoch during training.
+
+        # Arguments
+            generator        : The generator that represents the dataset to evaluate.
+            iou_threshold    : The threshold used to consider when a detection is positive or negative.
+            score_threshold  : The score confidence threshold to use for detections.
+            max_detections   : The maximum number of detections to use per image.
+            save_path        : The path to save images with visualized detections to.
+            tensorboard      : Instance of keras.callbacks.TensorBoard used to log the mAP value.
+            weighted_average : Compute the mAP using the weighted average of precisions among classes.
+            verbose          : Set the verbosity level, by default this is set to 1.
+        """
+        self.generator       = generator
+        self.iou_threshold   = iou_threshold
+        self.score_threshold = score_threshold
+        self.max_detections  = max_detections
+        self.save_path       = save_path
+        self.tensorboard     = tensorboard
+        self.weighted_average = weighted_average
+        self.verbose         = verbose
+
+        super(Evaluate, self).__init__()
+
+    def on_epoch_end(self, epoch, logs=None):
+        logs = logs or {}
+
+        # run evaluation
+        average_precisions, _ = evaluate(
+            self.generator,
+            self.model,
+            iou_threshold=self.iou_threshold,
+            score_threshold=self.score_threshold,
+            max_detections=self.max_detections,
+            save_path=self.save_path
+        )
+
+        # compute per class average precision
+        total_instances = []
+        precisions = []
+        for label, (average_precision, num_annotations) in average_precisions.items():
+            if self.verbose == 1:
+                print('{:.0f} instances of class'.format(num_annotations),
+                      self.generator.label_to_name(label), 'with average precision: {:.4f}'.format(average_precision))
+            total_instances.append(num_annotations)
+            precisions.append(average_precision)
+        if self.weighted_average:
+            self.mean_ap = sum([a * b for a, b in zip(total_instances, precisions)]) / sum(total_instances)
+        else:
+            self.mean_ap = sum(precisions) / sum(x > 0 for x in total_instances)
+
+        if self.tensorboard:
+            import tensorflow as tf
+            if tf.version.VERSION < '2.0.0' and self.tensorboard.writer:
+                summary = tf.Summary()
+                summary_value = summary.value.add()
+                summary_value.simple_value = self.mean_ap
+                summary_value.tag = "mAP"
+                self.tensorboard.writer.add_summary(summary, epoch)
+
+        logs['mAP'] = self.mean_ap
+
+        if self.verbose == 1:
+            print('mAP: {:.4f}'.format(self.mean_ap))
diff --git a/src/keras_retinanet/initializers.py b/src/keras_retinanet/initializers.py
new file mode 100644
index 0000000..f41faf8
--- /dev/null
+++ b/src/keras_retinanet/initializers.py
@@ -0,0 +1,39 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import keras
+
+import numpy as np
+import math
+
+
+class PriorProbability(keras.initializers.Initializer):
+    """ Apply a prior probability to the weights.
+    """
+
+    def __init__(self, probability=0.01):
+        self.probability = probability
+
+    def get_config(self):
+        return {
+            'probability': self.probability
+        }
+
+    def __call__(self, shape, dtype=None):
+        # set bias to -log((1 - p)/p) for foreground
+        result = np.ones(shape, dtype=dtype) * -math.log((1 - self.probability) / self.probability)
+
+        return result
diff --git a/src/keras_retinanet/layers/__init__.py b/src/keras_retinanet/layers/__init__.py
new file mode 100644
index 0000000..5a8c7d3
--- /dev/null
+++ b/src/keras_retinanet/layers/__init__.py
@@ -0,0 +1,2 @@
+from ._misc import RegressBoxes, UpsampleLike, Anchors, ClipBoxes  # noqa: F401
+from .filter_detections import FilterDetections  # noqa: F401
diff --git a/src/keras_retinanet/layers/_misc.py b/src/keras_retinanet/layers/_misc.py
new file mode 100644
index 0000000..6fc19b0
--- /dev/null
+++ b/src/keras_retinanet/layers/_misc.py
@@ -0,0 +1,185 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import keras
+from .. import backend
+from ..utils import anchors as utils_anchors
+
+import numpy as np
+
+
+class Anchors(keras.layers.Layer):
+    """ Keras layer for generating achors for a given shape.
+    """
+
+    def __init__(self, size, stride, ratios=None, scales=None, *args, **kwargs):
+        """ Initializer for an Anchors layer.
+
+        Args
+            size: The base size of the anchors to generate.
+            stride: The stride of the anchors to generate.
+            ratios: The ratios of the anchors to generate (defaults to AnchorParameters.default.ratios).
+            scales: The scales of the anchors to generate (defaults to AnchorParameters.default.scales).
+        """
+        self.size   = size
+        self.stride = stride
+        self.ratios = ratios
+        self.scales = scales
+
+        if ratios is None:
+            self.ratios  = utils_anchors.AnchorParameters.default.ratios
+        elif isinstance(ratios, list):
+            self.ratios  = np.array(ratios)
+        if scales is None:
+            self.scales  = utils_anchors.AnchorParameters.default.scales
+        elif isinstance(scales, list):
+            self.scales  = np.array(scales)
+
+        self.num_anchors = len(self.ratios) * len(self.scales)
+        self.anchors     = keras.backend.variable(utils_anchors.generate_anchors(
+            base_size=self.size,
+            ratios=self.ratios,
+            scales=self.scales,
+        ))
+
+        super(Anchors, self).__init__(*args, **kwargs)
+
+    def call(self, inputs, **kwargs):
+        features = inputs
+        features_shape = keras.backend.shape(features)
+
+        # generate proposals from bbox deltas and shifted anchors
+        if keras.backend.image_data_format() == 'channels_first':
+            anchors = backend.shift(features_shape[2:4], self.stride, self.anchors)
+        else:
+            anchors = backend.shift(features_shape[1:3], self.stride, self.anchors)
+        anchors = keras.backend.tile(keras.backend.expand_dims(anchors, axis=0), (features_shape[0], 1, 1))
+
+        return anchors
+
+    def compute_output_shape(self, input_shape):
+        if None not in input_shape[1:]:
+            if keras.backend.image_data_format() == 'channels_first':
+                total = np.prod(input_shape[2:4]) * self.num_anchors
+            else:
+                total = np.prod(input_shape[1:3]) * self.num_anchors
+
+            return (input_shape[0], total, 4)
+        else:
+            return (input_shape[0], None, 4)
+
+    def get_config(self):
+        config = super(Anchors, self).get_config()
+        config.update({
+            'size'   : self.size,
+            'stride' : self.stride,
+            'ratios' : self.ratios.tolist(),
+            'scales' : self.scales.tolist(),
+        })
+
+        return config
+
+
+class UpsampleLike(keras.layers.Layer):
+    """ Keras layer for upsampling a Tensor to be the same shape as another Tensor.
+    """
+
+    def call(self, inputs, **kwargs):
+        source, target = inputs
+        target_shape = keras.backend.shape(target)
+        if keras.backend.image_data_format() == 'channels_first':
+            source = backend.transpose(source, (0, 2, 3, 1))
+            output = backend.resize_images(source, (target_shape[2], target_shape[3]), method='nearest')
+            output = backend.transpose(output, (0, 3, 1, 2))
+            return output
+        else:
+            return backend.resize_images(source, (target_shape[1], target_shape[2]), method='nearest')
+
+    def compute_output_shape(self, input_shape):
+        if keras.backend.image_data_format() == 'channels_first':
+            return (input_shape[0][0], input_shape[0][1]) + input_shape[1][2:4]
+        else:
+            return (input_shape[0][0],) + input_shape[1][1:3] + (input_shape[0][-1],)
+
+
+class RegressBoxes(keras.layers.Layer):
+    """ Keras layer for applying regression values to boxes.
+    """
+
+    def __init__(self, mean=None, std=None, *args, **kwargs):
+        """ Initializer for the RegressBoxes layer.
+
+        Args
+            mean: The mean value of the regression values which was used for normalization.
+            std: The standard value of the regression values which was used for normalization.
+        """
+        if mean is None:
+            mean = np.array([0, 0, 0, 0])
+        if std is None:
+            std = np.array([0.2, 0.2, 0.2, 0.2])
+
+        if isinstance(mean, (list, tuple)):
+            mean = np.array(mean)
+        elif not isinstance(mean, np.ndarray):
+            raise ValueError('Expected mean to be a np.ndarray, list or tuple. Received: {}'.format(type(mean)))
+
+        if isinstance(std, (list, tuple)):
+            std = np.array(std)
+        elif not isinstance(std, np.ndarray):
+            raise ValueError('Expected std to be a np.ndarray, list or tuple. Received: {}'.format(type(std)))
+
+        self.mean = mean
+        self.std  = std
+        super(RegressBoxes, self).__init__(*args, **kwargs)
+
+    def call(self, inputs, **kwargs):
+        anchors, regression = inputs
+        return backend.bbox_transform_inv(anchors, regression, mean=self.mean, std=self.std)
+
+    def compute_output_shape(self, input_shape):
+        return input_shape[0]
+
+    def get_config(self):
+        config = super(RegressBoxes, self).get_config()
+        config.update({
+            'mean': self.mean.tolist(),
+            'std' : self.std.tolist(),
+        })
+
+        return config
+
+
+class ClipBoxes(keras.layers.Layer):
+    """ Keras layer to clip box values to lie inside a given shape.
+    """
+    def call(self, inputs, **kwargs):
+        image, boxes = inputs
+        shape = keras.backend.cast(keras.backend.shape(image), keras.backend.floatx())
+        if keras.backend.image_data_format() == 'channels_first':
+            _, _, height, width = backend.unstack(shape, axis=0)
+        else:
+            _, height, width, _ = backend.unstack(shape, axis=0)
+
+        x1, y1, x2, y2 = backend.unstack(boxes, axis=-1)
+        x1 = backend.clip_by_value(x1, 0, width  - 1)
+        y1 = backend.clip_by_value(y1, 0, height - 1)
+        x2 = backend.clip_by_value(x2, 0, width  - 1)
+        y2 = backend.clip_by_value(y2, 0, height - 1)
+
+        return keras.backend.stack([x1, y1, x2, y2], axis=2)
+
+    def compute_output_shape(self, input_shape):
+        return input_shape[1]
diff --git a/src/keras_retinanet/layers/filter_detections.py b/src/keras_retinanet/layers/filter_detections.py
new file mode 100644
index 0000000..f73e918
--- /dev/null
+++ b/src/keras_retinanet/layers/filter_detections.py
@@ -0,0 +1,223 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import keras
+from .. import backend
+
+
+def filter_detections(
+    boxes,
+    classification,
+    other                 = [],
+    class_specific_filter = True,
+    nms                   = True,
+    score_threshold       = 0.05,
+    max_detections        = 300,
+    nms_threshold         = 0.5
+):
+    """ Filter detections using the boxes and classification values.
+
+    Args
+        boxes                 : Tensor of shape (num_boxes, 4) containing the boxes in (x1, y1, x2, y2) format.
+        classification        : Tensor of shape (num_boxes, num_classes) containing the classification scores.
+        other                 : List of tensors of shape (num_boxes, ...) to filter along with the boxes and classification scores.
+        class_specific_filter : Whether to perform filtering per class, or take the best scoring class and filter those.
+        nms                   : Flag to enable/disable non maximum suppression.
+        score_threshold       : Threshold used to prefilter the boxes with.
+        max_detections        : Maximum number of detections to keep.
+        nms_threshold         : Threshold for the IoU value to determine when a box should be suppressed.
+
+    Returns
+        A list of [boxes, scores, labels, other[0], other[1], ...].
+        boxes is shaped (max_detections, 4) and contains the (x1, y1, x2, y2) of the non-suppressed boxes.
+        scores is shaped (max_detections,) and contains the scores of the predicted class.
+        labels is shaped (max_detections,) and contains the predicted label.
+        other[i] is shaped (max_detections, ...) and contains the filtered other[i] data.
+        In case there are less than max_detections detections, the tensors are padded with -1's.
+    """
+    def _filter_detections(scores, labels):
+        # threshold based on score
+        indices = backend.where(keras.backend.greater(scores, score_threshold))
+
+        if nms:
+            filtered_boxes  = backend.gather_nd(boxes, indices)
+            filtered_scores = keras.backend.gather(scores, indices)[:, 0]
+
+            # perform NMS
+            nms_indices = backend.non_max_suppression(filtered_boxes, filtered_scores, max_output_size=max_detections, iou_threshold=nms_threshold)
+
+            # filter indices based on NMS
+            indices = keras.backend.gather(indices, nms_indices)
+
+        # add indices to list of all indices
+        labels = backend.gather_nd(labels, indices)
+        indices = keras.backend.stack([indices[:, 0], labels], axis=1)
+
+        return indices
+
+    if class_specific_filter:
+        all_indices = []
+        # perform per class filtering
+        for c in range(int(classification.shape[1])):
+            scores = classification[:, c]
+            labels = c * backend.ones((keras.backend.shape(scores)[0],), dtype='int64')
+            all_indices.append(_filter_detections(scores, labels))
+
+        # concatenate indices to single tensor
+        indices = keras.backend.concatenate(all_indices, axis=0)
+    else:
+        scores  = keras.backend.max(classification, axis    = 1)
+        labels  = keras.backend.argmax(classification, axis = 1)
+        indices = _filter_detections(scores, labels)
+
+    # select top k
+    scores              = backend.gather_nd(classification, indices)
+    labels              = indices[:, 1]
+    scores, top_indices = backend.top_k(scores, k=keras.backend.minimum(max_detections, keras.backend.shape(scores)[0]))
+
+    # filter input using the final set of indices
+    indices             = keras.backend.gather(indices[:, 0], top_indices)
+    boxes               = keras.backend.gather(boxes, indices)
+    labels              = keras.backend.gather(labels, top_indices)
+    other_              = [keras.backend.gather(o, indices) for o in other]
+
+    # zero pad the outputs
+    pad_size = keras.backend.maximum(0, max_detections - keras.backend.shape(scores)[0])
+    boxes    = backend.pad(boxes, [[0, pad_size], [0, 0]], constant_values=-1)
+    scores   = backend.pad(scores, [[0, pad_size]], constant_values=-1)
+    labels   = backend.pad(labels, [[0, pad_size]], constant_values=-1)
+    labels   = keras.backend.cast(labels, 'int32')
+    other_   = [backend.pad(o, [[0, pad_size]] + [[0, 0] for _ in range(1, len(o.shape))], constant_values=-1) for o in other_]
+
+    # set shapes, since we know what they are
+    boxes.set_shape([max_detections, 4])
+    scores.set_shape([max_detections])
+    labels.set_shape([max_detections])
+    for o, s in zip(other_, [list(keras.backend.int_shape(o)) for o in other]):
+        o.set_shape([max_detections] + s[1:])
+
+    return [boxes, scores, labels] + other_
+
+
+class FilterDetections(keras.layers.Layer):
+    """ Keras layer for filtering detections using score threshold and NMS.
+    """
+
+    def __init__(
+        self,
+        nms                   = True,
+        class_specific_filter = True,
+        nms_threshold         = 0.5,
+        score_threshold       = 0.05,
+        max_detections        = 300,
+        parallel_iterations   = 32,
+        **kwargs
+    ):
+        """ Filters detections using score threshold, NMS and selecting the top-k detections.
+
+        Args
+            nms                   : Flag to enable/disable NMS.
+            class_specific_filter : Whether to perform filtering per class, or take the best scoring class and filter those.
+            nms_threshold         : Threshold for the IoU value to determine when a box should be suppressed.
+            score_threshold       : Threshold used to prefilter the boxes with.
+            max_detections        : Maximum number of detections to keep.
+            parallel_iterations   : Number of batch items to process in parallel.
+        """
+        self.nms                   = nms
+        self.class_specific_filter = class_specific_filter
+        self.nms_threshold         = nms_threshold
+        self.score_threshold       = score_threshold
+        self.max_detections        = max_detections
+        self.parallel_iterations   = parallel_iterations
+        super(FilterDetections, self).__init__(**kwargs)
+
+    def call(self, inputs, **kwargs):
+        """ Constructs the NMS graph.
+
+        Args
+            inputs : List of [boxes, classification, other[0], other[1], ...] tensors.
+        """
+        boxes          = inputs[0]
+        classification = inputs[1]
+        other          = inputs[2:]
+
+        # wrap nms with our parameters
+        def _filter_detections(args):
+            boxes          = args[0]
+            classification = args[1]
+            other          = args[2]
+
+            return filter_detections(
+                boxes,
+                classification,
+                other,
+                nms                   = self.nms,
+                class_specific_filter = self.class_specific_filter,
+                score_threshold       = self.score_threshold,
+                max_detections        = self.max_detections,
+                nms_threshold         = self.nms_threshold,
+            )
+
+        # call filter_detections on each batch
+        outputs = backend.map_fn(
+            _filter_detections,
+            elems=[boxes, classification, other],
+            dtype=[keras.backend.floatx(), keras.backend.floatx(), 'int32'] + [o.dtype for o in other],
+            parallel_iterations=self.parallel_iterations
+        )
+
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        """ Computes the output shapes given the input shapes.
+
+        Args
+            input_shape : List of input shapes [boxes, classification, other[0], other[1], ...].
+
+        Returns
+            List of tuples representing the output shapes:
+            [filtered_boxes.shape, filtered_scores.shape, filtered_labels.shape, filtered_other[0].shape, filtered_other[1].shape, ...]
+        """
+        return [
+            (input_shape[0][0], self.max_detections, 4),
+            (input_shape[1][0], self.max_detections),
+            (input_shape[1][0], self.max_detections),
+        ] + [
+            tuple([input_shape[i][0], self.max_detections] + list(input_shape[i][2:])) for i in range(2, len(input_shape))
+        ]
+
+    def compute_mask(self, inputs, mask=None):
+        """ This is required in Keras when there is more than 1 output.
+        """
+        return (len(inputs) + 1) * [None]
+
+    def get_config(self):
+        """ Gets the configuration of this layer.
+
+        Returns
+            Dictionary containing the parameters of this layer.
+        """
+        config = super(FilterDetections, self).get_config()
+        config.update({
+            'nms'                   : self.nms,
+            'class_specific_filter' : self.class_specific_filter,
+            'nms_threshold'         : self.nms_threshold,
+            'score_threshold'       : self.score_threshold,
+            'max_detections'        : self.max_detections,
+            'parallel_iterations'   : self.parallel_iterations,
+        })
+
+        return config
diff --git a/src/keras_retinanet/losses.py b/src/keras_retinanet/losses.py
new file mode 100644
index 0000000..382a975
--- /dev/null
+++ b/src/keras_retinanet/losses.py
@@ -0,0 +1,118 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import keras
+from . import backend
+
+
+def focal(alpha=0.25, gamma=2.0, cutoff=0.5):
+    """ Create a functor for computing the focal loss.
+
+    Args
+        alpha: Scale the focal weight with alpha.
+        gamma: Take the power of the focal weight with gamma.
+        cutoff: Positive prediction cutoff for soft targets
+
+    Returns
+        A functor that computes the focal loss using the alpha and gamma.
+    """
+    def _focal(y_true, y_pred):
+        """ Compute the focal loss given the target tensor and the predicted tensor.
+
+        As defined in https://arxiv.org/abs/1708.02002
+
+        Args
+            y_true: Tensor of target data from the generator with shape (B, N, num_classes).
+            y_pred: Tensor of predicted data from the network with shape (B, N, num_classes).
+
+        Returns
+            The focal loss of y_pred w.r.t. y_true.
+        """
+        labels         = y_true[:, :, :-1]
+        anchor_state   = y_true[:, :, -1]  # -1 for ignore, 0 for background, 1 for object
+        classification = y_pred
+
+        # filter out "ignore" anchors
+        indices        = backend.where(keras.backend.not_equal(anchor_state, -1))
+        labels         = backend.gather_nd(labels, indices)
+        classification = backend.gather_nd(classification, indices)
+
+        # compute the focal loss
+        alpha_factor = keras.backend.ones_like(labels) * alpha
+        alpha_factor = backend.where(keras.backend.greater(labels, cutoff), alpha_factor, 1 - alpha_factor)
+        focal_weight = backend.where(keras.backend.greater(labels, cutoff), 1 - classification, classification)
+        focal_weight = alpha_factor * focal_weight ** gamma
+
+        cls_loss = focal_weight * keras.backend.binary_crossentropy(labels, classification)
+
+        # compute the normalizer: the number of positive anchors
+        normalizer = backend.where(keras.backend.equal(anchor_state, 1))
+        normalizer = keras.backend.cast(keras.backend.shape(normalizer)[0], keras.backend.floatx())
+        normalizer = keras.backend.maximum(keras.backend.cast_to_floatx(1.0), normalizer)
+
+        return keras.backend.sum(cls_loss) / normalizer
+
+    return _focal
+
+
+def smooth_l1(sigma=3.0):
+    """ Create a smooth L1 loss functor.
+
+    Args
+        sigma: This argument defines the point where the loss changes from L2 to L1.
+
+    Returns
+        A functor for computing the smooth L1 loss given target data and predicted data.
+    """
+    sigma_squared = sigma ** 2
+
+    def _smooth_l1(y_true, y_pred):
+        """ Compute the smooth L1 loss of y_pred w.r.t. y_true.
+
+        Args
+            y_true: Tensor from the generator of shape (B, N, 5). The last value for each box is the state of the anchor (ignore, negative, positive).
+            y_pred: Tensor from the network of shape (B, N, 4).
+
+        Returns
+            The smooth L1 loss of y_pred w.r.t. y_true.
+        """
+        # separate target and state
+        regression        = y_pred
+        regression_target = y_true[:, :, :-1]
+        anchor_state      = y_true[:, :, -1]
+
+        # filter out "ignore" anchors
+        indices           = backend.where(keras.backend.equal(anchor_state, 1))
+        regression        = backend.gather_nd(regression, indices)
+        regression_target = backend.gather_nd(regression_target, indices)
+
+        # compute smooth L1 loss
+        # f(x) = 0.5 * (sigma * x)^2          if |x| < 1 / sigma / sigma
+        #        |x| - 0.5 / sigma / sigma    otherwise
+        regression_diff = regression - regression_target
+        regression_diff = keras.backend.abs(regression_diff)
+        regression_loss = backend.where(
+            keras.backend.less(regression_diff, 1.0 / sigma_squared),
+            0.5 * sigma_squared * keras.backend.pow(regression_diff, 2),
+            regression_diff - 0.5 / sigma_squared
+        )
+
+        # compute the normalizer: the number of positive anchors
+        normalizer = keras.backend.maximum(1, keras.backend.shape(indices)[0])
+        normalizer = keras.backend.cast(normalizer, dtype=keras.backend.floatx())
+        return keras.backend.sum(regression_loss) / normalizer
+
+    return _smooth_l1
diff --git a/src/keras_retinanet/models/__init__.py b/src/keras_retinanet/models/__init__.py
new file mode 100644
index 0000000..3b05ca8
--- /dev/null
+++ b/src/keras_retinanet/models/__init__.py
@@ -0,0 +1,125 @@
+from __future__ import print_function
+import sys
+
+
+class Backbone(object):
+    """ This class stores additional information on backbones.
+    """
+    def __init__(self, backbone):
+        # a dictionary mapping custom layer names to the correct classes
+        from .. import layers
+        from .. import losses
+        from .. import initializers
+        self.custom_objects = {
+            'UpsampleLike'     : layers.UpsampleLike,
+            'PriorProbability' : initializers.PriorProbability,
+            'RegressBoxes'     : layers.RegressBoxes,
+            'FilterDetections' : layers.FilterDetections,
+            'Anchors'          : layers.Anchors,
+            'ClipBoxes'        : layers.ClipBoxes,
+            '_smooth_l1'       : losses.smooth_l1(),
+            '_focal'           : losses.focal(),
+        }
+
+        self.backbone = backbone
+        self.validate()
+
+    def retinanet(self, *args, **kwargs):
+        """ Returns a retinanet model using the correct backbone.
+        """
+        raise NotImplementedError('retinanet method not implemented.')
+
+    def download_imagenet(self):
+        """ Downloads ImageNet weights and returns path to weights file.
+        """
+        raise NotImplementedError('download_imagenet method not implemented.')
+
+    def validate(self):
+        """ Checks whether the backbone string is correct.
+        """
+        raise NotImplementedError('validate method not implemented.')
+
+    def preprocess_image(self, inputs):
+        """ Takes as input an image and prepares it for being passed through the network.
+        Having this function in Backbone allows other backbones to define a specific preprocessing step.
+        """
+        raise NotImplementedError('preprocess_image method not implemented.')
+
+
+def backbone(backbone_name):
+    """ Returns a backbone object for the given backbone.
+    """
+    if 'densenet' in backbone_name:
+        from .densenet import DenseNetBackbone as b
+    elif 'seresnext' in backbone_name or 'seresnet' in backbone_name or 'senet' in backbone_name:
+        from .senet import SeBackbone as b
+    elif 'resnet' in backbone_name:
+        from .resnet import ResNetBackbone as b
+    elif 'mobilenet' in backbone_name:
+        from .mobilenet import MobileNetBackbone as b
+    elif 'vgg' in backbone_name:
+        from .vgg import VGGBackbone as b
+    elif 'EfficientNet' in backbone_name:
+        from .effnet import EfficientNetBackbone as b
+    else:
+        raise NotImplementedError('Backbone class for  \'{}\' not implemented.'.format(backbone))
+
+    return b(backbone_name)
+
+
+def load_model(filepath, backbone_name='resnet50'):
+    """ Loads a retinanet model using the correct custom objects.
+
+    Args
+        filepath: one of the following:
+            - string, path to the saved model, or
+            - h5py.File object from which to load the model
+        backbone_name         : Backbone with which the model was trained.
+
+    Returns
+        A keras.models.Model object.
+
+    Raises
+        ImportError: if h5py is not available.
+        ValueError: In case of an invalid savefile.
+    """
+    import keras.models
+    return keras.models.load_model(filepath, custom_objects=backbone(backbone_name).custom_objects)
+
+
+def convert_model(model, nms=True, class_specific_filter=True, anchor_params=None, **kwargs):
+    """ Converts a training model to an inference model.
+
+    Args
+        model                 : A retinanet training model.
+        nms                   : Boolean, whether to add NMS filtering to the converted model.
+        class_specific_filter : Whether to use class specific filtering or filter for the best scoring class only.
+        anchor_params         : Anchor parameters object. If omitted, default values are used.
+        **kwargs              : Inference and minimal retinanet model settings.
+
+    Returns
+        A keras.models.Model object.
+
+    Raises
+        ImportError: if h5py is not available.
+        ValueError: In case of an invalid savefile.
+    """
+    from .retinanet import retinanet_bbox
+    return retinanet_bbox(model=model, nms=nms, class_specific_filter=class_specific_filter, anchor_params=anchor_params, **kwargs)
+
+
+def assert_training_model(model):
+    """ Assert that the model is a training model.
+    """
+    assert(all(output in model.output_names for output in ['regression', 'classification'])), \
+        "Input is not a training model (no 'regression' and 'classification' outputs were found, outputs are: {}).".format(model.output_names)
+
+
+def check_training_model(model):
+    """ Check that model is a training model and exit otherwise.
+    """
+    try:
+        assert_training_model(model)
+    except AssertionError as e:
+        print(e, file=sys.stderr)
+        sys.exit(1)
diff --git a/src/keras_retinanet/models/densenet.py b/src/keras_retinanet/models/densenet.py
new file mode 100644
index 0000000..c3aafd0
--- /dev/null
+++ b/src/keras_retinanet/models/densenet.py
@@ -0,0 +1,105 @@
+"""
+Copyright 2018 vidosits (https://github.com/vidosits/)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import keras
+from keras.applications import densenet
+from keras.utils import get_file
+
+from . import retinanet
+from . import Backbone
+from ..utils.image import preprocess_image
+
+
+allowed_backbones = {
+    'densenet121': ([6, 12, 24, 16], densenet.DenseNet121),
+    'densenet169': ([6, 12, 32, 32], densenet.DenseNet169),
+    'densenet201': ([6, 12, 48, 32], densenet.DenseNet201),
+}
+
+
+class DenseNetBackbone(Backbone):
+    """ Describes backbone information and provides utility functions.
+    """
+
+    def retinanet(self, *args, **kwargs):
+        """ Returns a retinanet model using the correct backbone.
+        """
+        return densenet_retinanet(*args, backbone=self.backbone, **kwargs)
+
+    def download_imagenet(self):
+        """ Download pre-trained weights for the specified backbone name.
+        This name is in the format {backbone}_weights_tf_dim_ordering_tf_kernels_notop
+        where backbone is the densenet + number of layers (e.g. densenet121).
+        For more info check the explanation from the keras densenet script itself:
+            https://github.com/keras-team/keras/blob/master/keras/applications/densenet.py
+        """
+        origin    = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/'
+        file_name = '{}_weights_tf_dim_ordering_tf_kernels_notop.h5'
+
+        # load weights
+        if keras.backend.image_data_format() == 'channels_first':
+            raise ValueError('Weights for "channels_first" format are not available.')
+
+        weights_url = origin + file_name.format(self.backbone)
+        return get_file(file_name.format(self.backbone), weights_url, cache_subdir='models')
+
+    def validate(self):
+        """ Checks whether the backbone string is correct.
+        """
+        backbone = self.backbone.split('_')[0]
+
+        if backbone not in allowed_backbones:
+            raise ValueError('Backbone (\'{}\') not in allowed backbones ({}).'.format(backbone, allowed_backbones.keys()))
+
+    def preprocess_image(self, inputs):
+        """ Takes as input an image and prepares it for being passed through the network.
+        """
+        return preprocess_image(inputs, mode='tf')
+
+
+def densenet_retinanet(num_classes, backbone='densenet121', inputs=None, modifier=None, **kwargs):
+    """ Constructs a retinanet model using a densenet backbone.
+
+    Args
+        num_classes: Number of classes to predict.
+        backbone: Which backbone to use (one of ('densenet121', 'densenet169', 'densenet201')).
+        inputs: The inputs to the network (defaults to a Tensor of shape (None, None, 3)).
+        modifier: A function handler which can modify the backbone before using it in retinanet (this can be used to freeze backbone layers for example).
+
+    Returns
+        RetinaNet model with a DenseNet backbone.
+    """
+    # choose default input
+    if inputs is None:
+        inputs = keras.layers.Input((None, None, 3))
+
+    blocks, creator = allowed_backbones[backbone]
+    model = creator(input_tensor=inputs, include_top=False, pooling=None, weights=None)
+
+    # get last conv layer from the end of each dense block
+    layer_outputs = [model.get_layer(name='conv{}_block{}_concat'.format(idx + 2, block_num)).output for idx, block_num in enumerate(blocks)]
+
+    # create the densenet backbone
+    model = keras.models.Model(inputs=inputs, outputs=layer_outputs[1:], name=model.name)
+
+    # invoke modifier if given
+    if modifier:
+        model = modifier(model)
+
+    # create the full model
+    model = retinanet.retinanet(inputs=inputs, num_classes=num_classes, backbone_layers=model.outputs, **kwargs)
+
+    return model
diff --git a/src/keras_retinanet/models/effnet.py b/src/keras_retinanet/models/effnet.py
new file mode 100644
index 0000000..12591ad
--- /dev/null
+++ b/src/keras_retinanet/models/effnet.py
@@ -0,0 +1,153 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import keras
+from keras.utils import get_file
+
+from . import retinanet
+from . import Backbone
+import efficientnet.keras as efn
+
+
+class EfficientNetBackbone(Backbone):
+    """ Describes backbone information and provides utility functions.
+    """
+
+    def __init__(self, backbone):
+        super(EfficientNetBackbone, self).__init__(backbone)
+        self.preprocess_image_func = None
+
+    def retinanet(self, *args, **kwargs):
+        """ Returns a retinanet model using the correct backbone.
+        """
+        return effnet_retinanet(*args, backbone=self.backbone, **kwargs)
+
+    def download_imagenet(self):
+        """ Downloads ImageNet weights and returns path to weights file.
+        """
+        from efficientnet.weights import IMAGENET_WEIGHTS_PATH
+        from efficientnet.weights import IMAGENET_WEIGHTS_HASHES
+
+        model_name = 'efficientnet-b' + self.backbone[-1]
+        file_name = model_name + '_weights_tf_dim_ordering_tf_kernels_autoaugment_notop.h5'
+        file_hash = IMAGENET_WEIGHTS_HASHES[model_name][1]
+        weights_path = get_file(file_name, IMAGENET_WEIGHTS_PATH + file_name, cache_subdir='models', file_hash=file_hash)
+        return weights_path
+
+    def validate(self):
+        """ Checks whether the backbone string is correct.
+        """
+        allowed_backbones = ['EfficientNetB0', 'EfficientNetB1', 'EfficientNetB2', 'EfficientNetB3', 'EfficientNetB4',
+                             'EfficientNetB5', 'EfficientNetB6', 'EfficientNetB7']
+        backbone = self.backbone.split('_')[0]
+
+        if backbone not in allowed_backbones:
+            raise ValueError('Backbone (\'{}\') not in allowed backbones ({}).'.format(backbone, allowed_backbones))
+
+    def preprocess_image(self, inputs):
+        """ Takes as input an image and prepares it for being passed through the network.
+        """
+        return efn.preprocess_input(inputs)
+
+
+def effnet_retinanet(num_classes, backbone='EfficientNetB0', inputs=None, modifier=None, **kwargs):
+    """ Constructs a retinanet model using a resnet backbone.
+
+    Args
+        num_classes: Number of classes to predict.
+        backbone: Which backbone to use (one of ('resnet50', 'resnet101', 'resnet152')).
+        inputs: The inputs to the network (defaults to a Tensor of shape (None, None, 3)).
+        modifier: A function handler which can modify the backbone before using it in retinanet (this can be used to freeze backbone layers for example).
+
+    Returns
+        RetinaNet model with a ResNet backbone.
+    """
+    # choose default input
+    if inputs is None:
+        if keras.backend.image_data_format() == 'channels_first':
+            inputs = keras.layers.Input(shape=(3, None, None))
+        else:
+            # inputs = keras.layers.Input(shape=(224, 224, 3))
+            inputs = keras.layers.Input(shape=(None, None, 3))
+
+    # get last conv layer from the end of each block [28x28, 14x14, 7x7]
+    if backbone == 'EfficientNetB0':
+        model = efn.EfficientNetB0(input_tensor=inputs, include_top=False, weights=None)
+    elif backbone == 'EfficientNetB1':
+        model = efn.EfficientNetB1(input_tensor=inputs, include_top=False, weights=None)
+    elif backbone == 'EfficientNetB2':
+        model = efn.EfficientNetB2(input_tensor=inputs, include_top=False, weights=None)
+    elif backbone == 'EfficientNetB3':
+        model = efn.EfficientNetB3(input_tensor=inputs, include_top=False, weights=None)
+    elif backbone == 'EfficientNetB4':
+        model = efn.EfficientNetB4(input_tensor=inputs, include_top=False, weights=None)
+    elif backbone == 'EfficientNetB5':
+        model = efn.EfficientNetB5(input_tensor=inputs, include_top=False, weights=None)
+    elif backbone == 'EfficientNetB6':
+        model = efn.EfficientNetB6(input_tensor=inputs, include_top=False, weights=None)
+    elif backbone == 'EfficientNetB7':
+        model = efn.EfficientNetB7(input_tensor=inputs, include_top=False, weights=None)
+    else:
+        raise ValueError('Backbone (\'{}\') is invalid.'.format(backbone))
+
+    layer_outputs = ['block4a_expand_activation', 'block6a_expand_activation', 'top_activation']
+
+    layer_outputs = [
+        model.get_layer(name=layer_outputs[0]).output,  # 28x28
+        model.get_layer(name=layer_outputs[1]).output,  # 14x14
+        model.get_layer(name=layer_outputs[2]).output,  # 7x7
+    ]
+    # create the densenet backbone
+    model = keras.models.Model(inputs=inputs, outputs=layer_outputs, name=model.name)
+
+    # invoke modifier if given
+    if modifier:
+        model = modifier(model)
+
+    # create the full model
+    return retinanet.retinanet(inputs=inputs, num_classes=num_classes, backbone_layers=model.outputs, **kwargs)
+
+
+def EfficientNetB0_retinanet(num_classes, inputs=None, **kwargs):
+    return effnet_retinanet(num_classes=num_classes, backbone='EfficientNetB0', inputs=inputs, **kwargs)
+
+
+def EfficientNetB1_retinanet(num_classes, inputs=None, **kwargs):
+    return effnet_retinanet(num_classes=num_classes, backbone='EfficientNetB1', inputs=inputs, **kwargs)
+
+
+def EfficientNetB2_retinanet(num_classes, inputs=None, **kwargs):
+    return effnet_retinanet(num_classes=num_classes, backbone='EfficientNetB2', inputs=inputs, **kwargs)
+
+
+def EfficientNetB3_retinanet(num_classes, inputs=None, **kwargs):
+    return effnet_retinanet(num_classes=num_classes, backbone='EfficientNetB3', inputs=inputs, **kwargs)
+
+
+def EfficientNetB4_retinanet(num_classes, inputs=None, **kwargs):
+    return effnet_retinanet(num_classes=num_classes, backbone='EfficientNetB4', inputs=inputs, **kwargs)
+
+
+def EfficientNetB5_retinanet(num_classes, inputs=None, **kwargs):
+    return effnet_retinanet(num_classes=num_classes, backbone='EfficientNetB5', inputs=inputs, **kwargs)
+
+
+def EfficientNetB6_retinanet(num_classes, inputs=None, **kwargs):
+    return effnet_retinanet(num_classes=num_classes, backbone='EfficientNetB6', inputs=inputs, **kwargs)
+
+
+def EfficientNetB7_retinanet(num_classes, inputs=None, **kwargs):
+    return effnet_retinanet(num_classes=num_classes, backbone='EfficientNetB7', inputs=inputs, **kwargs)
diff --git a/src/keras_retinanet/models/mobilenet.py b/src/keras_retinanet/models/mobilenet.py
new file mode 100644
index 0000000..4a3850b
--- /dev/null
+++ b/src/keras_retinanet/models/mobilenet.py
@@ -0,0 +1,109 @@
+"""
+Copyright 2017-2018 lvaleriu (https://github.com/lvaleriu/)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import keras
+from keras.applications import mobilenet
+from keras.utils import get_file
+from ..utils.image import preprocess_image
+
+from . import retinanet
+from . import Backbone
+
+
+class MobileNetBackbone(Backbone):
+    """ Describes backbone information and provides utility functions.
+    """
+
+    allowed_backbones = ['mobilenet128', 'mobilenet160', 'mobilenet192', 'mobilenet224']
+
+    def retinanet(self, *args, **kwargs):
+        """ Returns a retinanet model using the correct backbone.
+        """
+        return mobilenet_retinanet(*args, backbone=self.backbone, **kwargs)
+
+    def download_imagenet(self):
+        """ Download pre-trained weights for the specified backbone name.
+        This name is in the format mobilenet{rows}_{alpha} where rows is the
+        imagenet shape dimension and 'alpha' controls the width of the network.
+        For more info check the explanation from the keras mobilenet script itself.
+        """
+
+        alpha = float(self.backbone.split('_')[1])
+        rows = int(self.backbone.split('_')[0].replace('mobilenet', ''))
+
+        # load weights
+        if keras.backend.image_data_format() == 'channels_first':
+            raise ValueError('Weights for "channels_last" format '
+                             'are not available.')
+        if alpha == 1.0:
+            alpha_text = '1_0'
+        elif alpha == 0.75:
+            alpha_text = '7_5'
+        elif alpha == 0.50:
+            alpha_text = '5_0'
+        else:
+            alpha_text = '2_5'
+
+        model_name = 'mobilenet_{}_{}_tf_no_top.h5'.format(alpha_text, rows)
+        weights_url = mobilenet.mobilenet.BASE_WEIGHT_PATH + model_name
+        weights_path = get_file(model_name, weights_url, cache_subdir='models')
+
+        return weights_path
+
+    def validate(self):
+        """ Checks whether the backbone string is correct.
+        """
+        backbone = self.backbone.split('_')[0]
+
+        if backbone not in MobileNetBackbone.allowed_backbones:
+            raise ValueError('Backbone (\'{}\') not in allowed backbones ({}).'.format(backbone, MobileNetBackbone.allowed_backbones))
+
+    def preprocess_image(self, inputs):
+        """ Takes as input an image and prepares it for being passed through the network.
+        """
+        return preprocess_image(inputs, mode='tf')
+
+
+def mobilenet_retinanet(num_classes, backbone='mobilenet224_1.0', inputs=None, modifier=None, **kwargs):
+    """ Constructs a retinanet model using a mobilenet backbone.
+
+    Args
+        num_classes: Number of classes to predict.
+        backbone: Which backbone to use (one of ('mobilenet128', 'mobilenet160', 'mobilenet192', 'mobilenet224')).
+        inputs: The inputs to the network (defaults to a Tensor of shape (None, None, 3)).
+        modifier: A function handler which can modify the backbone before using it in retinanet (this can be used to freeze backbone layers for example).
+
+    Returns
+        RetinaNet model with a MobileNet backbone.
+    """
+    alpha = float(backbone.split('_')[1])
+
+    # choose default input
+    if inputs is None:
+        inputs = keras.layers.Input((None, None, 3))
+
+    backbone = mobilenet.MobileNet(input_tensor=inputs, alpha=alpha, include_top=False, pooling=None, weights=None)
+
+    # create the full model
+    layer_names = ['conv_pw_5_relu', 'conv_pw_11_relu', 'conv_pw_13_relu']
+    layer_outputs = [backbone.get_layer(name).output for name in layer_names]
+    backbone = keras.models.Model(inputs=inputs, outputs=layer_outputs, name=backbone.name)
+
+    # invoke modifier if given
+    if modifier:
+        backbone = modifier(backbone)
+
+    return retinanet.retinanet(inputs=inputs, num_classes=num_classes, backbone_layers=backbone.outputs, **kwargs)
diff --git a/src/keras_retinanet/models/resnet.py b/src/keras_retinanet/models/resnet.py
new file mode 100644
index 0000000..3ed555d
--- /dev/null
+++ b/src/keras_retinanet/models/resnet.py
@@ -0,0 +1,124 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import keras
+from keras.utils import get_file
+import keras_resnet
+import keras_resnet.models
+
+from . import retinanet
+from . import Backbone
+from ..utils.image import preprocess_image
+
+
+class ResNetBackbone(Backbone):
+    """ Describes backbone information and provides utility functions.
+    """
+
+    def __init__(self, backbone):
+        super(ResNetBackbone, self).__init__(backbone)
+        self.custom_objects.update(keras_resnet.custom_objects)
+
+    def retinanet(self, *args, **kwargs):
+        """ Returns a retinanet model using the correct backbone.
+        """
+        return resnet_retinanet(*args, backbone=self.backbone, **kwargs)
+
+    def download_imagenet(self):
+        """ Downloads ImageNet weights and returns path to weights file.
+        """
+        resnet_filename = 'ResNet-{}-model.keras.h5'
+        resnet_resource = 'https://github.com/fizyr/keras-models/releases/download/v0.0.1/{}'.format(resnet_filename)
+        depth = int(self.backbone.replace('resnet', ''))
+
+        filename = resnet_filename.format(depth)
+        resource = resnet_resource.format(depth)
+        if depth == 50:
+            checksum = '3e9f4e4f77bbe2c9bec13b53ee1c2319'
+        elif depth == 101:
+            checksum = '05dc86924389e5b401a9ea0348a3213c'
+        elif depth == 152:
+            checksum = '6ee11ef2b135592f8031058820bb9e71'
+
+        return get_file(
+            filename,
+            resource,
+            cache_subdir='models',
+            md5_hash=checksum
+        )
+
+    def validate(self):
+        """ Checks whether the backbone string is correct.
+        """
+        allowed_backbones = ['resnet50', 'resnet101', 'resnet152']
+        backbone = self.backbone.split('_')[0]
+
+        if backbone not in allowed_backbones:
+            raise ValueError('Backbone (\'{}\') not in allowed backbones ({}).'.format(backbone, allowed_backbones))
+
+    def preprocess_image(self, inputs):
+        """ Takes as input an image and prepares it for being passed through the network.
+        """
+        return preprocess_image(inputs, mode='caffe')
+
+
+def resnet_retinanet(num_classes, backbone='resnet50', inputs=None, modifier=None, **kwargs):
+    """ Constructs a retinanet model using a resnet backbone.
+
+    Args
+        num_classes: Number of classes to predict.
+        backbone: Which backbone to use (one of ('resnet50', 'resnet101', 'resnet152')).
+        inputs: The inputs to the network (defaults to a Tensor of shape (None, None, 3)).
+        modifier: A function handler which can modify the backbone before using it in retinanet (this can be used to freeze backbone layers for example).
+
+    Returns
+        RetinaNet model with a ResNet backbone.
+    """
+    # choose default input
+    if inputs is None:
+        if keras.backend.image_data_format() == 'channels_first':
+            inputs = keras.layers.Input(shape=(3, None, None))
+        else:
+            inputs = keras.layers.Input(shape=(None, None, 3))
+
+    # create the resnet backbone
+    if backbone == 'resnet50':
+        resnet = keras_resnet.models.ResNet50(inputs, include_top=False, freeze_bn=True)
+    elif backbone == 'resnet101':
+        resnet = keras_resnet.models.ResNet101(inputs, include_top=False, freeze_bn=True)
+    elif backbone == 'resnet152':
+        resnet = keras_resnet.models.ResNet152(inputs, include_top=False, freeze_bn=True)
+    else:
+        raise ValueError('Backbone (\'{}\') is invalid.'.format(backbone))
+
+    # invoke modifier if given
+    if modifier:
+        resnet = modifier(resnet)
+
+    # create the full model
+    return retinanet.retinanet(inputs=inputs, num_classes=num_classes, backbone_layers=resnet.outputs[1:], **kwargs)
+
+
+def resnet50_retinanet(num_classes, inputs=None, **kwargs):
+    return resnet_retinanet(num_classes=num_classes, backbone='resnet50', inputs=inputs, **kwargs)
+
+
+def resnet101_retinanet(num_classes, inputs=None, **kwargs):
+    return resnet_retinanet(num_classes=num_classes, backbone='resnet101', inputs=inputs, **kwargs)
+
+
+def resnet152_retinanet(num_classes, inputs=None, **kwargs):
+    return resnet_retinanet(num_classes=num_classes, backbone='resnet152', inputs=inputs, **kwargs)
diff --git a/src/keras_retinanet/models/retinanet.py b/src/keras_retinanet/models/retinanet.py
new file mode 100644
index 0000000..b0065bb
--- /dev/null
+++ b/src/keras_retinanet/models/retinanet.py
@@ -0,0 +1,364 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import keras
+from .. import initializers
+from .. import layers
+from ..utils.anchors import AnchorParameters
+from . import assert_training_model
+
+
+def default_classification_model(
+    num_classes,
+    num_anchors,
+    pyramid_feature_size=256,
+    prior_probability=0.01,
+    classification_feature_size=256,
+    name='classification_submodel'
+):
+    """ Creates the default classification submodel.
+
+    Args
+        num_classes                 : Number of classes to predict a score for at each feature level.
+        num_anchors                 : Number of anchors to predict classification scores for at each feature level.
+        pyramid_feature_size        : The number of filters to expect from the feature pyramid levels.
+        classification_feature_size : The number of filters to use in the layers in the classification submodel.
+        name                        : The name of the submodel.
+
+    Returns
+        A keras.models.Model that predicts classes for each anchor.
+    """
+    options = {
+        'kernel_size' : 3,
+        'strides'     : 1,
+        'padding'     : 'same',
+    }
+
+    if keras.backend.image_data_format() == 'channels_first':
+        inputs  = keras.layers.Input(shape=(pyramid_feature_size, None, None))
+    else:
+        inputs  = keras.layers.Input(shape=(None, None, pyramid_feature_size))
+    outputs = inputs
+    for i in range(4):
+        outputs = keras.layers.Conv2D(
+            filters=classification_feature_size,
+            activation='relu',
+            name='pyramid_classification_{}'.format(i),
+            kernel_initializer=keras.initializers.normal(mean=0.0, stddev=0.01, seed=None),
+            bias_initializer='zeros',
+            **options
+        )(outputs)
+
+    outputs = keras.layers.Conv2D(
+        filters=num_classes * num_anchors,
+        kernel_initializer=keras.initializers.normal(mean=0.0, stddev=0.01, seed=None),
+        bias_initializer=initializers.PriorProbability(probability=prior_probability),
+        name='pyramid_classification',
+        **options
+    )(outputs)
+
+    # reshape output and apply sigmoid
+    if keras.backend.image_data_format() == 'channels_first':
+        outputs = keras.layers.Permute((2, 3, 1), name='pyramid_classification_permute')(outputs)
+    outputs = keras.layers.Reshape((-1, num_classes), name='pyramid_classification_reshape')(outputs)
+    outputs = keras.layers.Activation('sigmoid', name='pyramid_classification_sigmoid')(outputs)
+
+    return keras.models.Model(inputs=inputs, outputs=outputs, name=name)
+
+
+def default_regression_model(num_values, num_anchors, pyramid_feature_size=256, regression_feature_size=256, name='regression_submodel'):
+    """ Creates the default regression submodel.
+
+    Args
+        num_values              : Number of values to regress.
+        num_anchors             : Number of anchors to regress for each feature level.
+        pyramid_feature_size    : The number of filters to expect from the feature pyramid levels.
+        regression_feature_size : The number of filters to use in the layers in the regression submodel.
+        name                    : The name of the submodel.
+
+    Returns
+        A keras.models.Model that predicts regression values for each anchor.
+    """
+    # All new conv layers except the final one in the
+    # RetinaNet (classification) subnets are initialized
+    # with bias b = 0 and a Gaussian weight fill with stddev = 0.01.
+    options = {
+        'kernel_size'        : 3,
+        'strides'            : 1,
+        'padding'            : 'same',
+        'kernel_initializer' : keras.initializers.normal(mean=0.0, stddev=0.01, seed=None),
+        'bias_initializer'   : 'zeros'
+    }
+
+    if keras.backend.image_data_format() == 'channels_first':
+        inputs  = keras.layers.Input(shape=(pyramid_feature_size, None, None))
+    else:
+        inputs  = keras.layers.Input(shape=(None, None, pyramid_feature_size))
+    outputs = inputs
+    for i in range(4):
+        outputs = keras.layers.Conv2D(
+            filters=regression_feature_size,
+            activation='relu',
+            name='pyramid_regression_{}'.format(i),
+            **options
+        )(outputs)
+
+    outputs = keras.layers.Conv2D(num_anchors * num_values, name='pyramid_regression', **options)(outputs)
+    if keras.backend.image_data_format() == 'channels_first':
+        outputs = keras.layers.Permute((2, 3, 1), name='pyramid_regression_permute')(outputs)
+    outputs = keras.layers.Reshape((-1, num_values), name='pyramid_regression_reshape')(outputs)
+
+    return keras.models.Model(inputs=inputs, outputs=outputs, name=name)
+
+
+def __create_pyramid_features(C3, C4, C5, feature_size=256):
+    """ Creates the FPN layers on top of the backbone features.
+
+    Args
+        C3           : Feature stage C3 from the backbone.
+        C4           : Feature stage C4 from the backbone.
+        C5           : Feature stage C5 from the backbone.
+        feature_size : The feature size to use for the resulting feature levels.
+
+    Returns
+        A list of feature levels [P3, P4, P5, P6, P7].
+    """
+    # upsample C5 to get P5 from the FPN paper
+    P5           = keras.layers.Conv2D(feature_size, kernel_size=1, strides=1, padding='same', name='C5_reduced')(C5)
+    P5_upsampled = layers.UpsampleLike(name='P5_upsampled')([P5, C4])
+    P5           = keras.layers.Conv2D(feature_size, kernel_size=3, strides=1, padding='same', name='P5')(P5)
+
+    # add P5 elementwise to C4
+    P4           = keras.layers.Conv2D(feature_size, kernel_size=1, strides=1, padding='same', name='C4_reduced')(C4)
+    P4           = keras.layers.Add(name='P4_merged')([P5_upsampled, P4])
+    P4_upsampled = layers.UpsampleLike(name='P4_upsampled')([P4, C3])
+    P4           = keras.layers.Conv2D(feature_size, kernel_size=3, strides=1, padding='same', name='P4')(P4)
+
+    # add P4 elementwise to C3
+    P3 = keras.layers.Conv2D(feature_size, kernel_size=1, strides=1, padding='same', name='C3_reduced')(C3)
+    P3 = keras.layers.Add(name='P3_merged')([P4_upsampled, P3])
+    P3 = keras.layers.Conv2D(feature_size, kernel_size=3, strides=1, padding='same', name='P3')(P3)
+
+    # "P6 is obtained via a 3x3 stride-2 conv on C5"
+    P6 = keras.layers.Conv2D(feature_size, kernel_size=3, strides=2, padding='same', name='P6')(C5)
+
+    # "P7 is computed by applying ReLU followed by a 3x3 stride-2 conv on P6"
+    P7 = keras.layers.Activation('relu', name='C6_relu')(P6)
+    P7 = keras.layers.Conv2D(feature_size, kernel_size=3, strides=2, padding='same', name='P7')(P7)
+
+    return [P3, P4, P5, P6, P7]
+
+
+def default_submodels(num_classes, num_anchors):
+    """ Create a list of default submodels used for object detection.
+
+    The default submodels contains a regression submodel and a classification submodel.
+
+    Args
+        num_classes : Number of classes to use.
+        num_anchors : Number of base anchors.
+
+    Returns
+        A list of tuple, where the first element is the name of the submodel and the second element is the submodel itself.
+    """
+    return [
+        ('regression', default_regression_model(4, num_anchors)),
+        ('classification', default_classification_model(num_classes, num_anchors))
+    ]
+
+
+def __build_model_pyramid(name, model, features):
+    """ Applies a single submodel to each FPN level.
+
+    Args
+        name     : Name of the submodel.
+        model    : The submodel to evaluate.
+        features : The FPN features.
+
+    Returns
+        A tensor containing the response from the submodel on the FPN features.
+    """
+    return keras.layers.Concatenate(axis=1, name=name)([model(f) for f in features])
+
+
+def __build_pyramid(models, features):
+    """ Applies all submodels to each FPN level.
+
+    Args
+        models   : List of submodels to run on each pyramid level (by default only regression, classifcation).
+        features : The FPN features.
+
+    Returns
+        A list of tensors, one for each submodel.
+    """
+    return [__build_model_pyramid(n, m, features) for n, m in models]
+
+
+def __build_anchors(anchor_parameters, features):
+    """ Builds anchors for the shape of the features from FPN.
+
+    Args
+        anchor_parameters : Parameteres that determine how anchors are generated.
+        features          : The FPN features.
+
+    Returns
+        A tensor containing the anchors for the FPN features.
+
+        The shape is:
+        ```
+        (batch_size, num_anchors, 4)
+        ```
+    """
+    anchors = [
+        layers.Anchors(
+            size=anchor_parameters.sizes[i],
+            stride=anchor_parameters.strides[i],
+            ratios=anchor_parameters.ratios,
+            scales=anchor_parameters.scales,
+            name='anchors_{}'.format(i)
+        )(f) for i, f in enumerate(features)
+    ]
+
+    return keras.layers.Concatenate(axis=1, name='anchors')(anchors)
+
+
+def retinanet(
+    inputs,
+    backbone_layers,
+    num_classes,
+    num_anchors             = None,
+    create_pyramid_features = __create_pyramid_features,
+    submodels               = None,
+    name                    = 'retinanet'
+):
+    """ Construct a RetinaNet model on top of a backbone.
+
+    This model is the minimum model necessary for training (with the unfortunate exception of anchors as output).
+
+    Args
+        inputs                  : keras.layers.Input (or list of) for the input to the model.
+        num_classes             : Number of classes to classify.
+        num_anchors             : Number of base anchors.
+        create_pyramid_features : Functor for creating pyramid features given the features C3, C4, C5 from the backbone.
+        submodels               : Submodels to run on each feature map (default is regression and classification submodels).
+        name                    : Name of the model.
+
+    Returns
+        A keras.models.Model which takes an image as input and outputs generated anchors and the result from each submodel on every pyramid level.
+
+        The order of the outputs is as defined in submodels:
+        ```
+        [
+            regression, classification, other[0], other[1], ...
+        ]
+        ```
+    """
+
+    if num_anchors is None:
+        num_anchors = AnchorParameters.default.num_anchors()
+
+    if submodels is None:
+        submodels = default_submodels(num_classes, num_anchors)
+
+    C3, C4, C5 = backbone_layers
+
+    # compute pyramid features as per https://arxiv.org/abs/1708.02002
+    features = create_pyramid_features(C3, C4, C5)
+
+    # for all pyramid levels, run available submodels
+    pyramids = __build_pyramid(submodels, features)
+
+    return keras.models.Model(inputs=inputs, outputs=pyramids, name=name)
+
+
+def retinanet_bbox(
+    model                 = None,
+    nms                   = True,
+    class_specific_filter = True,
+    name                  = 'retinanet-bbox',
+    anchor_params         = None,
+    nms_threshold         = 0.5,
+    score_threshold       = 0.05,
+    max_detections        = 300,
+    parallel_iterations   = 32,
+    **kwargs
+):
+    """ Construct a RetinaNet model on top of a backbone and adds convenience functions to output boxes directly.
+
+    This model uses the minimum retinanet model and appends a few layers to compute boxes within the graph.
+    These layers include applying the regression values to the anchors and performing NMS.
+
+    Args
+        model                 : RetinaNet model to append bbox layers to. If None, it will create a RetinaNet model using **kwargs.
+        nms                   : Whether to use non-maximum suppression for the filtering step.
+        class_specific_filter : Whether to use class specific filtering or filter for the best scoring class only.
+        name                  : Name of the model.
+        anchor_params         : Struct containing anchor parameters. If None, default values are used.
+        nms_threshold         : Threshold for the IoU value to determine when a box should be suppressed.
+        score_threshold       : Threshold used to prefilter the boxes with.
+        max_detections        : Maximum number of detections to keep.
+        parallel_iterations   : Number of batch items to process in parallel.
+        **kwargs              : Additional kwargs to pass to the minimal retinanet model.
+
+    Returns
+        A keras.models.Model which takes an image as input and outputs the detections on the image.
+
+        The order is defined as follows:
+        ```
+        [
+            boxes, scores, labels, other[0], other[1], ...
+        ]
+        ```
+    """
+
+    # if no anchor parameters are passed, use default values
+    if anchor_params is None:
+        anchor_params = AnchorParameters.default
+
+    # create RetinaNet model
+    if model is None:
+        model = retinanet(num_anchors=anchor_params.num_anchors(), **kwargs)
+    else:
+        assert_training_model(model)
+
+    # compute the anchors
+    features = [model.get_layer(p_name).output for p_name in ['P3', 'P4', 'P5', 'P6', 'P7']]
+    anchors  = __build_anchors(anchor_params, features)
+
+    # we expect the anchors, regression and classification values as first output
+    regression     = model.outputs[0]
+    classification = model.outputs[1]
+
+    # "other" can be any additional output from custom submodels, by default this will be []
+    other = model.outputs[2:]
+
+    # apply predicted regression to anchors
+    boxes = layers.RegressBoxes(name='boxes')([anchors, regression])
+    boxes = layers.ClipBoxes(name='clipped_boxes')([model.inputs[0], boxes])
+
+    # filter detections (apply NMS / score threshold / select top-k)
+    detections = layers.FilterDetections(
+        nms                   = nms,
+        class_specific_filter = class_specific_filter,
+        name                  = 'filtered_detections',
+        nms_threshold         = nms_threshold,
+        score_threshold       = score_threshold,
+        max_detections        = max_detections,
+        parallel_iterations   = parallel_iterations
+    )([boxes, classification] + other)
+
+    # construct the model
+    return keras.models.Model(inputs=model.inputs, outputs=detections, name=name)
diff --git a/src/keras_retinanet/models/senet.py b/src/keras_retinanet/models/senet.py
new file mode 100644
index 0000000..deb1eac
--- /dev/null
+++ b/src/keras_retinanet/models/senet.py
@@ -0,0 +1,155 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import keras
+from keras.utils import get_file
+
+from . import retinanet
+from . import Backbone
+from classification_models.keras import Classifiers
+
+
+class SeBackbone(Backbone):
+    """ Describes backbone information and provides utility functions.
+    """
+
+    def __init__(self, backbone):
+        super(SeBackbone, self).__init__(backbone)
+        _, self.preprocess_image_func = Classifiers.get(self.backbone)
+
+    def retinanet(self, *args, **kwargs):
+        """ Returns a retinanet model using the correct backbone.
+        """
+        return senet_retinanet(*args, backbone=self.backbone, **kwargs)
+
+    def download_imagenet(self):
+        """ Downloads ImageNet weights and returns path to weights file.
+        """
+        from classification_models.weights import WEIGHTS_COLLECTION
+
+        weights_path = None
+        for el in WEIGHTS_COLLECTION:
+            if el['model'] == self.backbone and not el['include_top']:
+                weights_path = get_file(el['name'], el['url'], cache_subdir='models', file_hash=el['md5'])
+
+        if weights_path is None:
+            raise ValueError('Unable to find imagenet weights for backbone {}!'.format(self.backbone))
+
+        return weights_path
+
+    def validate(self):
+        """ Checks whether the backbone string is correct.
+        """
+        allowed_backbones = ['seresnet18', 'seresnet34', 'seresnet50', 'seresnet101', 'seresnet152',
+                             'seresnext50', 'seresnext101', 'senet154']
+        backbone = self.backbone.split('_')[0]
+
+        if backbone not in allowed_backbones:
+            raise ValueError('Backbone (\'{}\') not in allowed backbones ({}).'.format(backbone, allowed_backbones))
+
+    def preprocess_image(self, inputs):
+        """ Takes as input an image and prepares it for being passed through the network.
+        """
+        return self.preprocess_image_func(inputs)
+
+
+def senet_retinanet(num_classes, backbone='seresnext50', inputs=None, modifier=None, **kwargs):
+    """ Constructs a retinanet model using a resnet backbone.
+
+    Args
+        num_classes: Number of classes to predict.
+        backbone: Which backbone to use (one of ('resnet50', 'resnet101', 'resnet152')).
+        inputs: The inputs to the network (defaults to a Tensor of shape (None, None, 3)).
+        modifier: A function handler which can modify the backbone before using it in retinanet (this can be used to freeze backbone layers for example).
+
+    Returns
+        RetinaNet model with a ResNet backbone.
+    """
+    # choose default input
+    if inputs is None:
+        if keras.backend.image_data_format() == 'channels_first':
+            inputs = keras.layers.Input(shape=(3, None, None))
+        else:
+            # inputs = keras.layers.Input(shape=(224, 224, 3))
+            inputs = keras.layers.Input(shape=(None, None, 3))
+
+    classifier, _ = Classifiers.get(backbone)
+    model = classifier(input_tensor=inputs, include_top=False, weights=None)
+
+    # get last conv layer from the end of each block [28x28, 14x14, 7x7]
+    if backbone == 'seresnet18' or backbone == 'seresnet34':
+        layer_outputs = ['stage3_unit1_relu1', 'stage4_unit1_relu1', 'relu1']
+    elif backbone == 'seresnet50':
+        layer_outputs = ['activation_36', 'activation_66', 'activation_81']
+    elif backbone == 'seresnet101':
+        layer_outputs = ['activation_36', 'activation_151', 'activation_166']
+    elif backbone == 'seresnet152':
+        layer_outputs = ['activation_56', 'activation_236', 'activation_251']
+    elif backbone == 'seresnext50':
+        layer_outputs = ['activation_37', 'activation_67', 'activation_81']
+    elif backbone == 'seresnext101':
+        layer_outputs = ['activation_37', 'activation_152', 'activation_166']
+    elif backbone == 'senet154':
+        layer_outputs = ['activation_59', 'activation_239', 'activation_253']
+    else:
+        raise ValueError('Backbone (\'{}\') is invalid.'.format(backbone))
+
+    layer_outputs = [
+        model.get_layer(name=layer_outputs[0]).output,  # 28x28
+        model.get_layer(name=layer_outputs[1]).output,  # 14x14
+        model.get_layer(name=layer_outputs[2]).output,  # 7x7
+    ]
+    # create the densenet backbone
+    model = keras.models.Model(inputs=inputs, outputs=layer_outputs, name=model.name)
+
+    # invoke modifier if given
+    if modifier:
+        model = modifier(model)
+
+    # create the full model
+    return retinanet.retinanet(inputs=inputs, num_classes=num_classes, backbone_layers=model.outputs, **kwargs)
+
+
+def seresnet18_retinanet(num_classes, inputs=None, **kwargs):
+    return senet_retinanet(num_classes=num_classes, backbone='seresnet18', inputs=inputs, **kwargs)
+
+
+def seresnet34_retinanet(num_classes, inputs=None, **kwargs):
+    return senet_retinanet(num_classes=num_classes, backbone='seresnet34', inputs=inputs, **kwargs)
+
+
+def seresnet50_retinanet(num_classes, inputs=None, **kwargs):
+    return senet_retinanet(num_classes=num_classes, backbone='seresnet50', inputs=inputs, **kwargs)
+
+
+def seresnet101_retinanet(num_classes, inputs=None, **kwargs):
+    return senet_retinanet(num_classes=num_classes, backbone='seresnet101', inputs=inputs, **kwargs)
+
+
+def seresnet152_retinanet(num_classes, inputs=None, **kwargs):
+    return senet_retinanet(num_classes=num_classes, backbone='seresnet152', inputs=inputs, **kwargs)
+
+
+def seresnext50_retinanet(num_classes, inputs=None, **kwargs):
+    return senet_retinanet(num_classes=num_classes, backbone='seresnext50', inputs=inputs, **kwargs)
+
+
+def seresnext101_retinanet(num_classes, inputs=None, **kwargs):
+    return senet_retinanet(num_classes=num_classes, backbone='seresnext101', inputs=inputs, **kwargs)
+
+
+def senet154_retinanet(num_classes, inputs=None, **kwargs):
+    return senet_retinanet(num_classes=num_classes, backbone='senet154', inputs=inputs, **kwargs)
diff --git a/src/keras_retinanet/models/vgg.py b/src/keras_retinanet/models/vgg.py
new file mode 100644
index 0000000..fad7e4b
--- /dev/null
+++ b/src/keras_retinanet/models/vgg.py
@@ -0,0 +1,99 @@
+"""
+Copyright 2017-2018 cgratie (https://github.com/cgratie/)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+
+import keras
+from keras.utils import get_file
+
+from . import retinanet
+from . import Backbone
+from ..utils.image import preprocess_image
+
+
+class VGGBackbone(Backbone):
+    """ Describes backbone information and provides utility functions.
+    """
+
+    def retinanet(self, *args, **kwargs):
+        """ Returns a retinanet model using the correct backbone.
+        """
+        return vgg_retinanet(*args, backbone=self.backbone, **kwargs)
+
+    def download_imagenet(self):
+        """ Downloads ImageNet weights and returns path to weights file.
+        Weights can be downloaded at https://github.com/fizyr/keras-models/releases .
+        """
+        if self.backbone == 'vgg16':
+            resource = keras.applications.vgg16.vgg16.WEIGHTS_PATH_NO_TOP
+            checksum = '6d6bbae143d832006294945121d1f1fc'
+        elif self.backbone == 'vgg19':
+            resource = keras.applications.vgg19.vgg19.WEIGHTS_PATH_NO_TOP
+            checksum = '253f8cb515780f3b799900260a226db6'
+        else:
+            raise ValueError("Backbone '{}' not recognized.".format(self.backbone))
+
+        return get_file(
+            '{}_weights_tf_dim_ordering_tf_kernels_notop.h5'.format(self.backbone),
+            resource,
+            cache_subdir='models',
+            file_hash=checksum
+        )
+
+    def validate(self):
+        """ Checks whether the backbone string is correct.
+        """
+        allowed_backbones = ['vgg16', 'vgg19']
+
+        if self.backbone not in allowed_backbones:
+            raise ValueError('Backbone (\'{}\') not in allowed backbones ({}).'.format(self.backbone, allowed_backbones))
+
+    def preprocess_image(self, inputs):
+        """ Takes as input an image and prepares it for being passed through the network.
+        """
+        return preprocess_image(inputs, mode='caffe')
+
+
+def vgg_retinanet(num_classes, backbone='vgg16', inputs=None, modifier=None, **kwargs):
+    """ Constructs a retinanet model using a vgg backbone.
+
+    Args
+        num_classes: Number of classes to predict.
+        backbone: Which backbone to use (one of ('vgg16', 'vgg19')).
+        inputs: The inputs to the network (defaults to a Tensor of shape (None, None, 3)).
+        modifier: A function handler which can modify the backbone before using it in retinanet (this can be used to freeze backbone layers for example).
+
+    Returns
+        RetinaNet model with a VGG backbone.
+    """
+    # choose default input
+    if inputs is None:
+        inputs = keras.layers.Input(shape=(None, None, 3))
+
+    # create the vgg backbone
+    if backbone == 'vgg16':
+        vgg = keras.applications.VGG16(input_tensor=inputs, include_top=False, weights=None)
+    elif backbone == 'vgg19':
+        vgg = keras.applications.VGG19(input_tensor=inputs, include_top=False, weights=None)
+    else:
+        raise ValueError("Backbone '{}' not recognized.".format(backbone))
+
+    if modifier:
+        vgg = modifier(vgg)
+
+    # create the full model
+    layer_names = ["block3_pool", "block4_pool", "block5_pool"]
+    layer_outputs = [vgg.get_layer(name).output for name in layer_names]
+    return retinanet.retinanet(inputs=inputs, num_classes=num_classes, backbone_layers=layer_outputs, **kwargs)
diff --git a/src/keras_retinanet/preprocessing/__init__.py b/src/keras_retinanet/preprocessing/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/keras_retinanet/preprocessing/coco.py b/src/keras_retinanet/preprocessing/coco.py
new file mode 100644
index 0000000..b684b80
--- /dev/null
+++ b/src/keras_retinanet/preprocessing/coco.py
@@ -0,0 +1,159 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from ..preprocessing.generator import Generator
+from ..utils.image import read_image_bgr
+
+import os
+import numpy as np
+
+from pycocotools.coco import COCO
+
+
+class CocoGenerator(Generator):
+    """ Generate data from the COCO dataset.
+
+    See https://github.com/cocodataset/cocoapi/tree/master/PythonAPI for more information.
+    """
+
+    def __init__(self, data_dir, set_name, **kwargs):
+        """ Initialize a COCO data generator.
+
+        Args
+            data_dir: Path to where the COCO dataset is stored.
+            set_name: Name of the set to parse.
+        """
+        self.data_dir  = data_dir
+        self.set_name  = set_name
+        self.coco      = COCO(os.path.join(data_dir, 'annotations', 'instances_' + set_name + '.json'))
+        self.image_ids = self.coco.getImgIds()
+
+        self.load_classes()
+
+        super(CocoGenerator, self).__init__(**kwargs)
+
+    def load_classes(self):
+        """ Loads the class to label mapping (and inverse) for COCO.
+        """
+        # load class names (name -> label)
+        categories = self.coco.loadCats(self.coco.getCatIds())
+        categories.sort(key=lambda x: x['id'])
+
+        self.classes             = {}
+        self.coco_labels         = {}
+        self.coco_labels_inverse = {}
+        for c in categories:
+            self.coco_labels[len(self.classes)] = c['id']
+            self.coco_labels_inverse[c['id']] = len(self.classes)
+            self.classes[c['name']] = len(self.classes)
+
+        # also load the reverse (label -> name)
+        self.labels = {}
+        for key, value in self.classes.items():
+            self.labels[value] = key
+
+    def size(self):
+        """ Size of the COCO dataset.
+        """
+        return len(self.image_ids)
+
+    def num_classes(self):
+        """ Number of classes in the dataset. For COCO this is 80.
+        """
+        return len(self.classes)
+
+    def has_label(self, label):
+        """ Return True if label is a known label.
+        """
+        return label in self.labels
+
+    def has_name(self, name):
+        """ Returns True if name is a known class.
+        """
+        return name in self.classes
+
+    def name_to_label(self, name):
+        """ Map name to label.
+        """
+        return self.classes[name]
+
+    def label_to_name(self, label):
+        """ Map label to name.
+        """
+        return self.labels[label]
+
+    def coco_label_to_label(self, coco_label):
+        """ Map COCO label to the label as used in the network.
+        COCO has some gaps in the order of labels. The highest label is 90, but there are 80 classes.
+        """
+        return self.coco_labels_inverse[coco_label]
+
+    def coco_label_to_name(self, coco_label):
+        """ Map COCO label to name.
+        """
+        return self.label_to_name(self.coco_label_to_label(coco_label))
+
+    def label_to_coco_label(self, label):
+        """ Map label as used by the network to labels as used by COCO.
+        """
+        return self.coco_labels[label]
+
+    def image_path(self, image_index):
+        """ Returns the image path for image_index.
+        """
+        image_info = self.coco.loadImgs(self.image_ids[image_index])[0]
+        path       = os.path.join(self.data_dir, 'images', self.set_name, image_info['file_name'])
+        return path
+
+    def image_aspect_ratio(self, image_index):
+        """ Compute the aspect ratio for an image with image_index.
+        """
+        image = self.coco.loadImgs(self.image_ids[image_index])[0]
+        return float(image['width']) / float(image['height'])
+
+    def load_image(self, image_index):
+        """ Load an image at the image_index.
+        """
+        path  = self.image_path(image_index)
+        return read_image_bgr(path)
+
+    def load_annotations(self, image_index):
+        """ Load annotations for an image_index.
+        """
+        # get ground truth annotations
+        annotations_ids = self.coco.getAnnIds(imgIds=self.image_ids[image_index], iscrowd=False)
+        annotations     = {'labels': np.empty((0,)), 'bboxes': np.empty((0, 4))}
+
+        # some images appear to miss annotations (like image with id 257034)
+        if len(annotations_ids) == 0:
+            return annotations
+
+        # parse annotations
+        coco_annotations = self.coco.loadAnns(annotations_ids)
+        for idx, a in enumerate(coco_annotations):
+            # some annotations have basically no width / height, skip them
+            if a['bbox'][2] < 1 or a['bbox'][3] < 1:
+                continue
+
+            annotations['labels'] = np.concatenate([annotations['labels'], [self.coco_label_to_label(a['category_id'])]], axis=0)
+            annotations['bboxes'] = np.concatenate([annotations['bboxes'], [[
+                a['bbox'][0],
+                a['bbox'][1],
+                a['bbox'][0] + a['bbox'][2],
+                a['bbox'][1] + a['bbox'][3],
+            ]]], axis=0)
+
+        return annotations
diff --git a/src/keras_retinanet/preprocessing/csv_generator.py b/src/keras_retinanet/preprocessing/csv_generator.py
new file mode 100644
index 0000000..c756224
--- /dev/null
+++ b/src/keras_retinanet/preprocessing/csv_generator.py
@@ -0,0 +1,225 @@
+"""
+Copyright 2017-2018 yhenon (https://github.com/yhenon/)
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from .generator import Generator
+from ..utils.image import read_image_bgr
+
+import numpy as np
+from PIL import Image
+from six import raise_from
+
+import csv
+import sys
+import os.path
+from collections import OrderedDict
+
+
+def _parse(value, function, fmt):
+    """
+    Parse a string into a value, and format a nice ValueError if it fails.
+
+    Returns `function(value)`.
+    Any `ValueError` raised is catched and a new `ValueError` is raised
+    with message `fmt.format(e)`, where `e` is the caught `ValueError`.
+    """
+    try:
+        return function(value)
+    except ValueError as e:
+        raise_from(ValueError(fmt.format(e)), None)
+
+
+def _read_classes(csv_reader):
+    """ Parse the classes file given by csv_reader.
+    """
+    result = OrderedDict()
+    for line, row in enumerate(csv_reader):
+        line += 1
+
+        try:
+            class_name, class_id = row
+        except ValueError:
+            raise_from(ValueError('line {}: format should be \'class_name,class_id\''.format(line)), None)
+        class_id = _parse(class_id, int, 'line {}: malformed class ID: {{}}'.format(line))
+
+        if class_name in result:
+            raise ValueError('line {}: duplicate class name: \'{}\''.format(line, class_name))
+        result[class_name] = class_id
+    return result
+
+
+def _read_annotations(csv_reader, classes):
+    """ Read annotations from the csv_reader.
+    """
+    result = OrderedDict()
+    for line, row in enumerate(csv_reader):
+        line += 1
+
+        try:
+            img_file, x1, y1, x2, y2, class_name = row[:6]
+        except ValueError:
+            raise_from(ValueError('line {}: format should be \'img_file,x1,y1,x2,y2,class_name\' or \'img_file,,,,,\''.format(line)), None)
+
+        if img_file not in result:
+            result[img_file] = []
+
+        # If a row contains only an image path, it's an image without annotations.
+        if (x1, y1, x2, y2, class_name) == ('', '', '', '', ''):
+            continue
+
+        x1 = _parse(x1, int, 'line {}: malformed x1: {{}}'.format(line))
+        y1 = _parse(y1, int, 'line {}: malformed y1: {{}}'.format(line))
+        x2 = _parse(x2, int, 'line {}: malformed x2: {{}}'.format(line))
+        y2 = _parse(y2, int, 'line {}: malformed y2: {{}}'.format(line))
+
+        # Check that the bounding box is valid.
+        if x2 <= x1:
+            raise ValueError('line {}: x2 ({}) must be higher than x1 ({})'.format(line, x2, x1))
+        if y2 <= y1:
+            raise ValueError('line {}: y2 ({}) must be higher than y1 ({})'.format(line, y2, y1))
+
+        # check if the current class name is correctly present
+        if class_name not in classes:
+            raise ValueError('line {}: unknown class name: \'{}\' (classes: {})'.format(line, class_name, classes))
+
+        result[img_file].append({'x1': x1, 'x2': x2, 'y1': y1, 'y2': y2, 'class': class_name})
+    return result
+
+
+def _open_for_csv(path):
+    """ Open a file with flags suitable for csv.reader.
+
+    This is different for python2 it means with mode 'rb',
+    for python3 this means 'r' with "universal newlines".
+    """
+    if sys.version_info[0] < 3:
+        return open(path, 'rb')
+    else:
+        return open(path, 'r', newline='')
+
+
+class CSVGenerator(Generator):
+    """ Generate data for a custom CSV dataset.
+
+    See https://github.com/fizyr/keras-retinanet#csv-datasets for more information.
+    """
+
+    def __init__(
+        self,
+        csv_data_file,
+        csv_class_file,
+        base_dir=None,
+        **kwargs
+    ):
+        """ Initialize a CSV data generator.
+
+        Args
+            csv_data_file: Path to the CSV annotations file.
+            csv_class_file: Path to the CSV classes file.
+            base_dir: Directory w.r.t. where the files are to be searched (defaults to the directory containing the csv_data_file).
+        """
+        self.image_names = []
+        self.image_data  = {}
+        self.base_dir    = base_dir
+
+        # Take base_dir from annotations file if not explicitly specified.
+        if self.base_dir is None:
+            self.base_dir = os.path.dirname(csv_data_file)
+
+        # parse the provided class file
+        try:
+            with _open_for_csv(csv_class_file) as file:
+                self.classes = _read_classes(csv.reader(file, delimiter=','))
+        except ValueError as e:
+            raise_from(ValueError('invalid CSV class file: {}: {}'.format(csv_class_file, e)), None)
+
+        self.labels = {}
+        for key, value in self.classes.items():
+            self.labels[value] = key
+
+        # csv with img_path, x1, y1, x2, y2, class_name
+        try:
+            with _open_for_csv(csv_data_file) as file:
+                self.image_data = _read_annotations(csv.reader(file, delimiter=','), self.classes)
+        except ValueError as e:
+            raise_from(ValueError('invalid CSV annotations file: {}: {}'.format(csv_data_file, e)), None)
+        self.image_names = list(self.image_data.keys())
+
+        super(CSVGenerator, self).__init__(**kwargs)
+
+    def size(self):
+        """ Size of the dataset.
+        """
+        return len(self.image_names)
+
+    def num_classes(self):
+        """ Number of classes in the dataset.
+        """
+        return max(self.classes.values()) + 1
+
+    def has_label(self, label):
+        """ Return True if label is a known label.
+        """
+        return label in self.labels
+
+    def has_name(self, name):
+        """ Returns True if name is a known class.
+        """
+        return name in self.classes
+
+    def name_to_label(self, name):
+        """ Map name to label.
+        """
+        return self.classes[name]
+
+    def label_to_name(self, label):
+        """ Map label to name.
+        """
+        return self.labels[label]
+
+    def image_path(self, image_index):
+        """ Returns the image path for image_index.
+        """
+        return os.path.join(self.base_dir, self.image_names[image_index])
+
+    def image_aspect_ratio(self, image_index):
+        """ Compute the aspect ratio for an image with image_index.
+        """
+        # PIL is fast for metadata
+        image = Image.open(self.image_path(image_index))
+        return float(image.width) / float(image.height)
+
+    def load_image(self, image_index):
+        """ Load an image at the image_index.
+        """
+        return read_image_bgr(self.image_path(image_index))
+
+    def load_annotations(self, image_index):
+        """ Load annotations for an image_index.
+        """
+        path        = self.image_names[image_index]
+        annotations = {'labels': np.empty((0,)), 'bboxes': np.empty((0, 4))}
+
+        for idx, annot in enumerate(self.image_data[path]):
+            annotations['labels'] = np.concatenate((annotations['labels'], [self.name_to_label(annot['class'])]))
+            annotations['bboxes'] = np.concatenate((annotations['bboxes'], [[
+                float(annot['x1']),
+                float(annot['y1']),
+                float(annot['x2']),
+                float(annot['y2']),
+            ]]))
+
+        return annotations
diff --git a/src/keras_retinanet/preprocessing/generator.py b/src/keras_retinanet/preprocessing/generator.py
new file mode 100644
index 0000000..ae1c43b
--- /dev/null
+++ b/src/keras_retinanet/preprocessing/generator.py
@@ -0,0 +1,377 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import numpy as np
+import random
+import warnings
+
+import keras
+
+from ..utils.anchors import (
+    anchor_targets_bbox,
+    anchors_for_shape,
+    guess_shapes
+)
+from ..utils.config import parse_anchor_parameters
+from ..utils.image import (
+    TransformParameters,
+    adjust_transform_for_image,
+    apply_transform,
+    preprocess_image,
+    resize_image,
+)
+from ..utils.transform import transform_aabb
+
+
+class Generator(keras.utils.Sequence):
+    """ Abstract generator class.
+    """
+
+    def __init__(
+        self,
+        transform_generator = None,
+        visual_effect_generator=None,
+        batch_size=1,
+        group_method='ratio',  # one of 'none', 'random', 'ratio'
+        shuffle_groups=True,
+        image_min_side=800,
+        image_max_side=1333,
+        no_resize=False,
+        transform_parameters=None,
+        compute_anchor_targets=anchor_targets_bbox,
+        compute_shapes=guess_shapes,
+        preprocess_image=preprocess_image,
+        config=None
+    ):
+        """ Initialize Generator object.
+
+        Args
+            transform_generator    : A generator used to randomly transform images and annotations.
+            batch_size             : The size of the batches to generate.
+            group_method           : Determines how images are grouped together (defaults to 'ratio', one of ('none', 'random', 'ratio')).
+            shuffle_groups         : If True, shuffles the groups each epoch.
+            image_min_side         : After resizing the minimum side of an image is equal to image_min_side.
+            image_max_side         : If after resizing the maximum side is larger than image_max_side, scales down further so that the max side is equal to image_max_side.
+            no_resize              : If True, no image/annotation resizing is performed.
+            transform_parameters   : The transform parameters used for data augmentation.
+            compute_anchor_targets : Function handler for computing the targets of anchors for an image and its annotations.
+            compute_shapes         : Function handler for computing the shapes of the pyramid for a given input.
+            preprocess_image       : Function handler for preprocessing an image (scaling / normalizing) for passing through a network.
+        """
+        self.transform_generator    = transform_generator
+        self.visual_effect_generator = visual_effect_generator
+        self.batch_size             = int(batch_size)
+        self.group_method           = group_method
+        self.shuffle_groups         = shuffle_groups
+        self.image_min_side         = image_min_side
+        self.image_max_side         = image_max_side
+        self.no_resize              = no_resize
+        self.transform_parameters   = transform_parameters or TransformParameters()
+        self.compute_anchor_targets = compute_anchor_targets
+        self.compute_shapes         = compute_shapes
+        self.preprocess_image       = preprocess_image
+        self.config                 = config
+
+        # Define groups
+        self.group_images()
+
+        # Shuffle when initializing
+        if self.shuffle_groups:
+            self.on_epoch_end()
+
+    def on_epoch_end(self):
+        if self.shuffle_groups:
+            random.shuffle(self.groups)
+
+    def size(self):
+        """ Size of the dataset.
+        """
+        raise NotImplementedError('size method not implemented')
+
+    def num_classes(self):
+        """ Number of classes in the dataset.
+        """
+        raise NotImplementedError('num_classes method not implemented')
+
+    def has_label(self, label):
+        """ Returns True if label is a known label.
+        """
+        raise NotImplementedError('has_label method not implemented')
+
+    def has_name(self, name):
+        """ Returns True if name is a known class.
+        """
+        raise NotImplementedError('has_name method not implemented')
+
+    def name_to_label(self, name):
+        """ Map name to label.
+        """
+        raise NotImplementedError('name_to_label method not implemented')
+
+    def label_to_name(self, label):
+        """ Map label to name.
+        """
+        raise NotImplementedError('label_to_name method not implemented')
+
+    def image_aspect_ratio(self, image_index):
+        """ Compute the aspect ratio for an image with image_index.
+        """
+        raise NotImplementedError('image_aspect_ratio method not implemented')
+
+    def image_path(self, image_index):
+        """ Get the path to an image.
+        """
+        raise NotImplementedError('image_path method not implemented')
+
+    def load_image(self, image_index):
+        """ Load an image at the image_index.
+        """
+        raise NotImplementedError('load_image method not implemented')
+
+    def load_annotations(self, image_index):
+        """ Load annotations for an image_index.
+        """
+        raise NotImplementedError('load_annotations method not implemented')
+
+    def load_annotations_group(self, group):
+        """ Load annotations for all images in group.
+        """
+        annotations_group = [self.load_annotations(image_index) for image_index in group]
+        for annotations in annotations_group:
+            assert(isinstance(annotations, dict)), '\'load_annotations\' should return a list of dictionaries, received: {}'.format(type(annotations))
+            assert('labels' in annotations), '\'load_annotations\' should return a list of dictionaries that contain \'labels\' and \'bboxes\'.'
+            assert('bboxes' in annotations), '\'load_annotations\' should return a list of dictionaries that contain \'labels\' and \'bboxes\'.'
+
+        return annotations_group
+
+    def filter_annotations(self, image_group, annotations_group, group):
+        """ Filter annotations by removing those that are outside of the image bounds or whose width/height < 0.
+        """
+        # test all annotations
+        for index, (image, annotations) in enumerate(zip(image_group, annotations_group)):
+            # test x2 < x1 | y2 < y1 | x1 < 0 | y1 < 0 | x2 <= 0 | y2 <= 0 | x2 >= image.shape[1] | y2 >= image.shape[0]
+            invalid_indices = np.where(
+                (annotations['bboxes'][:, 2] <= annotations['bboxes'][:, 0]) |
+                (annotations['bboxes'][:, 3] <= annotations['bboxes'][:, 1]) |
+                (annotations['bboxes'][:, 0] < 0) |
+                (annotations['bboxes'][:, 1] < 0) |
+                (annotations['bboxes'][:, 2] > image.shape[1]) |
+                (annotations['bboxes'][:, 3] > image.shape[0])
+            )[0]
+
+            # delete invalid indices
+            if len(invalid_indices):
+                warnings.warn('Image {} with id {} (shape {}) contains the following invalid boxes: {}.'.format(
+                    self.image_path(group[index]),
+                    group[index],
+                    image.shape,
+                    annotations['bboxes'][invalid_indices, :]
+                ))
+                for k in annotations_group[index].keys():
+                    annotations_group[index][k] = np.delete(annotations[k], invalid_indices, axis=0)
+        return image_group, annotations_group
+
+    def load_image_group(self, group):
+        """ Load images for all images in a group.
+        """
+        return [self.load_image(image_index) for image_index in group]
+
+    def random_visual_effect_group_entry(self, image, annotations):
+        """ Randomly transforms image and annotation.
+        """
+        visual_effect = next(self.visual_effect_generator)
+        # apply visual effect
+        image = visual_effect(image)
+        return image, annotations
+
+    def random_visual_effect_group(self, image_group, annotations_group):
+        """ Randomly apply visual effect on each image.
+        """
+        assert(len(image_group) == len(annotations_group))
+
+        if self.visual_effect_generator is None:
+            # do nothing
+            return image_group, annotations_group
+
+        for index in range(len(image_group)):
+            # apply effect on a single group entry
+            image_group[index], annotations_group[index] = self.random_visual_effect_group_entry(
+                image_group[index], annotations_group[index]
+            )
+
+        return image_group, annotations_group
+
+    def random_transform_group_entry(self, image, annotations, transform=None):
+        """ Randomly transforms image and annotation.
+        """
+        # randomly transform both image and annotations
+        if transform is not None or self.transform_generator:
+            if transform is None:
+                transform = adjust_transform_for_image(next(self.transform_generator), image, self.transform_parameters.relative_translation)
+
+            # apply transformation to image
+            image = apply_transform(transform, image, self.transform_parameters)
+
+            # Transform the bounding boxes in the annotations.
+            annotations['bboxes'] = annotations['bboxes'].copy()
+            for index in range(annotations['bboxes'].shape[0]):
+                annotations['bboxes'][index, :] = transform_aabb(transform, annotations['bboxes'][index, :])
+
+        return image, annotations
+
+    def random_transform_group(self, image_group, annotations_group):
+        """ Randomly transforms each image and its annotations.
+        """
+
+        assert(len(image_group) == len(annotations_group))
+
+        for index in range(len(image_group)):
+            # transform a single group entry
+            image_group[index], annotations_group[index] = self.random_transform_group_entry(image_group[index], annotations_group[index])
+
+        return image_group, annotations_group
+
+    def resize_image(self, image):
+        """ Resize an image using image_min_side and image_max_side.
+        """
+        if self.no_resize:
+            return image, 1
+        else:
+            return resize_image(image, min_side=self.image_min_side, max_side=self.image_max_side)
+
+    def preprocess_group_entry(self, image, annotations):
+        """ Preprocess image and its annotations.
+        """
+        # preprocess the image
+        image = self.preprocess_image(image)
+
+        # resize image
+        image, image_scale = self.resize_image(image)
+
+        # apply resizing to annotations too
+        annotations['bboxes'] *= image_scale
+
+        # convert to the wanted keras floatx
+        image = keras.backend.cast_to_floatx(image)
+
+        return image, annotations
+
+    def preprocess_group(self, image_group, annotations_group):
+        """ Preprocess each image and its annotations in its group.
+        """
+        assert(len(image_group) == len(annotations_group))
+
+        for index in range(len(image_group)):
+            # preprocess a single group entry
+            image_group[index], annotations_group[index] = self.preprocess_group_entry(image_group[index], annotations_group[index])
+
+        return image_group, annotations_group
+
+    def group_images(self):
+        """ Order the images according to self.order and makes groups of self.batch_size.
+        """
+        # determine the order of the images
+        order = list(range(self.size()))
+        if self.group_method == 'random':
+            random.shuffle(order)
+        elif self.group_method == 'ratio':
+            order.sort(key=lambda x: self.image_aspect_ratio(x))
+
+        # divide into groups, one group = one batch
+        self.groups = [[order[x % len(order)] for x in range(i, i + self.batch_size)] for i in range(0, len(order), self.batch_size)]
+
+    def compute_inputs(self, image_group):
+        """ Compute inputs for the network using an image_group.
+        """
+        # get the max image shape
+        max_shape = tuple(max(image.shape[x] for image in image_group) for x in range(3))
+
+        # construct an image batch object
+        image_batch = np.zeros((self.batch_size,) + max_shape, dtype=keras.backend.floatx())
+
+        # copy all images to the upper left part of the image batch object
+        for image_index, image in enumerate(image_group):
+            image_batch[image_index, :image.shape[0], :image.shape[1], :image.shape[2]] = image
+
+        if keras.backend.image_data_format() == 'channels_first':
+            image_batch = image_batch.transpose((0, 3, 1, 2))
+
+        return image_batch
+
+    def generate_anchors(self, image_shape):
+        anchor_params = None
+        if self.config and 'anchor_parameters' in self.config:
+            anchor_params = parse_anchor_parameters(self.config)
+        return anchors_for_shape(image_shape, anchor_params=anchor_params, shapes_callback=self.compute_shapes)
+
+    def compute_targets(self, image_group, annotations_group):
+        """ Compute target outputs for the network using images and their annotations.
+        """
+        # get the max image shape
+        max_shape = tuple(max(image.shape[x] for image in image_group) for x in range(3))
+        anchors   = self.generate_anchors(max_shape)
+
+        batches = self.compute_anchor_targets(
+            anchors,
+            image_group,
+            annotations_group,
+            self.num_classes()
+        )
+
+        return list(batches)
+
+    def compute_input_output(self, group):
+        """ Compute inputs and target outputs for the network.
+        """
+        # load images and annotations
+        image_group       = self.load_image_group(group)
+        annotations_group = self.load_annotations_group(group)
+
+        # check validity of annotations
+        image_group, annotations_group = self.filter_annotations(image_group, annotations_group, group)
+
+        # randomly apply visual effect
+        image_group, annotations_group = self.random_visual_effect_group(image_group, annotations_group)
+
+        # randomly transform data
+        image_group, annotations_group = self.random_transform_group(image_group, annotations_group)
+
+        # perform preprocessing steps
+        image_group, annotations_group = self.preprocess_group(image_group, annotations_group)
+
+        # compute network inputs
+        inputs = self.compute_inputs(image_group)
+
+        # compute network targets
+        targets = self.compute_targets(image_group, annotations_group)
+
+        return inputs, targets
+
+    def __len__(self):
+        """
+        Number of batches for generator.
+        """
+
+        return len(self.groups)
+
+    def __getitem__(self, index):
+        """
+        Keras sequence method for generating batches.
+        """
+        group = self.groups[index]
+        inputs, targets = self.compute_input_output(group)
+
+        return inputs, targets
diff --git a/src/keras_retinanet/preprocessing/kitti.py b/src/keras_retinanet/preprocessing/kitti.py
new file mode 100644
index 0000000..5922558
--- /dev/null
+++ b/src/keras_retinanet/preprocessing/kitti.py
@@ -0,0 +1,168 @@
+"""
+Copyright 2017-2018 lvaleriu (https://github.com/lvaleriu/)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import csv
+import os.path
+
+import numpy as np
+from PIL import Image
+
+from .generator import Generator
+from ..utils.image import read_image_bgr
+
+kitti_classes = {
+    'Car': 0,
+    'Van': 1,
+    'Truck': 2,
+    'Pedestrian': 3,
+    'Person_sitting': 4,
+    'Cyclist': 5,
+    'Tram': 6,
+    'Misc': 7,
+    'DontCare': 7
+}
+
+
+class KittiGenerator(Generator):
+    """ Generate data for a KITTI dataset.
+
+    See http://www.cvlibs.net/datasets/kitti/ for more information.
+    """
+
+    def __init__(
+        self,
+        base_dir,
+        subset='train',
+        **kwargs
+    ):
+        """ Initialize a KITTI data generator.
+
+        Args
+            base_dir: Directory w.r.t. where the files are to be searched (defaults to the directory containing the csv_data_file).
+            subset: The subset to generate data for (defaults to 'train').
+        """
+        self.base_dir = base_dir
+
+        label_dir = os.path.join(self.base_dir, subset, 'labels')
+        image_dir = os.path.join(self.base_dir, subset, 'images')
+
+        """
+        1    type         Describes the type of object: 'Car', 'Van', 'Truck',
+                             'Pedestrian', 'Person_sitting', 'Cyclist', 'Tram',
+                             'Misc' or 'DontCare'
+        1    truncated    Float from 0 (non-truncated) to 1 (truncated), where
+                         truncated refers to the object leaving image boundaries
+        1    occluded     Integer (0,1,2,3) indicating occlusion state:
+                         0 = fully visible, 1 = partly occluded
+                         2 = largely occluded, 3 = unknown
+        1    alpha        Observation angle of object, ranging [-pi..pi]
+        4    bbox         2D bounding box of object in the image (0-based index):
+                         contains left, top, right, bottom pixel coordinates
+        3    dimensions   3D object dimensions: height, width, length (in meters)
+        3    location     3D object location x,y,z in camera coordinates (in meters)
+        1    rotation_y   Rotation ry around Y-axis in camera coordinates [-pi..pi]
+        """
+
+        self.labels = {}
+        self.classes = kitti_classes
+        for name, label in self.classes.items():
+            self.labels[label] = name
+
+        self.image_data = dict()
+        self.images = []
+        for i, fn in enumerate(os.listdir(label_dir)):
+            label_fp = os.path.join(label_dir, fn)
+            image_fp = os.path.join(image_dir, fn.replace('.txt', '.png'))
+
+            self.images.append(image_fp)
+
+            fieldnames = ['type', 'truncated', 'occluded', 'alpha', 'left', 'top', 'right', 'bottom', 'dh', 'dw', 'dl',
+                          'lx', 'ly', 'lz', 'ry']
+            with open(label_fp, 'r') as csv_file:
+                reader = csv.DictReader(csv_file, delimiter=' ', fieldnames=fieldnames)
+                boxes = []
+                for line, row in enumerate(reader):
+                    label = row['type']
+                    cls_id = kitti_classes[label]
+
+                    annotation = {'cls_id': cls_id, 'x1': row['left'], 'x2': row['right'], 'y2': row['bottom'], 'y1': row['top']}
+                    boxes.append(annotation)
+
+                self.image_data[i] = boxes
+
+        super(KittiGenerator, self).__init__(**kwargs)
+
+    def size(self):
+        """ Size of the dataset.
+        """
+        return len(self.images)
+
+    def num_classes(self):
+        """ Number of classes in the dataset.
+        """
+        return max(self.classes.values()) + 1
+
+    def has_label(self, label):
+        """ Return True if label is a known label.
+        """
+        return label in self.labels
+
+    def has_name(self, name):
+        """ Returns True if name is a known class.
+        """
+        return name in self.classes
+
+    def name_to_label(self, name):
+        """ Map name to label.
+        """
+        raise NotImplementedError()
+
+    def label_to_name(self, label):
+        """ Map label to name.
+        """
+        return self.labels[label]
+
+    def image_aspect_ratio(self, image_index):
+        """ Compute the aspect ratio for an image with image_index.
+        """
+        # PIL is fast for metadata
+        image = Image.open(self.images[image_index])
+        return float(image.width) / float(image.height)
+
+    def image_path(self, image_index):
+        """ Get the path to an image.
+        """
+        return self.images[image_index]
+
+    def load_image(self, image_index):
+        """ Load an image at the image_index.
+        """
+        return read_image_bgr(self.image_path(image_index))
+
+    def load_annotations(self, image_index):
+        """ Load annotations for an image_index.
+        """
+        image_data = self.image_data[image_index]
+        annotations = {'labels': np.empty((len(image_data),)), 'bboxes': np.empty((len(image_data), 4))}
+
+        for idx, ann in enumerate(image_data):
+            annotations['bboxes'][idx, 0] = float(ann['x1'])
+            annotations['bboxes'][idx, 1] = float(ann['y1'])
+            annotations['bboxes'][idx, 2] = float(ann['x2'])
+            annotations['bboxes'][idx, 3] = float(ann['y2'])
+            annotations['labels'][idx] = int(ann['cls_id'])
+
+        return annotations
diff --git a/src/keras_retinanet/preprocessing/open_images.py b/src/keras_retinanet/preprocessing/open_images.py
new file mode 100644
index 0000000..a5ac737
--- /dev/null
+++ b/src/keras_retinanet/preprocessing/open_images.py
@@ -0,0 +1,375 @@
+"""
+Copyright 2017-2018 lvaleriu (https://github.com/lvaleriu/)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import csv
+import json
+import os
+import warnings
+
+import numpy as np
+from PIL import Image
+
+from .generator import Generator
+from ..utils.image import read_image_bgr
+
+
+def load_hierarchy(metadata_dir, version='v4'):
+    hierarchy = None
+    if version == 'challenge2018':
+        hierarchy = 'bbox_labels_500_hierarchy.json'
+    elif version == 'v4':
+        hierarchy = 'bbox_labels_600_hierarchy.json'
+    elif version == 'v3':
+        hierarchy = 'bbox_labels_600_hierarchy.json'
+
+    hierarchy_json = os.path.join(metadata_dir, hierarchy)
+    with open(hierarchy_json) as f:
+        hierarchy_data = json.loads(f.read())
+
+    return hierarchy_data
+
+
+def load_hierarchy_children(hierarchy):
+    res = [hierarchy['LabelName']]
+
+    if 'Subcategory' in hierarchy:
+        for subcategory in hierarchy['Subcategory']:
+            children = load_hierarchy_children(subcategory)
+
+            for c in children:
+                res.append(c)
+
+    return res
+
+
+def find_hierarchy_parent(hierarchy, parent_cls):
+    if hierarchy['LabelName'] == parent_cls:
+        return hierarchy
+    elif 'Subcategory' in hierarchy:
+        for child in hierarchy['Subcategory']:
+            res = find_hierarchy_parent(child, parent_cls)
+            if res is not None:
+                return res
+
+    return None
+
+
+def get_labels(metadata_dir, version='v4'):
+    if version == 'v4' or version == 'challenge2018':
+        csv_file = 'class-descriptions-boxable.csv' if version == 'v4' else 'challenge-2018-class-descriptions-500.csv'
+
+        boxable_classes_descriptions = os.path.join(metadata_dir, csv_file)
+        id_to_labels = {}
+        cls_index    = {}
+
+        i = 0
+        with open(boxable_classes_descriptions) as f:
+            for row in csv.reader(f):
+                # make sure the csv row is not empty (usually the last one)
+                if len(row):
+                    label       = row[0]
+                    description = row[1].replace("\"", "").replace("'", "").replace('`', '')
+
+                    id_to_labels[i]  = description
+                    cls_index[label] = i
+
+                    i += 1
+    else:
+        trainable_classes_path = os.path.join(metadata_dir, 'classes-bbox-trainable.txt')
+        description_path = os.path.join(metadata_dir, 'class-descriptions.csv')
+
+        description_table = {}
+        with open(description_path) as f:
+            for row in csv.reader(f):
+                # make sure the csv row is not empty (usually the last one)
+                if len(row):
+                    description_table[row[0]] = row[1].replace("\"", "").replace("'", "").replace('`', '')
+
+        with open(trainable_classes_path, 'rb') as f:
+            trainable_classes = f.read().split('\n')
+
+        id_to_labels = dict([(i, description_table[c]) for i, c in enumerate(trainable_classes)])
+        cls_index = dict([(c, i) for i, c in enumerate(trainable_classes)])
+
+    return id_to_labels, cls_index
+
+
+def generate_images_annotations_json(main_dir, metadata_dir, subset, cls_index, version='v4'):
+    validation_image_ids = {}
+
+    if version == 'v4':
+        annotations_path = os.path.join(metadata_dir, subset, '{}-annotations-bbox.csv'.format(subset))
+    elif version == 'challenge2018':
+        validation_image_ids_path = os.path.join(metadata_dir, 'challenge-2018-image-ids-valset-od.csv')
+
+        with open(validation_image_ids_path, 'r') as csv_file:
+            reader = csv.DictReader(csv_file, fieldnames=['ImageID'])
+            next(reader)
+            for line, row in enumerate(reader):
+                image_id = row['ImageID']
+                validation_image_ids[image_id] = True
+
+        annotations_path = os.path.join(metadata_dir, 'challenge-2018-train-annotations-bbox.csv')
+    else:
+        annotations_path = os.path.join(metadata_dir, subset, 'annotations-human-bbox.csv')
+
+    fieldnames = ['ImageID', 'Source', 'LabelName', 'Confidence',
+                  'XMin', 'XMax', 'YMin', 'YMax',
+                  'IsOccluded', 'IsTruncated', 'IsGroupOf', 'IsDepiction', 'IsInside']
+
+    id_annotations = dict()
+    with open(annotations_path, 'r') as csv_file:
+        reader = csv.DictReader(csv_file, fieldnames=fieldnames)
+        next(reader)
+
+        images_sizes = {}
+        for line, row in enumerate(reader):
+            frame = row['ImageID']
+
+            if version == 'challenge2018':
+                if subset == 'train':
+                    if frame in validation_image_ids:
+                        continue
+                elif subset == 'validation':
+                    if frame not in validation_image_ids:
+                        continue
+                else:
+                    raise NotImplementedError('This generator handles only the train and validation subsets')
+
+            class_name = row['LabelName']
+
+            if class_name not in cls_index:
+                continue
+
+            cls_id = cls_index[class_name]
+
+            if version == 'challenge2018':
+                # We recommend participants to use the provided subset of the training set as a validation set.
+                # This is preferable over using the V4 val/test sets, as the training set is more densely annotated.
+                img_path = os.path.join(main_dir, 'images', 'train', frame + '.jpg')
+            else:
+                img_path = os.path.join(main_dir, 'images', subset, frame + '.jpg')
+
+            if frame in images_sizes:
+                width, height = images_sizes[frame]
+            else:
+                try:
+                    with Image.open(img_path) as img:
+                        width, height = img.width, img.height
+                        images_sizes[frame] = (width, height)
+                except Exception as ex:
+                    if version == 'challenge2018':
+                        raise ex
+                    continue
+
+            x1 = float(row['XMin'])
+            x2 = float(row['XMax'])
+            y1 = float(row['YMin'])
+            y2 = float(row['YMax'])
+
+            x1_int = int(round(x1 * width))
+            x2_int = int(round(x2 * width))
+            y1_int = int(round(y1 * height))
+            y2_int = int(round(y2 * height))
+
+            # Check that the bounding box is valid.
+            if x2 <= x1:
+                raise ValueError('line {}: x2 ({}) must be higher than x1 ({})'.format(line, x2, x1))
+            if y2 <= y1:
+                raise ValueError('line {}: y2 ({}) must be higher than y1 ({})'.format(line, y2, y1))
+
+            if y2_int == y1_int:
+                warnings.warn('filtering line {}: rounding y2 ({}) and y1 ({}) makes them equal'.format(line, y2, y1))
+                continue
+
+            if x2_int == x1_int:
+                warnings.warn('filtering line {}: rounding x2 ({}) and x1 ({}) makes them equal'.format(line, x2, x1))
+                continue
+
+            img_id = row['ImageID']
+            annotation = {'cls_id': cls_id, 'x1': x1, 'x2': x2, 'y1': y1, 'y2': y2}
+
+            if img_id in id_annotations:
+                annotations = id_annotations[img_id]
+                annotations['boxes'].append(annotation)
+            else:
+                id_annotations[img_id] = {'w': width, 'h': height, 'boxes': [annotation]}
+    return id_annotations
+
+
+class OpenImagesGenerator(Generator):
+    def __init__(
+            self, main_dir, subset, version='v4',
+            labels_filter=None, annotation_cache_dir='.',
+            parent_label=None,
+            **kwargs
+    ):
+        if version == 'challenge2018':
+            metadata = 'challenge2018'
+        elif version == 'v4':
+            metadata = '2018_04'
+        elif version == 'v3':
+            metadata = '2017_11'
+        else:
+            raise NotImplementedError('There is currently no implementation for versions older than v3')
+
+        if version == 'challenge2018':
+            self.base_dir     = os.path.join(main_dir, 'images', 'train')
+        else:
+            self.base_dir     = os.path.join(main_dir, 'images', subset)
+
+        metadata_dir          = os.path.join(main_dir, metadata)
+        annotation_cache_json = os.path.join(annotation_cache_dir, subset + '.json')
+
+        self.hierarchy          = load_hierarchy(metadata_dir, version=version)
+        id_to_labels, cls_index = get_labels(metadata_dir, version=version)
+
+        if os.path.exists(annotation_cache_json):
+            with open(annotation_cache_json, 'r') as f:
+                self.annotations = json.loads(f.read())
+        else:
+            self.annotations = generate_images_annotations_json(main_dir, metadata_dir, subset, cls_index, version=version)
+            json.dump(self.annotations, open(annotation_cache_json, "w"))
+
+        if labels_filter is not None or parent_label is not None:
+            self.id_to_labels, self.annotations = self.__filter_data(id_to_labels, cls_index, labels_filter, parent_label)
+        else:
+            self.id_to_labels = id_to_labels
+
+        self.id_to_image_id = dict([(i, k) for i, k in enumerate(self.annotations)])
+
+        super(OpenImagesGenerator, self).__init__(**kwargs)
+
+    def __filter_data(self, id_to_labels, cls_index, labels_filter=None, parent_label=None):
+        """
+        If you want to work with a subset of the labels just set a list with trainable labels
+        :param labels_filter: Ex: labels_filter = ['Helmet', 'Hat', 'Analog television']
+        :param parent_label: If parent_label is set this will bring you the parent label
+        but also its children in the semantic hierarchy as defined in OID, ex: Animal
+        hierarchical tree
+        :return:
+        """
+
+        children_id_to_labels = {}
+
+        if parent_label is None:
+            # there is/are no other sublabel(s) other than the labels itself
+
+            for label in labels_filter:
+                for i, lb in id_to_labels.items():
+                    if lb == label:
+                        children_id_to_labels[i] = label
+                        break
+        else:
+            parent_cls = None
+            for i, lb in id_to_labels.items():
+                if lb == parent_label:
+                    parent_id = i
+                    for c, index in cls_index.items():
+                        if index == parent_id:
+                            parent_cls = c
+                    break
+
+            if parent_cls is None:
+                raise Exception('Couldnt find label {}'.format(parent_label))
+
+            parent_tree = find_hierarchy_parent(self.hierarchy, parent_cls)
+
+            if parent_tree is None:
+                raise Exception('Couldnt find parent {} in the semantic hierarchical tree'.format(parent_label))
+
+            children = load_hierarchy_children(parent_tree)
+
+            for cls in children:
+                index = cls_index[cls]
+                label = id_to_labels[index]
+                children_id_to_labels[index] = label
+
+        id_map = dict([(ind, i) for i, ind in enumerate(children_id_to_labels.keys())])
+
+        filtered_annotations = {}
+        for k in self.annotations:
+            img_ann = self.annotations[k]
+
+            filtered_boxes = []
+            for ann in img_ann['boxes']:
+                cls_id = ann['cls_id']
+                if cls_id in children_id_to_labels:
+                    ann['cls_id'] = id_map[cls_id]
+                    filtered_boxes.append(ann)
+
+            if len(filtered_boxes) > 0:
+                filtered_annotations[k] = {'w': img_ann['w'], 'h': img_ann['h'], 'boxes': filtered_boxes}
+
+        children_id_to_labels = dict([(id_map[i], l) for (i, l) in children_id_to_labels.items()])
+
+        return children_id_to_labels, filtered_annotations
+
+    def size(self):
+        return len(self.annotations)
+
+    def num_classes(self):
+        return len(self.id_to_labels)
+
+    def has_label(self, label):
+        """ Return True if label is a known label.
+        """
+        return label in self.id_to_labels
+
+    def has_name(self, name):
+        """ Returns True if name is a known class.
+        """
+        raise NotImplementedError()
+
+    def name_to_label(self, name):
+        raise NotImplementedError()
+
+    def label_to_name(self, label):
+        return self.id_to_labels[label]
+
+    def image_aspect_ratio(self, image_index):
+        img_annotations = self.annotations[self.id_to_image_id[image_index]]
+        height, width = img_annotations['h'], img_annotations['w']
+        return float(width) / float(height)
+
+    def image_path(self, image_index):
+        path = os.path.join(self.base_dir, self.id_to_image_id[image_index] + '.jpg')
+        return path
+
+    def load_image(self, image_index):
+        return read_image_bgr(self.image_path(image_index))
+
+    def load_annotations(self, image_index):
+        image_annotations = self.annotations[self.id_to_image_id[image_index]]
+
+        labels = image_annotations['boxes']
+        height, width = image_annotations['h'], image_annotations['w']
+
+        annotations = {'labels': np.empty((len(labels),)), 'bboxes': np.empty((len(labels), 4))}
+        for idx, ann in enumerate(labels):
+            cls_id = ann['cls_id']
+            x1 = ann['x1'] * width
+            x2 = ann['x2'] * width
+            y1 = ann['y1'] * height
+            y2 = ann['y2'] * height
+
+            annotations['bboxes'][idx, 0] = x1
+            annotations['bboxes'][idx, 1] = y1
+            annotations['bboxes'][idx, 2] = x2
+            annotations['bboxes'][idx, 3] = y2
+            annotations['labels'][idx] = cls_id
+
+        return annotations
diff --git a/src/keras_retinanet/preprocessing/pascal_voc.py b/src/keras_retinanet/preprocessing/pascal_voc.py
new file mode 100644
index 0000000..564fb37
--- /dev/null
+++ b/src/keras_retinanet/preprocessing/pascal_voc.py
@@ -0,0 +1,203 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from ..preprocessing.generator import Generator
+from ..utils.image import read_image_bgr
+
+import os
+import numpy as np
+from six import raise_from
+from PIL import Image
+
+try:
+    import xml.etree.cElementTree as ET
+except ImportError:
+    import xml.etree.ElementTree as ET
+
+voc_classes = {
+    'aeroplane'   : 0,
+    'bicycle'     : 1,
+    'bird'        : 2,
+    'boat'        : 3,
+    'bottle'      : 4,
+    'bus'         : 5,
+    'car'         : 6,
+    'cat'         : 7,
+    'chair'       : 8,
+    'cow'         : 9,
+    'diningtable' : 10,
+    'dog'         : 11,
+    'horse'       : 12,
+    'motorbike'   : 13,
+    'person'      : 14,
+    'pottedplant' : 15,
+    'sheep'       : 16,
+    'sofa'        : 17,
+    'train'       : 18,
+    'tvmonitor'   : 19
+}
+
+
+def _findNode(parent, name, debug_name=None, parse=None):
+    if debug_name is None:
+        debug_name = name
+
+    result = parent.find(name)
+    if result is None:
+        raise ValueError('missing element \'{}\''.format(debug_name))
+    if parse is not None:
+        try:
+            return parse(result.text)
+        except ValueError as e:
+            raise_from(ValueError('illegal value for \'{}\': {}'.format(debug_name, e)), None)
+    return result
+
+
+class PascalVocGenerator(Generator):
+    """ Generate data for a Pascal VOC dataset.
+
+    See http://host.robots.ox.ac.uk/pascal/VOC/ for more information.
+    """
+
+    def __init__(
+        self,
+        data_dir,
+        set_name,
+        classes=voc_classes,
+        image_extension='.jpg',
+        skip_truncated=False,
+        skip_difficult=False,
+        **kwargs
+    ):
+        """ Initialize a Pascal VOC data generator.
+
+        Args
+            base_dir: Directory w.r.t. where the files are to be searched (defaults to the directory containing the csv_data_file).
+            csv_class_file: Path to the CSV classes file.
+        """
+        self.data_dir             = data_dir
+        self.set_name             = set_name
+        self.classes              = classes
+        self.image_names          = [l.strip().split(None, 1)[0] for l in open(os.path.join(data_dir, 'ImageSets', 'Main', set_name + '.txt')).readlines()]
+        self.image_extension      = image_extension
+        self.skip_truncated       = skip_truncated
+        self.skip_difficult       = skip_difficult
+
+        self.labels = {}
+        for key, value in self.classes.items():
+            self.labels[value] = key
+
+        super(PascalVocGenerator, self).__init__(**kwargs)
+
+    def size(self):
+        """ Size of the dataset.
+        """
+        return len(self.image_names)
+
+    def num_classes(self):
+        """ Number of classes in the dataset.
+        """
+        return len(self.classes)
+
+    def has_label(self, label):
+        """ Return True if label is a known label.
+        """
+        return label in self.labels
+
+    def has_name(self, name):
+        """ Returns True if name is a known class.
+        """
+        return name in self.classes
+
+    def name_to_label(self, name):
+        """ Map name to label.
+        """
+        return self.classes[name]
+
+    def label_to_name(self, label):
+        """ Map label to name.
+        """
+        return self.labels[label]
+
+    def image_aspect_ratio(self, image_index):
+        """ Compute the aspect ratio for an image with image_index.
+        """
+        path  = os.path.join(self.data_dir, 'JPEGImages', self.image_names[image_index] + self.image_extension)
+        image = Image.open(path)
+        return float(image.width) / float(image.height)
+
+    def image_path(self, image_index):
+        """ Get the path to an image.
+        """
+        return os.path.join(self.data_dir, 'JPEGImages', self.image_names[image_index] + self.image_extension)
+
+    def load_image(self, image_index):
+        """ Load an image at the image_index.
+        """
+        return read_image_bgr(self.image_path(image_index))
+
+    def __parse_annotation(self, element):
+        """ Parse an annotation given an XML element.
+        """
+        truncated = _findNode(element, 'truncated', parse=int)
+        difficult = _findNode(element, 'difficult', parse=int)
+
+        class_name = _findNode(element, 'name').text
+        if class_name not in self.classes:
+            raise ValueError('class name \'{}\' not found in classes: {}'.format(class_name, list(self.classes.keys())))
+
+        box = np.zeros((4,))
+        label = self.name_to_label(class_name)
+
+        bndbox    = _findNode(element, 'bndbox')
+        box[0] = _findNode(bndbox, 'xmin', 'bndbox.xmin', parse=float) - 1
+        box[1] = _findNode(bndbox, 'ymin', 'bndbox.ymin', parse=float) - 1
+        box[2] = _findNode(bndbox, 'xmax', 'bndbox.xmax', parse=float) - 1
+        box[3] = _findNode(bndbox, 'ymax', 'bndbox.ymax', parse=float) - 1
+
+        return truncated, difficult, box, label
+
+    def __parse_annotations(self, xml_root):
+        """ Parse all annotations under the xml_root.
+        """
+        annotations = {'labels': np.empty((len(xml_root.findall('object')),)), 'bboxes': np.empty((len(xml_root.findall('object')), 4))}
+        for i, element in enumerate(xml_root.iter('object')):
+            try:
+                truncated, difficult, box, label = self.__parse_annotation(element)
+            except ValueError as e:
+                raise_from(ValueError('could not parse object #{}: {}'.format(i, e)), None)
+
+            if truncated and self.skip_truncated:
+                continue
+            if difficult and self.skip_difficult:
+                continue
+
+            annotations['bboxes'][i, :] = box
+            annotations['labels'][i] = label
+
+        return annotations
+
+    def load_annotations(self, image_index):
+        """ Load annotations for an image_index.
+        """
+        filename = self.image_names[image_index] + '.xml'
+        try:
+            tree = ET.parse(os.path.join(self.data_dir, 'Annotations', filename))
+            return self.__parse_annotations(tree.getroot())
+        except ET.ParseError as e:
+            raise_from(ValueError('invalid annotations file: {}: {}'.format(filename, e)), None)
+        except ValueError as e:
+            raise_from(ValueError('invalid annotations file: {}: {}'.format(filename, e)), None)
diff --git a/src/keras_retinanet/utils/__init__.py b/src/keras_retinanet/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/keras_retinanet/utils/anchors.py b/src/keras_retinanet/utils/anchors.py
new file mode 100644
index 0000000..493d73d
--- /dev/null
+++ b/src/keras_retinanet/utils/anchors.py
@@ -0,0 +1,346 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import numpy as np
+import keras
+
+from ..utils.compute_overlap import compute_overlap
+
+
+class AnchorParameters:
+    """ The parameteres that define how anchors are generated.
+
+    Args
+        sizes   : List of sizes to use. Each size corresponds to one feature level.
+        strides : List of strides to use. Each stride correspond to one feature level.
+        ratios  : List of ratios to use per location in a feature map.
+        scales  : List of scales to use per location in a feature map.
+    """
+    def __init__(self, sizes, strides, ratios, scales):
+        self.sizes   = sizes
+        self.strides = strides
+        self.ratios  = ratios
+        self.scales  = scales
+
+    def num_anchors(self):
+        return len(self.ratios) * len(self.scales)
+
+
+"""
+The default anchor parameters.
+"""
+AnchorParameters.default = AnchorParameters(
+    sizes   = [32, 64, 128, 256, 512],
+    strides = [8, 16, 32, 64, 128],
+    ratios  = np.array([0.5, 1, 2], keras.backend.floatx()),
+    scales  = np.array([2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)], keras.backend.floatx()),
+)
+
+
+def anchor_targets_bbox(
+    anchors,
+    image_group,
+    annotations_group,
+    num_classes,
+    negative_overlap=0.4,
+    positive_overlap=0.5
+):
+    """ Generate anchor targets for bbox detection.
+
+    Args
+        anchors: np.array of annotations of shape (N, 4) for (x1, y1, x2, y2).
+        image_group: List of BGR images.
+        annotations_group: List of annotation dictionaries with each annotation containing 'labels' and 'bboxes' of an image.
+        num_classes: Number of classes to predict.
+        mask_shape: If the image is padded with zeros, mask_shape can be used to mark the relevant part of the image.
+        negative_overlap: IoU overlap for negative anchors (all anchors with overlap < negative_overlap are negative).
+        positive_overlap: IoU overlap or positive anchors (all anchors with overlap > positive_overlap are positive).
+
+    Returns
+        labels_batch: batch that contains labels & anchor states (np.array of shape (batch_size, N, num_classes + 1),
+                      where N is the number of anchors for an image and the last column defines the anchor state (-1 for ignore, 0 for bg, 1 for fg).
+        regression_batch: batch that contains bounding-box regression targets for an image & anchor states (np.array of shape (batch_size, N, 4 + 1),
+                      where N is the number of anchors for an image, the first 4 columns define regression targets for (x1, y1, x2, y2) and the
+                      last column defines anchor states (-1 for ignore, 0 for bg, 1 for fg).
+    """
+
+    assert(len(image_group) == len(annotations_group)), "The length of the images and annotations need to be equal."
+    assert(len(annotations_group) > 0), "No data received to compute anchor targets for."
+    for annotations in annotations_group:
+        assert('bboxes' in annotations), "Annotations should contain bboxes."
+        assert('labels' in annotations), "Annotations should contain labels."
+
+    batch_size = len(image_group)
+
+    regression_batch  = np.zeros((batch_size, anchors.shape[0], 4 + 1), dtype=keras.backend.floatx())
+    labels_batch      = np.zeros((batch_size, anchors.shape[0], num_classes + 1), dtype=keras.backend.floatx())
+
+    # compute labels and regression targets
+    for index, (image, annotations) in enumerate(zip(image_group, annotations_group)):
+        if annotations['bboxes'].shape[0]:
+            # obtain indices of gt annotations with the greatest overlap
+            positive_indices, ignore_indices, argmax_overlaps_inds = compute_gt_annotations(anchors, annotations['bboxes'], negative_overlap, positive_overlap)
+
+            labels_batch[index, ignore_indices, -1]       = -1
+            labels_batch[index, positive_indices, -1]     = 1
+
+            regression_batch[index, ignore_indices, -1]   = -1
+            regression_batch[index, positive_indices, -1] = 1
+
+            # compute target class labels
+            labels_batch[index, positive_indices, annotations['labels'][argmax_overlaps_inds[positive_indices]].astype(int)] = 1
+
+            regression_batch[index, :, :-1] = bbox_transform(anchors, annotations['bboxes'][argmax_overlaps_inds, :])
+
+        # ignore annotations outside of image
+        if image.shape:
+            anchors_centers = np.vstack([(anchors[:, 0] + anchors[:, 2]) / 2, (anchors[:, 1] + anchors[:, 3]) / 2]).T
+            indices = np.logical_or(anchors_centers[:, 0] >= image.shape[1], anchors_centers[:, 1] >= image.shape[0])
+
+            labels_batch[index, indices, -1]     = -1
+            regression_batch[index, indices, -1] = -1
+
+    return regression_batch, labels_batch
+
+
+def compute_gt_annotations(
+    anchors,
+    annotations,
+    negative_overlap=0.4,
+    positive_overlap=0.5
+):
+    """ Obtain indices of gt annotations with the greatest overlap.
+
+    Args
+        anchors: np.array of annotations of shape (N, 4) for (x1, y1, x2, y2).
+        annotations: np.array of shape (N, 5) for (x1, y1, x2, y2, label).
+        negative_overlap: IoU overlap for negative anchors (all anchors with overlap < negative_overlap are negative).
+        positive_overlap: IoU overlap or positive anchors (all anchors with overlap > positive_overlap are positive).
+
+    Returns
+        positive_indices: indices of positive anchors
+        ignore_indices: indices of ignored anchors
+        argmax_overlaps_inds: ordered overlaps indices
+    """
+
+    overlaps = compute_overlap(anchors.astype(np.float64), annotations.astype(np.float64))
+    argmax_overlaps_inds = np.argmax(overlaps, axis=1)
+    max_overlaps = overlaps[np.arange(overlaps.shape[0]), argmax_overlaps_inds]
+
+    # assign "dont care" labels
+    positive_indices = max_overlaps >= positive_overlap
+    ignore_indices = (max_overlaps > negative_overlap) & ~positive_indices
+
+    return positive_indices, ignore_indices, argmax_overlaps_inds
+
+
+def layer_shapes(image_shape, model):
+    """Compute layer shapes given input image shape and the model.
+
+    Args
+        image_shape: The shape of the image.
+        model: The model to use for computing how the image shape is transformed in the pyramid.
+
+    Returns
+        A dictionary mapping layer names to image shapes.
+    """
+    shape = {
+        model.layers[0].name: (None,) + image_shape,
+    }
+
+    for layer in model.layers[1:]:
+        nodes = layer._inbound_nodes
+        for node in nodes:
+            inputs = [shape[lr.name] for lr in node.inbound_layers]
+            if not inputs:
+                continue
+            shape[layer.name] = layer.compute_output_shape(inputs[0] if len(inputs) == 1 else inputs)
+
+    return shape
+
+
+def make_shapes_callback(model):
+    """ Make a function for getting the shape of the pyramid levels.
+    """
+    def get_shapes(image_shape, pyramid_levels):
+        shape = layer_shapes(image_shape, model)
+        image_shapes = [shape["P{}".format(level)][1:3] for level in pyramid_levels]
+        return image_shapes
+
+    return get_shapes
+
+
+def guess_shapes(image_shape, pyramid_levels):
+    """Guess shapes based on pyramid levels.
+
+    Args
+         image_shape: The shape of the image.
+         pyramid_levels: A list of what pyramid levels are used.
+
+    Returns
+        A list of image shapes at each pyramid level.
+    """
+    image_shape = np.array(image_shape[:2])
+    image_shapes = [(image_shape + 2 ** x - 1) // (2 ** x) for x in pyramid_levels]
+    return image_shapes
+
+
+def anchors_for_shape(
+    image_shape,
+    pyramid_levels=None,
+    anchor_params=None,
+    shapes_callback=None,
+):
+    """ Generators anchors for a given shape.
+
+    Args
+        image_shape: The shape of the image.
+        pyramid_levels: List of ints representing which pyramids to use (defaults to [3, 4, 5, 6, 7]).
+        anchor_params: Struct containing anchor parameters. If None, default values are used.
+        shapes_callback: Function to call for getting the shape of the image at different pyramid levels.
+
+    Returns
+        np.array of shape (N, 4) containing the (x1, y1, x2, y2) coordinates for the anchors.
+    """
+
+    if pyramid_levels is None:
+        pyramid_levels = [3, 4, 5, 6, 7]
+
+    if anchor_params is None:
+        anchor_params = AnchorParameters.default
+
+    if shapes_callback is None:
+        shapes_callback = guess_shapes
+    image_shapes = shapes_callback(image_shape, pyramid_levels)
+
+    # compute anchors over all pyramid levels
+    all_anchors = np.zeros((0, 4))
+    for idx, p in enumerate(pyramid_levels):
+        anchors = generate_anchors(
+            base_size=anchor_params.sizes[idx],
+            ratios=anchor_params.ratios,
+            scales=anchor_params.scales
+        )
+        shifted_anchors = shift(image_shapes[idx], anchor_params.strides[idx], anchors)
+        all_anchors     = np.append(all_anchors, shifted_anchors, axis=0)
+
+    return all_anchors
+
+
+def shift(shape, stride, anchors):
+    """ Produce shifted anchors based on shape of the map and stride size.
+
+    Args
+        shape  : Shape to shift the anchors over.
+        stride : Stride to shift the anchors with over the shape.
+        anchors: The anchors to apply at each location.
+    """
+
+    # create a grid starting from half stride from the top left corner
+    shift_x = (np.arange(0, shape[1]) + 0.5) * stride
+    shift_y = (np.arange(0, shape[0]) + 0.5) * stride
+
+    shift_x, shift_y = np.meshgrid(shift_x, shift_y)
+
+    shifts = np.vstack((
+        shift_x.ravel(), shift_y.ravel(),
+        shift_x.ravel(), shift_y.ravel()
+    )).transpose()
+
+    # add A anchors (1, A, 4) to
+    # cell K shifts (K, 1, 4) to get
+    # shift anchors (K, A, 4)
+    # reshape to (K*A, 4) shifted anchors
+    A = anchors.shape[0]
+    K = shifts.shape[0]
+    all_anchors = (anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose((1, 0, 2)))
+    all_anchors = all_anchors.reshape((K * A, 4))
+
+    return all_anchors
+
+
+def generate_anchors(base_size=16, ratios=None, scales=None):
+    """
+    Generate anchor (reference) windows by enumerating aspect ratios X
+    scales w.r.t. a reference window.
+    """
+
+    if ratios is None:
+        ratios = AnchorParameters.default.ratios
+
+    if scales is None:
+        scales = AnchorParameters.default.scales
+
+    num_anchors = len(ratios) * len(scales)
+
+    # initialize output anchors
+    anchors = np.zeros((num_anchors, 4))
+
+    # scale base_size
+    anchors[:, 2:] = base_size * np.tile(scales, (2, len(ratios))).T
+
+    # compute areas of anchors
+    areas = anchors[:, 2] * anchors[:, 3]
+
+    # correct for ratios
+    anchors[:, 2] = np.sqrt(areas / np.repeat(ratios, len(scales)))
+    anchors[:, 3] = anchors[:, 2] * np.repeat(ratios, len(scales))
+
+    # transform from (x_ctr, y_ctr, w, h) -> (x1, y1, x2, y2)
+    anchors[:, 0::2] -= np.tile(anchors[:, 2] * 0.5, (2, 1)).T
+    anchors[:, 1::2] -= np.tile(anchors[:, 3] * 0.5, (2, 1)).T
+
+    return anchors
+
+
+def bbox_transform(anchors, gt_boxes, mean=None, std=None):
+    """Compute bounding-box regression targets for an image."""
+
+    # The Mean and std are calculated from COCO dataset.
+    # Bounding box normalization was firstly introduced in the Fast R-CNN paper.
+    # See https://github.com/fizyr/keras-retinanet/issues/1273#issuecomment-585828825  for more details
+    if mean is None:
+        mean = np.array([0, 0, 0, 0])
+    if std is None:
+        std = np.array([0.2, 0.2, 0.2, 0.2])
+
+    if isinstance(mean, (list, tuple)):
+        mean = np.array(mean)
+    elif not isinstance(mean, np.ndarray):
+        raise ValueError('Expected mean to be a np.ndarray, list or tuple. Received: {}'.format(type(mean)))
+
+    if isinstance(std, (list, tuple)):
+        std = np.array(std)
+    elif not isinstance(std, np.ndarray):
+        raise ValueError('Expected std to be a np.ndarray, list or tuple. Received: {}'.format(type(std)))
+
+    anchor_widths  = anchors[:, 2] - anchors[:, 0]
+    anchor_heights = anchors[:, 3] - anchors[:, 1]
+
+    # According to the information provided by a keras-retinanet author, they got marginally better results using
+    # the following way of bounding box parametrization.
+    # See https://github.com/fizyr/keras-retinanet/issues/1273#issuecomment-585828825 for more details
+    targets_dx1 = (gt_boxes[:, 0] - anchors[:, 0]) / anchor_widths
+    targets_dy1 = (gt_boxes[:, 1] - anchors[:, 1]) / anchor_heights
+    targets_dx2 = (gt_boxes[:, 2] - anchors[:, 2]) / anchor_widths
+    targets_dy2 = (gt_boxes[:, 3] - anchors[:, 3]) / anchor_heights
+
+    targets = np.stack((targets_dx1, targets_dy1, targets_dx2, targets_dy2))
+    targets = targets.T
+
+    targets = (targets - mean) / std
+
+    return targets
diff --git a/src/keras_retinanet/utils/coco_eval.py b/src/keras_retinanet/utils/coco_eval.py
new file mode 100644
index 0000000..e8d39c5
--- /dev/null
+++ b/src/keras_retinanet/utils/coco_eval.py
@@ -0,0 +1,93 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from pycocotools.cocoeval import COCOeval
+
+import keras
+import numpy as np
+import json
+
+import progressbar
+assert(callable(progressbar.progressbar)), "Using wrong progressbar module, install 'progressbar2' instead."
+
+
+def evaluate_coco(generator, model, threshold=0.05):
+    """ Use the pycocotools to evaluate a COCO model on a dataset.
+
+    Args
+        generator : The generator for generating the evaluation data.
+        model     : The model to evaluate.
+        threshold : The score threshold to use.
+    """
+    # start collecting results
+    results = []
+    image_ids = []
+    for index in progressbar.progressbar(range(generator.size()), prefix='COCO evaluation: '):
+        image = generator.load_image(index)
+        image = generator.preprocess_image(image)
+        image, scale = generator.resize_image(image)
+
+        if keras.backend.image_data_format() == 'channels_first':
+            image = image.transpose((2, 0, 1))
+
+        # run network
+        boxes, scores, labels = model.predict_on_batch(np.expand_dims(image, axis=0))
+
+        # correct boxes for image scale
+        boxes /= scale
+
+        # change to (x, y, w, h) (MS COCO standard)
+        boxes[:, :, 2] -= boxes[:, :, 0]
+        boxes[:, :, 3] -= boxes[:, :, 1]
+
+        # compute predicted labels and scores
+        for box, score, label in zip(boxes[0], scores[0], labels[0]):
+            # scores are sorted, so we can break
+            if score < threshold:
+                break
+
+            # append detection for each positively labeled class
+            image_result = {
+                'image_id'    : generator.image_ids[index],
+                'category_id' : generator.label_to_coco_label(label),
+                'score'       : float(score),
+                'bbox'        : box.tolist(),
+            }
+
+            # append detection to results
+            results.append(image_result)
+
+        # append image to list of processed images
+        image_ids.append(generator.image_ids[index])
+
+    if not len(results):
+        return
+
+    # write output
+    json.dump(results, open('{}_bbox_results.json'.format(generator.set_name), 'w'), indent=4)
+    json.dump(image_ids, open('{}_processed_image_ids.json'.format(generator.set_name), 'w'), indent=4)
+
+    # load results in COCO evaluation tool
+    coco_true = generator.coco
+    coco_pred = coco_true.loadRes('{}_bbox_results.json'.format(generator.set_name))
+
+    # run COCO evaluation
+    coco_eval = COCOeval(coco_true, coco_pred, 'bbox')
+    coco_eval.params.imgIds = image_ids
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
+    return coco_eval.stats
diff --git a/src/keras_retinanet/utils/colors.py b/src/keras_retinanet/utils/colors.py
new file mode 100644
index 0000000..7f1b685
--- /dev/null
+++ b/src/keras_retinanet/utils/colors.py
@@ -0,0 +1,112 @@
+import warnings
+
+
+def label_color(label):
+    """ Return a color from a set of predefined colors. Contains 80 colors in total.
+
+    Args
+        label: The label to get the color for.
+
+    Returns
+        A list of three values representing a RGB color.
+
+        If no color is defined for a certain label, the color green is returned and a warning is printed.
+    """
+    if label < len(colors):
+        return colors[label]
+    else:
+        warnings.warn('Label {} has no color, returning default.'.format(label))
+        return (0, 255, 0)
+
+
+"""
+Generated using:
+
+```
+colors = [list((matplotlib.colors.hsv_to_rgb([x, 1.0, 1.0]) * 255).astype(int)) for x in np.arange(0, 1, 1.0 / 80)]
+shuffle(colors)
+pprint(colors)
+```
+"""
+colors = [
+    [31  , 0   , 255] ,
+    [0   , 159 , 255] ,
+    [255 , 95  , 0]   ,
+    [255 , 19  , 0]   ,
+    [255 , 0   , 0]   ,
+    [255 , 38  , 0]   ,
+    [0   , 255 , 25]  ,
+    [255 , 0   , 133] ,
+    [255 , 172 , 0]   ,
+    [108 , 0   , 255] ,
+    [0   , 82  , 255] ,
+    [0   , 255 , 6]   ,
+    [255 , 0   , 152] ,
+    [223 , 0   , 255] ,
+    [12  , 0   , 255] ,
+    [0   , 255 , 178] ,
+    [108 , 255 , 0]   ,
+    [184 , 0   , 255] ,
+    [255 , 0   , 76]  ,
+    [146 , 255 , 0]   ,
+    [51  , 0   , 255] ,
+    [0   , 197 , 255] ,
+    [255 , 248 , 0]   ,
+    [255 , 0   , 19]  ,
+    [255 , 0   , 38]  ,
+    [89  , 255 , 0]   ,
+    [127 , 255 , 0]   ,
+    [255 , 153 , 0]   ,
+    [0   , 255 , 255] ,
+    [0   , 255 , 216] ,
+    [0   , 255 , 121] ,
+    [255 , 0   , 248] ,
+    [70  , 0   , 255] ,
+    [0   , 255 , 159] ,
+    [0   , 216 , 255] ,
+    [0   , 6   , 255] ,
+    [0   , 63  , 255] ,
+    [31  , 255 , 0]   ,
+    [255 , 57  , 0]   ,
+    [255 , 0   , 210] ,
+    [0   , 255 , 102] ,
+    [242 , 255 , 0]   ,
+    [255 , 191 , 0]   ,
+    [0   , 255 , 63]  ,
+    [255 , 0   , 95]  ,
+    [146 , 0   , 255] ,
+    [184 , 255 , 0]   ,
+    [255 , 114 , 0]   ,
+    [0   , 255 , 235] ,
+    [255 , 229 , 0]   ,
+    [0   , 178 , 255] ,
+    [255 , 0   , 114] ,
+    [255 , 0   , 57]  ,
+    [0   , 140 , 255] ,
+    [0   , 121 , 255] ,
+    [12  , 255 , 0]   ,
+    [255 , 210 , 0]   ,
+    [0   , 255 , 44]  ,
+    [165 , 255 , 0]   ,
+    [0   , 25  , 255] ,
+    [0   , 255 , 140] ,
+    [0   , 101 , 255] ,
+    [0   , 255 , 82]  ,
+    [223 , 255 , 0]   ,
+    [242 , 0   , 255] ,
+    [89  , 0   , 255] ,
+    [165 , 0   , 255] ,
+    [70  , 255 , 0]   ,
+    [255 , 0   , 172] ,
+    [255 , 76  , 0]   ,
+    [203 , 255 , 0]   ,
+    [204 , 0   , 255] ,
+    [255 , 0   , 229] ,
+    [255 , 133 , 0]   ,
+    [127 , 0   , 255] ,
+    [0   , 235 , 255] ,
+    [0   , 255 , 197] ,
+    [255 , 0   , 191] ,
+    [0   , 44  , 255] ,
+    [50  , 255 , 0]
+]
diff --git a/src/keras_retinanet/utils/compute_overlap.pyx b/src/keras_retinanet/utils/compute_overlap.pyx
new file mode 100644
index 0000000..e8b7930
--- /dev/null
+++ b/src/keras_retinanet/utils/compute_overlap.pyx
@@ -0,0 +1,53 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Sergey Karayev
+# --------------------------------------------------------
+
+cimport cython
+import numpy as np
+cimport numpy as np
+
+
+def compute_overlap(
+    np.ndarray[double, ndim=2] boxes,
+    np.ndarray[double, ndim=2] query_boxes
+):
+    """
+    Args
+        a: (N, 4) ndarray of float
+        b: (K, 4) ndarray of float
+
+    Returns
+        overlaps: (N, K) ndarray of overlap between boxes and query_boxes
+    """
+    cdef unsigned int N = boxes.shape[0]
+    cdef unsigned int K = query_boxes.shape[0]
+    cdef np.ndarray[double, ndim=2] overlaps = np.zeros((N, K), dtype=np.float64)
+    cdef double iw, ih, box_area
+    cdef double ua
+    cdef unsigned int k, n
+    for k in range(K):
+        box_area = (
+            (query_boxes[k, 2] - query_boxes[k, 0]) *
+            (query_boxes[k, 3] - query_boxes[k, 1])
+        )
+        for n in range(N):
+            iw = (
+                min(boxes[n, 2], query_boxes[k, 2]) -
+                max(boxes[n, 0], query_boxes[k, 0]) 
+            )
+            if iw > 0:
+                ih = (
+                    min(boxes[n, 3], query_boxes[k, 3]) -
+                    max(boxes[n, 1], query_boxes[k, 1]) 
+                )
+                if ih > 0:
+                    ua = np.float64(
+                        (boxes[n, 2] - boxes[n, 0]) *
+                        (boxes[n, 3] - boxes[n, 1]) +
+                        box_area - iw * ih
+                    )
+                    overlaps[n, k] = iw * ih / ua
+    return overlaps
diff --git a/src/keras_retinanet/utils/config.py b/src/keras_retinanet/utils/config.py
new file mode 100644
index 0000000..137f47c
--- /dev/null
+++ b/src/keras_retinanet/utils/config.py
@@ -0,0 +1,47 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import configparser
+import numpy as np
+import keras
+from ..utils.anchors import AnchorParameters
+
+
+def read_config_file(config_path):
+    config = configparser.ConfigParser()
+
+    with open(config_path, 'r') as file:
+        config.read_file(file)
+
+    assert 'anchor_parameters' in config, \
+        "Malformed config file. Verify that it contains the anchor_parameters section."
+
+    config_keys = set(config['anchor_parameters'])
+    default_keys = set(AnchorParameters.default.__dict__.keys())
+
+    assert config_keys <= default_keys, \
+        "Malformed config file. These keys are not valid: {}".format(config_keys - default_keys)
+
+    return config
+
+
+def parse_anchor_parameters(config):
+    ratios  = np.array(list(map(float, config['anchor_parameters']['ratios'].split(' '))), keras.backend.floatx())
+    scales  = np.array(list(map(float, config['anchor_parameters']['scales'].split(' '))), keras.backend.floatx())
+    sizes   = list(map(int, config['anchor_parameters']['sizes'].split(' ')))
+    strides = list(map(int, config['anchor_parameters']['strides'].split(' ')))
+
+    return AnchorParameters(sizes, strides, ratios, scales)
diff --git a/src/keras_retinanet/utils/eval.py b/src/keras_retinanet/utils/eval.py
new file mode 100644
index 0000000..da411b0
--- /dev/null
+++ b/src/keras_retinanet/utils/eval.py
@@ -0,0 +1,244 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from .anchors import compute_overlap
+from .visualization import draw_detections, draw_annotations
+
+import keras
+import numpy as np
+import os
+import time
+
+import cv2
+import progressbar
+assert(callable(progressbar.progressbar)), "Using wrong progressbar module, install 'progressbar2' instead."
+
+
+def _compute_ap(recall, precision):
+    """ Compute the average precision, given the recall and precision curves.
+
+    Code originally from https://github.com/rbgirshick/py-faster-rcnn.
+
+    # Arguments
+        recall:    The recall curve (list).
+        precision: The precision curve (list).
+    # Returns
+        The average precision as computed in py-faster-rcnn.
+    """
+    # correct AP calculation
+    # first append sentinel values at the end
+    mrec = np.concatenate(([0.], recall, [1.]))
+    mpre = np.concatenate(([0.], precision, [0.]))
+
+    # compute the precision envelope
+    for i in range(mpre.size - 1, 0, -1):
+        mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
+
+    # to calculate area under PR curve, look for points
+    # where X axis (recall) changes value
+    i = np.where(mrec[1:] != mrec[:-1])[0]
+
+    # and sum (\Delta recall) * prec
+    ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
+    return ap
+
+
+def _get_detections(generator, model, score_threshold=0.05, max_detections=100, save_path=None):
+    """ Get the detections from the model using the generator.
+
+    The result is a list of lists such that the size is:
+        all_detections[num_images][num_classes] = detections[num_detections, 4 + num_classes]
+
+    # Arguments
+        generator       : The generator used to run images through the model.
+        model           : The model to run on the images.
+        score_threshold : The score confidence threshold to use.
+        max_detections  : The maximum number of detections to use per image.
+        save_path       : The path to save the images with visualized detections to.
+    # Returns
+        A list of lists containing the detections for each image in the generator.
+    """
+    all_detections = [[None for i in range(generator.num_classes()) if generator.has_label(i)] for j in range(generator.size())]
+    all_inferences = [None for i in range(generator.size())]
+
+    for i in progressbar.progressbar(range(generator.size()), prefix='Running network: '):
+        raw_image    = generator.load_image(i)
+        image        = generator.preprocess_image(raw_image.copy())
+        image, scale = generator.resize_image(image)
+
+        if keras.backend.image_data_format() == 'channels_first':
+            image = image.transpose((2, 0, 1))
+
+        # run network
+        start = time.time()
+        boxes, scores, labels = model.predict_on_batch(np.expand_dims(image, axis=0))[:3]
+        inference_time = time.time() - start
+
+        # correct boxes for image scale
+        boxes /= scale
+
+        # select indices which have a score above the threshold
+        indices = np.where(scores[0, :] > score_threshold)[0]
+
+        # select those scores
+        scores = scores[0][indices]
+
+        # find the order with which to sort the scores
+        scores_sort = np.argsort(-scores)[:max_detections]
+
+        # select detections
+        image_boxes      = boxes[0, indices[scores_sort], :]
+        image_scores     = scores[scores_sort]
+        image_labels     = labels[0, indices[scores_sort]]
+        image_detections = np.concatenate([image_boxes, np.expand_dims(image_scores, axis=1), np.expand_dims(image_labels, axis=1)], axis=1)
+
+        if save_path is not None:
+            draw_annotations(raw_image, generator.load_annotations(i), label_to_name=generator.label_to_name)
+            draw_detections(raw_image, image_boxes, image_scores, image_labels, label_to_name=generator.label_to_name, score_threshold=score_threshold)
+
+            cv2.imwrite(os.path.join(save_path, '{}.png'.format(i)), raw_image)
+
+        # copy detections to all_detections
+        for label in range(generator.num_classes()):
+            if not generator.has_label(label):
+                continue
+
+            all_detections[i][label] = image_detections[image_detections[:, -1] == label, :-1]
+
+        all_inferences[i] = inference_time
+
+    return all_detections, all_inferences
+
+
+def _get_annotations(generator):
+    """ Get the ground truth annotations from the generator.
+
+    The result is a list of lists such that the size is:
+        all_detections[num_images][num_classes] = annotations[num_detections, 5]
+
+    # Arguments
+        generator : The generator used to retrieve ground truth annotations.
+    # Returns
+        A list of lists containing the annotations for each image in the generator.
+    """
+    all_annotations = [[None for i in range(generator.num_classes())] for j in range(generator.size())]
+
+    for i in progressbar.progressbar(range(generator.size()), prefix='Parsing annotations: '):
+        # load the annotations
+        annotations = generator.load_annotations(i)
+
+        # copy detections to all_annotations
+        for label in range(generator.num_classes()):
+            if not generator.has_label(label):
+                continue
+
+            all_annotations[i][label] = annotations['bboxes'][annotations['labels'] == label, :].copy()
+
+    return all_annotations
+
+
+def evaluate(
+    generator,
+    model,
+    iou_threshold=0.5,
+    score_threshold=0.05,
+    max_detections=100,
+    save_path=None
+):
+    """ Evaluate a given dataset using a given model.
+
+    # Arguments
+        generator       : The generator that represents the dataset to evaluate.
+        model           : The model to evaluate.
+        iou_threshold   : The threshold used to consider when a detection is positive or negative.
+        score_threshold : The score confidence threshold to use for detections.
+        max_detections  : The maximum number of detections to use per image.
+        save_path       : The path to save images with visualized detections to.
+    # Returns
+        A dict mapping class names to mAP scores.
+    """
+    # gather all detections and annotations
+    all_detections, all_inferences = _get_detections(generator, model, score_threshold=score_threshold, max_detections=max_detections, save_path=save_path)
+    all_annotations    = _get_annotations(generator)
+    average_precisions = {}
+
+    # all_detections = pickle.load(open('all_detections.pkl', 'rb'))
+    # all_annotations = pickle.load(open('all_annotations.pkl', 'rb'))
+    # pickle.dump(all_detections, open('all_detections.pkl', 'wb'))
+    # pickle.dump(all_annotations, open('all_annotations.pkl', 'wb'))
+
+    # process detections and annotations
+    for label in range(generator.num_classes()):
+        if not generator.has_label(label):
+            continue
+
+        false_positives = np.zeros((0,))
+        true_positives  = np.zeros((0,))
+        scores          = np.zeros((0,))
+        num_annotations = 0.0
+
+        for i in range(generator.size()):
+            detections           = all_detections[i][label]
+            annotations          = all_annotations[i][label]
+            num_annotations     += annotations.shape[0]
+            detected_annotations = []
+
+            for d in detections:
+                scores = np.append(scores, d[4])
+
+                if annotations.shape[0] == 0:
+                    false_positives = np.append(false_positives, 1)
+                    true_positives  = np.append(true_positives, 0)
+                    continue
+
+                overlaps            = compute_overlap(np.expand_dims(d, axis=0), annotations)
+                assigned_annotation = np.argmax(overlaps, axis=1)
+                max_overlap         = overlaps[0, assigned_annotation]
+
+                if max_overlap >= iou_threshold and assigned_annotation not in detected_annotations:
+                    false_positives = np.append(false_positives, 0)
+                    true_positives  = np.append(true_positives, 1)
+                    detected_annotations.append(assigned_annotation)
+                else:
+                    false_positives = np.append(false_positives, 1)
+                    true_positives  = np.append(true_positives, 0)
+
+        # no annotations -> AP for this class is 0 (is this correct?)
+        if num_annotations == 0:
+            average_precisions[label] = 0, 0
+            continue
+
+        # sort by score
+        indices         = np.argsort(-scores)
+        false_positives = false_positives[indices]
+        true_positives  = true_positives[indices]
+
+        # compute false positives and true positives
+        false_positives = np.cumsum(false_positives)
+        true_positives  = np.cumsum(true_positives)
+
+        # compute recall and precision
+        recall    = true_positives / num_annotations
+        precision = true_positives / np.maximum(true_positives + false_positives, np.finfo(np.float64).eps)
+
+        # compute average precision
+        average_precision  = _compute_ap(recall, precision)
+        average_precisions[label] = average_precision, num_annotations
+
+    # inference time
+    inference_time = np.sum(all_inferences) / generator.size()
+
+    return average_precisions, inference_time
diff --git a/src/keras_retinanet/utils/gpu.py b/src/keras_retinanet/utils/gpu.py
new file mode 100644
index 0000000..968c2b2
--- /dev/null
+++ b/src/keras_retinanet/utils/gpu.py
@@ -0,0 +1,53 @@
+"""
+Copyright 2017-2019 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import tensorflow as tf
+
+from .tf_version import tf_version_ok
+
+
+def setup_gpu(gpu_id):
+    if tf_version_ok((2, 0, 0)):
+        if gpu_id == 'cpu' or gpu_id == -1:
+            tf.config.experimental.set_visible_devices([], 'GPU')
+            return
+
+        gpus = tf.config.experimental.list_physical_devices('GPU')
+        if gpus:
+            # Restrict TensorFlow to only use the first GPU.
+            try:
+                # Currently, memory growth needs to be the same across GPUs.
+                for gpu in gpus:
+                    tf.config.experimental.set_memory_growth(gpu, True)
+
+                # Use only the selcted gpu.
+                tf.config.experimental.set_visible_devices(gpus[int(gpu_id)], 'GPU')
+            except RuntimeError as e:
+                # Visible devices must be set before GPUs have been initialized.
+                print(e)
+
+            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
+            print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
+    else:
+        import os
+        if gpu_id == 'cpu' or gpu_id == -1:
+            os.environ['CUDA_VISIBLE_DEVICES'] = ""
+            return
+
+        os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
+        config = tf.ConfigProto()
+        config.gpu_options.allow_growth = True
+        tf.keras.backend.set_session(tf.Session(config=config))
diff --git a/src/keras_retinanet/utils/image.py b/src/keras_retinanet/utils/image.py
new file mode 100644
index 0000000..b3116cd
--- /dev/null
+++ b/src/keras_retinanet/utils/image.py
@@ -0,0 +1,356 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from __future__ import division
+import numpy as np
+import cv2
+from PIL import Image
+
+from .transform import change_transform_origin
+
+
+def read_image_bgr(path):
+    """ Read an image in BGR format.
+
+    Args
+        path: Path to the image.
+    """
+    # We deliberately don't use cv2.imread here, since it gives no feedback on errors while reading the image.
+    image = np.ascontiguousarray(Image.open(path).convert('RGB'))
+    return image[:, :, ::-1]
+
+
+def preprocess_image(x, mode='caffe'):
+    """ Preprocess an image by subtracting the ImageNet mean.
+
+    Args
+        x: np.array of shape (None, None, 3) or (3, None, None).
+        mode: One of "caffe" or "tf".
+            - caffe: will zero-center each color channel with
+                respect to the ImageNet dataset, without scaling.
+            - tf: will scale pixels between -1 and 1, sample-wise.
+
+    Returns
+        The input with the ImageNet mean subtracted.
+    """
+    # mostly identical to "https://github.com/keras-team/keras-applications/blob/master/keras_applications/imagenet_utils.py"
+    # except for converting RGB -> BGR since we assume BGR already
+
+    # covert always to float32 to keep compatibility with opencv
+    x = x.astype(np.float32)
+
+    if mode == 'tf':
+        x /= 127.5
+        x -= 1.
+    elif mode == 'caffe':
+        x -= [103.939, 116.779, 123.68]
+
+    return x
+
+
+def adjust_transform_for_image(transform, image, relative_translation):
+    """ Adjust a transformation for a specific image.
+
+    The translation of the matrix will be scaled with the size of the image.
+    The linear part of the transformation will adjusted so that the origin of the transformation will be at the center of the image.
+    """
+    height, width, channels = image.shape
+
+    result = transform
+
+    # Scale the translation with the image size if specified.
+    if relative_translation:
+        result[0:2, 2] *= [width, height]
+
+    # Move the origin of transformation.
+    result = change_transform_origin(transform, (0.5 * width, 0.5 * height))
+
+    return result
+
+
+class TransformParameters:
+    """ Struct holding parameters determining how to apply a transformation to an image.
+
+    Args
+        fill_mode:             One of: 'constant', 'nearest', 'reflect', 'wrap'
+        interpolation:         One of: 'nearest', 'linear', 'cubic', 'area', 'lanczos4'
+        cval:                  Fill value to use with fill_mode='constant'
+        relative_translation:  If true (the default), interpret translation as a factor of the image size.
+                               If false, interpret it as absolute pixels.
+    """
+    def __init__(
+        self,
+        fill_mode            = 'nearest',
+        interpolation        = 'linear',
+        cval                 = 0,
+        relative_translation = True,
+    ):
+        self.fill_mode            = fill_mode
+        self.cval                 = cval
+        self.interpolation        = interpolation
+        self.relative_translation = relative_translation
+
+    def cvBorderMode(self):
+        if self.fill_mode == 'constant':
+            return cv2.BORDER_CONSTANT
+        if self.fill_mode == 'nearest':
+            return cv2.BORDER_REPLICATE
+        if self.fill_mode == 'reflect':
+            return cv2.BORDER_REFLECT_101
+        if self.fill_mode == 'wrap':
+            return cv2.BORDER_WRAP
+
+    def cvInterpolation(self):
+        if self.interpolation == 'nearest':
+            return cv2.INTER_NEAREST
+        if self.interpolation == 'linear':
+            return cv2.INTER_LINEAR
+        if self.interpolation == 'cubic':
+            return cv2.INTER_CUBIC
+        if self.interpolation == 'area':
+            return cv2.INTER_AREA
+        if self.interpolation == 'lanczos4':
+            return cv2.INTER_LANCZOS4
+
+
+def apply_transform(matrix, image, params):
+    """
+    Apply a transformation to an image.
+
+    The origin of transformation is at the top left corner of the image.
+
+    The matrix is interpreted such that a point (x, y) on the original image is moved to transform * (x, y) in the generated image.
+    Mathematically speaking, that means that the matrix is a transformation from the transformed image space to the original image space.
+
+    Args
+      matrix: A homogeneous 3 by 3 matrix holding representing the transformation to apply.
+      image:  The image to transform.
+      params: The transform parameters (see TransformParameters)
+    """
+    output = cv2.warpAffine(
+        image,
+        matrix[:2, :],
+        dsize       = (image.shape[1], image.shape[0]),
+        flags       = params.cvInterpolation(),
+        borderMode  = params.cvBorderMode(),
+        borderValue = params.cval,
+    )
+    return output
+
+
+def compute_resize_scale(image_shape, min_side=800, max_side=1333):
+    """ Compute an image scale such that the image size is constrained to min_side and max_side.
+
+    Args
+        min_side: The image's min side will be equal to min_side after resizing.
+        max_side: If after resizing the image's max side is above max_side, resize until the max side is equal to max_side.
+
+    Returns
+        A resizing scale.
+    """
+    (rows, cols, _) = image_shape
+
+    smallest_side = min(rows, cols)
+
+    # rescale the image so the smallest side is min_side
+    scale = min_side / smallest_side
+
+    # check if the largest side is now greater than max_side, which can happen
+    # when images have a large aspect ratio
+    largest_side = max(rows, cols)
+    if largest_side * scale > max_side:
+        scale = max_side / largest_side
+
+    return scale
+
+
+def resize_image(img, min_side=800, max_side=1333):
+    """ Resize an image such that the size is constrained to min_side and max_side.
+
+    Args
+        min_side: The image's min side will be equal to min_side after resizing.
+        max_side: If after resizing the image's max side is above max_side, resize until the max side is equal to max_side.
+
+    Returns
+        A resized image.
+    """
+    # compute scale to resize the image
+    scale = compute_resize_scale(img.shape, min_side=min_side, max_side=max_side)
+
+    # resize the image with the computed scale
+    img = cv2.resize(img, None, fx=scale, fy=scale)
+
+    return img, scale
+
+
+def _uniform(val_range):
+    """ Uniformly sample from the given range.
+
+    Args
+        val_range: A pair of lower and upper bound.
+    """
+    return np.random.uniform(val_range[0], val_range[1])
+
+
+def _check_range(val_range, min_val=None, max_val=None):
+    """ Check whether the range is a valid range.
+
+    Args
+        val_range: A pair of lower and upper bound.
+        min_val: Minimal value for the lower bound.
+        max_val: Maximal value for the upper bound.
+    """
+    if val_range[0] > val_range[1]:
+        raise ValueError('interval lower bound > upper bound')
+    if min_val is not None and val_range[0] < min_val:
+        raise ValueError('invalid interval lower bound')
+    if max_val is not None and val_range[1] > max_val:
+        raise ValueError('invalid interval upper bound')
+
+
+def _clip(image):
+    """
+    Clip and convert an image to np.uint8.
+
+    Args
+        image: Image to clip.
+    """
+    return np.clip(image, 0, 255).astype(np.uint8)
+
+
+class VisualEffect:
+    """ Struct holding parameters and applying image color transformation.
+
+    Args
+        contrast_factor:   A factor for adjusting contrast. Should be between 0 and 3.
+        brightness_delta:  Brightness offset between -1 and 1 added to the pixel values.
+        hue_delta:         Hue offset between -1 and 1 added to the hue channel.
+        saturation_factor: A factor multiplying the saturation values of each pixel.
+    """
+
+    def __init__(
+        self,
+        contrast_factor,
+        brightness_delta,
+        hue_delta,
+        saturation_factor,
+    ):
+        self.contrast_factor = contrast_factor
+        self.brightness_delta = brightness_delta
+        self.hue_delta = hue_delta
+        self.saturation_factor = saturation_factor
+
+    def __call__(self, image):
+        """ Apply a visual effect on the image.
+
+        Args
+            image: Image to adjust
+        """
+
+        if self.contrast_factor:
+            image = adjust_contrast(image, self.contrast_factor)
+        if self.brightness_delta:
+            image = adjust_brightness(image, self.brightness_delta)
+
+        if self.hue_delta or self.saturation_factor:
+
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
+
+            if self.hue_delta:
+                image = adjust_hue(image, self.hue_delta)
+            if self.saturation_factor:
+                image = adjust_saturation(image, self.saturation_factor)
+
+            image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)
+
+        return image
+
+
+def random_visual_effect_generator(
+    contrast_range=(0.9, 1.1),
+    brightness_range=(-.1, .1),
+    hue_range=(-0.05, 0.05),
+    saturation_range=(0.95, 1.05)
+):
+    """ Generate visual effect parameters uniformly sampled from the given intervals.
+
+    Args
+        contrast_factor:   A factor interval for adjusting contrast. Should be between 0 and 3.
+        brightness_delta:  An interval between -1 and 1 for the amount added to the pixels.
+        hue_delta:         An interval between -1 and 1 for the amount added to the hue channel.
+                           The values are rotated if they exceed 180.
+        saturation_factor: An interval for the factor multiplying the saturation values of each
+                           pixel.
+    """
+    _check_range(contrast_range, 0)
+    _check_range(brightness_range, -1, 1)
+    _check_range(hue_range, -1, 1)
+    _check_range(saturation_range, 0)
+
+    def _generate():
+        while True:
+            yield VisualEffect(
+                contrast_factor=_uniform(contrast_range),
+                brightness_delta=_uniform(brightness_range),
+                hue_delta=_uniform(hue_range),
+                saturation_factor=_uniform(saturation_range),
+            )
+
+    return _generate()
+
+
+def adjust_contrast(image, factor):
+    """ Adjust contrast of an image.
+
+    Args
+        image: Image to adjust.
+        factor: A factor for adjusting contrast.
+    """
+    mean = image.mean(axis=0).mean(axis=0)
+    return _clip((image - mean) * factor + mean)
+
+
+def adjust_brightness(image, delta):
+    """ Adjust brightness of an image
+
+    Args
+        image: Image to adjust.
+        delta: Brightness offset between -1 and 1 added to the pixel values.
+    """
+    return _clip(image + delta * 255)
+
+
+def adjust_hue(image, delta):
+    """ Adjust hue of an image.
+
+    Args
+        image: Image to adjust.
+        delta: An interval between -1 and 1 for the amount added to the hue channel.
+               The values are rotated if they exceed 180.
+    """
+    image[..., 0] = np.mod(image[..., 0] + delta * 180, 180)
+    return image
+
+
+def adjust_saturation(image, factor):
+    """ Adjust saturation of an image.
+
+    Args
+        image: Image to adjust.
+        factor: An interval for the factor multiplying the saturation values of each pixel.
+    """
+    image[..., 1] = np.clip(image[..., 1] * factor, 0 , 255)
+    return image
diff --git a/src/keras_retinanet/utils/keras_version.py b/src/keras_retinanet/utils/keras_version.py
new file mode 100644
index 0000000..626f265
--- /dev/null
+++ b/src/keras_retinanet/utils/keras_version.py
@@ -0,0 +1,55 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from __future__ import print_function
+
+import keras
+import sys
+
+minimum_keras_version = 2, 3, 0
+
+
+def keras_version():
+    """ Get the Keras version.
+
+    Returns
+        tuple of (major, minor, patch).
+    """
+    return tuple(map(int, keras.__version__.split('.')))
+
+
+def keras_version_ok():
+    """ Check if the current Keras version is higher than the minimum version.
+    """
+    return keras_version() >= minimum_keras_version
+
+
+def assert_keras_version():
+    """ Assert that the Keras version is up to date.
+    """
+    detected = keras.__version__
+    required = '.'.join(map(str, minimum_keras_version))
+    assert(keras_version() >= minimum_keras_version), 'You are using keras version {}. The minimum required version is {}.'.format(detected, required)
+
+
+def check_keras_version():
+    """ Check that the Keras version is up to date. If it isn't, print an error message and exit the script.
+    """
+    try:
+        assert_keras_version()
+    except AssertionError as e:
+        print(e, file=sys.stderr)
+        sys.exit(1)
diff --git a/src/keras_retinanet/utils/model.py b/src/keras_retinanet/utils/model.py
new file mode 100644
index 0000000..702262c
--- /dev/null
+++ b/src/keras_retinanet/utils/model.py
@@ -0,0 +1,28 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+
+def freeze(model):
+    """ Set all layers in a model to non-trainable.
+
+    The weights for these layers will not be updated during training.
+
+    This function modifies the given model in-place,
+    but it also returns the modified model to allow easy chaining with other functions.
+    """
+    for layer in model.layers:
+        layer.trainable = False
+    return model
diff --git a/src/keras_retinanet/utils/tf_version.py b/src/keras_retinanet/utils/tf_version.py
new file mode 100644
index 0000000..e6eb31a
--- /dev/null
+++ b/src/keras_retinanet/utils/tf_version.py
@@ -0,0 +1,58 @@
+"""
+Copyright 2017-2019 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from __future__ import print_function
+
+import tensorflow as tf
+import sys
+
+MINIMUM_TF_VERSION = 1, 14, 0
+BLACKLISTED_TF_VERSIONS = [
+    (2, 0, 0),  # Has a number of memory leaks and issues with eager execution.
+    (2, 0, 1),  # Has a number of memory leaks and issues with eager execution.
+]
+
+
+def tf_version():
+    """ Get the Tensorflow version.
+        Returns
+            tuple of (major, minor, patch).
+    """
+    return tuple(map(int, tf.version.VERSION.split('-')[0].split('.')))
+
+
+def tf_version_ok(minimum_tf_version=MINIMUM_TF_VERSION, blacklisted=BLACKLISTED_TF_VERSIONS):
+    """ Check if the current Tensorflow version is higher than the minimum version.
+    """
+    return tf_version() >= minimum_tf_version and tf_version() not in blacklisted
+
+
+def assert_tf_version(minimum_tf_version=MINIMUM_TF_VERSION, blacklisted=BLACKLISTED_TF_VERSIONS):
+    """ Assert that the Tensorflow version is up to date.
+    """
+    detected = tf.version.VERSION
+    required = '.'.join(map(str, minimum_tf_version))
+    assert(tf_version_ok(minimum_tf_version, blacklisted)), 'You are using tensorflow version {}. The minimum required version is {} (blacklisted: {}).'.format(detected, required, blacklisted)
+
+
+def check_tf_version():
+    """ Check that the Tensorflow version is up to date. If it isn't, print an error message and exit the script.
+    """
+    try:
+        assert_tf_version()
+    except AssertionError as e:
+        print(e, file=sys.stderr)
+        sys.exit(1)
diff --git a/src/keras_retinanet/utils/transform.py b/src/keras_retinanet/utils/transform.py
new file mode 100644
index 0000000..4c6afe6
--- /dev/null
+++ b/src/keras_retinanet/utils/transform.py
@@ -0,0 +1,289 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import numpy as np
+
+DEFAULT_PRNG = np.random
+
+
+def colvec(*args):
+    """ Create a numpy array representing a column vector. """
+    return np.array([args]).T
+
+
+def transform_aabb(transform, aabb):
+    """ Apply a transformation to an axis aligned bounding box.
+
+    The result is a new AABB in the same coordinate system as the original AABB.
+    The new AABB contains all corner points of the original AABB after applying the given transformation.
+
+    Args
+        transform: The transformation to apply.
+        x1:        The minimum x value of the AABB.
+        y1:        The minimum y value of the AABB.
+        x2:        The maximum x value of the AABB.
+        y2:        The maximum y value of the AABB.
+    Returns
+        The new AABB as tuple (x1, y1, x2, y2)
+    """
+    x1, y1, x2, y2 = aabb
+    # Transform all 4 corners of the AABB.
+    points = transform.dot([
+        [x1, x2, x1, x2],
+        [y1, y2, y2, y1],
+        [1,  1,  1,  1 ],
+    ])
+
+    # Extract the min and max corners again.
+    min_corner = points.min(axis=1)
+    max_corner = points.max(axis=1)
+
+    return [min_corner[0], min_corner[1], max_corner[0], max_corner[1]]
+
+
+def _random_vector(min, max, prng=DEFAULT_PRNG):
+    """ Construct a random vector between min and max.
+    Args
+        min: the minimum value for each component
+        max: the maximum value for each component
+    """
+    min = np.array(min)
+    max = np.array(max)
+    assert min.shape == max.shape
+    assert len(min.shape) == 1
+    return prng.uniform(min, max)
+
+
+def rotation(angle):
+    """ Construct a homogeneous 2D rotation matrix.
+    Args
+        angle: the angle in radians
+    Returns
+        the rotation matrix as 3 by 3 numpy array
+    """
+    return np.array([
+        [np.cos(angle), -np.sin(angle), 0],
+        [np.sin(angle),  np.cos(angle), 0],
+        [0, 0, 1]
+    ])
+
+
+def random_rotation(min, max, prng=DEFAULT_PRNG):
+    """ Construct a random rotation between -max and max.
+    Args
+        min:  a scalar for the minimum absolute angle in radians
+        max:  a scalar for the maximum absolute angle in radians
+        prng: the pseudo-random number generator to use.
+    Returns
+        a homogeneous 3 by 3 rotation matrix
+    """
+    return rotation(prng.uniform(min, max))
+
+
+def translation(translation):
+    """ Construct a homogeneous 2D translation matrix.
+    # Arguments
+        translation: the translation 2D vector
+    # Returns
+        the translation matrix as 3 by 3 numpy array
+    """
+    return np.array([
+        [1, 0, translation[0]],
+        [0, 1, translation[1]],
+        [0, 0, 1]
+    ])
+
+
+def random_translation(min, max, prng=DEFAULT_PRNG):
+    """ Construct a random 2D translation between min and max.
+    Args
+        min:  a 2D vector with the minimum translation for each dimension
+        max:  a 2D vector with the maximum translation for each dimension
+        prng: the pseudo-random number generator to use.
+    Returns
+        a homogeneous 3 by 3 translation matrix
+    """
+    return translation(_random_vector(min, max, prng))
+
+
+def shear(angle):
+    """ Construct a homogeneous 2D shear matrix.
+    Args
+        angle: the shear angle in radians
+    Returns
+        the shear matrix as 3 by 3 numpy array
+    """
+    return np.array([
+        [1, -np.sin(angle), 0],
+        [0,  np.cos(angle), 0],
+        [0, 0, 1]
+    ])
+
+
+def random_shear(min, max, prng=DEFAULT_PRNG):
+    """ Construct a random 2D shear matrix with shear angle between -max and max.
+    Args
+        min:  the minimum shear angle in radians.
+        max:  the maximum shear angle in radians.
+        prng: the pseudo-random number generator to use.
+    Returns
+        a homogeneous 3 by 3 shear matrix
+    """
+    return shear(prng.uniform(min, max))
+
+
+def scaling(factor):
+    """ Construct a homogeneous 2D scaling matrix.
+    Args
+        factor: a 2D vector for X and Y scaling
+    Returns
+        the zoom matrix as 3 by 3 numpy array
+    """
+    return np.array([
+        [factor[0], 0, 0],
+        [0, factor[1], 0],
+        [0, 0, 1]
+    ])
+
+
+def random_scaling(min, max, prng=DEFAULT_PRNG):
+    """ Construct a random 2D scale matrix between -max and max.
+    Args
+        min:  a 2D vector containing the minimum scaling factor for X and Y.
+        min:  a 2D vector containing The maximum scaling factor for X and Y.
+        prng: the pseudo-random number generator to use.
+    Returns
+        a homogeneous 3 by 3 scaling matrix
+    """
+    return scaling(_random_vector(min, max, prng))
+
+
+def random_flip(flip_x_chance, flip_y_chance, prng=DEFAULT_PRNG):
+    """ Construct a transformation randomly containing X/Y flips (or not).
+    Args
+        flip_x_chance: The chance that the result will contain a flip along the X axis.
+        flip_y_chance: The chance that the result will contain a flip along the Y axis.
+        prng:          The pseudo-random number generator to use.
+    Returns
+        a homogeneous 3 by 3 transformation matrix
+    """
+    flip_x = prng.uniform(0, 1) < flip_x_chance
+    flip_y = prng.uniform(0, 1) < flip_y_chance
+    # 1 - 2 * bool gives 1 for False and -1 for True.
+    return scaling((1 - 2 * flip_x, 1 - 2 * flip_y))
+
+
+def change_transform_origin(transform, center):
+    """ Create a new transform representing the same transformation,
+        only with the origin of the linear part changed.
+    Args
+        transform: the transformation matrix
+        center: the new origin of the transformation
+    Returns
+        translate(center) * transform * translate(-center)
+    """
+    center = np.array(center)
+    return np.linalg.multi_dot([translation(center), transform, translation(-center)])
+
+
+def random_transform(
+    min_rotation=0,
+    max_rotation=0,
+    min_translation=(0, 0),
+    max_translation=(0, 0),
+    min_shear=0,
+    max_shear=0,
+    min_scaling=(1, 1),
+    max_scaling=(1, 1),
+    flip_x_chance=0,
+    flip_y_chance=0,
+    prng=DEFAULT_PRNG
+):
+    """ Create a random transformation.
+
+    The transformation consists of the following operations in this order (from left to right):
+      * rotation
+      * translation
+      * shear
+      * scaling
+      * flip x (if applied)
+      * flip y (if applied)
+
+    Note that by default, the data generators in `keras_retinanet.preprocessing.generators` interpret the translation
+    as factor of the image size. So an X translation of 0.1 would translate the image by 10% of it's width.
+    Set `relative_translation` to `False` in the `TransformParameters` of a data generator to have it interpret
+    the translation directly as pixel distances instead.
+
+    Args
+        min_rotation:    The minimum rotation in radians for the transform as scalar.
+        max_rotation:    The maximum rotation in radians for the transform as scalar.
+        min_translation: The minimum translation for the transform as 2D column vector.
+        max_translation: The maximum translation for the transform as 2D column vector.
+        min_shear:       The minimum shear angle for the transform in radians.
+        max_shear:       The maximum shear angle for the transform in radians.
+        min_scaling:     The minimum scaling for the transform as 2D column vector.
+        max_scaling:     The maximum scaling for the transform as 2D column vector.
+        flip_x_chance:   The chance (0 to 1) that a transform will contain a flip along X direction.
+        flip_y_chance:   The chance (0 to 1) that a transform will contain a flip along Y direction.
+        prng:            The pseudo-random number generator to use.
+    """
+    return np.linalg.multi_dot([
+        random_rotation(min_rotation, max_rotation, prng),
+        random_translation(min_translation, max_translation, prng),
+        random_shear(min_shear, max_shear, prng),
+        random_scaling(min_scaling, max_scaling, prng),
+        random_flip(flip_x_chance, flip_y_chance, prng)
+    ])
+
+
+def random_transform_generator(prng=None, **kwargs):
+    """ Create a random transform generator.
+
+    Uses a dedicated, newly created, properly seeded PRNG by default instead of the global DEFAULT_PRNG.
+
+    The transformation consists of the following operations in this order (from left to right):
+      * rotation
+      * translation
+      * shear
+      * scaling
+      * flip x (if applied)
+      * flip y (if applied)
+
+    Note that by default, the data generators in `keras_retinanet.preprocessing.generators` interpret the translation
+    as factor of the image size. So an X translation of 0.1 would translate the image by 10% of it's width.
+    Set `relative_translation` to `False` in the `TransformParameters` of a data generator to have it interpret
+    the translation directly as pixel distances instead.
+
+    Args
+        min_rotation:    The minimum rotation in radians for the transform as scalar.
+        max_rotation:    The maximum rotation in radians for the transform as scalar.
+        min_translation: The minimum translation for the transform as 2D column vector.
+        max_translation: The maximum translation for the transform as 2D column vector.
+        min_shear:       The minimum shear angle for the transform in radians.
+        max_shear:       The maximum shear angle for the transform in radians.
+        min_scaling:     The minimum scaling for the transform as 2D column vector.
+        max_scaling:     The maximum scaling for the transform as 2D column vector.
+        flip_x_chance:   The chance (0 to 1) that a transform will contain a flip along X direction.
+        flip_y_chance:   The chance (0 to 1) that a transform will contain a flip along Y direction.
+        prng:            The pseudo-random number generator to use.
+    """
+
+    if prng is None:
+        # RandomState automatically seeds using the best available method.
+        prng = np.random.RandomState()
+
+    while True:
+        yield random_transform(prng=prng, **kwargs)
diff --git a/src/keras_retinanet/utils/visualization.py b/src/keras_retinanet/utils/visualization.py
new file mode 100644
index 0000000..c551043
--- /dev/null
+++ b/src/keras_retinanet/utils/visualization.py
@@ -0,0 +1,106 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import cv2
+import numpy as np
+
+from .colors import label_color
+
+
+def draw_box(image, box, color, thickness=2):
+    """ Draws a box on an image with a given color.
+
+    # Arguments
+        image     : The image to draw on.
+        box       : A list of 4 elements (x1, y1, x2, y2).
+        color     : The color of the box.
+        thickness : The thickness of the lines to draw a box with.
+    """
+    b = np.array(box).astype(int)
+    cv2.rectangle(image, (b[0], b[1]), (b[2], b[3]), color, thickness, cv2.LINE_AA)
+
+
+def draw_caption(image, box, caption):
+    """ Draws a caption above the box in an image.
+
+    # Arguments
+        image   : The image to draw on.
+        box     : A list of 4 elements (x1, y1, x2, y2).
+        caption : String containing the text to draw.
+    """
+    b = np.array(box).astype(int)
+    # cv2.putText(image, caption, (b[0], b[1] - 10), cv2.FONT_HERSHEY_PLAIN, 1, (0, 0, 0), 2)
+    # cv2.putText(image, caption, (b[0], b[1] - 10), cv2.FONT_HERSHEY_PLAIN, 1, (255, 255, 255), 1)
+
+
+def draw_boxes(image, boxes, color, thickness=2):
+    """ Draws boxes on an image with a given color.
+
+    # Arguments
+        image     : The image to draw on.
+        boxes     : A [N, 4] matrix (x1, y1, x2, y2).
+        color     : The color of the boxes.
+        thickness : The thickness of the lines to draw boxes with.
+    """
+    for b in boxes:
+        draw_box(image, b, color, thickness=thickness)
+
+
+def draw_detections(image, boxes, scores, labels, color=None, label_to_name=None, score_threshold=0.5):
+    """ Draws detections in an image.
+
+    # Arguments
+        image           : The image to draw on.
+        boxes           : A [N, 4] matrix (x1, y1, x2, y2).
+        scores          : A list of N classification scores.
+        labels          : A list of N labels.
+        color           : The color of the boxes. By default the color from keras_retinanet.utils.colors.label_color will be used.
+        label_to_name   : (optional) Functor for mapping a label to a name.
+        score_threshold : Threshold used for determining what detections to draw.
+    """
+    selection = np.where(scores > score_threshold)[0]
+
+    for i in selection:
+        c = color if color is not None else label_color(labels[i])
+        draw_box(image, boxes[i, :], color=c)
+
+        # draw labels
+        caption = (label_to_name(labels[i]) if label_to_name else labels[i]) + ': {0:.2f}'.format(scores[i])
+        draw_caption(image, boxes[i, :], caption)
+
+
+def draw_annotations(image, annotations, color=(0, 255, 0), label_to_name=None):
+    """ Draws annotations in an image.
+
+    # Arguments
+        image         : The image to draw on.
+        annotations   : A [N, 5] matrix (x1, y1, x2, y2, label) or dictionary containing bboxes (shaped [N, 4]) and labels (shaped [N]).
+        color         : The color of the boxes. By default the color from keras_retinanet.utils.colors.label_color will be used.
+        label_to_name : (optional) Functor for mapping a label to a name.
+    """
+    if isinstance(annotations, np.ndarray):
+        annotations = {'bboxes': annotations[:, :4], 'labels': annotations[:, 4]}
+
+    assert('bboxes' in annotations)
+    assert('labels' in annotations)
+    assert(annotations['bboxes'].shape[0] == annotations['labels'].shape[0])
+
+    for i in range(annotations['bboxes'].shape[0]):
+        label   = annotations['labels'][i]
+        c       = color if color is not None else label_color(label)
+        caption = '{}'.format(label_to_name(label) if label_to_name else label)
+        draw_caption(image, annotations['bboxes'][i], caption)
+        draw_box(image, annotations['bboxes'][i], color=c)