From c5fe58afe4abc5dae3f46511bc355f361f8358a2 Mon Sep 17 00:00:00 2001
From: Duncan Buck <dncnbuck@gmail.com>
Date: Tue, 7 Jun 2022 12:04:26 +0200
Subject: [PATCH 01/54] handle-split-op-when-num-splits-1

---
 .../converters/mil/frontend/torch/ops.py      | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
index 7c71f7d9e..9c21dd466 100644
--- a/coremltools/converters/mil/frontend/torch/ops.py
+++ b/coremltools/converters/mil/frontend/torch/ops.py
@@ -3273,6 +3273,18 @@ def _slice(context, node):
     context.add(res)
 
 
+def _num_splits_and_sizes(split_sizes):
+    if split_sizes.sym_val is not None:
+        return len(split_sizes.sym_val), split_sizes.sym_val
+
+    if any_symbolic(split_sizes.shape):
+        raise ValueError("Unable to determine number of splits")
+
+    num_splits = len(split_sizes.shape)
+    sizes = [get_new_symbol() for _ in range(num_splits)]
+    return num_splits, sizes
+
+
 @register_torch_op(torch_alias=["split_with_sizes"])
 def split(context, node):
     inputs = _get_inputs(context, node, expected=3)
@@ -3300,6 +3312,14 @@ def split(context, node):
         else:
             partial_size = mb.mul(x=tmp, y=remainder)
             split_sizes = mb.concat(values=[whole_sizes, partial_size], axis=0)
+    
+
+    num_splits, sizes = _num_splits_and_sizes(split_sizes=split_sizes)
+    if num_splits == 1:
+        out = mb.identity(x=x, name=node.name)
+        context.add(out, node.name)
+        return
+
     res = mb.split(x=x, split_sizes=split_sizes, axis=dim, name=node.name)
     context.add(res, torch_name=node.name)
 

From 5542de83d42ca11a9774c4f25c1441f348781edc Mon Sep 17 00:00:00 2001
From: Duncan Buck <dncnbuck@gmail.com>
Date: Tue, 7 Jun 2022 12:08:20 +0200
Subject: [PATCH 02/54] handle when unpacked tuple contains only single value

---
 coremltools/converters/mil/frontend/torch/ops.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
index 9c21dd466..48b9d8ae3 100644
--- a/coremltools/converters/mil/frontend/torch/ops.py
+++ b/coremltools/converters/mil/frontend/torch/ops.py
@@ -2473,6 +2473,10 @@ def upsample_nearest2d(context, node):
 def tupleunpack(context, node):
     inputs = _get_inputs(context, node, expected=1)
     values = inputs[0]
+
+    if len(node.outputs) == 1:
+        values = [values]
+
     # Node input could have been turned into constant array in @tupleconstruct
     if not isinstance(values, tuple) and not isinstance(values, list):
         values = values.val

From fdd159060a64c1f9816efc68b3e3cb5eb67648dc Mon Sep 17 00:00:00 2001
From: Duncan Buck <dncnbuck@gmail.com>
Date: Tue, 7 Jun 2022 12:12:25 +0200
Subject: [PATCH 03/54] roi_align implimentation

---
 .../converters/mil/frontend/torch/ops.py      | 55 +++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
index 48b9d8ae3..8bfdc50cb 100644
--- a/coremltools/converters/mil/frontend/torch/ops.py
+++ b/coremltools/converters/mil/frontend/torch/ops.py
@@ -4381,3 +4381,58 @@ def scatter_add(context, node):
     updates = inputs[3]
     result = mb.scatter_along_axis(data=data, indices=indices, updates=updates, axis=axis, mode="add", name=node.name)
     context.add(result)
+
+@register_torch_op()
+def roi_align(context, node):
+    """
+    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L2239
+    """
+    inputs = _get_inputs(context, node)
+
+    x = context[node.inputs[0]]
+    input_shape = x.shape  # (B, h_in, w_in, C)
+    if len(input_shape) != 4:
+        raise ValueError(
+            '"CropResize" op: expected input rank 4, got {}'.format(x.rank)
+        )
+    Hin, Win = input_shape[1:3]
+
+    const_box_info = True
+    if context[node.inputs[1]].val is None or context[node.inputs[2]].val is None:
+        const_box_info = False
+
+    extrapolation_value = context[node.inputs[2]].val
+
+    # CoreML index information along with boxes
+    if const_box_info:
+        boxes = context[node.inputs[1]].val
+        # CoreML expects boxes/ROI in
+        # [N, 1, 5, 1, 1] format
+        boxes = boxes.reshape(boxes.shape[0], 1, boxes.shape[1], 1, 1)
+    else:
+        boxes = inputs[1]
+        boxes = mb.reshape(x=boxes, shape=[boxes.shape[0], 1, boxes.shape[1], 1, 1])
+    # Get Height and Width of crop
+    h_out = inputs[3]
+    w_out = inputs[4]
+
+    # Torch input format: [B, C, h_in, w_in]
+    # CoreML input format: [B, C, h_in, w_in]
+
+    # Crop Resize
+    x = mb.crop_resize(
+        x=x,
+        roi=boxes,
+        target_height=h_out.val,
+        target_width=w_out.val,
+        normalized_coordinates=True,
+        spatial_scale=extrapolation_value,
+        box_coordinate_mode="CORNERS_HEIGHT_FIRST",
+        sampling_mode='OFFSET_CORNERS',
+    )
+    
+    # CoreML output format: [N, 1, C, h_out, w_out]
+    # Torch output format: [N, C, h_out, w_out]
+    x = mb.squeeze(x=x, axes=[1])
+
+    context.add(x, torch_name=node.outputs[0])
\ No newline at end of file

From 7b4cbd95721b6fb65b076c29fa39e0c877b6a27f Mon Sep 17 00:00:00 2001
From: Duncan Buck <dncnbuck@gmail.com>
Date: Tue, 7 Jun 2022 12:13:42 +0200
Subject: [PATCH 04/54] add torch op numel

---
 coremltools/converters/mil/frontend/torch/ops.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
index 8bfdc50cb..21d4e4358 100644
--- a/coremltools/converters/mil/frontend/torch/ops.py
+++ b/coremltools/converters/mil/frontend/torch/ops.py
@@ -4435,4 +4435,9 @@ def roi_align(context, node):
     # Torch output format: [N, C, h_out, w_out]
     x = mb.squeeze(x=x, axes=[1])
 
-    context.add(x, torch_name=node.outputs[0])
\ No newline at end of file
+    context.add(x, torch_name=node.outputs[0])
+
+@register_torch_op()
+def numel(context, node):
+    inputs = _get_inputs(context, node, expected=1)
+    context.add(mb.reduce_prod(x=inputs[0], name=node.name), torch_name=node.outputs[0])

From b002207f062a91ee21db06d5bde1f6e07fee4e1a Mon Sep 17 00:00:00 2001
From: Duncan Buck <dncnbuck@gmail.com>
Date: Tue, 7 Jun 2022 12:16:07 +0200
Subject: [PATCH 05/54] add torch op nms

---
 .../converters/mil/frontend/torch/ops.py      | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
index 21d4e4358..966ce7e3f 100644
--- a/coremltools/converters/mil/frontend/torch/ops.py
+++ b/coremltools/converters/mil/frontend/torch/ops.py
@@ -4441,3 +4441,35 @@ def roi_align(context, node):
 def numel(context, node):
     inputs = _get_inputs(context, node, expected=1)
     context.add(mb.reduce_prod(x=inputs[0], name=node.name), torch_name=node.outputs[0])
+
+@register_torch_op()
+def nms(context, node):
+    inputs = _get_inputs(context, node)
+    boxes = inputs[0]
+
+    num_boxes = boxes.shape[1]
+    max_boxes = num_boxes  # we set the max_boxes just to be # input boxes
+
+    scores = inputs[1]
+    iou_threshold = inputs[2]
+    boxes = mb.expand_dims(x=boxes, axes=[0])
+    scores = mb.expand_dims(x=scores, axes=[0, -1])
+
+    # Follow tensorflow op example: TensorFlow's default value for score_threshold, Core ML does not
+    # have float('-inf') support, converted to minimum float32 instead
+    score_threshold = -3.4e38
+
+    _, _, x, _ = mb.non_maximum_suppression(
+        boxes=boxes,
+        scores=scores,
+        iou_threshold=iou_threshold,
+        score_threshold=score_threshold,
+        max_boxes=max_boxes
+    )
+
+    if not is_symbolic(num_boxes):
+        x = mb.squeeze(x=x, axes=[0])
+        x = mb.slice_by_index(x=x, begin=[0], end=[max_boxes], name=node.name)
+    else:
+        x = mb.squeeze(x=x, axes=[0], name=node.name)
+    context.add(x, torch_name=node.name)
\ No newline at end of file

From 63894276c2e61cf81c231f28833626aa40190717 Mon Sep 17 00:00:00 2001
From: Duncan Buck <dncnbuck@gmail.com>
Date: Tue, 7 Jun 2022 12:17:38 +0200
Subject: [PATCH 06/54] add torch op repeat_interleave

---
 .../converters/mil/frontend/torch/ops.py      | 51 ++++++++++++++++++-
 1 file changed, 50 insertions(+), 1 deletion(-)

diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
index 966ce7e3f..17f6b7c28 100644
--- a/coremltools/converters/mil/frontend/torch/ops.py
+++ b/coremltools/converters/mil/frontend/torch/ops.py
@@ -4472,4 +4472,53 @@ def nms(context, node):
         x = mb.slice_by_index(x=x, begin=[0], end=[max_boxes], name=node.name)
     else:
         x = mb.squeeze(x=x, axes=[0], name=node.name)
-    context.add(x, torch_name=node.name)
\ No newline at end of file
+    context.add(x, torch_name=node.name)
+
+@register_torch_op
+def repeat_interleave(context, node):
+    inputs = _get_inputs(context, node)
+
+    x = inputs[0]
+    reps = inputs[1]
+    dim = inputs[2] if inputs[2] else 0
+
+    perm = [] + [axis for axis in range(x.rank) if axis not in []]
+
+    x = mb.transpose(x=x, perm=perm)  # torch.transpose(x, 0, 1)
+    x = mb.tile(x=x, reps=reps.val[0], name=node.name)  # torch.repeat(x, size)
+    x = mb.reshape(x=x, shape=(-1, x.shape[0]))  # x.view(-1, 2)
+    x = mb.transpose(x=x, perm=(-1, 0))  # torch.transpose(x, 0, 1)
+    dims = list(x.shape)
+
+    # Implementation of flatten
+    total = 1
+    start_val = dim
+    end_val = -1
+    start = len(dims) + start_val if start_val < 0 else start_val
+    end = len(dims) + end_val if end_val < 0 else end_val
+
+    if start > len(dims) or end > len(dims) or start < 0 or end < 0:
+        raise ValueError(
+            "Invalid start and end. (start, end) == ({}, {})".format(start, end_val)
+        )
+    if start > end:
+        raise ValueError(
+            "Start must be before end. (start, end) == ({}, {})".format(start, end_val)
+        )
+    x_shape = mb.shape(x=x)
+
+    shape1 = mb.slice_by_index(x=x_shape, begin=[0], end=[start])
+    shape2 = mb.slice_by_index(x=x_shape, begin=[end + 1], end=[len(dims)])
+
+    flatten_dim = -1
+    if not any_symbolic(x.shape):
+        flatten_dim = 1
+        for dim in dims[start: end + 1]:
+            flatten_dim *= dim
+
+    shape = mb.concat(values=(shape1, [flatten_dim], shape2), axis=0)
+    shape = mb.cast(x=shape, dtype="int32")
+    reshape = mb.reshape(x=x, shape=shape, name=node.name)
+
+    context.add(reshape, node.name)
+

From cecae9c4479713b565303b19f27ee7d642c7b684 Mon Sep 17 00:00:00 2001
From: Duncan Buck <dncnbuck@gmail.com>
Date: Tue, 7 Jun 2022 12:25:09 +0200
Subject: [PATCH 07/54] add torch op narrow

---
 coremltools/converters/mil/frontend/torch/ops.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
index 17f6b7c28..9e3c1326f 100644
--- a/coremltools/converters/mil/frontend/torch/ops.py
+++ b/coremltools/converters/mil/frontend/torch/ops.py
@@ -4152,6 +4152,11 @@ def _make_tensor(list_of_tensor, name, rank):
         context.add(mb.identity(x=val, name=node.name))
         return
 
+    if inputs[2] is None:
+        res = mb.const(val=[val.val], name=node.name)
+        context.add(res, torch_name=node.name)
+        return
+
     # Case 2: Create a tensor filled with a single value
     val = val.val # element val to fill
     msg_prefix = 'torch::tensor {} '.format(node.name)
@@ -4522,3 +4527,13 @@ def repeat_interleave(context, node):
 
     context.add(reshape, node.name)
 
+@register_torch_op(override=True)
+def narrow(context, node):
+    data, dim, start, length = _get_inputs(context, node, expected=4)
+    data_shape = mb.shape(x=data).val
+    begin = [0]*len(data_shape)
+    end = [x for x in data_shape]
+    begin[dim.val] = start.val
+    end[dim.val] = start.val+length.val
+    out = mb.slice_by_index(x=data, begin=begin, end=end)
+    context.add(out, torch_name=node.name)

From 15185d87d59f6f29eb5c4a2b0471032a1b36ba3f Mon Sep 17 00:00:00 2001
From: Duncan Buck <dncnbuck@gmail.com>
Date: Tue, 7 Jun 2022 12:25:27 +0200
Subject: [PATCH 08/54] add torch op logicaland

---
 coremltools/converters/mil/frontend/torch/ops.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
index 9e3c1326f..f324dbe95 100644
--- a/coremltools/converters/mil/frontend/torch/ops.py
+++ b/coremltools/converters/mil/frontend/torch/ops.py
@@ -4537,3 +4537,11 @@ def narrow(context, node):
     end[dim.val] = start.val+length.val
     out = mb.slice_by_index(x=data, begin=begin, end=end)
     context.add(out, torch_name=node.name)
+
+@register_torch_op(torch_alias=["__and_", '__and__'])
+def logicaland(context, node):
+    inputs = _get_inputs(context, node, expected=2)
+    x, y = inputs
+    x = mb.cast(x=x, dtype="bool")
+    y = mb.cast(x=y, dtype="bool")
+    context.add(mb.logical_and(x=x, y=y, name=node.name))

From e1b7d0f0d19ea83f22c31508e56f2b2eeb93b407 Mon Sep 17 00:00:00 2001
From: Duncan Buck <dncnbuck@gmail.com>
Date: Tue, 7 Jun 2022 12:28:23 +0200
Subject: [PATCH 09/54] handle broadcasting indicies for torch index op

---
 coremltools/converters/mil/frontend/torch/ops.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
index f324dbe95..daacadfa7 100644
--- a/coremltools/converters/mil/frontend/torch/ops.py
+++ b/coremltools/converters/mil/frontend/torch/ops.py
@@ -2976,8 +2976,11 @@ def index(context, node):
     # For multiple index axes case, we now assume that all the index have equal shape
     for index in valid_indices:
         if not is_compatible_symbolic_vector(index.shape, valid_indices[0].shape):
-            raise NotImplementedError("Broadcasable tensor index not supported.")
-
+            broadcast_inputs = _broadcast_tensors([valid_indices[0], index])
+            index = broadcast_inputs[1]
+            valid_indices[0] = broadcast_inputs[0]
+            valid_indices.append(index)
+    
     # First stack the index together
     indices_rank = valid_indices[0].rank
     indices = mb.stack(values=valid_indices, axis=indices_rank)

From 70f1954f73eeb3a5e6005c15023fd35c00a9fc11 Mon Sep 17 00:00:00 2001
From: Duncan Buck <dncnbuck@gmail.com>
Date: Tue, 7 Jun 2022 12:29:53 +0200
Subject: [PATCH 10/54] patch torch clamp op to handle int dtype

---
 coremltools/converters/mil/frontend/torch/ops.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
index daacadfa7..87ea37794 100644
--- a/coremltools/converters/mil/frontend/torch/ops.py
+++ b/coremltools/converters/mil/frontend/torch/ops.py
@@ -3826,8 +3826,20 @@ def ceil(context, node):
 @register_torch_op
 def clamp(context, node):
     inputs = _get_inputs(context, node, expected=3)
-    min_val = inputs[1] if inputs[1] else _np.finfo(_np.float32).min
-    max_val = inputs[2] if inputs[2] else _np.finfo(_np.float32).max
+    if not inputs[1]:
+        min_val = _np.finfo(_np.float32).min
+    else:
+        min_val = inputs[1]
+        if types.builtin_to_string(min_val.dtype).startswith('int'):
+            min_val = mb.cast(x=min_val, dtype='fp32')
+
+    if not inputs[2]:
+        max_val = _np.finfo(_np.float32).max
+    else:
+        max_val = inputs[2]
+        if types.builtin_to_string(max_val.dtype).startswith('int'):
+            max_val = mb.cast(x=max_val, dtype='fp32')
+
     context.add(mb.clip(x=inputs[0], alpha=min_val, beta=max_val, name=node.name))
 
 @register_torch_op

From 9d2d0923dd6d178114470096f39aedb2e5c0c8f0 Mon Sep 17 00:00:00 2001
From: Duncan Buck <dncnbuck@gmail.com>
Date: Tue, 7 Jun 2022 13:06:34 +0200
Subject: [PATCH 11/54] return copy of inpt tensor if no dtype is given

---
 coremltools/converters/mil/frontend/torch/ops.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
index 87ea37794..ef32fc76b 100644
--- a/coremltools/converters/mil/frontend/torch/ops.py
+++ b/coremltools/converters/mil/frontend/torch/ops.py
@@ -3384,6 +3384,13 @@ def to(context, node):
             "Received invalid arguments for PyTorch conversion of op {}".format(node)
         )
 
+    # We have to handle the case where the dtype is not set, this should be inferred from the Tensor dtype
+    # see, https://pytorch.org/docs/stable/generated/torch.Tensor.to.html?highlight=#torch.Tensor.to
+    if dtype is None:
+        out = mb.identity(x=_input, name=node.name)
+        context.add(out, node.name)
+        return = 0 # TODO: infer from Tensor (spoiler in this case we care about its f32 => 6)
+
     torch_dtype = NUM_TO_TORCH_DTYPE[dtype]
     if isinstance(_input, Var) and _input.val is not None:
         _input = _input.val

From b91363018ca6616bff7b556801935a7aa826d311 Mon Sep 17 00:00:00 2001
From: Duncan Buck <dncnbuck@gmail.com>
Date: Wed, 8 Jun 2022 11:39:19 +0200
Subject: [PATCH 12/54] remove accidential typo

---
 coremltools/converters/mil/frontend/torch/ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
index ef32fc76b..5ec94aef8 100644
--- a/coremltools/converters/mil/frontend/torch/ops.py
+++ b/coremltools/converters/mil/frontend/torch/ops.py
@@ -3389,7 +3389,7 @@ def to(context, node):
     if dtype is None:
         out = mb.identity(x=_input, name=node.name)
         context.add(out, node.name)
-        return = 0 # TODO: infer from Tensor (spoiler in this case we care about its f32 => 6)
+        return
 
     torch_dtype = NUM_TO_TORCH_DTYPE[dtype]
     if isinstance(_input, Var) and _input.val is not None:

From b0074cc0d8f37b0c1ee38d334a79fe0c3f7500fc Mon Sep 17 00:00:00 2001
From: Duncan Buck <dncnbuck@gmail.com>
Date: Wed, 8 Jun 2022 12:02:56 +0200
Subject: [PATCH 13/54] remove logicaland op and alias new logical_and op

---
 .../converters/mil/frontend/torch/ops.py       | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
index 1394a10fc..d790cf362 100644
--- a/coremltools/converters/mil/frontend/torch/ops.py
+++ b/coremltools/converters/mil/frontend/torch/ops.py
@@ -4108,7 +4108,7 @@ def is_floating_point(context, node):
     is_float = types.is_float(inputs[0].dtype)
     context.add(mb.const(val=is_float, name=node.name))
 
-@register_torch_op()
+@register_torch_op(torch_alias=["__and_", '__and__'])
 def logical_and(context, node):
     inputs = _get_inputs(context, node, expected=2)
     x, y = inputs
@@ -4544,7 +4544,7 @@ def scatter_add(context, node):
     inputs = _get_inputs(context, node)
     _scatter(context, inputs, 'add', node.name)
     
-@register_torch_op()
+@register_torch_op
 def roi_align(context, node):
     """
     https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L2239
@@ -4599,12 +4599,12 @@ def roi_align(context, node):
 
     context.add(x, torch_name=node.outputs[0])
 
-@register_torch_op()
+@register_torch_op
 def numel(context, node):
     inputs = _get_inputs(context, node, expected=1)
     context.add(mb.reduce_prod(x=inputs[0], name=node.name), torch_name=node.outputs[0])
 
-@register_torch_op()
+@register_torch_op
 def nms(context, node):
     inputs = _get_inputs(context, node)
     boxes = inputs[0]
@@ -4684,7 +4684,7 @@ def repeat_interleave(context, node):
 
     context.add(reshape, node.name)
 
-@register_torch_op(override=True)
+@register_torch_op
 def narrow(context, node):
     data, dim, start, length = _get_inputs(context, node, expected=4)
     data_shape = mb.shape(x=data).val
@@ -4694,11 +4694,3 @@ def narrow(context, node):
     end[dim.val] = start.val+length.val
     out = mb.slice_by_index(x=data, begin=begin, end=end)
     context.add(out, torch_name=node.name)
-
-@register_torch_op(torch_alias=["__and_", '__and__'])
-def logicaland(context, node):
-    inputs = _get_inputs(context, node, expected=2)
-    x, y = inputs
-    x = mb.cast(x=x, dtype="bool")
-    y = mb.cast(x=y, dtype="bool")
-    context.add(mb.logical_and(x=x, y=y, name=node.name))

From a9fb7ed21cb572580f47739813ee9e56cf6654d6 Mon Sep 17 00:00:00 2001
From: Duncan Buck <dncnbuck@gmail.com>
Date: Wed, 8 Jun 2022 12:30:36 +0200
Subject: [PATCH 14/54] consistent use of double quotes

---
 coremltools/converters/mil/frontend/torch/ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
index d790cf362..355afe622 100644
--- a/coremltools/converters/mil/frontend/torch/ops.py
+++ b/coremltools/converters/mil/frontend/torch/ops.py
@@ -4108,7 +4108,7 @@ def is_floating_point(context, node):
     is_float = types.is_float(inputs[0].dtype)
     context.add(mb.const(val=is_float, name=node.name))
 
-@register_torch_op(torch_alias=["__and_", '__and__'])
+@register_torch_op(torch_alias=["__and_", "__and__"])
 def logical_and(context, node):
     inputs = _get_inputs(context, node, expected=2)
     x, y = inputs

From 29217d51626a3c3cc8630dbc3fb6dc515394dccc Mon Sep 17 00:00:00 2001
From: Duncan Buck <dncnbuck@gmail.com>
Date: Wed, 8 Jun 2022 12:41:32 +0200
Subject: [PATCH 15/54] remove link to crop and resize layer in NN

---
 coremltools/converters/mil/frontend/torch/ops.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
index 355afe622..c162b01a2 100644
--- a/coremltools/converters/mil/frontend/torch/ops.py
+++ b/coremltools/converters/mil/frontend/torch/ops.py
@@ -4546,9 +4546,6 @@ def scatter_add(context, node):
     
 @register_torch_op
 def roi_align(context, node):
-    """
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L2239
-    """
     inputs = _get_inputs(context, node)
 
     x = context[node.inputs[0]]

From b268f9b6aa69963dd9d5795e540b05a025c37685 Mon Sep 17 00:00:00 2001
From: Toby Roseman <troseman@apple.com>
Date: Tue, 7 Jun 2022 10:17:03 -0700
Subject: [PATCH 16/54] 6.0b1 Release (#1508)

---
 .gitlab-ci.yml                                |   11 -
 .pylintrc                                     |    3 -
 BUILDING.md                                   |    4 +-
 README.md                                     |    1 -
 coremlpython/CoreMLPython.h                   |    2 +-
 coremlpython/CoreMLPython.mm                  |    5 +-
 coremlpython/CoreMLPythonUtils.mm             |   13 +-
 coremltools/__init__.py                       |   20 +-
 coremltools/_deps/__init__.py                 |   84 +-
 coremltools/converters/__init__.py            |    3 +-
 coremltools/converters/_converters_entry.py   |  372 +-
 coremltools/converters/keras/__init__.py      |   19 -
 .../converters/keras/_keras2_converter.py     |  637 ---
 .../converters/keras/_keras_converter.py      |  841 ----
 coremltools/converters/keras/_layers.py       | 1099 ------
 coremltools/converters/keras/_layers2.py      | 1585 --------
 coremltools/converters/keras/_topology.py     |  706 ----
 coremltools/converters/keras/_topology2.py    |  837 ----
 coremltools/converters/keras/_utils.py        |   34 -
 coremltools/converters/mil/__init__.py        |    1 +
 .../mil/_deployment_compatibility.py          |   11 +-
 .../converters/mil/backend/backend_helper.py  |   33 +-
 .../converters/mil/backend/mil/helper.py      |   27 +-
 .../converters/mil/backend/mil/load.py        |  146 +-
 .../passes/adjust_io_to_supported_types.py    |   89 +-
 .../mil/passes/homogenize_input_dtypes.py     |   18 +-
 .../passes/insert_image_preprocessing_op.py   |    4 +-
 .../mil/backend/mil/passes/mil_passes.py      |    6 +-
 .../mil/backend/mil/passes/test_passes.py     |   72 +
 coremltools/converters/mil/backend/nn/load.py |   34 +-
 .../converters/mil/backend/nn/op_mapping.py   |  145 +-
 .../nn/passes/alert_return_type_cast.py       |    8 +-
 .../passes/handle_return_inputs_as_outputs.py |    6 +-
 .../nn/passes/handle_return_unused_inputs.py  |    6 +-
 .../backend/nn/passes/test_mlmodel_passes.py  |   41 +-
 coremltools/converters/mil/conftest.py        |   13 +-
 coremltools/converters/mil/converter.py       |    1 -
 coremltools/converters/mil/frontend/_utils.py |   16 +
 .../frontend/milproto}/__init__.py            |    4 +-
 .../mil/frontend/milproto/helper.py           |   70 +
 .../converters/mil/frontend/milproto/load.py  |  422 ++
 .../mil/frontend/milproto/test_load.py        |  164 +
 .../frontend/tensorflow/basic_graph_ops.py    |    2 +-
 .../mil/frontend/tensorflow/convert_utils.py  |    2 +-
 .../mil/frontend/tensorflow/converter.py      |  102 +-
 .../mil/frontend/tensorflow/dot_visitor.py    |    2 +-
 .../mil/frontend/tensorflow/load.py           |   18 +-
 .../converters/mil/frontend/tensorflow/ops.py |  414 +-
 .../mil/frontend/tensorflow/parse.py          |    8 +-
 .../tensorflow/ssa_passes/test_passes.py      |    2 +-
 .../tensorflow/test/test_composite_ops.py     |    2 +-
 .../tensorflow/test/test_conversion_api.py    |  517 +++
 .../tensorflow/test/test_custom_ops.py        |    1 -
 .../frontend/tensorflow/test/test_graphs.py   |    2 +-
 .../mil/frontend/tensorflow/test/test_load.py |   65 +-
 .../mil/frontend/tensorflow/test/test_ops.py  |  662 ++--
 .../frontend/tensorflow/test/test_parse.py    |    2 +-
 .../tensorflow/test/test_parsed_tf_node.py    |    2 +-
 .../frontend/tensorflow/test/testing_utils.py |  103 +-
 .../tensorflow/tf_graph_pass/__init__.py      |    2 -
 .../tensorflow/tf_graph_pass/cond_to_where.py |    8 +-
 .../tf_graph_pass/functionalize_loops.py      |    5 +-
 .../tf_graph_pass/insert_get_tuple.py         |    7 +-
 .../tensorflow/tf_graph_pass/visitors.py      |   12 +-
 .../mil/frontend/tensorflow/tfssa.py          |    6 +-
 .../mil/frontend/tensorflow2/load.py          |    4 +-
 .../ssa_passes/remove_vacuous_cond.py         |    5 +-
 .../tensorflow2/test/test_conversion_api.py   |   92 +
 .../frontend/tensorflow2/test/test_v2_load.py |    4 +-
 .../frontend/tensorflow2/test/test_v2_ops.py  |   40 +-
 .../tensorflow2/test/test_v2_ops_tf_keras.py  |   40 +-
 .../tensorflow2/test/testing_utils.py         |  123 +-
 .../mil/frontend/torch/converter.py           |   34 +-
 .../mil/frontend/torch/dialect_ops.py         |   28 +-
 .../mil/frontend/torch/internal_graph.py      |    2 +-
 .../converters/mil/frontend/torch/load.py     |   15 +-
 .../converters/mil/frontend/torch/ops.py      |  210 +-
 .../torch_upsample_to_core_upsample.py        |   86 +-
 .../torch/test/test_conversion_api.py         |  573 +++
 .../torch/test/test_internal_graph.py         |    6 +-
 .../mil/frontend/torch/test/test_torch_ops.py |  441 ++-
 .../mil/frontend/torch/test/testing_utils.py  |   30 +-
 .../mil/frontend/torch/torchir_passes.py      |    3 +-
 coremltools/converters/mil/input_types.py     |   99 +-
 coremltools/converters/mil/mil/__init__.py    |    2 +-
 coremltools/converters/mil/mil/block.py       |   20 +-
 coremltools/converters/mil/mil/builder.py     |    4 +-
 coremltools/converters/mil/mil/input_type.py  |   88 +-
 coremltools/converters/mil/mil/operation.py   |   21 +-
 .../converters/mil/mil/ops/defs/__init__.py   |    8 +
 .../converters/mil/mil/ops/defs/activation.py |   55 +-
 .../converters/mil/mil/ops/defs/classify.py   |    7 +-
 .../mil/mil/ops/defs/constexpr_ops.py         |  377 ++
 .../mil/mil/ops/defs/control_flow.py          |   68 +-
 .../converters/mil/mil/ops/defs/conv.py       |    6 +-
 .../mil/mil/ops/defs/elementwise_binary.py    |    6 +-
 .../mil/mil/ops/defs/elementwise_unary.py     |   76 +-
 .../mil/mil/ops/defs/image_resizing.py        |   19 +-
 .../converters/mil/mil/ops/defs/linear.py     |    9 +-
 .../mil/mil/ops/defs/normalization.py         |   46 +-
 .../converters/mil/mil/ops/defs/pool.py       |    8 +-
 .../converters/mil/mil/ops/defs/random.py     |   10 +-
 .../converters/mil/mil/ops/defs/recurrent.py  |    6 +-
 .../converters/mil/mil/ops/defs/reduction.py  |  100 +-
 .../mil/mil/ops/defs/scatter_gather.py        |   32 +-
 .../mil/mil/ops/defs/tensor_operation.py      |   40 +-
 .../mil/mil/ops/defs/tensor_transformation.py |  180 +-
 .../mil/mil/ops/tests/test_activation.py      |   63 +-
 .../mil/mil/ops/tests/test_const.py           |    7 +-
 .../mil/mil/ops/tests/test_constexpr_ops.py   |  203 +
 .../mil/mil/ops/tests/test_control_flow.py    |    3 +-
 .../converters/mil/mil/ops/tests/test_conv.py |    1 -
 .../mil/ops/tests/test_elementwise_binary.py  |    1 -
 .../mil/ops/tests/test_elementwise_unary.py   |    8 +-
 .../mil/mil/ops/tests/test_image_resizing.py  |    2 +-
 .../mil/mil/ops/tests/test_linear.py          |    3 +
 .../mil/mil/ops/tests/test_normalization.py   |   71 +-
 .../mil/mil/ops/tests/test_random.py          |    3 +-
 .../mil/mil/ops/tests/test_recurrent.py       |    1 +
 .../mil/mil/ops/tests/test_scatter_gather.py  |   43 +-
 .../mil/ops/tests/test_tensor_operation.py    |   16 +-
 .../ops/tests/test_tensor_transformation.py   |   77 +-
 .../mil/mil/ops/tests/testing_utils.py        |   35 +-
 .../converters/mil/mil/passes/__init__.py     |    4 +-
 .../mil/passes/apply_common_pass_pipeline.py  |    3 +
 .../mil/mil/passes/cast_optimization.py       |   53 +-
 .../mil/mil/passes/compression_passes.py      |  474 +++
 .../mil/mil/passes/const_elimination.py       |    2 +-
 .../mil/mil/passes/conv_batchnorm_fusion.py   |    9 +-
 .../mil/mil/passes/conv_bias_fusion.py        |   16 +-
 .../mil/mil/passes/conv_scale_fusion.py       |    4 +-
 .../passes/elementwise_batchnorm_fusion.py    |    9 +-
 .../passes/gelu_tanh_approximation_fusion.py  |    2 +-
 .../converters/mil/mil/passes/graph_pass.py   |   35 +-
 .../converters/mil/mil/passes/helper.py       |    2 +-
 .../mil/mil/passes/linear_bias_fusion.py      |   10 +-
 .../mil/passes/matmul_weight_bias_fusion.py   |    9 +-
 .../mil/passes/merge_consecutive_paddings.py  |    3 +-
 .../mil/mil/passes/name_sanitization_utils.py |    2 +-
 .../mil/mil/passes/pad_conv_connect.py        |   11 +-
 .../converters/mil/mil/passes/prelu_fusion.py |  198 +
 .../mil/mil/passes/quantization_passes.py     |    1 +
 .../mil/mil/passes/reduce_transposes.py       |   24 +-
 .../mil/mil/passes/test_cast_optimization.py  |   44 +-
 .../passes/test_concat_to_pixel_shuffle.py    |   25 +-
 .../mil/mil/passes/test_conv_scale_fusion.py  |    3 -
 .../mil/passes/test_elementwise_fusions.py    |    9 +-
 .../mil/passes/test_fp16_compute_precision.py |   13 +-
 .../mil/mil/passes/test_noop_elimination.py   |   15 +-
 .../converters/mil/mil/passes/test_passes.py  |  221 +-
 .../mil/passes/test_reduce_transposes_pass.py |   26 +-
 .../mil/passes/test_replace_stack_reshape.py  |   29 +-
 .../mil/passes/test_use_reflection_padding.py |   10 +-
 .../mil/mil/passes/update_output_dtypes.py    |   53 +
 .../mil/mil/passes/use_reflection_padding.py  |    4 +-
 coremltools/converters/mil/mil/program.py     |    9 +
 .../converters/mil/mil/tests/test_block.py    |    9 +-
 .../converters/mil/mil/types/get_type_info.py |    3 -
 .../mil/mil/types/global_methods.py           |    2 -
 .../converters/mil/mil/types/type_bool.py     |    4 +-
 .../converters/mil/mil/types/type_dict.py     |    2 -
 .../converters/mil/mil/types/type_double.py   |   21 +-
 .../mil/mil/types/type_globals_pseudo_type.py |    2 -
 .../converters/mil/mil/types/type_int.py      |    6 +-
 .../converters/mil/mil/types/type_list.py     |    8 +-
 .../converters/mil/mil/types/type_mapping.py  |   29 +-
 .../converters/mil/mil/types/type_spec.py     |    4 -
 .../converters/mil/mil/types/type_str.py      |    2 -
 .../converters/mil/mil/types/type_tensor.py   |    4 +-
 .../converters/mil/mil/types/type_tuple.py    |    8 +-
 .../converters/mil/mil/types/type_unknown.py  |    2 -
 .../converters/mil/mil/types/type_void.py     |    2 -
 coremltools/converters/mil/mil/var.py         |    6 +-
 .../mil/mil/visitors/dot_visitor.py           |    2 +-
 .../mil/test_flexible_shape_inputs.py         |    1 +
 coremltools/converters/mil/testing_utils.py   |  220 +-
 coremltools/converters/onnx/_backend.py       |  180 -
 coremltools/converters/onnx/_backend_rep.py   |  120 -
 coremltools/converters/onnx/_converter.py     |  941 -----
 coremltools/converters/onnx/_error_utils.py   |  102 -
 coremltools/converters/onnx/_graph.py         |  313 --
 coremltools/converters/onnx/_graph_viz.py     |  126 -
 coremltools/converters/onnx/_operators.py     | 2666 -------------
 coremltools/converters/onnx/_operators_nd.py  | 2773 -------------
 .../converters/onnx/_tests/_test_utils.py     |  270 --
 .../converters/onnx/_tests/test_convert.py    |  115 -
 .../onnx/_tests/test_custom_layers.py         |  221 --
 .../converters/onnx/_tests/test_graph.py      |   81 -
 .../onnx/_tests/test_mlmodel_passes.py        |   31 -
 .../converters/onnx/_tests/test_operators.py  |  419 --
 .../onnx/_tests/test_pytorch_model.py         | 1007 -----
 .../onnx/_tests/test_transformers.py          |  321 --
 coremltools/converters/onnx/_transformers.py  |  940 -----
 coremltools/models/__init__.py                |    1 +
 coremltools/models/datatypes.py               |    2 +-
 .../onnx => models/ml_program}/__init__.py    |    5 +-
 .../models/ml_program/compression_utils.py    |  580 +++
 coremltools/models/model.py                   |   44 +-
 .../models/nearest_neighbors/builder.py       |    4 +-
 coremltools/models/neural_network/builder.py  |    2 +-
 .../neural_network/flexible_shape_utils.py    |    4 +-
 .../neural_network/quantization_utils.py      |   60 +-
 .../neural_network/update_optimizer_utils.py  |    4 +-
 coremltools/models/pipeline.py                |   22 +-
 coremltools/models/tree_ensemble.py           |   15 +-
 coremltools/proto/FeatureTypes_pb2.py         |   46 +-
 coremltools/test/api/test_api_examples.py     |   72 +-
 coremltools/test/api/test_api_visibilities.py |   40 +-
 coremltools/test/blob/test_weights.py         |   26 +-
 coremltools/test/ml_program/__init__.py       |    4 +
 .../test/ml_program/test_compression.py       |  410 ++
 coremltools/test/modelpackage/test_mlmodel.py |    7 +-
 .../test/modelpackage/test_modelpackage.py    |    4 +-
 coremltools/test/neural_network/test_keras.py | 1137 ------
 .../test/neural_network/test_keras2.py        | 1594 --------
 .../neural_network/test_keras2_numeric.py     | 3458 -----------------
 .../test/neural_network/test_keras_nonseq.py  |  132 -
 .../test/neural_network/test_keras_numeric.py | 3137 ---------------
 coremltools/test/neural_network/test_model.py |   30 +-
 .../test_multiple_images_preprocessing.py     |  191 -
 .../neural_network/test_neural_networks.py    |  155 -
 .../test/neural_network/test_nn_builder.py    |   34 +-
 .../neural_network/test_numpy_nn_layers.py    |   33 +-
 .../test/neural_network/test_quantization.py  |  498 +--
 .../test_recurrent_stress_tests.py            | 1951 ----------
 .../test_simple_nn_inference.py               |    6 +-
 .../test_simple_recurrent_single_layer.py     |  486 ---
 .../test/neural_network/test_tf_numeric.py    |   66 +-
 .../test/pipeline/test_model_updatable.py     |   78 -
 .../test/sklearn_tests/test_io_types.py       |  101 +-
 coremltools/version.py                        |    2 +-
 .../src/google/protobuf/repeated_field.h      |    4 -
 docs/documentation.md                         |    4 -
 docs/source/coremltools.converters.keras.rst  |    9 -
 docs/source/coremltools.converters.onnx.rst   |    5 -
 docs/source/coremltools.converters.rst        |    2 -
 docs/source/coremltools.models.rst            |    7 +-
 milstoragepython/MilStorage.cpp               |    8 +
 milstoragepython/MilStorage.hpp               |    2 +
 milstoragepython/MilStoragePython.cpp         |    2 +
 mlmodel/build/format/FeatureTypes.pb.cc       |    4 +
 mlmodel/build/format/FeatureTypes.pb.h        |    8 +-
 mlmodel/build/format/FeatureTypes_enums.h     |    2 +
 mlmodel/format/FeatureTypes.proto             |    8 +-
 mlmodel/format/Model.proto                    |    3 +
 mlmodel/src/DataType.cpp                      |    7 +-
 mlmodel/src/Globals.hpp                       |    7 +-
 mlmodel/src/LayerShapeConstraints.cpp         |    3 +-
 mlmodel/src/TreeEnsembleCommon.cpp            |   10 +-
 mlmodel/src/Utils.cpp                         |   71 +
 mlmodel/src/Utils.hpp                         |    4 +
 .../BayesianProbitRegressionValidator.cpp     |    2 +
 .../src/Validation/InterfaceValidators.cpp    |   22 +-
 .../NeuralNetwork/NeuralNetworkShapes.cpp     |    4 +-
 .../SoundAnalysisPreprocessingValidator.cpp   |    2 +
 mlmodel/tests/InterfaceTests.cpp              |    2 +
 reqs/build.pip                                |    6 +-
 reqs/docs.pip                                 |    6 +-
 reqs/test.pip                                 |   15 +-
 reqs/test_tf2.pip                             |   14 +-
 scripts/test.sh                               |    4 +-
 setup.py                                      |    3 +-
 262 files changed, 8670 insertions(+), 31435 deletions(-)
 delete mode 100644 coremltools/converters/keras/__init__.py
 delete mode 100644 coremltools/converters/keras/_keras2_converter.py
 delete mode 100644 coremltools/converters/keras/_keras_converter.py
 delete mode 100644 coremltools/converters/keras/_layers.py
 delete mode 100644 coremltools/converters/keras/_layers2.py
 delete mode 100644 coremltools/converters/keras/_topology.py
 delete mode 100644 coremltools/converters/keras/_topology2.py
 delete mode 100644 coremltools/converters/keras/_utils.py
 rename coremltools/converters/{onnx/_tests => mil/frontend/milproto}/__init__.py (67%)
 create mode 100644 coremltools/converters/mil/frontend/milproto/helper.py
 create mode 100644 coremltools/converters/mil/frontend/milproto/load.py
 create mode 100644 coremltools/converters/mil/frontend/milproto/test_load.py
 create mode 100644 coremltools/converters/mil/frontend/tensorflow/test/test_conversion_api.py
 create mode 100644 coremltools/converters/mil/frontend/tensorflow2/test/test_conversion_api.py
 create mode 100644 coremltools/converters/mil/frontend/torch/test/test_conversion_api.py
 create mode 100644 coremltools/converters/mil/mil/ops/defs/constexpr_ops.py
 create mode 100644 coremltools/converters/mil/mil/ops/tests/test_constexpr_ops.py
 create mode 100644 coremltools/converters/mil/mil/passes/compression_passes.py
 create mode 100644 coremltools/converters/mil/mil/passes/prelu_fusion.py
 create mode 100644 coremltools/converters/mil/mil/passes/update_output_dtypes.py
 delete mode 100644 coremltools/converters/onnx/_backend.py
 delete mode 100644 coremltools/converters/onnx/_backend_rep.py
 delete mode 100644 coremltools/converters/onnx/_converter.py
 delete mode 100644 coremltools/converters/onnx/_error_utils.py
 delete mode 100644 coremltools/converters/onnx/_graph.py
 delete mode 100644 coremltools/converters/onnx/_graph_viz.py
 delete mode 100644 coremltools/converters/onnx/_operators.py
 delete mode 100644 coremltools/converters/onnx/_operators_nd.py
 delete mode 100644 coremltools/converters/onnx/_tests/_test_utils.py
 delete mode 100644 coremltools/converters/onnx/_tests/test_convert.py
 delete mode 100644 coremltools/converters/onnx/_tests/test_custom_layers.py
 delete mode 100644 coremltools/converters/onnx/_tests/test_graph.py
 delete mode 100644 coremltools/converters/onnx/_tests/test_mlmodel_passes.py
 delete mode 100644 coremltools/converters/onnx/_tests/test_operators.py
 delete mode 100644 coremltools/converters/onnx/_tests/test_pytorch_model.py
 delete mode 100644 coremltools/converters/onnx/_tests/test_transformers.py
 delete mode 100644 coremltools/converters/onnx/_transformers.py
 rename coremltools/{converters/onnx => models/ml_program}/__init__.py (56%)
 create mode 100644 coremltools/models/ml_program/compression_utils.py
 create mode 100644 coremltools/test/ml_program/__init__.py
 create mode 100644 coremltools/test/ml_program/test_compression.py
 delete mode 100644 coremltools/test/neural_network/test_keras.py
 delete mode 100644 coremltools/test/neural_network/test_keras2.py
 delete mode 100644 coremltools/test/neural_network/test_keras2_numeric.py
 delete mode 100644 coremltools/test/neural_network/test_keras_nonseq.py
 delete mode 100644 coremltools/test/neural_network/test_keras_numeric.py
 delete mode 100644 coremltools/test/neural_network/test_multiple_images_preprocessing.py
 delete mode 100644 coremltools/test/neural_network/test_recurrent_stress_tests.py
 delete mode 100644 coremltools/test/neural_network/test_simple_recurrent_single_layer.py
 delete mode 100644 docs/source/coremltools.converters.keras.rst
 delete mode 100644 docs/source/coremltools.converters.onnx.rst

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 9ac4a0559..23ee50dbf 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -38,12 +38,6 @@ check_python_flake8:
     paths:
       - build/dist/
 
-build_wheel_linux_py35:
-  <<: *build_linux
-  image: registry.gitlab.com/zach_nation/coremltools/build-image-ubuntu-18.04:1.0.0
-  variables:
-    PYTHON: "3.5"
-
 build_wheel_linux_py36:
   <<: *build_linux
   image: registry.gitlab.com/zach_nation/coremltools/build-image-ubuntu-18.04:1.0.0
@@ -85,11 +79,6 @@ build_wheel_linux_py39:
       paths:
         - build/dist/
 
-build_wheel_macos_py35:
-  <<: *build_macos
-  variables:
-    PYTHON: "3.5"
-
 build_wheel_macos_py36:
   <<: *build_macos
   variables:
diff --git a/.pylintrc b/.pylintrc
index e56bd652e..ac57e06c7 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -316,8 +316,6 @@ ignore-on-opaque-inference=yes
 ignored-classes=
   optparse.Values,
   sympy.core.mul.Mul,
-  test.model_zoo.onnx.test_latham_lstm.TestLathamLSTM,
-  test.model_zoo.onnx.test_transformer.TestMT,
   thread._local,
   _thread._local
 
@@ -328,7 +326,6 @@ ignored-classes=
 ignored-modules=
   LazyLoader,
   matplotlib.cm,
-  onnx.onnx_*_ml_pb2,
   tensorflow,
   tensorflow.core.framework.*_pb2,
   tensorflow.tools.api.generator.api.contrib,
diff --git a/BUILDING.md b/BUILDING.md
index 6d342b16a..f4bd213f2 100644
--- a/BUILDING.md
+++ b/BUILDING.md
@@ -19,7 +19,7 @@ Follow these steps:
 1. Fork and clone the GitHub [coremltools repository](https://github.com/apple/coremltools).
 
 2. Run the [build.sh](scripts/build.sh) script to build `coremltools`. 
-	* By default this script uses Python 3.7, but you can include `--python=3.5` (or `3.6`, `3.8`, and so on) as a argument to change the Python version. 
+	* By default this script uses Python 3.7, but you can include `--python=3.6` (or `3.7`, `3.8`, `3.9`) as a argument to change the Python version.
 	* The script creates a new `build` folder with the coremltools distribution, and a `dist` folder with Python wheel files.
 	
 3. Run the [test.sh](scripts/test.sh) script to test the build.
@@ -45,7 +45,7 @@ The following build targets help you configure the development environment. If y
 * `test_slow` | Run all non-fast tests.
 * `wheel` | Build wheels in release mode.
 
-The script uses Python 3.7, but you can include `--python=3.5` (or `3.6`, `3.8`, and so on) as a argument to change the Python version.
+The script uses Python 3.7, but you can include `--python=3.6` (or `3.7`, `3.8`, `3.9`) as a argument to change the Python version.
 
 ## Resources
 
diff --git a/README.md b/README.md
index ff7d4f212..2b925c00e 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,6 @@ Use *coremltools* to convert machine learning models from third-party libraries
 * [TensorFlow 1.x](https://www.tensorflow.org/versions/r1.15/api_docs/python/tf)
 * [TensorFlow 2.x](https://www.tensorflow.org/api_docs)
 * [PyTorch](https://pytorch.org/)
-* [TensorFlow's Keras APIs](https://keras.io/)
 * Non-neural network frameworks:
 	* [scikit-learn](https://scikit-learn.org/stable/)
 	* [XGBoost](https://xgboost.readthedocs.io/en/latest/)
diff --git a/coremlpython/CoreMLPython.h b/coremlpython/CoreMLPython.h
index 92c72a7cc..88ae59b9c 100644
--- a/coremlpython/CoreMLPython.h
+++ b/coremlpython/CoreMLPython.h
@@ -26,7 +26,7 @@ namespace CoreML {
             Model& operator=(const Model&) = delete;
             ~Model();
             explicit Model(const std::string& urlStr, const std::string& computeUnits);
-            py::dict predict(const py::dict& input, bool useCPUOnly);
+            py::dict predict(const py::dict& input);
             static py::bytes autoSetSpecificationVersion(const py::bytes& modelBytes);
             static int32_t maximumSupportedSpecificationVersion();
             std::string toString() const;
diff --git a/coremlpython/CoreMLPython.mm b/coremlpython/CoreMLPython.mm
index ac4d01320..28a523106 100644
--- a/coremlpython/CoreMLPython.mm
+++ b/coremlpython/CoreMLPython.mm
@@ -78,15 +78,12 @@
     }
 }
 
-py::dict Model::predict(const py::dict& input, bool useCPUOnly) {
+py::dict Model::predict(const py::dict& input) {
     @autoreleasepool {
         NSError *error = nil;
         MLDictionaryFeatureProvider *inFeatures = Utils::dictToFeatures(input, &error);
         Utils::handleError(error);
-        MLPredictionOptions *options = [[MLPredictionOptions alloc] init];
-        options.usesCPUOnly = useCPUOnly;
         id<MLFeatureProvider> outFeatures = [m_model predictionFromFeatures:static_cast<MLDictionaryFeatureProvider * _Nonnull>(inFeatures)
-                                                                    options:options
                                                                       error:&error];
         Utils::handleError(error);
         return Utils::featuresToDict(outFeatures);
diff --git a/coremlpython/CoreMLPythonUtils.mm b/coremlpython/CoreMLPythonUtils.mm
index 6ef212faa..72f8c3441 100644
--- a/coremlpython/CoreMLPythonUtils.mm
+++ b/coremlpython/CoreMLPythonUtils.mm
@@ -433,6 +433,7 @@ static size_t sizeOfArrayElement(MLMultiArrayDataType type) {
         case MLMultiArrayDataTypeInt32:
             return sizeof(int32_t);
         case MLMultiArrayDataTypeFloat32:
+        case MLMultiArrayDataTypeFloat16:
             return sizeof(float);
         case MLMultiArrayDataTypeDouble:
             return sizeof(double);
@@ -449,7 +450,7 @@ static size_t sizeOfArrayElement(MLMultiArrayDataType type) {
     MLMultiArrayDataType type = value.dataType;
     std::vector<size_t> shape = Utils::convertNSArrayToCpp(value.shape);
     std::vector<size_t> strides = Utils::convertNSArrayToCpp(value.strides);
-    
+
     // convert strides to numpy (bytes) instead of mlkit (elements)
     for (size_t& stride : strides) {
         stride *= sizeOfArrayElement(type);
@@ -460,6 +461,16 @@ static size_t sizeOfArrayElement(MLMultiArrayDataType type) {
             return py::array(shape, strides, static_cast<const int32_t*>(value.dataPointer));
         case MLMultiArrayDataTypeFloat32:
             return py::array(shape, strides, static_cast<const float*>(value.dataPointer));
+        case MLMultiArrayDataTypeFloat16:
+        {
+            // create a float32 array, cast float16 values and copy into it
+            // TODO: rdar://92239209 : return np.float16 instead of np.float32 when multiarray type is Float16
+            std::vector<float> value_fp32(value.count, 0.0);
+            for (size_t i=0; i<value.count; i++) {
+                value_fp32[i] = [value[i] floatValue];
+            }
+            return py::array(shape, strides, value_fp32.data());
+        }
         case MLMultiArrayDataTypeDouble:
             return py::array(shape, strides, static_cast<const double*>(value.dataPointer));
         default:
diff --git a/coremltools/__init__.py b/coremltools/__init__.py
index 59bc64ad9..c5b872218 100644
--- a/coremltools/__init__.py
+++ b/coremltools/__init__.py
@@ -14,8 +14,8 @@
 Core MLTools in a python package for creating, examining, and testing models in the .mlmodel
 format. In particular, it can be used to:
 
-* Convert existing models to .mlmodel format from popular machine learning tools including:
-     Keras, scikit-learn, libsvm, and XGBoost.
+* Convert existing models to .mlpackage or .mlmodel formats from popular machine learning tools including:
+     PyTorch, TensorFlow, scikit-learn, XGBoost and libsvm.
 * Express models in .mlmodel format through a simple API.
 * Make predictions with an .mlmodel (on select platforms for testing purposes).
 
@@ -60,6 +60,9 @@
 # New versions for iOS 15.0
 _SPECIFICATION_VERSION_IOS_15 = 6
 
+# New versions for iOS 16.0
+_SPECIFICATION_VERSION_IOS_16 = 7
+
 class ComputeUnit(_Enum):
     '''
     The set of processing-unit configurations the model can use to make predictions.
@@ -68,16 +71,29 @@ class ComputeUnit(_Enum):
     CPU_AND_GPU = 2 # Allows the model to use both the CPU and GPU, but not the neural engine
     CPU_ONLY = 3 # Limit the model to only use the CPU
 
+# A dictionary that maps the CoreML model specification version to the MLProgram/MIL opset string
+_OPSET = {
+    _SPECIFICATION_VERSION_IOS_15: "CoreML5",
+    _SPECIFICATION_VERSION_IOS_16: "CoreML6",
+}
+
+# Default specification version for each backend
+_LOWEST_ALLOWED_SPECIFICATION_VERSION_FOR_NEURALNETWORK = _SPECIFICATION_VERSION_IOS_13
+_LOWEST_ALLOWED_SPECIFICATION_VERSION_FOR_MILPROGRAM = _SPECIFICATION_VERSION_IOS_15
+
+
 # expose sub packages as directories
 from . import converters
 from . import proto
 from . import models
 from .models import utils
+from .models.ml_program import compression_utils
 
 # expose unified converter in coremltools package level
 from .converters import convert
 from .converters import (
     ClassifierConfig,
+    ColorLayout as colorlayout,
     TensorType,
     ImageType,
     RangeDim,
diff --git a/coremltools/_deps/__init__.py b/coremltools/_deps/__init__.py
index 19796155d..1b3c9042e 100644
--- a/coremltools/_deps/__init__.py
+++ b/coremltools/_deps/__init__.py
@@ -100,7 +100,7 @@ def __get_sklearn_version(version):
 _TF_1_MIN_VERSION = "1.12.0"
 _TF_1_MAX_VERSION = "1.15.0"
 _TF_2_MIN_VERSION = "2.1.0"
-_TF_2_MAX_VERSION = "2.6.2"
+_TF_2_MAX_VERSION = "2.8.0"
 
 try:
     import tensorflow
@@ -143,82 +143,9 @@ def __get_sklearn_version(version):
 MSG_TF1_NOT_FOUND = "TensorFlow 1.x not found."
 MSG_TF2_NOT_FOUND = "TensorFlow 2.x not found."
 
-# ---------------------------------------------------------------------------------------
-_HAS_KERAS_TF = True
-_HAS_KERAS2_TF = True
-_KERAS_MIN_VERSION = "1.2.2"
-_KERAS_MAX_VERSION = "2.6.0"
-MSG_KERAS1_NOT_FOUND = "Keras 1 not found."
-MSG_KERAS2_NOT_FOUND = "Keras 2 not found."
-
-try:
-    # Prevent keras from printing things that are not errors to standard error.
-    import sys
-
-    import io
-
-    temp = io.StringIO()
-    stderr = sys.stderr
-    try:
-        sys.stderr = temp
-        import keras
-    except:
-        # Print out any actual error message and re-raise.
-        sys.stderr = stderr
-        sys.stderr.write(temp.getvalue())
-        raise
-    finally:
-        sys.stderr = stderr
-    import tensorflow
-
-    k_ver = _get_version(keras.__version__)
-
-    # keras 1 version too old
-    if k_ver < _StrictVersion(_KERAS_MIN_VERSION):
-        _HAS_KERAS_TF = False
-        _HAS_KERAS2_TF = False
-        _logging.warning(
-            (
-                "Keras version %s is not supported. Minimum required version: %s ."
-                "Keras conversion will be disabled."
-            )
-            % (keras.__version__, _KERAS_MIN_VERSION)
-        )
-    # keras version too new
-    if k_ver > _StrictVersion(_KERAS_MAX_VERSION):
-        _HAS_KERAS_TF = False
-        _logging.warning(
-            (
-                "Keras version %s has not been tested with coremltools. You may run into unexpected errors. "
-                "Keras %s is the most recent version that has been tested."
-            )
-            % (keras.__version__, _KERAS_MAX_VERSION)
-        )
-    # Using Keras 2 rather than 1
-    if k_ver >= _StrictVersion("2.0.0"):
-        _HAS_KERAS_TF = False
-        _HAS_KERAS2_TF = True
-    # Using Keras 1 rather than 2
-    else:
-        _HAS_KERAS_TF = True
-        _HAS_KERAS2_TF = False
-    if keras.backend.backend() != "tensorflow":
-        _HAS_KERAS_TF = False
-        _HAS_KERAS2_TF = False
-        _logging.warning(
-            (
-                "Unsupported Keras backend (only TensorFlow is currently supported). "
-                "Keras conversion will be disabled."
-            )
-        )
-
-except:
-    _HAS_KERAS_TF = False
-    _HAS_KERAS2_TF = False
-
 # ---------------------------------------------------------------------------------------
 _HAS_TORCH = True
-_TORCH_MAX_VERSION = "1.10.2"
+_TORCH_MAX_VERSION = "1.11.0"
 try:
     import torch
     _warn_if_above_max_supported_version("Torch", torch.__version__, _TORCH_MAX_VERSION)
@@ -228,13 +155,6 @@ def __get_sklearn_version(version):
 
 
 # ---------------------------------------------------------------------------------------
-_HAS_ONNX = True
-try:
-    import onnx
-except:
-    _HAS_ONNX = False
-MSG_ONNX_NOT_FOUND = "ONNX not found."
-
 try:
     import scipy
 except:
diff --git a/coremltools/converters/__init__.py b/coremltools/converters/__init__.py
index 6e4a44247..bca49bbbc 100644
--- a/coremltools/converters/__init__.py
+++ b/coremltools/converters/__init__.py
@@ -7,11 +7,10 @@
 from . import libsvm
 from . import sklearn
 from . import xgboost
-from . import keras
-from . import onnx
 from ._converters_entry import convert
 from .mil import (
     ClassifierConfig,
+    ColorLayout,
     TensorType,
     ImageType,
     RangeDim,
diff --git a/coremltools/converters/_converters_entry.py b/coremltools/converters/_converters_entry.py
index 26ed93251..7277bc0bb 100644
--- a/coremltools/converters/_converters_entry.py
+++ b/coremltools/converters/_converters_entry.py
@@ -10,16 +10,23 @@
 
 from coremltools import (
     ComputeUnit as _ComputeUnit,
-    __version__ as _ct_version
+    __version__ as _ct_version,
+    _LOWEST_ALLOWED_SPECIFICATION_VERSION_FOR_NEURALNETWORK,
+    _LOWEST_ALLOWED_SPECIFICATION_VERSION_FOR_MILPROGRAM,
 )
 from coremltools.converters.mil.mil.passes.quantization_passes import (
     AbstractQuantizationPass,
     ComputePrecision as precision,
     FP16ComputePrecision
 )
-from coremltools.converters.mil.input_types import InputType, ClassifierConfig
+from coremltools.converters.mil.input_types import (
+    ClassifierConfig,
+    ImageType,
+    InputType,
+    TensorType,
+)
 from coremltools.converters.mil.converter import mil_convert
-from coremltools.converters.mil.mil import Program
+from coremltools.converters.mil.mil import Program, types
 from coremltools._deps import _HAS_TORCH, _HAS_TF_1, _HAS_TF_2
 from coremltools.converters._profile_utils import _profile
 
@@ -55,7 +62,6 @@ def convert(
     compute_precision=None,
     skip_model_load=False,
     compute_units=_ComputeUnit.ALL,
-    useCPUOnly=False,
     package_dir=None,
     debug=False,
 ):
@@ -101,39 +107,79 @@ def convert(
 
     inputs : list of ``TensorType`` or ``ImageType``
 
-        * TensorFlow 1 and 2
+        * If "dtype" is specified in ``TensorType`` / ``ImageType``,
+          it will be applied to the input of the converted model.
+
+          e.g.: The following code snippet will produce a CoreML model with Float16 typed inputs.
+          >>> import coremltools as ct
+          >>> mlmodel = ct.convert(keras_model,
+          >>>                      inputs=[ct.TensorType(dtype=np.float16)],
+          >>>                      minimum_deployment_target=ct.target.macOS13)
+
+          e.g.: The following code snippet will produce a CoreML model with Grayscale 16 bit input image type
+          >>> import coremltools as ct
+          >>> # H : image height, W: image width
+          >>> mlmodel = ct.convert(torch_model,
+          >>>                      inputs=[ct.ImageType(shape=(1, 1, H, W),
+          >>>                              color_layout=ct.colorlayout.GRAYSCALE_FLOAT16)],
+          >>>                      minimum_deployment_target=ct.target.macOS13)
+
+        * TensorFlow 1 and 2 (including tf.keras)
 
             - The ``inputs`` parameter is optional. If not provided, the inputs
               are placeholder nodes in the model (if the model is frozen graph)
               or function inputs (if the model is a ``tf.function``).
-            - The inputs must correspond to all or some of the placeholder nodes
-              in the TF model.
-            - ``TensorType`` and ``ImageType`` in ``inputs`` must have the ``name``
-              specified. ``shape`` is optional.
             - If ``inputs`` is provided, it must be a flat list.
+            - The ``inputs`` must correspond to all or some of the placeholder nodes
+              in the TF model.
+            - If ``name`` is specified in ``TensorType`` and ``ImageType``, it
+              must correspond to a placeholder op in the TF graph. The input names
+              in the converted CoreML model can later be modifed using ``ct.utils.rename_feature`` API.
+            - If ``dtype`` is not specified, it defaults to the dtype of the inputs in the TF model.
 
         * PyTorch
 
             - The ``inputs`` parameter is required.
+            - Number of elements in the ``inputs`` must match the number of inputs of the pytorch model.
             - ``inputs`` may be a nested list or tuple.
-            - ``TensorType`` and ``ImageType`` in ``inputs`` must have the ``name``
-              and ``shape`` specified.
-
-    outputs : list[str] (optional)
-
-        * TensorFlow 1 and 2
-
-            - The ``outputs`` parameter is optional.
-
-            - If specified, ``outputs`` is a list of string representing node
-              names.
-
-            - If ``outputs`` is not specified, the converter infers outputs to
-              all be terminal identity nodes.
+            - ``TensorType`` and ``ImageType`` must have the ``shape`` specified.
+            - If ``name`` argument is specified in ``TensorType`` / ``ImageType``, the converted
+                CoreML model will have inputs with the same name.
+            - If ``dtype`` is missing, it defaults to float32
+
+    outputs : list of ``TensorType`` or ``ImageType`` (optional)
+
+        * If "dtype" is specified in ``TensorType`` / ``ImageType``,
+          it will be applied to the output of the converted model.
+
+          e.g.: to produce float 16 typed inputs and outputs:
+          >>> import coremltools as ct
+          >>> mlmodel = ct.convert(keras_model,
+          >>>                      inputs=[ct.TensorType(dtype=np.float16)],
+          >>>                      outputs=[ct.TensorType(dtype=np.float16)],
+          >>>                      minimum_deployment_target=ct.target.macOS13)
+
+          e.g.: to produce Image inputs and outputs:
+          >>> import coremltools as ct
+          >>> # H: image height, W: image width
+          >>> mlmodel = ct.convert(torch_model,
+          >>>                      inputs=[ct.ImageType(shape=(1, 3, H, W), color_layout=ct.colorlayout.RGB)],
+          >>>                      outputs=[ct.ImageType(color_layout=ct.colorlayout.RGB)],
+          >>>                      minimum_deployment_target=ct.target.macOS13)
+
+        * TensorFlow 1 and 2 (including tf.keras)
+
+            - If ``outputs`` is not specified, the converter infers outputs from the
+              sink nodes in the graph.
+            - If specified, the ``name`` in ``TensorType`` / ``ImageType`` must correspond
+              to a node in the TF graph. In this case, the model will be converted up to
+              that node.
 
         * PyTorch
 
-            - ``outputs`` must not be specified.
+            - If specified, the length of the list must match the number of outputs returned by the
+              torch model
+            - If ``name`` is specified it is applied to the output names of the converted coreml model.
 
     classifier_config : ClassifierConfig class (optional)
         The configuration if the MLModel is intended to be a classifier.
@@ -201,34 +247,63 @@ def convert(
 
     compute_precision : coremltools.precision enumeration or ct.transform.FP16ComputePrecision() (optional)
 
+        Use this argument to control the storage precision of the tensors in the mlprogram.
+
         Must be one of the following.
-        
-        - ``coremltools.precision.FLOAT16``
-            - The following transform is applied:
+
+        - ``coremltools.precision.FLOAT16`` enum
+            - In this case the following transform is applied to produce a float16 typed program,
+                i.e. a program where all the intermediate float tensors have type float16
+                (for ops that support that type).
+
               ::
                  coremltools.transform.FP16ComputePrecision(op_selector=
                                                          lambda op:True)
 
-              The above transform injects ``cast`` ops to convert the
-              float32 dtypes of intermediate tensors to float16.
-        - ``coremltools.precision.FLOAT32``
+              The above transform itertes over all the ops. For each op,
+              it looks at its inputs and outputs, and if they are of type float32, ``cast``
+              ops are injected to convert those tensors (aka vars) to float16 type.
+
+        - ``coremltools.precision.FLOAT32`` enum
             - No transform is applied. The original float32 tensor dtype in
-              the source model is preserved.
+              the source model is preserved. Opt into this option if the default converted model
+              is displaying numerical precision issues.
+
         - ``coremltools.transform.FP16ComputePrecision(op_selector=...)``
-            - Use the above to control which tensors are cast to float16.
+            - Use this option to control which tensors are cast to float16.
+              Before casting the inputs/outputs of any op from float32 to float16,
+              the op_selector function is invoked on the op object. This function
+              must return a boolean value. By default its set to return True for every op,
+              however, this can be customized.
             - For example:
               ::
                  coremltools.transform.FP16ComputePrecision(op_selector=
                                          lambda op: op.op_type != "linear")
 
               The above casts all the float32 tensors to be float16, except
-              the input/output tensors to any ``linear`` op.
+              the input/output tensors to any ``linear`` op. See more examples
+              on this below.
+
         - ``None``
+            - This is the default.
             - When ``convert_to="mlprogram"``, compute_precision parameter
               defaults to ``coremltools.precision.FLOAT16``.
             - When ``convert_to="neuralnetwork"``, compute_precision parameter
               needs to be ``None`` and has no meaning.
 
+            e.g.: Customize the float16 precision transform to prevent from casting all the "real_div"
+                  ops in the program to float16 precision:
+
+            >>> def skip_real_div_ops(op):
+            >>>     if op.op_type == "real_div":
+            >>>         return False
+            >>>     return True
+            >>>
+            >>> model = ct.convert(source_model,
+            >>>                    compute_precision=ct.transform.FP16ComputePrecision(op_selector=skip_real_div_ops),
+            >>>                    minimum_deployment_target=ct.target.iOS15
+            >>>                    )
+
     skip_model_load : bool
         Set to True to prevent coremltools from calling into the Core ML framework
         to compile and load the model, post-conversion. In that case, the returned
@@ -253,11 +328,6 @@ def convert(
             - ``coremltools.ComputeUnit.CPU_AND_GPU``: Use both the CPU and GPU, but not the
               neural engine.
 
-    useCPUOnly: bool
-        Deprecated, to be removed in coremltools 6.0. Please use `compute_units` instead.
-        - if True, identical to setting compute_units to `coremltools.ComputeUnit.CPU_ONLY``
-        - if False, identical to setting compute_units to `coremltools.ComputeUnit.ALL``
-
     package_dir : str
         Post conversion, the model is saved at a temporary location and
         loaded to form the MLModel object ready for prediction.
@@ -322,16 +392,15 @@ def convert(
     more advanced options.
     """
     _check_deployment_target(minimum_deployment_target)
-    exact_source = _determine_source(model, source, outputs)
+    outputs_as_strings, outputs_as_tensor_or_image_types = _validate_outputs_argument(outputs)
+    exact_source = _determine_source(model, source,
+                                     outputs_as_strings,
+                                     outputs_as_tensor_or_image_types,
+                                     outputs)
     exact_target = _determine_target(convert_to, minimum_deployment_target)
-    _validate_inputs(model, exact_source, inputs, outputs, classifier_config, compute_precision,
-                     exact_target)
-
-    if useCPUOnly:
-        warnings.warn('The "useCPUOnly" parameter is deprecated and will be removed in 6.0. '
-                      'Use the compute_units parameter: "compute_units=coremotools.ComputeUnits.CPU_ONLY".')
-        compute_units = _ComputeUnit.CPU_ONLY
-
+    _validate_conversion_arguments(model, exact_source, inputs, outputs_as_tensor_or_image_types,
+                                   classifier_config, compute_precision,
+                                   exact_target, minimum_deployment_target)
 
     if compute_precision is None:
         transforms = [FP16ComputePrecision(op_selector=lambda op: True)] if convert_to != "neuralnetwork" else list()
@@ -349,18 +418,24 @@ def convert(
         if ext != _MLPACKAGE_EXTENSION:
             raise Exception("If package_dir is provided, it must have extension {} (not {})".format(_MLPACKAGE_EXTENSION, ext))
 
+    specification_version = minimum_deployment_target.value if minimum_deployment_target is not None else None
+    
+    if specification_version is None:
+        specification_version = _set_default_specification_version(exact_target)
+
     mlmodel = mil_convert(
         model,
         convert_from=exact_source,
         convert_to=exact_target,
         inputs=inputs,
-        outputs=outputs,
+        outputs=outputs_as_tensor_or_image_types, # None or list[ct.ImageType/ct.TensorType]
         classifier_config=classifier_config,
         transforms=tuple(transforms),
         skip_model_load=skip_model_load,
         compute_units=compute_units,
         package_dir=package_dir,
         debug=debug,
+        specification_version=specification_version,
     )
 
     if exact_target == 'milinternal':
@@ -379,6 +454,16 @@ def convert(
 
     return mlmodel
 
+def _set_default_specification_version(target):
+    if target == "neuralnetwork":
+        return _LOWEST_ALLOWED_SPECIFICATION_VERSION_FOR_NEURALNETWORK
+    elif target == "mlprogram":
+        return _LOWEST_ALLOWED_SPECIFICATION_VERSION_FOR_MILPROGRAM
+    elif target == "milinternal":
+        return None
+    else:
+        raise NotImplementedError("Backend converter {} not implemented".format(target))
+
 
 def _check_deployment_target(minimum_deployment_target):
     if minimum_deployment_target is not None and \
@@ -390,9 +475,79 @@ def _check_deployment_target(minimum_deployment_target):
         )
         raise TypeError(msg.format(minimum_deployment_target))
 
-def _validate_inputs(model, exact_source, inputs, outputs, classifier_config, compute_precision, convert_to):
+def _validate_outputs_argument(outputs):
+    """
+    - validate properties that the "outputs" argument must satisfy, for instance, it should either be a list
+      of ct.ImageType/ct.TensorType or a list of strings, etc.
+    - return : tuple
+        - (outputs_as_strings, outputs_as_tensor_or_image_types)
+        - outputs_as_strings: list[str]
+        - outputs_as_tensor_or_image_types : list[ct.ImageType] or list[ct.TensorType]
+    """
+    if outputs is None:
+        return None, None
+    else:
+        if not isinstance(outputs, list):
+            msg = '"outputs" must be of type list'
+            raise ValueError(msg)
+        if len(outputs) == 0:
+            return None, None
+        if not all([isinstance(t, TensorType) or isinstance(t, ImageType) or isinstance(t, str) for t in outputs]):
+            msg = '"outputs" must be a list of type ct.TensorType or ct.ImageType or strings'
+            raise ValueError(msg)
+
+        msg_inconsistent_types = 'all elements of "outputs" must either be of type str ' \
+                                 'or of types ct.ImageType/ct.TensorType'
+        if isinstance(outputs[0], str):
+            # if one of the elements is a string, all elements must be strings
+            if not all([isinstance(t, str) for t in outputs]):
+                raise ValueError(msg_inconsistent_types)
+            return outputs, [TensorType(name=name) for name in outputs]
+
+        if isinstance(outputs[0], InputType):
+            if not all([isinstance(t, TensorType) or isinstance(t, ImageType) for t in outputs]):
+                raise ValueError(msg_inconsistent_types)
+            if any([t.shape is not None for t in outputs]):
+                msg = "The 'shape' argument must not be specified for the outputs, since it is " \
+                      "automatically inferred from the input shapes and the ops in the model"
+                raise ValueError(msg)
+            for out_ in outputs:
+                if isinstance(out_, TensorType):
+                    if out_.default_value is not None:
+                        raise ValueError("The 'default_value' argument must not be specified for the outputs")
+                if isinstance(out_, ImageType):
+                    if out_.scale != 1.0:
+                        raise ValueError("'scale' must be 1.0 for a output of ImageType")
+                    if not (out_.bias is None or out_.bias == 0.0 or out_.bias == [0.0, 0.0, 0.0]):
+                        raise ValueError("'bias' must be None or 0 for an output of ImageType")
+                    if out_.channel_first is not None:
+                        raise ValueError("'channel_first' must be None for an output of ImageType")
+            output_names = [t.name for t in outputs]
+            # verify that either all of the entries in output_names is "None" or none of them is "None"
+            msg_consistent_names = 'Either none or all the outputs must have the "name" argument specified'
+            if output_names[0] is None and not all([name is None for name in output_names]):
+                raise ValueError(msg_consistent_names)
+            if output_names[0] is not None and not all([name is not None for name in output_names]):
+                raise ValueError(msg_consistent_names)
+            if output_names[0] is not None:
+                if len(set(output_names)) != len(output_names):
+                    raise ValueError("Duplicate names provided in 'outputs'")
+            if output_names[0] is None:
+                return None, outputs
+            else:
+                return output_names, outputs
+
+def _validate_conversion_arguments(model,
+                                   exact_source,
+                                   inputs,
+                                   outputs,
+                                   classifier_config,
+                                   compute_precision,
+                                   convert_to,
+                                   minimum_deployment_target,
+                                   ):
     """
-    Validate and process model, inputs, outputs, classifier_config based on
+    Validate and process model, inputs, classifier_config based on
     `exact_source` (which cannot be `auto`)
     """
     def raise_if_duplicated(input_list):
@@ -406,11 +561,47 @@ def raise_if_duplicated(input_list):
         if len(dups) > 0:
             raise ValueError("Duplicated inputs: {}".format(dups))
 
+    def _flatten_list(_inputs):
+        ret = []
+        for _input in _inputs:
+            if isinstance(_input, (list, tuple)):
+                ret.extend(_flatten_list(_input))
+            elif isinstance(_input, InputType):
+                ret.append(_input)
+            else:
+                raise ValueError(
+                    "Unknown type {} for flattening into InputType.".format(
+                        type(_input)
+                    )
+                )
+        return ret
+
+    flat_inputs = None
     if inputs is not None:
         if not isinstance(inputs, list):
             msg = '"inputs" must be of type list'
             raise ValueError(msg)
 
+        # get flattened inputs
+        flat_inputs = _flatten_list(inputs)
+        for t in flat_inputs:
+            if not isinstance(t, InputType):
+                msg = 'inputs must be a list of type ct.TensorType or ct.ImageType'
+                raise ValueError(msg)
+            if t.dtype == types.fp16:
+                if not (minimum_deployment_target is not None and \
+                    minimum_deployment_target >= AvailableTarget.iOS16):
+                    msg = "float16 dtype for inputs is only supported for deployment target >= iOS16/macOS13/watchOS9/tvOS16"
+                    raise TypeError(msg)
+
+    if outputs is not None:
+        for t in outputs:
+            if t.dtype == types.fp16:
+                if not (minimum_deployment_target is not None and \
+                    minimum_deployment_target >= AvailableTarget.iOS16):
+                    msg = "float16 dtype for outputs is only supported for deployment target >= iOS16/macOS13/watchOS9/tvOS16"
+                    raise TypeError(msg)
+
     if classifier_config is not None:
         if not isinstance(classifier_config, ClassifierConfig):
             msg = '"classifier_config" must be of type ClassifierConfig'
@@ -448,22 +639,6 @@ def raise_if_duplicated(input_list):
             msg = 'Expected argument for pytorch "inputs" not provided'
             raise ValueError(msg)
 
-        def _flatten_list(_inputs):
-            ret = []
-            for _input in _inputs:
-                if isinstance(_input, (list, tuple)):
-                    ret.extend(_flatten_list(_input))
-                elif isinstance(_input, InputType):
-                    ret.append(_input)
-                else:
-                    raise ValueError(
-                        "Unknown type {} for flattening into InputType.".format(
-                            type(_input)
-                        )
-                    )
-            return ret
-
-        flat_inputs = _flatten_list(inputs)
         raise_if_duplicated(flat_inputs)
         if inputs is not None and not all(
             [isinstance(_input, InputType) for _input in flat_inputs]
@@ -471,8 +646,6 @@ def _flatten_list(_inputs):
             raise ValueError(
                 "Input should be a list/tuple (or nested lists/tuples) of TensorType or ImageType"
             )
-        if outputs is not None:
-            raise ValueError("outputs must not be specified for PyTorch")
 
     elif exact_source == "milinternal":
         if not isinstance(model, Program):
@@ -480,7 +653,10 @@ def _flatten_list(_inputs):
             raise ValueError(msg)
 
 
-def _determine_source(model, source, outputs):
+def _determine_source(model, source,
+                      output_names,
+                      outputs_as_tensor_or_image_types,
+                      output_argument_as_specified_by_user):
     """
     Infer source (which can be auto) to the precise framework.
     """
@@ -503,26 +679,36 @@ def _determine_source(model, source, outputs):
     # Determine `auto` source
     if source == "auto" and _HAS_TF_1:
         try:
-            loader = TF1Loader(model, outputs=outputs)
-            loader._graph_def_from_model(outputs=outputs)
+            loader = TF1Loader(model, outputs=outputs_as_tensor_or_image_types)
+            loader._graph_def_from_model(output_names=output_names)
             return "tensorflow"
         except:
             pass
 
     if source == "auto" and _HAS_TF_2:
         try:
-            loader = TF2Loader(model, outputs=outputs)
-            loader._graph_def_from_model(outputs=outputs)
+            loader = TF2Loader(model, outputs=outputs_as_tensor_or_image_types)
+            loader._graph_def_from_model(output_names=output_names)
             return "tensorflow2"
         except:
             pass
 
     if source == "auto" and _HAS_TORCH:
+        is_torch_load_successful = False
         try:
             pytorch_load(model)
-            return "pytorch"
+            is_torch_load_successful = True
         except:
             pass
+        if is_torch_load_successful:
+            # validate that the outputs passed by the user are of type ImageType/TensorType
+            if output_argument_as_specified_by_user is not None and \
+                not all([isinstance(t, TensorType) or isinstance(t, ImageType) \
+                        for t in output_argument_as_specified_by_user]):
+                msg = '"outputs" must be a list of type ct.TensorType or ct.ImageType for pytorch conversion'
+                raise ValueError(msg)
+            return "pytorch"
+
 
     if source == "auto" and isinstance(model, Program):
         return "milinternal"
@@ -544,12 +730,12 @@ def _determine_target(convert_to, minimum_deployment_target):
     """
     if minimum_deployment_target is not None:
         if convert_to == "mlprogram" and \
-            minimum_deployment_target.value < AvailableTarget.iOS15.value:
+            minimum_deployment_target < AvailableTarget.iOS15:
                 msg = "When 'convert_to' is {}, the minimum deployment target must be at least iOS15/macOS12/watchOS8/tvOS15"
                 raise ValueError(msg.format(convert_to))
 
         if convert_to == "neuralnetwork" and \
-            minimum_deployment_target.value >= AvailableTarget.iOS15.value:
+            minimum_deployment_target >= AvailableTarget.iOS15:
             msg = "If minimum deployment target is iOS15/macOS12/watchOS8/tvOS15 or higher, then " \
                   "'convert_to' cannot be {}. It must be 'mlprogram'"
             raise ValueError(msg.format(convert_to))
@@ -559,22 +745,41 @@ def _determine_target(convert_to, minimum_deployment_target):
     else:
         if minimum_deployment_target is None:
             return "neuralnetwork"
-        elif minimum_deployment_target.value <= AvailableTarget.iOS14.value:
+        elif minimum_deployment_target <= AvailableTarget.iOS14:
             return "neuralnetwork"
         else:
             return "mlprogram"
 
+
+def _get_metadata_from_mlmodel(mlmodel):
+    # Copy from source mlmodel if metadata info exists
+    src_pkg_version = mlmodel.user_defined_metadata[_METADATA_SOURCE]
+    coremltools_version = mlmodel.user_defined_metadata[_METADATA_VERSION]
+
+    src_pkg_version_list = src_pkg_version.split("==")
+    if len(src_pkg_version_list) == 0:
+        src_pkg, pkg_ver = None, None
+    elif len(src_pkg_version_list) == 1:
+        src_pkg, pkg_ver = src_pkg_version_list[0], ""
+    elif len(src_pkg_version_list) == 2:
+        src_pkg, pkg_ver = src_pkg_version_list
+    else:
+        raise AssertionError("Unable to parse src_pkg_version")
+
+    build_info = {'coremltools-version': _ct_version if not coremltools_version else coremltools_version}
+    if src_pkg is not None and pkg_ver is not None:
+        build_info['coremltools-component-' + src_pkg] = str(pkg_ver)
+
+    return build_info
+
+
 def _record_build_metadata(mlmodel, exact_source):
     # recording metadata: coremltools version, source framework and version
-    src_pkg, pkg_ver = None, None
     if exact_source in {"tensorflow", "tensorflow2"} and (_HAS_TF_1 or _HAS_TF_2):
-        src_pkg, pkg_ver = "tensorflow", tf.__version__
         src_pkg_version = "tensorflow=={0}".format(tf.__version__)
     elif exact_source == "pytorch" and _HAS_TORCH:
-        src_pkg, pkg_ver = "pytorch", torch.__version__
         src_pkg_version = "torch=={0}".format(torch.__version__)
     elif exact_source == 'milinternal':
-        src_pkg, pkg_ver = "milinternal", ""
         src_pkg_version = "milinternal"
     else:
         raise ValueError('Unsupported source {}'.format(exact_source))
@@ -582,9 +787,8 @@ def _record_build_metadata(mlmodel, exact_source):
     mlmodel.user_defined_metadata[_METADATA_SOURCE] = src_pkg_version
     mlmodel.user_defined_metadata[_METADATA_VERSION] = _ct_version
 
-    build_info = {'coremltools-version': _ct_version}
-    if src_pkg is not None and pkg_ver is not None:
-        build_info['coremltools-component-' + src_pkg] = str(pkg_ver)
+    build_info = _get_metadata_from_mlmodel(mlmodel)
+
     mlmodel._set_build_info_mil_attributes(build_info)
 
     return mlmodel
diff --git a/coremltools/converters/keras/__init__.py b/coremltools/converters/keras/__init__.py
deleted file mode 100644
index 2ae650f55..000000000
--- a/coremltools/converters/keras/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2017, Apple Inc. All rights reserved.
-#
-# Use of this source code is governed by a BSD-3-clause license that can be
-# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
-
-from ..._deps import _HAS_KERAS_TF
-from ..._deps import _HAS_KERAS2_TF
-
-if _HAS_KERAS_TF or _HAS_KERAS2_TF:
-    import keras as _keras
-    import logging as _logging
-    from ._keras_converter import convert
-
-    if _keras.backend.backend() != "tensorflow":
-        _HAS_KERAS_TF = False
-        _HAS_KERAS2_TF = False
-        _logging.warning(
-            "Currently, only Keras models with TensorFlow backend can be converted to Core ML."
-        )
diff --git a/coremltools/converters/keras/_keras2_converter.py b/coremltools/converters/keras/_keras2_converter.py
deleted file mode 100644
index 90cff43d4..000000000
--- a/coremltools/converters/keras/_keras2_converter.py
+++ /dev/null
@@ -1,637 +0,0 @@
-# Copyright (c) 2017-2019, Apple Inc. All rights reserved.
-#
-# Use of this source code is governed by a BSD-3-clause license that can be
-# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
-import logging
-
-from ...models.neural_network import NeuralNetworkBuilder as _NeuralNetworkBuilder
-from ...models.neural_network.update_optimizer_utils import AdamParams
-from ...models.neural_network.update_optimizer_utils import SgdParams
-from ...proto import FeatureTypes_pb2 as _FeatureTypes_pb2
-from collections import OrderedDict as _OrderedDict
-from ...proto import Model_pb2 as _Model_pb2
-from ...models import datatypes
-from ...models import MLModel as _MLModel
-from ...models.utils import save_spec as _save_spec
-
-from ..._deps import _HAS_KERAS2_TF
-
-if _HAS_KERAS2_TF:
-    import keras as _keras
-    from . import _layers2
-    from . import _topology2
-
-    _KERAS_LAYER_REGISTRY = {
-        _keras.layers.core.Dense: _layers2.convert_dense,
-        _keras.layers.core.Activation: _layers2.convert_activation,
-        _keras.layers.advanced_activations.LeakyReLU: _layers2.convert_activation,
-        _keras.layers.advanced_activations.PReLU: _layers2.convert_activation,
-        _keras.layers.advanced_activations.ELU: _layers2.convert_activation,
-        _keras.layers.advanced_activations.ThresholdedReLU: _layers2.convert_activation,
-        _keras.layers.advanced_activations.Softmax: _layers2.convert_activation,
-        _keras.layers.convolutional.Conv2D: _layers2.convert_convolution,
-        _keras.layers.convolutional.Conv2DTranspose: _layers2.convert_convolution,
-        _keras.layers.convolutional.SeparableConv2D: _layers2.convert_separable_convolution,
-        _keras.layers.pooling.AveragePooling2D: _layers2.convert_pooling,
-        _keras.layers.pooling.MaxPooling2D: _layers2.convert_pooling,
-        _keras.layers.pooling.GlobalAveragePooling2D: _layers2.convert_pooling,
-        _keras.layers.pooling.GlobalMaxPooling2D: _layers2.convert_pooling,
-        _keras.layers.convolutional.ZeroPadding2D: _layers2.convert_padding,
-        _keras.layers.convolutional.Cropping2D: _layers2.convert_cropping,
-        _keras.layers.convolutional.UpSampling2D: _layers2.convert_upsample,
-        _keras.layers.convolutional.Conv1D: _layers2.convert_convolution1d,
-        _keras.layers.pooling.AveragePooling1D: _layers2.convert_pooling,
-        _keras.layers.pooling.MaxPooling1D: _layers2.convert_pooling,
-        _keras.layers.pooling.GlobalAveragePooling1D: _layers2.convert_pooling,
-        _keras.layers.pooling.GlobalMaxPooling1D: _layers2.convert_pooling,
-        _keras.layers.convolutional.ZeroPadding1D: _layers2.convert_padding,
-        _keras.layers.convolutional.Cropping1D: _layers2.convert_cropping,
-        _keras.layers.convolutional.UpSampling1D: _layers2.convert_upsample,
-        _keras.layers.recurrent.LSTM: _layers2.convert_lstm,
-        _keras.layers.recurrent.SimpleRNN: _layers2.convert_simple_rnn,
-        _keras.layers.recurrent.GRU: _layers2.convert_gru,
-        _keras.layers.wrappers.Bidirectional: _layers2.convert_bidirectional,
-        _keras.layers.normalization.BatchNormalization: _layers2.convert_batchnorm,
-        _keras.layers.Add: _layers2.convert_merge,
-        _keras.layers.Multiply: _layers2.convert_merge,
-        _keras.layers.Average: _layers2.convert_merge,
-        _keras.layers.Maximum: _layers2.convert_merge,
-        _keras.layers.Concatenate: _layers2.convert_merge,
-        _keras.layers.Dot: _layers2.convert_merge,
-        _keras.layers.core.Flatten: _layers2.convert_flatten,
-        _keras.layers.core.Permute: _layers2.convert_permute,
-        _keras.layers.core.Reshape: _layers2.convert_reshape,
-        _keras.layers.embeddings.Embedding: _layers2.convert_embedding,
-        _keras.layers.core.RepeatVector: _layers2.convert_repeat_vector,
-        _keras.layers.core.Dropout: _layers2.default_skip,
-        _keras.layers.core.SpatialDropout2D: _layers2.default_skip,
-        _keras.layers.core.SpatialDropout1D: _layers2.default_skip,
-        _keras.layers.wrappers.TimeDistributed: _layers2.default_skip,
-    }
-    from distutils.version import StrictVersion as _StrictVersion
-
-    ## 2.2 Version check
-    if _keras.__version__ >= _StrictVersion("2.2.0"):
-        _KERAS_LAYER_REGISTRY[
-            _keras.layers.DepthwiseConv2D
-        ] = _layers2.convert_convolution
-        _KERAS_LAYER_REGISTRY[
-            _keras.engine.input_layer.InputLayer
-        ] = _layers2.default_skip
-        if _keras.__version__ >= _StrictVersion("2.2.1"):
-            _KERAS_LAYER_REGISTRY[
-                _keras.layers.advanced_activations.ReLU
-            ] = _layers2.convert_advanced_relu
-    else:
-        _KERAS_LAYER_REGISTRY[
-            _keras.applications.mobilenet.DepthwiseConv2D
-        ] = _layers2.convert_convolution
-        _KERAS_LAYER_REGISTRY[_keras.engine.topology.InputLayer] = _layers2.default_skip
-    # end if _HAS_KERAS2_TF
-
-
-def _is_merge_layer(layer):
-    if _HAS_KERAS2_TF:
-        for lt in _topology2._KERAS_MERGE_LAYERS:
-            if isinstance(layer, lt):
-                return True
-    return False
-
-
-def _is_activation_layer(layer):
-    return (
-        isinstance(layer, _keras.layers.core.Activation)
-        or isinstance(layer, _keras.layers.advanced_activations.LeakyReLU)
-        or isinstance(layer, _keras.layers.advanced_activations.PReLU)
-        or isinstance(layer, _keras.layers.advanced_activations.ELU)
-        or isinstance(layer, _keras.layers.advanced_activations.ThresholdedReLU)
-        or isinstance(layer, _keras.layers.advanced_activations.Softmax)
-    )
-
-
-def _check_unsupported_layers(model, add_custom_layers=False):
-    # When add_custom_layers = True, we just convert all layers not present in
-    # registry as custom layer placeholders
-    if add_custom_layers:
-        return
-    for i, layer in enumerate(model.layers):
-        if isinstance(layer, _keras.models.Sequential) or isinstance(
-            layer, _keras.models.Model
-        ):
-            _check_unsupported_layers(layer)
-        else:
-            if type(layer) not in _KERAS_LAYER_REGISTRY:
-                raise ValueError("Keras layer '%s' not supported. " % str(type(layer)))
-            if isinstance(layer, _keras.layers.wrappers.TimeDistributed):
-                if type(layer.layer) not in _KERAS_LAYER_REGISTRY:
-                    raise ValueError(
-                        "Keras layer '%s' not supported. " % str(type(layer.layer))
-                    )
-            if isinstance(layer, _keras.layers.wrappers.Bidirectional):
-                if not isinstance(layer.layer, _keras.layers.recurrent.LSTM):
-                    raise ValueError(
-                        "Keras bi-directional wrapper conversion supports "
-                        "only LSTM layer at this time. "
-                    )
-
-
-def _get_layer_converter_fn(layer, add_custom_layers=False):
-    """Get the right converter function for Keras
-    """
-    layer_type = type(layer)
-    if layer_type in _KERAS_LAYER_REGISTRY:
-        convert_func = _KERAS_LAYER_REGISTRY[layer_type]
-        if convert_func is _layers2.convert_activation:
-            act_name = _layers2._get_activation_name_from_keras_layer(layer)
-            if act_name == "CUSTOM":
-                return None
-        return convert_func
-    elif add_custom_layers:
-        return None
-    else:
-        raise TypeError("Keras layer of type %s is not supported." % type(layer))
-
-
-def _load_keras_model(model_network_path, model_weight_path, custom_objects=None):
-    """Load a keras model from disk
-
-    Parameters
-    ----------
-    model_network_path: str
-        Path where the model network path is (json file)
-
-    model_weight_path: str
-        Path where the model network weights are (hd5 file)
-
-    custom_objects:
-        A dictionary of layers or other custom classes
-        or functions used by the model
-
-    Returns
-    -------
-    model: A keras model
-    """
-    from keras.models import model_from_json
-    import json
-
-    # Load the model network
-    json_file = open(model_network_path, "r")
-    loaded_model_json = json_file.read()
-    json_file.close()
-
-    if not custom_objects:
-        custom_objects = {}
-
-    # Load the model weights
-    loaded_model = model_from_json(loaded_model_json, custom_objects=custom_objects)
-    loaded_model.load_weights(model_weight_path)
-
-    return loaded_model
-
-
-def _convert_training_info(model, builder, output_features):
-    """
-    Convert the training information from the given Keras 'model' into the Core
-    ML in 'builder'.
-
-    :param model: keras.model.Sequential
-        The source Keras model.
-    :param builder: NeutralNetworkBuilder
-        The target model that will gain the loss and optimizer.
-    :param output_features: list of tuples, (str, datatype)
-        The set of tensor names that are output from the layers in the Keras
-        model.
-    """
-    # Keras does not have a number of epochs compiled into the model, so we
-    # invent one here for ease of use.  1 makes the most sense, as the user
-    # can just invoke training repeatedly if they'd like to do more.
-    builder.set_epochs(1)
-    import keras
-
-    try:
-        if (
-            model.loss == keras.losses.categorical_crossentropy
-            or model.loss == "categorical_crossentropy"
-        ):
-            builder.set_categorical_cross_entropy_loss(
-                name="loss_layer", input=output_features[0][0]
-            )
-        elif (
-            model.loss == keras.losses.mean_squared_error
-            or model.loss == "mean_squared_error"
-        ):
-            builder.set_mean_squared_error_loss(
-                name="loss_layer", input_feature=output_features[0]
-            )
-        else:
-            print(
-                "Models loss: "
-                + str(model.loss)
-                + ", vs Keras loss: "
-                + str(keras.losses.mean_squared_error)
-            )
-            logging.warning(
-                "Loss " + str(model.loss) + " is not yet "
-                "supported by Core ML. The loss layer will "
-                "not be carried over. To train this model, "
-                "you will need to manually add a supported "
-                "loss layer."
-            )
-    except AttributeError:
-        logging.warning(
-            "Core ML conversion was asked to respect trainable "
-            "parameters from the Keras model, but the input "
-            "model does not include a loss layer."
-        )
-    try:
-        opt = model.optimizer
-    except AttributeError:
-        logging.warning(
-            "Core ML conversion was asked to respect trainable "
-            "parameters from the Keras model, but could not read "
-            "the optimizer from Keras."
-        )
-        return
-
-    if model.optimizer:
-        # a dict of the parameters we need.
-        cfg = model.optimizer.get_config()
-        if "decay" in cfg and cfg["decay"] != 0.0:
-            logging.warning(
-                "Keras optimizer has 'decay' set, which is "
-                "not supported in Core ML. This parameter "
-                "of the optimizer will be ignored. Clients "
-                "can change the learning rate from within an "
-                "MLUpdateTask callback to achieve the same "
-                "effect."
-            )
-        if isinstance(model.optimizer, keras.optimizers.SGD):
-            params = SgdParams(lr=cfg["lr"], momentum=cfg["momentum"])
-            if "nesterov" in cfg and cfg["nesterov"] == True:
-                logging.warning(
-                    "Keras SGD optimizer has 'nesterov' set, "
-                    "but this is not supported by Core ML. "
-                    "The parameter will be ignored."
-                )
-            # Keras does not require a user to specify batch size up front,
-            # as Core ML does.  We need to choose something, let's be a bit
-            # wide to minimize the chance of user "surprise" when running.
-            params.set_batch(16, [1, 16, 32])
-            builder.set_sgd_optimizer(params)
-        elif isinstance(model.optimizer, keras.optimizers.Adam):
-            params = AdamParams(
-                lr=cfg["lr"],
-                beta1=cfg["beta_1"],
-                beta2=cfg["beta_2"],
-                eps=cfg["epsilon"],
-            )
-            if "amsgrad" in cfg and cfg["amsgrad"] == True:
-                logging.warning(
-                    "Keras Adam optimizer has 'amsgrad' set, "
-                    "but this is not supported by Core ML. "
-                    "The parameter will be ignored."
-                )
-            # Keras does not require a user to specify batch size up front,
-            # as Core ML does.  We need to choose something, let's be a bit
-            # wide to minimize the chance of user "surprise" when running.
-            params.set_batch(16, [1, 16, 32])
-            builder.set_adam_optimizer(params)
-        else:
-            logging.warning(
-                "Optimizer " + str(model.optimizer) + " is "
-                "not yet supported by Core ML. The optimizer "
-                "will not be carried over. To train this "
-                "model, you will need to manually add a "
-                "supported optimizer."
-            )
-    else:
-        logging.warning(
-            "Core ML conversion was asked to respect "
-            "trainable parameters from the Keras model, but "
-            "the input model does not include an optimizer."
-        )
-
-
-def _convert(
-    model,
-    input_names=None,
-    output_names=None,
-    image_input_names=None,
-    input_name_shape_dict={},
-    is_bgr=False,
-    red_bias=0.0,
-    green_bias=0.0,
-    blue_bias=0.0,
-    gray_bias=0.0,
-    image_scale=1.0,
-    class_labels=None,
-    predicted_feature_name=None,
-    predicted_probabilities_output="",
-    add_custom_layers=False,
-    custom_conversion_functions=None,
-    custom_objects=None,
-    input_shapes=None,
-    output_shapes=None,
-    respect_trainable=False,
-    use_float_arraytype=False,
-):
-    # Check Keras format
-    if _keras.backend.image_data_format() == "channels_first":
-        print(
-            "Keras image data format 'channels_first' detected. Currently "
-            "only 'channels_last' is supported. "
-            "Changing to 'channels_last', but your model may not be converted "
-            "converted properly."
-        )
-        _keras.backend.set_image_data_format("channels_last")
-
-    # Check custom conversion functions / custom objects
-    add_custom_layers = custom_conversion_functions is not None
-
-    if isinstance(model, str):
-        model = _keras.models.load_model(model, custom_objects=custom_objects)
-    elif isinstance(model, tuple):
-        model = _load_keras_model(model[0], model[1])
-
-    # Check valid versions
-    _check_unsupported_layers(model, add_custom_layers)
-
-    # Build network graph to represent Keras model
-    graph = _topology2.NetGraph(model)
-    graph.build()
-
-    # The graph should be finalized before executing this
-    graph.generate_blob_names()
-    graph.add_recurrent_optionals()
-
-    inputs = graph.get_input_layers()
-    outputs = graph.get_output_layers()
-
-    # check input / output names validity
-    if input_names is not None:
-        if isinstance(input_names, str):
-            input_names = [input_names]
-    else:
-        input_names = ["input" + str(i + 1) for i in range(len(inputs))]
-
-    if output_names is not None:
-        if isinstance(output_names, str):
-            output_names = [output_names]
-    else:
-        output_names = ["output" + str(i + 1) for i in range(len(outputs))]
-
-    if image_input_names is not None and isinstance(image_input_names, str):
-        image_input_names = [image_input_names]
-
-    graph.reset_model_input_names(input_names)
-    graph.reset_model_output_names(output_names)
-
-    # Keras -> Core ML input dimension dictionary
-    # (None, None) -> [1, 1, 1, 1, 1]
-    # (None, D) -> [D] or [D, 1, 1, 1, 1]
-    # (None, Seq, D) -> [Seq, 1, D, 1, 1]
-    # (None, H, W, C) -> [C, H, W]
-    # (D) -> [D]
-    # (Seq, D) -> [Seq, 1, D, 1, 1]
-    # (Batch, Sequence, D) -> [D]
-    # (Batch, Seq, H, W, C) -> (C,H,W)
-
-    # Retrieve input shapes from model
-    if len(model._inbound_nodes) > 1 and input_shapes is not None:
-        input_dims = [filter(None, x) for x in input_shapes]
-        unfiltered_shapes = input_shapes
-    else:
-        if type(model.input_shape) is list:
-            input_dims = [filter(None, x) for x in model.input_shape]
-            unfiltered_shapes = model.input_shape
-        else:
-            input_dims = [filter(None, model.input_shape)]
-            unfiltered_shapes = [model.input_shape]
-
-    for idx, dim in enumerate(input_dims):
-        if input_names[idx] in input_name_shape_dict:
-            unfiltered_shape = input_name_shape_dict[input_names[idx]]
-            dim = list(filter(None, unfiltered_shape))
-        else:
-            unfiltered_shape = unfiltered_shapes[idx]
-            dim = list(input_dims[idx])
-
-        if len(unfiltered_shape) == 1:
-            if len(dim) == 1:
-                input_dims[idx] = dim  # dim is just a number
-            else:
-                errMsg = "Invalid input shape for '{}'.\n".format(input_names[idx])
-                errMsg += "Please provide a finite channel value (D) using input_name_shape_dict arg "
-                errMsg += "with key = '{}' and value = [D]".format(input_names[idx])
-                raise ValueError(errMsg)
-
-        elif len(unfiltered_shape) == 2:
-            if len(dim) == 2:  # [Seq, D]
-                input_dims[idx] = (dim[1],)
-            elif len(dim) == 1:
-                s = graph.get_successors(inputs[idx])[0]
-                if isinstance(
-                    graph.get_keras_layer(s), _keras.layers.embeddings.Embedding
-                ):
-                    # Embedding layer's special input (None, D) where D is
-                    # actually sequence length
-                    input_dims[idx] = (1,)
-                else:
-                    input_dims[idx] = dim  # dim is just a number
-            else:  # Used to be [None, None] before filtering; indicating unknown
-                # sequence length
-                input_dims[idx] = tuple([1])
-
-        elif len(unfiltered_shape) == 3:
-            if len(dim) == 3:  # keras provided fixed batch and sequence length,
-                # so the input was (batch, sequence, channel)
-                input_dims[idx] = (dim[2],)
-            elif len(dim) == 2:  # [None, Seq, D]
-                input_dims[idx] = (dim[1],)
-            elif len(dim) == 1:
-                input_dims[idx] = dim  # dim is just a number
-            else:
-                errMsg = "Invalid input shape for '{}'.\n".format(input_names[idx])
-                errMsg += "Please provide a finite channel value (D) using "
-                errMsg += (
-                    "input_name_shape_dict arg with key = '{}' and "
-                    "value = [None, None, D]".format(input_names[idx])
-                )
-                raise ValueError(errMsg)
-
-        elif len(unfiltered_shape) == 4:
-            if len(dim) == 3:  # keras uses the reverse notation from CoreML
-                input_dims[idx] = (dim[2], dim[0], dim[1])
-            else:
-                errMsg = "Invalid input shape for '{}'.\n".format(input_names[idx])
-                errMsg += (
-                    "Please provide a finite height (H), width (W) & "
-                    "channel value (C) "
-                )
-                errMsg += (
-                    "using input_name_shape_dict arg with key = '{}' "
-                    "and value = [None, H, W, C]\n".format(input_names[idx])
-                )
-                errMsg += (
-                    "Converted .mlmodel can be modified to have flexible "
-                    "input shape using coremltools.models.neural_network.flexible_shape_utils"
-                )
-                raise ValueError(errMsg)
-
-        elif len(unfiltered_shape) == 5:
-            if len(dim) == 4:  # keras uses the reverse notation from CoreML
-                input_dims[idx] = (dim[-1], dim[-3], dim[-2])
-            else:
-                errMsg = "Invalid input shape for '{}', shape:{}.\n".format(
-                    input_names[idx], str(unfiltered_shape)
-                )
-                raise ValueError(errMsg)
-        else:
-            raise ValueError(
-                "Input '%s' has input shape of length %d" % (input_names[idx], len(dim))
-            )
-
-    # Retrieve output shapes from model
-    if len(model._outbound_nodes) > 1 and output_shapes is not None:
-        output_dims = [filter(None, x) for x in output_shapes]
-    else:
-        if type(model.output_shape) is list:
-            output_dims = [filter(None, x) for x in model.output_shape]
-        else:
-            output_dims = [filter(None, model.output_shape[1:])]
-
-    for idx, dim in enumerate(output_dims):
-        dim = list(dim)
-        if len(dim) == 1:
-            output_dims[idx] = dim
-        elif len(dim) == 2:  # [Seq, D]
-            output_dims[idx] = (dim[1],)
-        elif len(dim) == 3:
-            output_dims[idx] = (dim[2], dim[0], dim[1])
-
-    input_types = [datatypes.Array(*dim) for dim in input_dims]
-    output_types = [datatypes.Array(*dim) for dim in output_dims]
-
-    # Some of the feature handling is sensitive about string vs. unicode
-    input_names = map(str, input_names)
-    output_names = map(str, output_names)
-    is_classifier = class_labels is not None
-    if is_classifier:
-        mode = "classifier"
-    else:
-        mode = None
-
-    # assuming these match
-    input_features = list(zip(input_names, input_types))
-    output_features = list(zip(output_names, output_types))
-
-    builder = _NeuralNetworkBuilder(
-        input_features,
-        output_features,
-        mode=mode,
-        use_float_arraytype=use_float_arraytype,
-    )
-
-    for iter, layer in enumerate(graph.layer_list):
-        keras_layer = graph.keras_layer_map[layer]
-        print("%d : %s, %s" % (iter, layer, keras_layer))
-        if isinstance(keras_layer, _keras.layers.wrappers.TimeDistributed):
-            keras_layer = keras_layer.layer
-        converter_func = _get_layer_converter_fn(keras_layer, add_custom_layers)
-        input_names, output_names = graph.get_layer_blobs(layer)
-        # this may be none if we're using custom layers
-        if converter_func:
-            converter_func(
-                builder,
-                layer,
-                input_names,
-                output_names,
-                keras_layer,
-                respect_trainable,
-            )
-        else:
-            if _is_activation_layer(keras_layer):
-                layer_name = keras_layer.activation.__name__
-            else:
-                layer_name = type(keras_layer).__name__
-            if layer_name in custom_conversion_functions:
-                custom_spec = custom_conversion_functions[layer_name](keras_layer)
-            else:
-                custom_spec = None
-
-            builder.add_custom(layer, input_names, output_names, custom_spec)
-
-    # Since we aren't mangling anything the user gave us, we only need to update
-    # the model interface here
-    builder.add_optionals(graph.optional_inputs, graph.optional_outputs)
-
-    # Add classifier classes (if applicable)
-    if is_classifier:
-        classes_in = class_labels
-        if isinstance(classes_in, str):
-            import os
-
-            if not os.path.isfile(classes_in):
-                raise ValueError(
-                    "Path to class labels (%s) does not exist." % classes_in
-                )
-            with open(classes_in, "r") as f:
-                classes = f.read()
-            classes = classes.splitlines()
-        elif type(classes_in) is list:  # list[int or str]
-            classes = classes_in
-        else:
-            raise ValueError(
-                "Class labels must be a list of integers / strings, or a file path"
-            )
-
-        if predicted_feature_name is not None:
-            builder.set_class_labels(
-                classes,
-                predicted_feature_name=predicted_feature_name,
-                prediction_blob=predicted_probabilities_output,
-            )
-        else:
-            builder.set_class_labels(classes)
-
-    # Set pre-processing parameters
-    builder.set_pre_processing_parameters(
-        image_input_names=image_input_names,
-        is_bgr=is_bgr,
-        red_bias=red_bias,
-        green_bias=green_bias,
-        blue_bias=blue_bias,
-        gray_bias=gray_bias,
-        image_scale=image_scale,
-    )
-
-    # add in the loss and optimizer, if the network has it and that is
-    # appropriate given the flag.
-    if respect_trainable:
-        _convert_training_info(model, builder, output_features)
-
-    # Return the protobuf spec
-    spec = builder.spec
-
-    # If the model has multi-arrays of type double, recommend to the user the utility function
-    # coremltools.models.utils.convert_double_to_float_multiarray_type(spec)
-    has_double_multiarray = False
-    for feature in list(spec.description.input) + list(spec.description.output):
-        if feature.type.HasField("multiArrayType"):
-            if (
-                feature.type.multiArrayType.dataType
-                == _Model_pb2.ArrayFeatureType.DOUBLE
-            ):
-                has_double_multiarray = True
-                break
-
-    if has_double_multiarray:
-        print(
-            "\n\nRecommendation: This model has at least one multiarray input/output of type double.\n"
-            "For large sized arrays, multiarrays of type float32 are more efficient.\n"
-            "In future, float input/output multiarrays will be produced by default by the converter.\n"
-            "Please use, either the flag 'use_float_arraytype' during the call to convert or\n"
-            "the utility 'coremltools.utils.convert_double_to_float_multiarray_type(spec)', post-conversion.\n\n"
-        )
-
-    return spec
diff --git a/coremltools/converters/keras/_keras_converter.py b/coremltools/converters/keras/_keras_converter.py
deleted file mode 100644
index 9ea7809ca..000000000
--- a/coremltools/converters/keras/_keras_converter.py
+++ /dev/null
@@ -1,841 +0,0 @@
-# Copyright (c) 2021, Apple Inc. All rights reserved.
-#
-# Use of this source code is governed by a BSD-3-clause license that can be
-# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
-
-from ...models.neural_network import NeuralNetworkBuilder as _NeuralNetworkBuilder
-from ...proto import FeatureTypes_pb2 as _FeatureTypes_pb2
-from ...models import datatypes, _METADATA_VERSION, _METADATA_SOURCE
-from ...models import MLModel as _MLModel
-from ...models import (
-    _MLMODEL_FULL_PRECISION,
-    _MLMODEL_HALF_PRECISION,
-    _VALID_MLMODEL_PRECISION_TYPES,
-)
-from ...models._deprecation import deprecated as _deprecated
-from ...models.utils import _convert_neural_network_spec_weights_to_fp16
-
-from ..._deps import _HAS_KERAS_TF
-from ..._deps import _HAS_KERAS2_TF
-from coremltools import __version__ as ct_version
-
-if _HAS_KERAS_TF:
-    import keras as _keras
-    from . import _layers
-    from . import _topology
-
-    _KERAS_LAYER_REGISTRY = {
-        _keras.layers.core.Dense: _layers.convert_dense,
-        _keras.layers.core.Activation: _layers.convert_activation,
-        _keras.layers.advanced_activations.LeakyReLU: _layers.convert_activation,
-        _keras.layers.advanced_activations.PReLU: _layers.convert_activation,
-        _keras.layers.advanced_activations.ELU: _layers.convert_activation,
-        _keras.layers.advanced_activations.ParametricSoftplus: _layers.convert_activation,
-        _keras.layers.advanced_activations.ThresholdedReLU: _layers.convert_activation,
-        _keras.activations.softmax: _layers.convert_activation,
-        _keras.layers.convolutional.Convolution2D: _layers.convert_convolution,
-        _keras.layers.convolutional.Deconvolution2D: _layers.convert_convolution,
-        _keras.layers.convolutional.AtrousConvolution2D: _layers.convert_convolution,
-        _keras.layers.convolutional.AveragePooling2D: _layers.convert_pooling,
-        _keras.layers.convolutional.MaxPooling2D: _layers.convert_pooling,
-        _keras.layers.pooling.GlobalAveragePooling2D: _layers.convert_pooling,
-        _keras.layers.pooling.GlobalMaxPooling2D: _layers.convert_pooling,
-        _keras.layers.convolutional.ZeroPadding2D: _layers.convert_padding,
-        _keras.layers.convolutional.Cropping2D: _layers.convert_cropping,
-        _keras.layers.convolutional.UpSampling2D: _layers.convert_upsample,
-        _keras.layers.convolutional.Convolution1D: _layers.convert_convolution1d,
-        _keras.layers.convolutional.AtrousConvolution1D: _layers.convert_convolution1d,
-        _keras.layers.convolutional.AveragePooling1D: _layers.convert_pooling,
-        _keras.layers.convolutional.MaxPooling1D: _layers.convert_pooling,
-        _keras.layers.pooling.GlobalAveragePooling1D: _layers.convert_pooling,
-        _keras.layers.pooling.GlobalMaxPooling1D: _layers.convert_pooling,
-        _keras.layers.convolutional.ZeroPadding1D: _layers.convert_padding,
-        _keras.layers.convolutional.Cropping1D: _layers.convert_cropping,
-        _keras.layers.convolutional.UpSampling1D: _layers.convert_upsample,
-        _keras.layers.recurrent.LSTM: _layers.convert_lstm,
-        _keras.layers.recurrent.SimpleRNN: _layers.convert_simple_rnn,
-        _keras.layers.recurrent.GRU: _layers.convert_gru,
-        _keras.layers.wrappers.Bidirectional: _layers.convert_bidirectional,
-        _keras.layers.normalization.BatchNormalization: _layers.convert_batchnorm,
-        _keras.engine.topology.Merge: _layers.convert_merge,
-        _keras.layers.core.Flatten: _layers.convert_flatten,
-        _keras.layers.core.Permute: _layers.convert_permute,
-        _keras.layers.core.Reshape: _layers.convert_reshape,
-        _keras.layers.embeddings.Embedding: _layers.convert_embedding,
-        _keras.layers.core.RepeatVector: _layers.convert_repeat_vector,
-        ## All the layers that can be skipped (merged with conv)
-        _keras.engine.topology.InputLayer: _layers.default_skip,
-        _keras.layers.core.Dropout: _layers.default_skip,
-        _keras.layers.wrappers.TimeDistributed: _layers.default_skip,
-    }
-
-    _KERAS_SKIP_LAYERS = [
-        _keras.layers.core.Dropout,
-    ]
-
-
-def _check_unsupported_layers(model):
-    for i, layer in enumerate(model.layers):
-        if isinstance(layer, _keras.models.Sequential) or isinstance(
-            layer, _keras.models.Model
-        ):
-            _check_unsupported_layers(layer)
-        else:
-            if type(layer) not in _KERAS_LAYER_REGISTRY:
-                raise ValueError("Keras layer '%s' not supported. " % str(type(layer)))
-            if isinstance(layer, _keras.engine.topology.Merge):
-                if layer.layers is None:
-                    continue
-                for merge_layer in layer.layers:
-                    if isinstance(merge_layer, _keras.models.Sequential) or isinstance(
-                        merge_layer, _keras.models.Model
-                    ):
-                        _check_unsupported_layers(merge_layer)
-            if isinstance(layer, _keras.layers.wrappers.TimeDistributed):
-                if type(layer.layer) not in _KERAS_LAYER_REGISTRY:
-                    raise ValueError(
-                        "Keras layer '%s' not supported. " % str(type(layer.layer))
-                    )
-            if isinstance(layer, _keras.layers.wrappers.Bidirectional):
-                if not isinstance(layer.layer, _keras.layers.recurrent.LSTM):
-                    raise ValueError(
-                        "Keras bi-directional wrapper conversion supports only "
-                        "LSTM layer at this time. "
-                    )
-
-
-def _get_layer_converter_fn(layer):
-    """Get the right converter function for Keras
-    """
-    layer_type = type(layer)
-    if layer_type in _KERAS_LAYER_REGISTRY:
-        return _KERAS_LAYER_REGISTRY[layer_type]
-    else:
-        raise TypeError("Keras layer of type %s is not supported." % type(layer))
-
-
-def _load_keras_model(model_network_path, model_weight_path, custom_objects=None):
-    """Load a keras model from disk
-
-    Parameters
-    ----------
-    model_network_path: str
-        Path where the model network path is (json file)
-
-    model_weight_path: str
-        Path where the model network weights are (hd5 file)
-
-    custom_objects:
-        A dictionary of layers or other custom classes
-        or functions used by the model
-
-    Returns
-    -------
-    model: A keras model
-    """
-    from keras.models import model_from_json
-    import json
-
-    # Load the model network
-    json_file = open(model_network_path, "r")
-    json_string = json_file.read()
-    json_file.close()
-    loaded_model_json = json.loads(json_string)
-
-    if not custom_objects:
-        custom_objects = {}
-
-    # Load the model weights
-    loaded_model = model_from_json(loaded_model_json, custom_objects=custom_objects)
-    loaded_model.load_weights(model_weight_path)
-
-    return loaded_model
-
-
-def _convert(
-    model,
-    input_names=None,
-    output_names=None,
-    image_input_names=None,
-    is_bgr=False,
-    red_bias=0.0,
-    green_bias=0.0,
-    blue_bias=0.0,
-    gray_bias=0.0,
-    image_scale=1.0,
-    class_labels=None,
-    predicted_feature_name=None,
-    predicted_probabilities_output="",
-    custom_objects=None,
-    respect_trainable=False,
-):
-    if not (_HAS_KERAS_TF):
-        raise RuntimeError(
-            "keras not found or unsupported version or backend "
-            "found. keras conversion API is disabled."
-        )
-    if isinstance(model, str):
-        model = _keras.models.load_model(model, custom_objects=custom_objects)
-    elif isinstance(model, tuple):
-        model = _load_keras_model(model[0], model[1], custom_objects=custom_objects)
-
-    # Check valid versions
-    _check_unsupported_layers(model)
-
-    # Build network graph to represent Keras model
-    graph = _topology.NetGraph(model)
-    graph.build()
-    graph.remove_skip_layers(_KERAS_SKIP_LAYERS)
-    graph.insert_1d_permute_layers()
-    graph.insert_permute_for_spatial_bn()
-    graph.defuse_activation()
-    graph.remove_internal_input_layers()
-    graph.make_output_layers()
-
-    # The graph should be finalized before executing this
-    graph.generate_blob_names()
-    graph.add_recurrent_optionals()
-
-    inputs = graph.get_input_layers()
-    outputs = graph.get_output_layers()
-
-    # check input / output names validity
-    if input_names is not None:
-        if isinstance(input_names, str):
-            input_names = [input_names]
-    else:
-        input_names = ["input" + str(i + 1) for i in range(len(inputs))]
-    if output_names is not None:
-        if isinstance(output_names, str):
-            output_names = [output_names]
-    else:
-        output_names = ["output" + str(i + 1) for i in range(len(outputs))]
-
-    if image_input_names is not None and isinstance(image_input_names, str):
-        image_input_names = [image_input_names]
-
-    graph.reset_model_input_names(input_names)
-    graph.reset_model_output_names(output_names)
-
-    # Keras -> Core ML input dimension dictionary
-    # (None, None) -> [1, 1, 1, 1, 1]
-    # (None, D) -> [D] or [D, 1, 1, 1, 1]
-    # (None, Seq, D) -> [Seq, 1, D, 1, 1]
-    # (None, H, W, C) -> [C, H, W]
-    # (D) -> [D]
-    # (Seq, D) -> [Seq, 1, 1, D, 1]
-    # (Batch, Sequence, D) -> [D]
-
-    # Retrieve input shapes from model
-    if type(model.input_shape) is list:
-        input_dims = [list(filter(None, x)) for x in model.input_shape]
-        unfiltered_shapes = model.input_shape
-    else:
-        input_dims = [list(filter(None, model.input_shape))]
-        unfiltered_shapes = [model.input_shape]
-
-    for idx, dim in enumerate(input_dims):
-        unfiltered_shape = unfiltered_shapes[idx]
-        if len(dim) == 0:
-            # Used to be [None, None] before filtering; indicating unknown
-            # sequence length
-            input_dims[idx] = tuple([1])
-        elif len(dim) == 1:
-            s = graph.get_successors(inputs[idx])[0]
-            if isinstance(graph.get_keras_layer(s), _keras.layers.embeddings.Embedding):
-                # Embedding layer's special input (None, D) where D is actually
-                # sequence length
-                input_dims[idx] = (1,)
-            else:
-                input_dims[idx] = dim  # dim is just a number
-        elif len(dim) == 2:  # [Seq, D]
-            input_dims[idx] = (dim[1],)
-        elif len(dim) == 3:  # H,W,C
-            if len(unfiltered_shape) > 3:
-                # keras uses the reverse notation from us
-                input_dims[idx] = (dim[2], dim[0], dim[1])
-            else:  # keras provided fixed batch and sequence length, so the input
-                # was (batch, sequence, channel)
-                input_dims[idx] = (dim[2],)
-        else:
-            raise ValueError(
-                "Input" + input_names[idx] + "has input shape of length" + str(len(dim))
-            )
-
-    # Retrieve output shapes from model
-    if type(model.output_shape) is list:
-        output_dims = [list(filter(None, x)) for x in model.output_shape]
-    else:
-        output_dims = [list(filter(None, model.output_shape[1:]))]
-
-    for idx, dim in enumerate(output_dims):
-        if len(dim) == 1:
-            output_dims[idx] = dim
-        elif len(dim) == 2:  # [Seq, D]
-            output_dims[idx] = (dim[1],)
-        elif len(dim) == 3:
-            output_dims[idx] = (dim[2], dim[1], dim[0])
-
-    input_types = [datatypes.Array(*dim) for dim in input_dims]
-    output_types = [datatypes.Array(*dim) for dim in output_dims]
-
-    # Some of the feature handling is sensitive about string vs. unicode
-    input_names = map(str, input_names)
-    output_names = map(str, output_names)
-    is_classifier = class_labels is not None
-    if is_classifier:
-        mode = "classifier"
-    else:
-        mode = None
-
-    # assuming these match
-    input_features = list(zip(input_names, input_types))
-    output_features = list(zip(output_names, output_types))
-
-    builder = _NeuralNetworkBuilder(input_features, output_features, mode=mode)
-
-    for iter, layer in enumerate(graph.layer_list):
-        keras_layer = graph.keras_layer_map[layer]
-        print("%d : %s, %s" % (iter, layer, keras_layer))
-        if isinstance(keras_layer, _keras.layers.wrappers.TimeDistributed):
-            keras_layer = keras_layer.layer
-        converter_func = _get_layer_converter_fn(keras_layer)
-        input_names, output_names = graph.get_layer_blobs(layer)
-        converter_func(builder, layer, input_names, output_names, keras_layer)
-
-    # Set the right inputs and outputs on the model description (interface)
-    builder.set_input(input_names, input_dims)
-    builder.set_output(output_names, output_dims)
-
-    # Since we aren't mangling anything the user gave us, we only need to update
-    # the model interface here
-    builder.add_optionals(graph.optional_inputs, graph.optional_outputs)
-
-    # Add classifier classes (if applicable)
-    if is_classifier:
-        classes_in = class_labels
-        if isinstance(classes_in, str):
-            import os
-
-            if not os.path.isfile(classes_in):
-                raise ValueError(
-                    "Path to class labels (%s) does not exist." % classes_in
-                )
-            with open(classes_in, "r") as f:
-                classes = f.read()
-            classes = classes.splitlines()
-        elif type(classes_in) is list:  # list[int or str]
-            classes = classes_in
-        else:
-            raise ValueError(
-                "Class labels must be a list of integers / strings, or a file path"
-            )
-
-        if predicted_feature_name is not None:
-            builder.set_class_labels(
-                classes,
-                predicted_feature_name=predicted_feature_name,
-                prediction_blob=predicted_probabilities_output,
-            )
-        else:
-            builder.set_class_labels(classes)
-
-    # Set pre-processing paramsters
-    builder.set_pre_processing_parameters(
-        image_input_names=image_input_names,
-        is_bgr=is_bgr,
-        red_bias=red_bias,
-        green_bias=green_bias,
-        blue_bias=blue_bias,
-        gray_bias=gray_bias,
-        image_scale=image_scale,
-    )
-
-    # Return the protobuf spec
-    spec = builder.spec
-    return spec
-
-
-def _convert_to_spec(
-    model,
-    input_names=None,
-    output_names=None,
-    image_input_names=None,
-    input_name_shape_dict={},
-    is_bgr=False,
-    red_bias=0.0,
-    green_bias=0.0,
-    blue_bias=0.0,
-    gray_bias=0.0,
-    image_scale=1.0,
-    class_labels=None,
-    predicted_feature_name=None,
-    model_precision=_MLMODEL_FULL_PRECISION,
-    predicted_probabilities_output="",
-    add_custom_layers=False,
-    custom_conversion_functions=None,
-    custom_objects=None,
-    input_shapes=None,
-    output_shapes=None,
-    respect_trainable=False,
-    use_float_arraytype=False,
-):
-    """
-    Convert a Keras model to Core ML protobuf specification (.mlmodel).
-
-    Parameters
-    ----------
-    model: Keras model object | str | (str, str)
-        A trained Keras neural network model which can be one of the following:
-
-        - a Keras model object
-        - a string with the path to a Keras model file (h5)
-        - a tuple of strings, where the first is the path to a Keras model
-
-          architecture (.json file), the second is the path to its weights
-          stored in h5 file.
-
-    input_names: [str] | str
-        Optional name(s) that can be given to the inputs of the Keras model.
-        These names will be used in the interface of the Core ML models to refer
-        to the inputs of the Keras model. If not provided, the Keras inputs
-        are named to [input1, input2, ..., inputN] in the Core ML model.  When
-        multiple inputs are present, the input feature names are in the same
-        order as the Keras inputs.
-
-    output_names: [str] | str
-        Optional name(s) that can be given to the outputs of the Keras model.
-        These names will be used in the interface of the Core ML models to refer
-        to the outputs of the Keras model. If not provided, the Keras outputs
-        are named to [output1, output2, ..., outputN] in the Core ML model.
-        When multiple outputs are present, output feature names are in the same
-        order as the Keras inputs.
-
-    image_input_names: [str] | str
-        Input names to the Keras model (a subset of the input_names
-        parameter) that can be treated as images by Core ML. All other inputs
-        are treated as MultiArrays (N-D Arrays).
-
-    input_name_shape_dict: {str: [int]}
-        Optional Dictionary of input tensor names and their corresponding shapes expressed
-        as a list of ints
-
-    is_bgr: bool | dict()
-        Flag indicating the channel order the model internally uses to represent
-        color images. Set to True if the internal channel order is BGR,
-        otherwise it will be assumed RGB. This flag is applicable only if
-        image_input_names is specified. To specify a different value for each
-        image input, provide a dictionary with input names as keys.
-        Note that this flag is about the models internal channel order.
-        An input image can be passed to the model in any color pixel layout
-        containing red, green and blue values (e.g. 32BGRA or 32ARGB). This flag
-        determines how those pixel values get mapped to the internal multiarray
-        representation.
-
-    red_bias: float | dict()
-        Bias value to be added to the red channel of the input image.
-        Defaults to 0.0
-        Applicable only if image_input_names is specified.
-        To specify different values for each image input provide a dictionary with input names as keys.
-
-    blue_bias: float | dict()
-        Bias value to be added to the blue channel of the input image.
-        Defaults to 0.0
-        Applicable only if image_input_names is specified.
-        To specify different values for each image input provide a dictionary with input names as keys.
-
-    green_bias: float | dict()
-        Bias value to be added to the green channel of the input image.
-        Defaults to 0.0
-        Applicable only if image_input_names is specified.
-        To specify different values for each image input provide a dictionary with input names as keys.
-
-    gray_bias: float | dict()
-        Bias value to be added to the input image (in grayscale). Defaults
-        to 0.0
-        Applicable only if image_input_names is specified.
-        To specify different values for each image input provide a dictionary with input names as keys.
-
-    image_scale: float | dict()
-        Value by which input images will be scaled before bias is added and
-        Core ML model makes a prediction. Defaults to 1.0.
-        Applicable only if image_input_names is specified.
-        To specify different values for each image input provide a dictionary with input names as keys.
-
-    class_labels: list[int or str] | str
-        Class labels (applies to classifiers only) that map the index of the
-        output of a neural network to labels in a classifier.
-
-        If the provided class_labels is a string, it is assumed to be a
-        filepath where classes are parsed as a list of newline separated
-        strings.
-
-    predicted_feature_name: str
-        Name of the output feature for the class labels exposed in the Core ML
-        model (applies to classifiers only). Defaults to 'classLabel'
-
-    model_precision: str
-        Precision at which model will be saved. Currently full precision (float) and half precision
-        (float16) models are supported. Defaults to '_MLMODEL_FULL_PRECISION' (full precision).
-
-    predicted_probabilities_output: str
-        Name of the neural network output to be interpreted as the predicted
-        probabilities of the resulting classes. Typically the output of a
-        softmax function. Defaults to the first output blob.
-
-    add_custom_layers: bool
-        If True, then unknown Keras layer types will be added to the model as
-        'custom' layers, which must then be filled in as postprocessing.
-
-    custom_conversion_functions: {'str': (Layer -> CustomLayerParams)}
-        A dictionary with keys corresponding to names of custom layers and values
-        as functions taking a Keras custom layer and returning a parameter dictionary
-        and list of weights.
-
-    custom_objects: {'str': (function)}
-        Dictionary that includes a key, value pair of {'<function name>': <function>}
-        for custom objects such as custom loss in the Keras model.
-        Provide a string of the name of the custom function as a key.
-        Provide a function as a value.
-
-    respect_trainable: bool
-        If True, then Keras layers that are marked 'trainable' will
-        automatically be marked updatable in the Core ML model.
-
-    use_float_arraytype: bool
-        If true, the datatype of input/output multiarrays is set to Float32 instead
-        of double.
-
-    Returns
-    -------
-    model: MLModel
-        Model in Core ML format.
-
-    Examples
-    --------
-    .. sourcecode:: python
-
-        # Make a Keras model
-        >>> model = Sequential()
-        >>> model.add(Dense(num_channels, input_dim = input_dim))
-
-        # Convert it with default input and output names
-        >>> import coremltools
-        >>> coreml_model = coremltools.converters.keras.convert(model)
-
-        # Saving the Core ML model to a file.
-        >>> coreml_model.save('my_model.mlmodel')
-
-    Converting a model with a single image input.
-
-    .. sourcecode:: python
-
-        >>> coreml_model = coremltools.converters.keras.convert(model, input_names =
-        ... 'image', image_input_names = 'image')
-
-    Core ML also lets you add class labels to models to expose them as
-    classifiers.
-
-    .. sourcecode:: python
-
-        >>> coreml_model = coremltools.converters.keras.convert(model, input_names = 'image',
-        ... image_input_names = 'image', class_labels = ['cat', 'dog', 'rat'])
-
-    Class labels for classifiers can also come from a file on disk.
-
-    .. sourcecode:: python
-
-        >>> coreml_model = coremltools.converters.keras.convert(model, input_names =
-        ... 'image', image_input_names = 'image', class_labels = 'labels.txt')
-
-    Provide customized input and output names to the Keras inputs and outputs
-    while exposing them to Core ML.
-
-    .. sourcecode:: python
-
-        >>> coreml_model = coremltools.converters.keras.convert(model, input_names =
-        ...   ['my_input_1', 'my_input_2'], output_names = ['my_output'])
-
-    """
-    if model_precision not in _VALID_MLMODEL_PRECISION_TYPES:
-        raise RuntimeError("Model precision {} is not valid".format(model_precision))
-
-    if _HAS_KERAS_TF:
-        spec = _convert(
-            model=model,
-            input_names=input_names,
-            output_names=output_names,
-            image_input_names=image_input_names,
-            is_bgr=is_bgr,
-            red_bias=red_bias,
-            green_bias=green_bias,
-            blue_bias=blue_bias,
-            gray_bias=gray_bias,
-            image_scale=image_scale,
-            class_labels=class_labels,
-            predicted_feature_name=predicted_feature_name,
-            predicted_probabilities_output=predicted_probabilities_output,
-            custom_objects=custom_objects,
-            respect_trainable=respect_trainable,
-        )
-    elif _HAS_KERAS2_TF:
-        from . import _keras2_converter
-
-        spec = _keras2_converter._convert(
-            model=model,
-            input_names=input_names,
-            output_names=output_names,
-            image_input_names=image_input_names,
-            input_name_shape_dict=input_name_shape_dict,
-            is_bgr=is_bgr,
-            red_bias=red_bias,
-            green_bias=green_bias,
-            blue_bias=blue_bias,
-            gray_bias=gray_bias,
-            image_scale=image_scale,
-            class_labels=class_labels,
-            predicted_feature_name=predicted_feature_name,
-            predicted_probabilities_output=predicted_probabilities_output,
-            add_custom_layers=add_custom_layers,
-            custom_conversion_functions=custom_conversion_functions,
-            custom_objects=custom_objects,
-            input_shapes=input_shapes,
-            output_shapes=output_shapes,
-            respect_trainable=respect_trainable,
-            use_float_arraytype=use_float_arraytype,
-        )
-    else:
-        raise RuntimeError(
-            "Keras not found or unsupported version or backend found. keras conversion API is disabled."
-        )
-
-    if model_precision == _MLMODEL_HALF_PRECISION and model is not None:
-        spec = _convert_neural_network_spec_weights_to_fp16(spec)
-
-    return spec
-
-
-@_deprecated()
-def convert(
-    model,
-    input_names=None,
-    output_names=None,
-    image_input_names=None,
-    input_name_shape_dict={},
-    is_bgr=False,
-    red_bias=0.0,
-    green_bias=0.0,
-    blue_bias=0.0,
-    gray_bias=0.0,
-    image_scale=1.0,
-    class_labels=None,
-    predicted_feature_name=None,
-    model_precision=_MLMODEL_FULL_PRECISION,
-    predicted_probabilities_output="",
-    add_custom_layers=False,
-    custom_conversion_functions=None,
-    input_shapes=None,
-    output_shapes=None,
-    respect_trainable=False,
-    use_float_arraytype=False,
-):
-    """
-    WARNING: This function is deprecated. It will be removed in the 6.0.
-
-    Convert a Keras model to Core ML protobuf specification (.mlmodel).
-
-    Parameters
-    ----------
-    model: Keras model object | str | (str, str)
-
-        A trained Keras neural network model which can be one of the following:
-
-        - a Keras model object
-        - a string with the path to a Keras model file (h5)
-        - a tuple of strings, where the first is the path to a Keras model
-    architecture (.json file), the second is the path to its weights stored in h5 file.
-
-    input_names: [str] | str
-        Optional name(s) that can be given to the inputs of the Keras model.
-        These names will be used in the interface of the Core ML models to refer
-        to the inputs of the Keras model. If not provided, the Keras inputs
-        are named to [input1, input2, ..., inputN] in the Core ML model.  When
-        multiple inputs are present, the input feature names are in the same
-        order as the Keras inputs.
-
-    output_names: [str] | str
-        Optional name(s) that can be given to the outputs of the Keras model.
-        These names will be used in the interface of the Core ML models to refer
-        to the outputs of the Keras model. If not provided, the Keras outputs
-        are named to [output1, output2, ..., outputN] in the Core ML model.
-        When multiple outputs are present, output feature names are in the same
-        order as the Keras inputs.
-
-    image_input_names: [str] | str
-        Input names to the Keras model (a subset of the input_names
-        parameter) that can be treated as images by Core ML. All other inputs
-        are treated as MultiArrays (N-D Arrays).
-
-    is_bgr: bool | dict()
-        Flag indicating the channel order the model internally uses to represent
-        color images. Set to True if the internal channel order is BGR,
-        otherwise it will be assumed RGB. This flag is applicable only if
-        image_input_names is specified. To specify a different value for each
-        image input, provide a dictionary with input names as keys.
-        Note that this flag is about the models internal channel order.
-        An input image can be passed to the model in any color pixel layout
-        containing red, green and blue values (e.g. 32BGRA or 32ARGB). This flag
-        determines how those pixel values get mapped to the internal multiarray
-        representation.
-
-    red_bias: float | dict()
-        Bias value to be added to the red channel of the input image.
-        Defaults to 0.0
-        Applicable only if image_input_names is specified.
-        To specify different values for each image input provide a dictionary with input names as keys.
-
-    blue_bias: float | dict()
-        Bias value to be added to the blue channel of the input image.
-        Defaults to 0.0
-        Applicable only if image_input_names is specified.
-        To specify different values for each image input provide a dictionary with input names as keys.
-
-    green_bias: float | dict()
-        Bias value to be added to the green channel of the input image.
-        Defaults to 0.0
-        Applicable only if image_input_names is specified.
-        To specify different values for each image input provide a dictionary with input names as keys.
-
-    gray_bias: float | dict()
-        Bias value to be added to the input image (in grayscale). Defaults
-        to 0.0
-        Applicable only if image_input_names is specified.
-        To specify different values for each image input provide a dictionary with input names as keys.
-
-    image_scale: float | dict()
-        Value by which input images will be scaled before bias is added and
-        Core ML model makes a prediction. Defaults to 1.0.
-        Applicable only if image_input_names is specified.
-        To specify different values for each image input provide a dictionary with input names as keys.
-
-    class_labels: list[int or str] | str
-        Class labels (applies to classifiers only) that map the index of the
-        output of a neural network to labels in a classifier.
-
-        If the provided class_labels is a string, it is assumed to be a
-        filepath where classes are parsed as a list of newline separated
-        strings.
-
-    predicted_feature_name: str
-        Name of the output feature for the class labels exposed in the Core ML
-        model (applies to classifiers only). Defaults to 'classLabel'
-
-    model_precision: str
-        Precision at which model will be saved. Currently full precision (float) and half precision
-        (float16) models are supported. Defaults to '_MLMODEL_FULL_PRECISION' (full precision).
-
-    predicted_probabilities_output: str
-        Name of the neural network output to be interpreted as the predicted
-        probabilities of the resulting classes. Typically the output of a
-        softmax function. Defaults to the first output blob.
-
-    add_custom_layers: bool
-        If yes, then unknown Keras layer types will be added to the model as
-        'custom' layers, which must then be filled in as postprocessing.
-
-    custom_conversion_functions: {str:(Layer -> (dict, [weights])) }
-        A dictionary with keys corresponding to names of custom layers and values
-        as functions taking a Keras custom layer and returning a parameter dictionary
-        and list of weights.
-
-    respect_trainable: bool
-        If yes, then Keras layers marked 'trainable' will automatically be
-        marked updatable in the Core ML model.
-
-    use_float_arraytype: bool
-        If true, the datatype of input/output multiarrays is set to Float32 instead
-        of double.
-
-    Returns
-    -------
-    model: MLModel
-    Model in Core ML format.
-
-    Examples
-    --------
-    .. sourcecode:: python
-
-        # Make a Keras model
-        >>> model = Sequential()
-        >>> model.add(Dense(num_channels, input_dim = input_dim))
-
-        # Convert it with default input and output names
-        >>> import coremltools
-        >>> coreml_model = coremltools.converters.keras.convert(model)
-
-        # Saving the Core ML model to a file.
-        >>> coreml_model.save('my_model.mlmodel')
-
-    Converting a model with a single image input.
-
-    .. sourcecode:: python
-
-        >>> coreml_model = coremltools.converters.keras.convert(model, input_names =
-        ... 'image', image_input_names = 'image')
-
-    Core ML also lets you add class labels to models to expose them as
-    classifiers.
-
-    .. sourcecode:: python
-
-        >>> coreml_model = coremltools.converters.keras.convert(model, input_names = 'image',
-        ... image_input_names = 'image', class_labels = ['cat', 'dog', 'rat'])
-
-    Class labels for classifiers can also come from a file on disk.
-
-    .. sourcecode:: python
-
-        >>> coreml_model = coremltools.converters.keras.convert(model, input_names =
-        ... 'image', image_input_names = 'image', class_labels = 'labels.txt')
-
-    Provide customized input and output names to the Keras inputs and outputs
-    while exposing them to Core ML.
-
-    .. sourcecode:: python
-
-        >>> coreml_model = coremltools.converters.keras.convert(model, input_names =
-        ...   ['my_input_1', 'my_input_2'], output_names = ['my_output'])
-
-    """
-    spec = _convert_to_spec(
-        model,
-        input_names=input_names,
-        output_names=output_names,
-        image_input_names=image_input_names,
-        input_name_shape_dict=input_name_shape_dict,
-        is_bgr=is_bgr,
-        red_bias=red_bias,
-        green_bias=green_bias,
-        blue_bias=blue_bias,
-        gray_bias=gray_bias,
-        image_scale=image_scale,
-        class_labels=class_labels,
-        predicted_feature_name=predicted_feature_name,
-        model_precision=model_precision,
-        predicted_probabilities_output=predicted_probabilities_output,
-        add_custom_layers=add_custom_layers,
-        custom_conversion_functions=custom_conversion_functions,
-        input_shapes=input_shapes,
-        output_shapes=output_shapes,
-        respect_trainable=respect_trainable,
-        use_float_arraytype=use_float_arraytype,
-    )
-
-    model = _MLModel(spec)
-
-    from keras import __version__ as keras_version
-
-    model.user_defined_metadata[_METADATA_VERSION] = ct_version
-    model.user_defined_metadata[_METADATA_SOURCE] = "keras=={0}".format(keras_version)
-
-    return model
diff --git a/coremltools/converters/keras/_layers.py b/coremltools/converters/keras/_layers.py
deleted file mode 100644
index 139d2d148..000000000
--- a/coremltools/converters/keras/_layers.py
+++ /dev/null
@@ -1,1099 +0,0 @@
-# Copyright (c) 2017, Apple Inc. All rights reserved.
-#
-# Use of this source code is governed by a BSD-3-clause license that can be
-# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
-
-from . import _utils
-import keras
-import numpy as np
-
-
-def _get_recurrent_activation_name_from_keras(activation):
-    if activation == keras.activations.sigmoid:
-        activation_str = "SIGMOID"
-    elif activation == keras.activations.hard_sigmoid:
-        activation_str = "SIGMOID_HARD"
-    elif activation == keras.activations.tanh:
-        activation_str = "TANH"
-    elif activation == keras.activations.relu:
-        activation_str = "RELU"
-    elif activation == keras.activations.linear:
-        activation_str = "LINEAR"
-    else:
-        raise NotImplementedError(
-            "activation %s not supported for Recurrent layer." % activation
-        )
-
-    return activation_str
-
-
-def _get_activation_name_from_keras_layer(keras_layer):
-    if isinstance(keras_layer, keras.layers.advanced_activations.LeakyReLU):
-        non_linearity = "LEAKYRELU"
-    elif isinstance(keras_layer, keras.layers.advanced_activations.PReLU):
-        non_linearity = "PRELU"
-    elif isinstance(keras_layer, keras.layers.advanced_activations.ELU):
-        non_linearity = "ELU"
-    elif isinstance(keras_layer, keras.layers.advanced_activations.ParametricSoftplus):
-        non_linearity = "PARAMETRICSOFTPLUS"
-    elif isinstance(keras_layer, keras.layers.advanced_activations.ThresholdedReLU):
-        non_linearity = "THRESHOLDEDRELU"
-    else:
-        act_name = keras_layer.activation.__name__
-
-        if act_name == "softmax":
-            non_linearity = "SOFTMAX"
-        elif act_name == "sigmoid":
-            non_linearity = "SIGMOID"
-        elif act_name == "tanh":
-            non_linearity = "TANH"
-        elif act_name == "relu":
-            non_linearity = "RELU"
-        elif act_name == "softplus":
-            non_linearity = "SOFTPLUS"
-        elif act_name == "softsign":
-            non_linearity = "SOFTSIGN"
-        elif act_name == "hard_sigmoid":
-            non_linearity = "SIGMOID_HARD"
-        elif act_name == "linear":
-            non_linearity = "LINEAR"
-        else:
-            _utils.raise_error_unsupported_categorical_option(
-                "activation", act_name, "Dense", keras_layer.name  ##
-            )
-
-    return non_linearity
-
-
-def _get_elementwise_name_from_keras_layer(keras_layer):
-    """
-    Get the keras layer name from the activation name.
-    """
-    mode = keras_layer.mode
-    if mode == "sum":
-        return "ADD"
-    elif mode == "mul":
-        return "MULTIPLY"
-    elif mode == "concat":
-        if len(keras_layer.input_shape[0]) == 3 and (
-            keras_layer.concat_axis == 1 or keras_layer.concat_axis == -2
-        ):
-            return "SEQUENCE_CONCAT"
-        elif len(keras_layer.input_shape[0]) == 4 and (
-            keras_layer.concat_axis == 3 or keras_layer.concat_axis == -1
-        ):
-            return "CONCAT"
-        elif len(keras_layer.input_shape[0]) == 2 and (
-            keras_layer.concat_axis == 1 or keras_layer.concat_axis == -1
-        ):
-            return "CONCAT"
-        else:
-            option = "input_shape = %s concat_axis = %s" % (
-                str(keras_layer.input_shape[0]),
-                str(keras_layer.concat_axis),
-            )
-            _utils.raise_error_unsupported_option(option, mode, keras_layer.name)
-    elif mode == "cos":
-        if len(keras_layer.input_shape[0]) == 2:
-            return "COS"
-        else:
-            option = "input_shape = %s" % (str(keras_layer.input_shape[0]))
-            _utils.raise_error_unsupported_option(option, mode, keras_layer.name)
-    elif mode == "dot":
-        if len(keras_layer.input_shape[0]) == 2:
-            return "DOT"
-        else:
-            option = "input_shape = %s" % (str(keras_layer.input_shape[0]))
-            _utils.raise_error_unsupported_option(option, mode, keras_layer.name)
-    elif mode == "max":
-        return "MAX"
-    elif mode == "ave":
-        return "AVE"
-    else:
-        _utils.raise_error_unsupported_categorical_option(
-            "mode", mode, "Merge", keras_layer.name
-        )
-
-
-def _same_elements_per_channel(x):
-    """
-    Test if a 3D (H,W,C) matrix x has the same element in each (H,W) matrix for each channel
-    """
-    eps = 1e-5
-    dims = x.shape
-    for c in range(dims[-1]):
-        xc = x[:, :, c].flatten()
-        if not np.all(np.absolute(xc - xc[0]) < eps):
-            return False
-    return True
-
-
-def convert_dense(builder, layer, input_names, output_names, keras_layer):
-    """Convert a dense layer from keras to coreml.
-
-    Parameters
-    keras_layer: layer
-    ----------
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    """
-    # Get input and output names
-    input_name, output_name = (input_names[0], output_names[0])
-
-    has_bias = keras_layer.bias
-    # Get the weights from keras
-    W = keras_layer.get_weights()[0].T
-    Wb = keras_layer.get_weights()[1].T if has_bias else None
-
-    builder.add_inner_product(
-        name=layer,
-        W=W,
-        b=Wb,
-        input_channels=keras_layer.input_dim,
-        output_channels=keras_layer.output_dim,
-        has_bias=has_bias,
-        input_name=input_name,
-        output_name=output_name,
-    )
-
-
-def convert_activation(builder, layer, input_names, output_names, keras_layer):
-    """Convert an activation layer from keras to coreml.
-
-    Parameters
-    ----------
-    keras_layer: layer
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    """
-    # Get input and output names
-    input_name, output_name = (input_names[0], output_names[0])
-    non_linearity = _get_activation_name_from_keras_layer(keras_layer)
-
-    # Add a non-linearity layer
-    if non_linearity == "SOFTMAX":
-        builder.add_softmax(name=layer, input_name=input_name, output_name=output_name)
-        return
-
-    params = None
-    if non_linearity == "LEAKYRELU":
-        params = [keras_layer.alpha]
-
-    elif non_linearity == "PRELU":
-        # In Keras 1.2  PReLU layer's weights are stored as a
-        # backend tensor, not a numpy array as it claims in documentation.
-        shared_axes = list(keras_layer.shared_axes)
-        if not (shared_axes == [1, 2, 3] or shared_axes == [1, 2]):
-            _utils.raise_error_unsupported_scenario(
-                "Shared axis not being [1,2,3] " "or [1,2]", "parametric_relu", layer
-            )
-        params = keras.backend.eval(keras_layer.weights[0])
-    elif non_linearity == "ELU":
-        params = keras_layer.alpha
-
-    elif non_linearity == "PARAMETRICSOFTPLUS":
-        # In Keras 1.2  Parametric Softplus layer's weights are stored as a
-        # backend tensor, not a numpy array as it claims in documentation.
-        alphas = keras.backend.eval(keras_layer.weights[0])
-        betas = keras.backend.eval(keras_layer.weights[1])
-
-        if len(alphas.shape) == 3:  # (H,W,C)
-            if not (
-                _same_elements_per_channel(alphas) and _same_elements_per_channel(betas)
-            ):
-                _utils.raise_error_unsupported_scenario(
-                    "Different parameter values", "parametric_softplus", layer
-                )
-            alphas = alphas[0, 0, :]
-            betas = betas[0, 0, :]
-        params = [alphas, betas]
-
-    elif non_linearity == "THRESHOLDEDRELU":
-        params = keras_layer.theta
-    else:
-        pass  # do nothing to parameters
-    builder.add_activation(
-        name=layer,
-        non_linearity=non_linearity,
-        input_name=input_name,
-        output_name=output_name,
-        params=params,
-    )
-
-
-def convert_merge(builder, layer, input_names, output_names, keras_layer):
-    """Convert concat layer from keras to coreml.
-
-    Parameters
-    ----------
-    keras_layer: layer
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    """
-    # Get input and output names
-    output_name = output_names[0]
-
-    mode = _get_elementwise_name_from_keras_layer(keras_layer)
-    builder.add_elementwise(
-        name=layer, input_names=input_names, output_name=output_name, mode=mode
-    )
-
-
-def convert_pooling(builder, layer, input_names, output_names, keras_layer):
-    """Convert pooling layer from keras to coreml.
-
-    Parameters
-    ----------
-    keras_layer: layer
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    """
-    # Get input and output names
-    input_name, output_name = (input_names[0], output_names[0])
-
-    # Pooling layer type
-    if (
-        isinstance(keras_layer, keras.layers.convolutional.MaxPooling2D)
-        or isinstance(keras_layer, keras.layers.convolutional.MaxPooling1D)
-        or isinstance(keras_layer, keras.layers.pooling.GlobalMaxPooling2D)
-        or isinstance(keras_layer, keras.layers.pooling.GlobalMaxPooling1D)
-    ):
-        layer_type_str = "MAX"
-    elif (
-        isinstance(keras_layer, keras.layers.convolutional.AveragePooling2D)
-        or isinstance(keras_layer, keras.layers.convolutional.AveragePooling1D)
-        or isinstance(keras_layer, keras.layers.pooling.GlobalAveragePooling2D)
-        or isinstance(keras_layer, keras.layers.pooling.GlobalAveragePooling1D)
-    ):
-        layer_type_str = "AVERAGE"
-    else:
-        raise TypeError("Pooling type %s not supported" % keras_layer)
-
-    # if it's global, set the global flag
-    if isinstance(keras_layer, keras.layers.pooling.GlobalMaxPooling2D) or isinstance(
-        keras_layer, keras.layers.pooling.GlobalAveragePooling2D
-    ):
-        # 2D global pooling
-        global_pooling = True
-        height, width = (0, 0)
-        stride_height, stride_width = (0, 0)
-        padding_type = "VALID"
-    elif isinstance(keras_layer, keras.layers.pooling.GlobalMaxPooling1D) or isinstance(
-        keras_layer, keras.layers.pooling.GlobalAveragePooling1D
-    ):
-        # 1D global pooling: 1D global pooling seems problematic,
-        # use this work-around
-        global_pooling = False
-        _, width, channels = keras_layer.input_shape
-        height = 1
-        stride_height, stride_width = height, width
-        padding_type = "VALID"
-    else:
-        global_pooling = False
-        # Set pool sizes and strides
-        # 1D cases:
-        if (
-            isinstance(keras_layer, keras.layers.convolutional.MaxPooling1D)
-            or isinstance(keras_layer, keras.layers.pooling.GlobalMaxPooling1D)
-            or isinstance(keras_layer, keras.layers.convolutional.AveragePooling1D)
-            or isinstance(keras_layer, keras.layers.pooling.GlobalAveragePooling1D)
-        ):
-            height, width = 1, keras_layer.pool_length
-            if keras_layer.stride is not None:
-                stride_height, stride_width = 1, keras_layer.stride
-            else:
-                stride_height, stride_width = 1, keras_layer.pool_length
-        # 2D cases:
-        else:
-            height, width = keras_layer.pool_size
-            if keras_layer.strides is None:
-                stride_height, stride_width = height, width
-            else:
-                stride_height, stride_width = keras_layer.strides
-
-        # Padding
-        border_mode = keras_layer.border_mode
-        if keras_layer.border_mode == "valid":
-            padding_type = "VALID"
-        elif keras_layer.border_mode == "same":
-            padding_type = "SAME"
-        else:
-            raise TypeError("Border mode %s not supported" % border_mode)
-
-    builder.add_pooling(
-        name=layer,
-        height=height,
-        width=width,
-        stride_height=stride_height,
-        stride_width=stride_width,
-        layer_type=layer_type_str,
-        padding_type=padding_type,
-        input_name=input_name,
-        output_name=output_name,
-        exclude_pad_area=True,
-        is_global=global_pooling,
-    )
-
-
-def convert_padding(builder, layer, input_names, output_names, keras_layer):
-    """Convert padding layer from keras to coreml.
-    Keras only supports zero padding at this time.
-    Parameters
-    ----------
-    keras_layer: layer
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    """
-    # Get input and output names
-    input_name, output_name = (input_names[0], output_names[0])
-
-    if isinstance(keras_layer, keras.layers.convolutional.ZeroPadding1D):
-        left, right = keras_layer.padding
-        top, bottom = (0, 0)
-    else:  # 2D
-        top, left = keras_layer.padding
-        bottom, right = keras_layer.padding
-
-    # Now add the layer
-    builder.add_padding(
-        name=layer,
-        left=left,
-        right=right,
-        top=top,
-        bottom=bottom,
-        value=0,
-        input_name=input_name,
-        output_name=output_name,
-    )
-
-
-def convert_cropping(builder, layer, input_names, output_names, keras_layer):
-    """Convert padding layer from keras to coreml.
-    Keras only supports zero padding at this time.
-    Parameters
-    ----------
-    keras_layer: layer
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    """
-    # Get input and output names
-    input_name, output_name = (input_names[0], output_names[0])
-
-    if isinstance(keras_layer, keras.layers.convolutional.Cropping1D):
-        left, right = keras_layer.cropping
-        top, bottom = (0, 0)
-    else:  # 2D
-        left, right = keras_layer.cropping[0]
-        top, bottom = keras_layer.cropping[1]
-
-    # Now add the layer
-    builder.add_crop(
-        name=layer,
-        left=left,
-        right=right,
-        top=top,
-        bottom=bottom,
-        offset=[0, 0],
-        input_names=[input_name],
-        output_name=output_name,
-    )
-
-
-def convert_reshape(builder, layer, input_names, output_names, keras_layer):
-    input_name, output_name = (input_names[0], output_names[0])
-
-    input_shape = keras_layer.input_shape
-    target_shape = keras_layer.target_shape
-
-    def get_coreml_target_shape(target_shape):
-        if len(target_shape) == 1:  # (D,)
-            coreml_shape = (1, target_shape[0], 1, 1)
-        elif len(target_shape) == 2:  # (S,D)
-            coreml_shape = target_shape + (1, 1)
-        elif len(target_shape) == 3:  # (H,W,C)
-            coreml_shape = (1, target_shape[2], target_shape[0], target_shape[1])
-        else:
-            coreml_shape = None
-        return coreml_shape
-
-    def get_mode(input_shape, target_shape):
-        in_shape = input_shape[1:]
-        if len(in_shape) == 3 or len(target_shape) == 3:
-            return 1
-        else:
-            return 0
-
-    new_shape = get_coreml_target_shape(target_shape)
-    if new_shape is not None:
-        mode = get_mode(input_shape, target_shape)
-        builder.add_reshape(
-            name=layer,
-            input_name=input_name,
-            output_name=output_name,
-            target_shape=new_shape,
-            mode=mode,
-        )
-    else:
-        _utils.raise_error_unsupported_categorical_option(
-            "input_shape", str(input_shape), "reshape", layer
-        )
-
-
-def convert_upsample(builder, layer, input_names, output_names, keras_layer):
-    """Convert convolution layer from keras to coreml.
-
-    Parameters
-    ----------
-    keras_layer: layer
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    """
-    # Get input and output names
-    input_name, output_name = (input_names[0], output_names[0])
-
-    if isinstance(keras_layer, keras.layers.convolutional.UpSampling1D):
-        fh, fw = 1, keras_layer.length
-    else:  # 2D
-        fh, fw = keras_layer.size
-
-    builder.add_upsample(
-        name=layer,
-        scaling_factor_h=fh,
-        scaling_factor_w=fw,
-        input_name=input_name,
-        output_name=output_name,
-    )
-
-
-def convert_convolution(builder, layer, input_names, output_names, keras_layer):
-    """Convert convolution layer from keras to coreml.
-
-    Parameters
-    ----------
-    keras_layer: layer
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    """
-    # Get input and output names
-    input_name, output_name = (input_names[0], output_names[0])
-
-    has_bias = keras_layer.bias
-    is_deconv = isinstance(keras_layer, keras.layers.convolutional.Deconvolution2D)
-
-    # Get the weights from keras.
-    # Keras stores convolution weights as list of numpy arrays
-    weightList = keras_layer.get_weights()
-    output_shape = list(filter(None, keras_layer.output_shape))[:-1]
-
-    # Parameter
-    height, width, channels, n_filters = weightList[0].shape
-    stride_height, stride_width = keras_layer.subsample
-
-    # Weights and bias terms
-    W = weightList[0]
-    b = weightList[1] if has_bias else None
-
-    # dilation factors
-    dilation_factors = [1, 1]
-    if isinstance(keras_layer, keras.layers.convolutional.AtrousConvolution2D):
-        dilation_factors = list(keras_layer.atrous_rate)
-
-    builder.add_convolution(
-        name=layer,
-        kernel_channels=channels,
-        output_channels=n_filters,
-        height=height,
-        width=width,
-        stride_height=stride_height,
-        stride_width=stride_width,
-        border_mode=keras_layer.border_mode,
-        groups=1,
-        W=W,
-        b=b,
-        has_bias=has_bias,
-        is_deconv=is_deconv,
-        output_shape=output_shape,
-        input_name=input_name,
-        output_name=output_name,
-        dilation_factors=dilation_factors,
-    )
-
-
-def convert_convolution1d(builder, layer, input_names, output_names, keras_layer):
-    """Convert convolution layer from keras to coreml.
-
-    Parameters
-    ----------
-    keras_layer: layer
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    """
-    # Get input and output names
-    input_name, output_name = (input_names[0], output_names[0])
-
-    has_bias = keras_layer.bias
-
-    # Get the weights from keras.
-    # Keras stores convolution weights as list of numpy arrays
-    weightList = keras_layer.get_weights()
-    output_shape = list(filter(None, keras_layer.output_shape))[:-1]
-
-    # Parameter
-    # weightList[0].shape = [kernel_length, input_length(time_step), input_dim, num_kernels]
-    filter_length, input_length, input_dim, n_filters = weightList[0].shape
-    stride_width = keras_layer.subsample[0]
-
-    # Weights and bias terms
-    W = weightList[0]
-    b = weightList[1] if has_bias else None
-
-    dilation_factors = [1, 1]
-    if isinstance(keras_layer, keras.layers.convolutional.AtrousConvolution1D):
-        dilation_factors[-1] = keras_layer.atrous_rate
-
-    builder.add_convolution(
-        name=layer,
-        kernel_channels=input_dim,
-        output_channels=n_filters,
-        height=1,
-        width=filter_length,
-        stride_height=1,
-        stride_width=stride_width,
-        border_mode=keras_layer.border_mode,
-        groups=1,
-        W=W,
-        b=b,
-        has_bias=has_bias,
-        is_deconv=False,
-        output_shape=output_shape,
-        input_name=input_name,
-        output_name=output_name,
-        dilation_factors=dilation_factors,
-    )
-
-
-def convert_lstm(builder, layer, input_names, output_names, keras_layer):
-    """Convert an LSTM layer from keras to coreml.
-
-    Parameters
-    ----------
-    keras_layer: layer
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    """
-
-    hidden_size = keras_layer.output_dim
-    input_size = keras_layer.input_shape[-1]
-    if keras_layer.consume_less not in ["cpu", "gpu"]:
-        raise ValueError(
-            "Cannot convert Keras layer with consume_less = %s"
-            % keras_layer.consume_less
-        )
-
-    output_all = keras_layer.return_sequences
-    reverse_input = keras_layer.go_backwards
-
-    # Keras: I C F O; W_x, W_h, b
-    # CoreML: I F O G; W_h and W_x are separated
-    W_h, W_x, b = ([], [], [])
-    if keras_layer.consume_less == "cpu":
-        W_h.append(keras_layer.get_weights()[1].T)
-        W_h.append(keras_layer.get_weights()[7].T)
-        W_h.append(keras_layer.get_weights()[10].T)
-        W_h.append(keras_layer.get_weights()[4].T)
-
-        W_x.append(keras_layer.get_weights()[0].T)
-        W_x.append(keras_layer.get_weights()[6].T)
-        W_x.append(keras_layer.get_weights()[9].T)
-        W_x.append(keras_layer.get_weights()[3].T)
-
-        b.append(keras_layer.get_weights()[2])
-        b.append(keras_layer.get_weights()[8])
-        b.append(keras_layer.get_weights()[11])
-        b.append(keras_layer.get_weights()[5])
-    else:
-        keras_W_h = keras_layer.get_weights()[1].T
-        W_h.append(keras_W_h[0 * hidden_size :][:hidden_size])
-        W_h.append(keras_W_h[1 * hidden_size :][:hidden_size])
-        W_h.append(keras_W_h[3 * hidden_size :][:hidden_size])
-        W_h.append(keras_W_h[2 * hidden_size :][:hidden_size])
-
-        keras_W_x = keras_layer.get_weights()[0].T
-        W_x.append(keras_W_x[0 * hidden_size :][:hidden_size])
-        W_x.append(keras_W_x[1 * hidden_size :][:hidden_size])
-        W_x.append(keras_W_x[3 * hidden_size :][:hidden_size])
-        W_x.append(keras_W_x[2 * hidden_size :][:hidden_size])
-
-        keras_b = keras_layer.get_weights()[2]
-        b.append(keras_b[0 * hidden_size :][:hidden_size])
-        b.append(keras_b[1 * hidden_size :][:hidden_size])
-        b.append(keras_b[3 * hidden_size :][:hidden_size])
-        b.append(keras_b[2 * hidden_size :][:hidden_size])
-
-    # Set activation type
-    inner_activation_str = _get_recurrent_activation_name_from_keras(
-        keras_layer.inner_activation
-    )
-    activation_str = _get_recurrent_activation_name_from_keras(keras_layer.activation)
-
-    # Add to the network
-    builder.add_unilstm(
-        name=layer,
-        W_h=W_h,
-        W_x=W_x,
-        b=b,
-        hidden_size=hidden_size,
-        input_size=input_size,
-        input_names=input_names,
-        output_names=output_names,
-        inner_activation=inner_activation_str,
-        cell_state_update_activation=activation_str,
-        output_activation=activation_str,
-        output_all=output_all,
-        reverse_input=reverse_input,
-    )
-
-
-def convert_simple_rnn(builder, layer, input_names, output_names, keras_layer):
-    """Convert an SimpleRNN layer from keras to coreml.
-
-    Parameters
-    ----------
-    keras_layer: layer
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    """
-    # Get input and output names
-    hidden_size = keras_layer.output_dim
-    input_size = keras_layer.input_shape[-1]
-
-    output_all = keras_layer.return_sequences
-    reverse_input = keras_layer.go_backwards
-
-    if keras_layer.consume_less not in ["cpu", "gpu"]:
-        raise ValueError(
-            "Cannot convert Keras layer with consume_less = %s"
-            % keras_layer.consume_less
-        )
-
-    W_h = np.zeros((hidden_size, hidden_size))
-    W_x = np.zeros((hidden_size, input_size))
-    b = np.zeros((hidden_size,))
-
-    if keras_layer.consume_less == "cpu":
-        W_h = keras_layer.get_weights()[1].T
-        W_x = keras_layer.get_weights()[0].T
-        b = keras_layer.get_weights()[2]
-    else:
-        W_h = keras_layer.get_weights()[1].T
-        W_x = keras_layer.get_weights()[0].T
-        b = keras_layer.get_weights()[2]
-
-    # Set actication type
-    activation_str = _get_recurrent_activation_name_from_keras(keras_layer.activation)
-
-    # Add to the network
-    builder.add_simple_rnn(
-        name=layer,
-        W_h=W_h,
-        W_x=W_x,
-        b=b,
-        hidden_size=hidden_size,
-        input_size=input_size,
-        activation=activation_str,
-        input_names=input_names,
-        output_names=output_names,
-        output_all=output_all,
-        reverse_input=reverse_input,
-    )
-
-
-def convert_gru(builder, layer, input_names, output_names, keras_layer):
-    """Convert a GRU layer from keras to coreml.
-
-    Parameters
-    ----------
-    keras_layer: layer
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    """
-
-    hidden_size = keras_layer.output_dim
-    input_size = keras_layer.input_shape[-1]
-
-    output_all = keras_layer.return_sequences
-    reverse_input = keras_layer.go_backwards
-
-    if keras_layer.consume_less not in ["cpu", "gpu"]:
-        raise ValueError(
-            "Cannot convert Keras layer with consume_less = %s"
-            % keras_layer.consume_less
-        )
-
-    # Keras: Z R O
-    # CoreML: Z R O
-    W_h, W_x, b = ([], [], [])
-    if keras_layer.consume_less == "cpu":
-        W_x.append(keras_layer.get_weights()[0].T)
-        W_x.append(keras_layer.get_weights()[3].T)
-        W_x.append(keras_layer.get_weights()[6].T)
-
-        W_h.append(keras_layer.get_weights()[1].T)
-        W_h.append(keras_layer.get_weights()[4].T)
-        W_h.append(keras_layer.get_weights()[7].T)
-
-        b.append(keras_layer.get_weights()[2])
-        b.append(keras_layer.get_weights()[5])
-        b.append(keras_layer.get_weights()[8])
-    else:
-        print("consume less not implemented")
-
-    # Set actication type
-    inner_activation_str = _get_recurrent_activation_name_from_keras(
-        keras_layer.inner_activation
-    )
-    activation_str = _get_recurrent_activation_name_from_keras(keras_layer.activation)
-
-    # Add to the network
-    builder.add_gru(
-        name=layer,
-        W_h=W_h,
-        W_x=W_x,
-        b=b,
-        input_size=input_size,
-        hidden_size=hidden_size,
-        input_names=input_names,
-        output_names=output_names,
-        activation=activation_str,
-        inner_activation=inner_activation_str,
-        output_all=output_all,
-        reverse_input=reverse_input,
-    )
-
-
-def convert_bidirectional(builder, layer, input_names, output_names, keras_layer):
-    """Convert a bidirectional layer from keras to coreml.
-        Currently assumes the units are LSTMs.
-
-    Parameters
-    ----------
-    keras_layer: layer
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    """
-
-    input_size = keras_layer.input_shape[-1]
-
-    lstm_layer = keras_layer.forward_layer
-    if type(lstm_layer) != keras.layers.recurrent.LSTM:
-        raise TypeError("Bidirectional layers only supported with LSTM")
-
-    if lstm_layer.go_backwards:
-        raise TypeError(" 'go_backwards' mode not supported with Bidirectional layers")
-
-    output_all = keras_layer.return_sequences
-
-    hidden_size = lstm_layer.output_dim
-    # output_size = lstm_layer.output_dim * 2
-
-    if lstm_layer.consume_less not in ["cpu", "gpu"]:
-        raise ValueError(
-            "Cannot convert Keras layer with consume_less = %s"
-            % keras_layer.consume_less
-        )
-
-    # Keras: I C F O; W_x, W_h, b
-    # CoreML: I F O G; W_h and W_x are separated
-
-    # Keras has all forward weights, followed by backward in the same order
-    W_h, W_x, b = ([], [], [])
-    if lstm_layer.consume_less == "cpu":
-        W_h.append(keras_layer.get_weights()[1].T)
-        W_h.append(keras_layer.get_weights()[7].T)
-        W_h.append(keras_layer.get_weights()[10].T)
-        W_h.append(keras_layer.get_weights()[4].T)
-
-        W_x.append(keras_layer.get_weights()[0].T)
-        W_x.append(keras_layer.get_weights()[6].T)
-        W_x.append(keras_layer.get_weights()[9].T)
-        W_x.append(keras_layer.get_weights()[3].T)
-
-        b.append(keras_layer.get_weights()[2])
-        b.append(keras_layer.get_weights()[8])
-        b.append(keras_layer.get_weights()[11])
-        b.append(keras_layer.get_weights()[5])
-    else:
-        keras_W_h = keras_layer.get_weights()[1].T
-        W_h.append(keras_W_h[0 * hidden_size :][:hidden_size])
-        W_h.append(keras_W_h[1 * hidden_size :][:hidden_size])
-        W_h.append(keras_W_h[3 * hidden_size :][:hidden_size])
-        W_h.append(keras_W_h[2 * hidden_size :][:hidden_size])
-
-        keras_W_x = keras_layer.get_weights()[0].T
-        W_x.append(keras_W_x[0 * hidden_size :][:hidden_size])
-        W_x.append(keras_W_x[1 * hidden_size :][:hidden_size])
-        W_x.append(keras_W_x[3 * hidden_size :][:hidden_size])
-        W_x.append(keras_W_x[2 * hidden_size :][:hidden_size])
-
-        keras_b = keras_layer.get_weights()[2]
-        b.append(keras_b[0 * hidden_size :][:hidden_size])
-        b.append(keras_b[1 * hidden_size :][:hidden_size])
-        b.append(keras_b[3 * hidden_size :][:hidden_size])
-        b.append(keras_b[2 * hidden_size :][:hidden_size])
-
-    W_h_back, W_x_back, b_back = ([], [], [])
-    if keras_layer.backward_layer.consume_less == "cpu":
-        back_weights = keras_layer.backward_layer.get_weights()
-        W_h_back.append(back_weights[1].T)
-        W_h_back.append(back_weights[7].T)
-        W_h_back.append(back_weights[10].T)
-        W_h_back.append(back_weights[4].T)
-
-        W_x_back.append(back_weights[0].T)
-        W_x_back.append(back_weights[6].T)
-        W_x_back.append(back_weights[9].T)
-        W_x_back.append(back_weights[3].T)
-
-        b_back.append(back_weights[2])
-        b_back.append(back_weights[8])
-        b_back.append(back_weights[11])
-        b_back.append(back_weights[5])
-    else:
-        keras_W_h = keras_layer.backward_layer.get_weights()[1].T
-        W_h_back.append(keras_W_h[0 * hidden_size :][:hidden_size])
-        W_h_back.append(keras_W_h[1 * hidden_size :][:hidden_size])
-        W_h_back.append(keras_W_h[3 * hidden_size :][:hidden_size])
-        W_h_back.append(keras_W_h[2 * hidden_size :][:hidden_size])
-
-        keras_W_x = keras_layer.backward_layer.get_weights()[0].T
-        W_x_back.append(keras_W_x[0 * hidden_size :][:hidden_size])
-        W_x_back.append(keras_W_x[1 * hidden_size :][:hidden_size])
-        W_x_back.append(keras_W_x[3 * hidden_size :][:hidden_size])
-        W_x_back.append(keras_W_x[2 * hidden_size :][:hidden_size])
-
-        keras_b = keras_layer.backward_layer.get_weights()[2]
-        b_back.append(keras_b[0 * hidden_size :][:hidden_size])
-        b_back.append(keras_b[1 * hidden_size :][:hidden_size])
-        b_back.append(keras_b[3 * hidden_size :][:hidden_size])
-        b_back.append(keras_b[2 * hidden_size :][:hidden_size])
-
-    # Set activation type
-    inner_activation_str = _get_recurrent_activation_name_from_keras(
-        lstm_layer.inner_activation
-    )
-    activation_str = _get_recurrent_activation_name_from_keras(lstm_layer.activation)
-
-    # Add to the network
-    builder.add_bidirlstm(
-        name=layer,
-        W_h=W_h,
-        W_x=W_x,
-        b=b,
-        W_h_back=W_h_back,
-        W_x_back=W_x_back,
-        b_back=b_back,
-        hidden_size=hidden_size,
-        input_size=input_size,
-        input_names=input_names,
-        output_names=output_names,
-        inner_activation=inner_activation_str,
-        cell_state_update_activation=activation_str,
-        output_activation=activation_str,
-        output_all=output_all,
-    )
-
-
-def convert_batchnorm(builder, layer, input_names, output_names, keras_layer):
-    """
-    Parameters
-    keras_layer: layer
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    """
-
-    # Get input and output names
-    input_name, output_name = (input_names[0], output_names[0])
-
-    # Currently CoreML supports only per-channel batch-norm
-    if keras_layer.mode != 0:
-        raise NotImplementedError("Currently supports only per-feature normalization")
-
-    axis = keras_layer.axis
-    nb_channels = keras_layer.input_shape[axis]
-
-    # Set parameters
-    # Parameter arrangement in Keras: gamma, beta, mean, variance
-    gamma = keras_layer.get_weights()[0]
-    beta = keras_layer.get_weights()[1]
-    mean = keras_layer.get_weights()[2]
-    std = keras_layer.get_weights()[3]
-    # compute adjusted parameters
-    variance = std * std
-    f = 1.0 / np.sqrt(std + keras_layer.epsilon)
-    gamma1 = gamma * f
-    beta1 = beta - gamma * mean * f
-    mean[:] = 0.0  # mean
-    variance[:] = 1.0 - 0.00001  # stddev
-
-    builder.add_batchnorm(
-        name=layer,
-        channels=nb_channels,
-        gamma=gamma1,
-        beta=beta1,
-        mean=mean,
-        variance=variance,
-        input_name=input_name,
-        output_name=output_name,
-    )
-
-
-def convert_flatten(builder, layer, input_names, output_names, keras_layer):
-    """Convert a flatten layer from keras to coreml.
-
-    Parameters
-    keras_layer: layer
-    ----------
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    """
-    input_name, output_name = (input_names[0], output_names[0])
-
-    # blob_order == 0 if the input blob needs not be rearranged
-    # blob_order == 1 if the input blob needs to be rearranged
-    blob_order = 0
-
-    # using keras_layer.input.shape have a "?" (Dimension[None] at the front),
-    # making a 3D tensor with unknown batch size 4D
-    if len(keras_layer.input.shape) == 4:
-        blob_order = 1
-
-    builder.add_flatten(
-        name=layer, mode=blob_order, input_name=input_name, output_name=output_name
-    )
-
-
-def convert_softmax(builder, layer, input_names, output_names, keras_layer):
-    """Convert a softmax layer from keras to coreml.
-
-    Parameters
-    keras_layer: layer
-    ----------
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    """
-    input_name, output_name = (input_names[0], output_names[0])
-
-    builder.add_softmax(name=layer, input_name=input_name, output_name=output_name)
-
-
-def convert_permute(builder, layer, input_names, output_names, keras_layer):
-    """Convert a softmax layer from keras to coreml.
-
-    Parameters
-    keras_layer: layer
-    ----------
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    """
-    input_name, output_name = (input_names[0], output_names[0])
-
-    keras_dims = keras_layer.dims
-    # Keras permute layer index begins at 1
-    if len(keras_dims) == 3:
-        # Keras input tensor interpret as (H,W,C)
-        x = list(np.array(keras_dims))
-        i1, i2, i3 = x.index(1), x.index(2), x.index(3)
-        x[i1], x[i2], x[i3] = 2, 3, 1
-        # add a sequence axis
-        x = [0] + x
-        dim = tuple(x)
-    elif len(keras_dims) == 4:
-        # Here we use Keras converter as a place holder for inserting
-        # permutations - the values here are not valid Keras dim parameters
-        # but parameters we need to use to convert to CoreML model
-        dim = keras_dims
-    else:
-        raise NotImplementedError("Supports only 3d permutation.")
-
-    builder.add_permute(
-        name=layer, dim=dim, input_name=input_name, output_name=output_name
-    )
-
-
-def convert_embedding(builder, layer, input_names, output_names, keras_layer):
-    """Convert a dense layer from keras to coreml.
-
-    Parameters
-    keras_layer: layer
-    ----------
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    """
-    # Get input and output names
-    input_name, output_name = (input_names[0], output_names[0])
-
-    # Get the weights from keras
-    W = keras_layer.get_weights()[0].T
-
-    # assuming keras embedding layers don't have biases
-    builder.add_embedding(
-        name=layer,
-        W=W,
-        b=None,
-        input_dim=keras_layer.input_dim,
-        output_channels=keras_layer.output_dim,
-        has_bias=False,
-        input_name=input_name,
-        output_name=output_name,
-    )
-
-
-def convert_repeat_vector(builder, layer, input_names, output_names, keras_layer):
-    # Keras RepeatVector only deals with 1D input
-    # Get input and output names
-    input_name, output_name = (input_names[0], output_names[0])
-
-    builder.add_sequence_repeat(
-        name=layer, nrep=keras_layer.n, input_name=input_name, output_name=output_name
-    )
-
-
-def default_skip(builder, layer, input_names, output_names, keras_layer):
-    """ Layers that can be skipped (because they are train time only. """
-    return
diff --git a/coremltools/converters/keras/_layers2.py b/coremltools/converters/keras/_layers2.py
deleted file mode 100644
index d765b87cf..000000000
--- a/coremltools/converters/keras/_layers2.py
+++ /dev/null
@@ -1,1585 +0,0 @@
-# Copyright (c) 2017-2019, Apple Inc. All rights reserved.
-#
-# Use of this source code is governed by a BSD-3-clause license that can be
-# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
-from . import _utils
-import logging
-import keras as _keras
-import numpy as _np
-from ...proto import NeuralNetwork_pb2 as _NeuralNetwork_pb2
-
-from distutils.version import StrictVersion as _StrictVersion
-
-if _keras.__version__ >= _StrictVersion("2.2.1"):
-    from keras.layers import DepthwiseConv2D
-elif _keras.__version__ >= _StrictVersion("2.2.0"):
-    from keras.layers import DepthwiseConv2D
-    from keras_applications.mobilenet import relu6
-else:
-    from keras.applications.mobilenet import DepthwiseConv2D, relu6
-
-
-def _get_recurrent_activation_name_from_keras(activation):
-    if activation == _keras.activations.sigmoid:
-        activation_str = "SIGMOID"
-    elif activation == _keras.activations.hard_sigmoid:
-        activation_str = "SIGMOID_HARD"
-    elif activation == _keras.activations.tanh:
-        activation_str = "TANH"
-    elif activation == _keras.activations.relu:
-        activation_str = "RELU"
-    elif activation == _keras.activations.linear:
-        activation_str = "LINEAR"
-    else:
-        raise NotImplementedError(
-            "activation %s not supported for Recurrent layer." % activation
-        )
-
-    return activation_str
-
-
-def _get_activation_name_from_keras_layer(keras_layer):
-    if isinstance(keras_layer, _keras.layers.advanced_activations.LeakyReLU):
-        non_linearity = "LEAKYRELU"
-    elif isinstance(keras_layer, _keras.layers.advanced_activations.PReLU):
-        non_linearity = "PRELU"
-    elif isinstance(keras_layer, _keras.layers.advanced_activations.ELU):
-        non_linearity = "ELU"
-    elif isinstance(keras_layer, _keras.layers.advanced_activations.ThresholdedReLU):
-        non_linearity = "THRESHOLDEDRELU"
-    elif isinstance(keras_layer, _keras.layers.advanced_activations.Softmax):
-        non_linearity = "SOFTMAX"
-    else:
-        act_name = keras_layer.activation.__name__
-
-        if act_name == "softmax":
-            non_linearity = "SOFTMAX"
-        elif act_name == "sigmoid":
-            non_linearity = "SIGMOID"
-        elif act_name == "tanh":
-            non_linearity = "TANH"
-        elif act_name == "relu":
-            non_linearity = "RELU"
-        elif act_name == "relu6":
-            non_linearity = "RELU6"
-        elif act_name == "softplus":
-            non_linearity = "SOFTPLUS"
-        elif act_name == "softsign":
-            non_linearity = "SOFTSIGN"
-        elif act_name == "hard_sigmoid":
-            non_linearity = "SIGMOID_HARD"
-        elif act_name == "elu":
-            non_linearity = "UNIT_ELU"
-        elif act_name == "linear":
-            non_linearity = "LINEAR"
-        elif act_name == "selu":
-            non_linearity = "SELU"
-        else:
-            non_linearity = "CUSTOM"
-
-    return non_linearity
-
-
-def _get_elementwise_name_from_keras_layer(keras_layer):
-    """
-    Get the keras layer name from the activation name.
-    """
-    if isinstance(keras_layer, _keras.layers.Add):
-        return "ADD"
-    elif isinstance(keras_layer, _keras.layers.Multiply):
-        return "MULTIPLY"
-    elif isinstance(keras_layer, _keras.layers.Concatenate):
-        if len(keras_layer.input_shape[0]) == 3 and (
-            keras_layer.axis == 1 or keras_layer.axis == -2
-        ):
-            return "SEQUENCE_CONCAT"
-        if len(keras_layer.input_shape[0]) == 3 and (
-            keras_layer.axis == 2 or keras_layer.axis == -1
-        ):
-            return "CONCAT"
-        elif len(keras_layer.input_shape[0]) == 4 and (
-            keras_layer.axis == 3 or keras_layer.axis == -1
-        ):
-            return "CONCAT"
-        elif len(keras_layer.input_shape[0]) == 2 and (
-            keras_layer.axis == 1 or keras_layer.axis == -1
-        ):
-            return "CONCAT"
-        else:
-            raise ValueError("Only channel and sequence concatenation are supported.")
-    elif isinstance(keras_layer, _keras.layers.Dot):
-        if len(keras_layer.input_shape[0]) == 2:
-            if type(keras_layer.axes) is list or type(keras_layer.axes) is tuple:
-                if len(keras_layer.axes) > 1:
-                    raise ValueError("Only vector dot-product is supported.")
-                else:
-                    axis = keras_layer.axes[0]
-            else:
-                axis = keras_layer.axes
-            if axis != -1 and axis != 1:
-                raise ValueError("Only vector dot-product is supported.")
-
-            if keras_layer.normalize:
-                return "COS"
-            else:
-                return "DOT"
-        else:
-            raise ValueError("Only vector dot-product is supported.")
-    elif isinstance(keras_layer, _keras.layers.Maximum):
-        return "MAX"
-    elif isinstance(keras_layer, _keras.layers.Average):
-        return "AVE"
-    else:
-        _utils.raise_error_unsupported_option(
-            str(type(keras_layer)), "merge", keras_layer.name
-        )
-
-
-def _same_elements_per_channel(x):
-    """ Test if a 3D (H,W,C) matrix x has the same element in each (H,W)
-    matrix for each channel
-    """
-    eps = 1e-5
-    dims = x.shape
-    for c in range(dims[-1]):
-        xc = x[:, :, c].flatten()
-        if not _np.all(_np.absolute(xc - xc[0]) < eps):
-            return False
-    return True
-
-
-def _check_data_format(keras_layer):
-    if hasattr(keras_layer, ("data_format")):
-        if keras_layer.data_format != "channels_last":
-            raise ValueError(
-                "Converter currently supports data_format = " "'channels_last' only."
-            )
-
-
-def convert_dense(
-    builder, layer, input_names, output_names, keras_layer, respect_train
-):
-    """
-    Convert a dense layer from keras to coreml.
-
-    Parameters
-    keras_layer: layer
-    ----------
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    respect_train: boolean
-        Whether or not to carry over Keras' "trainable" parameter.
-    """
-    # Get input and output names
-    input_name, output_name = (input_names[0], output_names[0])
-
-    has_bias = keras_layer.use_bias
-    # Get the weights from keras
-    W = keras_layer.get_weights()[0].T
-    Wb = keras_layer.get_weights()[1].T if has_bias else None
-    output_channels, input_channels = W.shape
-
-    builder.add_inner_product(
-        name=layer,
-        W=W,
-        b=Wb,
-        input_channels=input_channels,
-        output_channels=output_channels,
-        has_bias=has_bias,
-        input_name=input_name,
-        output_name=output_name,
-    )
-    if respect_train and keras_layer.trainable:
-        builder.make_updatable([layer])
-
-
-def convert_embedding(
-    builder, layer, input_names, output_names, keras_layer, respect_train
-):
-    """Convert a dense layer from keras to coreml.
-
-    Parameters
-    keras_layer: layer
-    ----------
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    respect_train: boolean
-        Whether to support Keras' "trainable" flag (unsupported).
-    """
-    # Get input and output names
-    input_name, output_name = (input_names[0], output_names[0])
-
-    # Get the weights from keras
-    W = keras_layer.get_weights()[0].T
-
-    # assuming keras embedding layers don't have biases
-    builder.add_embedding(
-        name=layer,
-        W=W,
-        b=None,
-        input_dim=keras_layer.input_dim,
-        output_channels=keras_layer.output_dim,
-        has_bias=False,
-        input_name=input_name,
-        output_name=output_name,
-    )
-
-    if respect_train and keras_layer.trainable:
-        logging.warning(
-            "Embedding layer '%s' is marked updatable, but Core "
-            "ML does not yet support updating layers of this "
-            "type. The layer will be frozen in Core ML.",
-            layer,
-        )
-
-
-def convert_activation(
-    builder, layer, input_names, output_names, keras_layer, respect_train
-):
-    """
-    Convert an activation layer from keras to coreml.
-
-    Parameters
-    ----------
-    keras_layer: layer
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    respect_train: boolean,
-        Ignored.
-    """
-    # Get input and output names
-    input_name, output_name = (input_names[0], output_names[0])
-    non_linearity = _get_activation_name_from_keras_layer(keras_layer)
-
-    # Add a non-linearity layer
-    if non_linearity == "SOFTMAX":
-        builder.add_softmax(name=layer, input_name=input_name, output_name=output_name)
-        return
-    if non_linearity == "RELU6":
-        # No direct support of RELU with max-activation value - use negate and
-        # clip layers
-        relu_output_name = output_name + "_relu"
-        builder.add_activation(layer, "RELU", input_name, relu_output_name)
-        # negate it
-        neg_output_name = relu_output_name + "_neg"
-        builder.add_activation(
-            layer + "__neg__", "LINEAR", relu_output_name, neg_output_name, [-1.0, 0]
-        )
-        # apply threshold
-        clip_output_name = relu_output_name + "_clip"
-        builder.add_unary(
-            layer + "__clip__",
-            neg_output_name,
-            clip_output_name,
-            "threshold",
-            alpha=-6.0,
-        )
-        # negate it back
-        builder.add_activation(
-            layer + "_neg2", "LINEAR", clip_output_name, output_name, [-1.0, 0]
-        )
-        return
-
-    if non_linearity == "SELU":
-        elu_output_name = output_name + "_elu"
-        builder.add_activation(
-            layer + "__elu__", "ELU", input_name, elu_output_name, params=1.6732
-        )
-        builder.add_elementwise(
-            layer,
-            input_names=elu_output_name,
-            output_name=output_name,
-            mode="MULTIPLY",
-            alpha=1.0507,
-        )
-        return
-
-    params = None
-    if non_linearity == "UNIT_ELU":
-        params = 1.0
-        non_linearity = "ELU"
-    elif non_linearity == "LEAKYRELU":
-        params = [keras_layer.alpha]
-    elif non_linearity == "PRELU":
-        shared_axes = list(keras_layer.shared_axes)
-        if not (shared_axes == [1, 2, 3] or shared_axes == [1, 2]):
-            _utils.raise_error_unsupported_scenario(
-                "Shared axis not being [1,2,3] or [1,2]", "parametric_relu", layer
-            )
-        params = _keras.backend.eval(keras_layer.weights[0])
-    elif non_linearity == "ELU":
-        params = keras_layer.alpha
-    elif non_linearity == "THRESHOLDEDRELU":
-        params = keras_layer.theta
-    else:
-        pass  # do nothing to parameters
-
-    builder.add_activation(
-        name=layer,
-        non_linearity=non_linearity,
-        input_name=input_name,
-        output_name=output_name,
-        params=params,
-    )
-
-
-def convert_advanced_relu(
-    builder, layer, input_names, output_names, keras_layer, respect_train
-):
-    """
-    Convert an ReLU layer with maximum value from keras to coreml.
-
-    Parameters
-    ----------
-    keras_layer: layer
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    respect_train: boolean
-        Ignored.
-    """
-    # Get input and output names
-    input_name, output_name = (input_names[0], output_names[0])
-
-    if keras_layer.max_value is None:
-        builder.add_activation(layer, "RELU", input_name, output_name)
-        return
-
-    # No direct support of RELU with max-activation value - use negate and
-    # clip layers
-    relu_output_name = output_name + "_relu"
-    builder.add_activation(layer, "RELU", input_name, relu_output_name)
-    # negate it
-    neg_output_name = relu_output_name + "_neg"
-    builder.add_activation(
-        layer + "__neg__", "LINEAR", relu_output_name, neg_output_name, [-1.0, 0]
-    )
-    # apply threshold
-    clip_output_name = relu_output_name + "_clip"
-    builder.add_unary(
-        layer + "__clip__",
-        neg_output_name,
-        clip_output_name,
-        "threshold",
-        alpha=-keras_layer.max_value,
-    )
-    # negate it back
-    builder.add_activation(
-        layer + "_neg2", "LINEAR", clip_output_name, output_name, [-1.0, 0]
-    )
-
-
-def convert_convolution(
-    builder, layer, input_names, output_names, keras_layer, respect_train
-):
-    """
-    Convert convolution layer from keras to coreml.
-
-    Parameters
-    ----------
-    keras_layer: layer
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    respect_train: boolean
-        Whether or not to carry over Keras' "trainable" parameter.
-    """
-
-    _check_data_format(keras_layer)
-
-    # Get input and output names
-    input_name, output_name = (input_names[0], output_names[0])
-
-    has_bias = keras_layer.use_bias
-    is_deconv = isinstance(keras_layer, _keras.layers.convolutional.Conv2DTranspose)
-
-    # Get the weights from _keras.
-    weightList = keras_layer.get_weights()
-
-    # Dimensions and weights
-    if is_deconv:
-        height, width, n_filters, channels = weightList[0].shape
-        W = weightList[0].transpose([0, 1, 3, 2])
-        try:
-            output_blob_shape = list(filter(None, keras_layer.output_shape))
-            output_shape = output_blob_shape[:-1]
-        except:
-            output_shape = None
-    else:
-        height, width, channels, n_filters = weightList[0].shape
-        W = weightList[0]
-        output_shape = None
-    b = weightList[1] if has_bias else None
-
-    output_channels = n_filters
-
-    stride_height, stride_width = keras_layer.strides
-
-    # Dilations
-    dilations = [1, 1]
-    if (type(keras_layer.dilation_rate) is list) or (
-        type(keras_layer.dilation_rate) is tuple
-    ):
-        dilations = [keras_layer.dilation_rate[0], keras_layer.dilation_rate[1]]
-    else:
-        dilations = [keras_layer.dilation_rate, keras_layer.dilation_rate]
-    if is_deconv and not dilations == [1, 1]:
-        raise ValueError("Unsupported non-unity dilation for Deconvolution layer")
-
-    groups = 1
-    kernel_channels = channels
-    # depth-wise convolution
-    if isinstance(keras_layer, DepthwiseConv2D):
-        groups = channels
-        kernel_channels = 1
-        depth_multiplier = keras_layer.depth_multiplier
-        W = _np.reshape(W, (height, width, 1, channels * depth_multiplier))
-        output_channels = channels * depth_multiplier
-
-    builder.add_convolution(
-        name=layer,
-        kernel_channels=kernel_channels,
-        output_channels=output_channels,
-        height=height,
-        width=width,
-        stride_height=stride_height,
-        stride_width=stride_width,
-        border_mode=keras_layer.padding,
-        groups=groups,
-        W=W,
-        b=b,
-        has_bias=has_bias,
-        is_deconv=is_deconv,
-        output_shape=output_shape,
-        input_name=input_name,
-        output_name=output_name,
-        dilation_factors=dilations,
-    )
-
-    if respect_train and keras_layer.trainable:
-        builder.make_updatable([layer])
-
-
-def convert_convolution1d(
-    builder, layer, input_names, output_names, keras_layer, respect_train
-):
-    """
-    Convert convolution layer from keras to coreml.
-
-    Parameters
-    ----------
-    keras_layer: layer
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    respect_train: boolean
-        Whether to honor Keras' "trainable" flag.
-    """
-    # Get input and output names
-    input_name, output_name = (input_names[0], output_names[0])
-
-    has_bias = keras_layer.use_bias
-
-    # Get the weights from _keras.
-    # Keras stores convolution weights as list of numpy arrays
-    weightList = keras_layer.get_weights()
-    output_shape = list(filter(None, keras_layer.output_shape))[:-1]
-
-    # Parameter
-    filter_length, input_dim, n_filters = weightList[0].shape
-    stride_width = (
-        keras_layer.strides
-        if type(keras_layer.strides) is int
-        else keras_layer.strides[0]
-    )
-
-    # Weights and bias terms
-    W = _np.expand_dims(weightList[0], axis=0)
-    b = weightList[1] if has_bias else None
-
-    dilations = [1, 1]
-    if (type(keras_layer.dilation_rate) is list) or (
-        type(keras_layer.dilation_rate) is tuple
-    ):
-        dilations = [1, keras_layer.dilation_rate[0]]
-    else:
-        dilations = [1, keras_layer.dilation_rate]
-
-    keras_padding = keras_layer.padding
-    if keras_padding == "causal":
-        builder.add_padding(
-            name=layer + "__causal_pad__",
-            left=filter_length - 1,
-            right=0,
-            top=0,
-            bottom=0,
-            value=0,
-            input_name=input_name,
-            output_name=input_name + "__causal_pad__",
-        )
-        input_name = input_name + "__causal_pad__"
-        keras_padding = "valid"
-
-    builder.add_convolution(
-        name=layer,
-        kernel_channels=input_dim,
-        output_channels=n_filters,
-        height=1,
-        width=filter_length,
-        stride_height=1,
-        stride_width=stride_width,
-        border_mode=keras_padding,
-        groups=1,
-        W=W,
-        b=b,
-        has_bias=has_bias,
-        is_deconv=False,
-        output_shape=output_shape,
-        input_name=input_name,
-        output_name=output_name,
-        dilation_factors=dilations,
-    )
-
-    if respect_train and keras_layer.trainable:
-        builder.make_updatable([layer])
-
-
-def convert_separable_convolution(
-    builder, layer, input_names, output_names, keras_layer, respect_train
-):
-    """
-    Convert separable convolution layer from keras to coreml.
-
-    Parameters
-    ----------
-    keras_layer: layer
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    respect_train: boolean
-        Whether to honor Keras' "trainable" flag.
-    """
-    _check_data_format(keras_layer)
-
-    # Get input and output names
-    input_name, output_name = (input_names[0], output_names[0])
-
-    has_bias = keras_layer.use_bias
-
-    # Get the weights from _keras.
-    weight_list = keras_layer.get_weights()
-    output_blob_shape = list(filter(None, keras_layer.output_shape))
-    output_channels = output_blob_shape[-1]
-
-    # D: depth mutliplier
-    # w[0] is (H,W,Cin,D)
-    # w[1] is (1,1,Cin * D, Cout)
-    W0 = weight_list[0]
-    W1 = weight_list[1]
-    height, width, input_channels, depth_mult = W0.shape
-    b = weight_list[2] if has_bias else None
-
-    W0 = _np.reshape(W0, (height, width, 1, input_channels * depth_mult))
-
-    stride_height, stride_width = keras_layer.strides
-
-    # Dilations
-    if (type(keras_layer.dilation_rate) is list) or (
-        type(keras_layer.dilation_rate) is tuple
-    ):
-        dilations = [keras_layer.dilation_rate[0], keras_layer.dilation_rate[1]]
-    else:
-        dilations = [keras_layer.dilation_rate, keras_layer.dilation_rate]
-
-    intermediate_name = output_name + "_intermin_"
-
-    builder.add_convolution(
-        name=layer + "_step_1",
-        kernel_channels=1,
-        output_channels=input_channels * depth_mult,
-        height=height,
-        width=width,
-        stride_height=stride_height,
-        stride_width=stride_width,
-        border_mode=keras_layer.padding,
-        groups=input_channels,
-        W=W0,
-        b=None,
-        has_bias=False,
-        is_deconv=False,
-        output_shape=None,
-        input_name=input_name,
-        output_name=intermediate_name,
-        dilation_factors=dilations,
-    )
-
-    builder.add_convolution(
-        name=layer + "_step_2",
-        kernel_channels=input_channels * depth_mult,
-        output_channels=output_channels,
-        height=1,
-        width=1,
-        stride_height=1,
-        stride_width=1,
-        border_mode=keras_layer.padding,
-        groups=1,
-        W=W1,
-        b=b,
-        has_bias=has_bias,
-        is_deconv=False,
-        output_shape=None,
-        input_name=intermediate_name,
-        output_name=output_name,
-        dilation_factors=[1, 1],
-    )
-
-    if respect_train and keras_layer.trainable:
-        builder.make_updatable([layer + "_step_1", layer + "_step_2"])
-
-
-def convert_batchnorm(
-    builder, layer, input_names, output_names, keras_layer, respect_train
-):
-    """
-    Convert a Batch Normalization layer.
-
-    Parameters
-    keras_layer: layer
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    respect_train: boolean
-        Whether to honor Keras' "trainable" flag (unsupported).
-    """
-
-    # Get input and output names
-    input_name, output_name = (input_names[0], output_names[0])
-
-    axis = keras_layer.axis
-    nb_channels = keras_layer.input_shape[axis]
-
-    # Set parameters
-    # Parameter arrangement in Keras: gamma, beta, mean, variance
-    idx = 0
-    gamma, beta = None, None
-    if keras_layer.scale:
-        gamma = keras_layer.get_weights()[idx]
-        idx += 1
-    if keras_layer.center:
-        beta = keras_layer.get_weights()[idx]
-        idx += 1
-    mean = keras_layer.get_weights()[idx]
-    std = keras_layer.get_weights()[idx + 1]
-
-    gamma = _np.ones(mean.shape) if gamma is None else gamma
-    beta = _np.zeros(mean.shape) if beta is None else beta
-
-    # compute adjusted parameters
-    variance = std * std
-    f = 1.0 / _np.sqrt(std + keras_layer.epsilon)
-    gamma1 = gamma * f
-    beta1 = beta - gamma * mean * f
-    mean[:] = 0.0  # mean
-    variance[:] = 1.0 - 0.00001  # stddev
-
-    builder.add_batchnorm(
-        name=layer,
-        channels=nb_channels,
-        gamma=gamma1,
-        beta=beta1,
-        mean=mean,
-        variance=variance,
-        input_name=input_name,
-        output_name=output_name,
-    )
-
-    if respect_train and keras_layer.trainable:
-        logging.warning(
-            "BatchNorm layer '%s' is marked updatable, but Core "
-            "ML does not yet support updating layers of this "
-            "type. The layer will be frozen in Core ML.",
-            layer,
-        )
-
-
-def convert_flatten(
-    builder, layer, input_names, output_names, keras_layer, respect_train
-):
-    """
-    Convert a flatten layer from keras to coreml.
-    ----------
-    Parameters
-    keras_layer: layer
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    respect_train: boolean
-        Ignored.
-    """
-    input_name, output_name = (input_names[0], output_names[0])
-
-    # blob_order == 0 if the input blob needs not be rearranged
-    # blob_order == 1 if the input blob needs to be rearranged
-    blob_order = 0
-
-    # using keras_layer.input.shape have a "?" (Dimension[None] at the front),
-    # making a 3D tensor with unknown batch size 4D
-
-    try:
-        in_shape = keras_layer.input_shape
-        if len(in_shape) == 4:
-            blob_order = 1
-        if len(in_shape) == 3 and in_shape[0] is None:
-            # handling Keras rank-3 tensor (Batch, Sequence, Channels)
-            permute_output_name = output_name + "__permute__"
-            builder.add_permute(
-                name=layer + "__permute__",
-                dim=(2, 1, 0, 3),
-                input_name=input_name,
-                output_name=permute_output_name,
-            )
-            builder.add_flatten(
-                name=layer,
-                mode=1,
-                input_name=permute_output_name,
-                output_name=output_name,
-            )
-        else:
-            builder.add_flatten(
-                name=layer,
-                mode=blob_order,
-                input_name=input_name,
-                output_name=output_name,
-            )
-    except:
-        builder.add_flatten(
-            name=layer, mode=1, input_name=input_name, output_name=output_name
-        )
-
-
-def convert_merge(
-    builder, layer, input_names, output_names, keras_layer, respect_train
-):
-    """
-    Convert concat layer from keras to coreml.
-
-    Parameters
-    ----------
-    keras_layer: layer
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    respect_train: boolean
-        Ignored.
-    """
-    # Get input and output names
-    output_name = output_names[0]
-
-    mode = _get_elementwise_name_from_keras_layer(keras_layer)
-    builder.add_elementwise(
-        name=layer, input_names=input_names, output_name=output_name, mode=mode
-    )
-
-
-def convert_pooling(
-    builder, layer, input_names, output_names, keras_layer, respect_train
-):
-    """
-    Convert pooling layer from keras to coreml.
-
-    Parameters
-    ----------
-    keras_layer: layer
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    respect_train: boolean
-        Ignored.
-    """
-    _check_data_format(keras_layer)
-    # Get input and output names
-    input_name, output_name = (input_names[0], output_names[0])
-
-    # Pooling layer type
-    if (
-        isinstance(keras_layer, _keras.layers.convolutional.MaxPooling2D)
-        or isinstance(keras_layer, _keras.layers.convolutional.MaxPooling1D)
-        or isinstance(keras_layer, _keras.layers.pooling.GlobalMaxPooling2D)
-        or isinstance(keras_layer, _keras.layers.pooling.GlobalMaxPooling1D)
-    ):
-        layer_type_str = "MAX"
-    elif (
-        isinstance(keras_layer, _keras.layers.convolutional.AveragePooling2D)
-        or isinstance(keras_layer, _keras.layers.convolutional.AveragePooling1D)
-        or isinstance(keras_layer, _keras.layers.pooling.GlobalAveragePooling2D)
-        or isinstance(keras_layer, _keras.layers.pooling.GlobalAveragePooling1D)
-    ):
-        layer_type_str = "AVERAGE"
-    else:
-        raise TypeError("Pooling type %s not supported" % keras_layer)
-
-    # if it's global, set the global flag
-    if isinstance(keras_layer, _keras.layers.pooling.GlobalMaxPooling2D) or isinstance(
-        keras_layer, _keras.layers.pooling.GlobalAveragePooling2D
-    ):
-        # 2D global pooling
-        global_pooling = True
-        height, width = (0, 0)
-        stride_height, stride_width = (0, 0)
-        padding_type = "VALID"
-    elif isinstance(
-        keras_layer, _keras.layers.pooling.GlobalMaxPooling1D
-    ) or isinstance(keras_layer, _keras.layers.pooling.GlobalAveragePooling1D):
-        # 1D global pooling: 1D global pooling seems problematic in the backend,
-        # use this work-around
-        global_pooling = False
-        _, width, channels = keras_layer.input_shape
-        height = 1
-        stride_height, stride_width = height, width
-        padding_type = "VALID"
-    else:
-        global_pooling = False
-        # Set pool sizes and strides
-        # 1D cases:
-        if (
-            isinstance(keras_layer, _keras.layers.convolutional.MaxPooling1D)
-            or isinstance(keras_layer, _keras.layers.pooling.GlobalMaxPooling1D)
-            or isinstance(keras_layer, _keras.layers.convolutional.AveragePooling1D)
-            or isinstance(keras_layer, _keras.layers.pooling.GlobalAveragePooling1D)
-        ):
-            pool_size = (
-                keras_layer.pool_size
-                if type(keras_layer.pool_size) is int
-                else keras_layer.pool_size[0]
-            )
-            height, width = 1, pool_size
-            if keras_layer.strides is not None:
-                strides = (
-                    keras_layer.strides
-                    if type(keras_layer.strides) is int
-                    else keras_layer.strides[0]
-                )
-                stride_height, stride_width = 1, strides
-            else:
-                stride_height, stride_width = 1, pool_size
-        # 2D cases:
-        else:
-            height, width = keras_layer.pool_size
-            if keras_layer.strides is None:
-                stride_height, stride_width = height, width
-            else:
-                stride_height, stride_width = keras_layer.strides
-
-        # Padding
-        padding = keras_layer.padding
-        if keras_layer.padding == "valid":
-            padding_type = "VALID"
-        elif keras_layer.padding == "same":
-            padding_type = "SAME"
-        else:
-            raise TypeError("Border mode %s not supported" % padding)
-
-    builder.add_pooling(
-        name=layer,
-        height=height,
-        width=width,
-        stride_height=stride_height,
-        stride_width=stride_width,
-        layer_type=layer_type_str,
-        padding_type=padding_type,
-        input_name=input_name,
-        output_name=output_name,
-        exclude_pad_area=True,
-        is_global=global_pooling,
-    )
-
-
-def convert_padding(
-    builder, layer, input_names, output_names, keras_layer, respect_train
-):
-    """
-    Convert padding layer from keras to coreml.
-    Keras only supports zero padding at this time.
-    Parameters
-    ----------
-    keras_layer: layer
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    respect_train: boolean
-        Ignored.
-    """
-    _check_data_format(keras_layer)
-    # Get input and output names
-    input_name, output_name = (input_names[0], output_names[0])
-
-    is_1d = isinstance(keras_layer, _keras.layers.ZeroPadding1D)
-
-    padding = keras_layer.padding
-    top = left = bottom = right = 0
-    if is_1d:
-        if type(padding) is int:
-            left = right = padding
-        elif type(padding) is tuple:
-            if type(padding[0]) is int:
-                left, right = padding
-            elif type(padding[0]) is tuple and len(padding[0]) == 2:
-                left, right = padding[0]
-            else:
-                raise ValueError("Unrecognized padding option: %s" % (str(padding)))
-        else:
-            raise ValueError("Unrecognized padding option: %s" % (str(padding)))
-    else:
-        if type(padding) is int:
-            top = left = bottom = right = padding
-        elif type(padding) is tuple:
-            if type(padding[0]) is int:
-                top, left = padding
-                bottom, right = padding
-            elif type(padding[0]) is tuple:
-                top, bottom = padding[0]
-                left, right = padding[1]
-            else:
-                raise ValueError("Unrecognized padding option: %s" % (str(padding)))
-        else:
-            raise ValueError("Unrecognized padding option: %s" % (str(padding)))
-
-    # Now add the layer
-    builder.add_padding(
-        name=layer,
-        left=left,
-        right=right,
-        top=top,
-        bottom=bottom,
-        value=0,
-        input_name=input_name,
-        output_name=output_name,
-    )
-
-
-def convert_cropping(
-    builder, layer, input_names, output_names, keras_layer, respect_train
-):
-    """
-    Convert padding layer from keras to coreml.
-    Keras only supports zero padding at this time.
-    Parameters
-    ----------
-    keras_layer: layer
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    respect_train: boolean
-        Ignored.
-    """
-
-    _check_data_format(keras_layer)
-    # Get input and output names
-    input_name, output_name = (input_names[0], output_names[0])
-    is_1d = isinstance(keras_layer, _keras.layers.Cropping1D)
-
-    cropping = keras_layer.cropping
-    top = left = bottom = right = 0
-    if is_1d:
-        if type(cropping) is int:
-            left = right = cropping
-        elif type(cropping) is tuple:
-            if type(cropping[0]) is int:
-                left, right = cropping
-            elif type(cropping[0]) is tuple and len(cropping[0]) == 2:
-                left, right = cropping[0]
-            else:
-                raise ValueError("Unrecognized cropping option: %s" % (str(cropping)))
-        else:
-            raise ValueError("Unrecognized cropping option: %s" % (str(cropping)))
-    else:
-        if type(cropping) is int:
-            top = left = bottom = right = cropping
-        elif type(cropping) is tuple:
-            if type(cropping[0]) is int:
-                top, left = cropping
-                bottom, right = cropping
-            elif type(cropping[0]) is tuple:
-                top, bottom = cropping[0]
-                left, right = cropping[1]
-            else:
-                raise ValueError("Unrecognized cropping option: %s" % (str(cropping)))
-        else:
-            raise ValueError("Unrecognized cropping option: %s" % (str(cropping)))
-
-    # Now add the layer
-    builder.add_crop(
-        name=layer,
-        left=left,
-        right=right,
-        top=top,
-        bottom=bottom,
-        offset=[0, 0],
-        input_names=[input_name],
-        output_name=output_name,
-    )
-
-
-def convert_upsample(
-    builder, layer, input_names, output_names, keras_layer, respect_train
-):
-    """
-    Convert upsample layer from keras to coreml.
-
-    Parameters
-    ----------
-    keras_layer: layer
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    respect_train: boolean
-        Ignored.
-    """
-    _check_data_format(keras_layer)
-    # Get input and output names
-    input_name, output_name = (input_names[0], output_names[0])
-
-    is_1d = isinstance(keras_layer, _keras.layers.UpSampling1D)
-
-    # Currently, we only support upsample of same dims
-    fh = fw = 1
-    if is_1d:
-        if type(keras_layer.size) is tuple and len(keras_layer.size) == 1:
-            fh, fw = 1, keras_layer.size[0]
-        elif type(keras_layer.size) is int:
-            fh, fw = 1, keras_layer.size
-        else:
-            raise ValueError(
-                "Unrecognized upsample factor format %s" % (str(keras_layer.size))
-            )
-    else:
-        if type(keras_layer.size) is int:
-            fh = fw = keras_layer.size
-        elif len(keras_layer.size) == 2:
-            if keras_layer.size[0] != keras_layer.size[1]:
-                raise ValueError(
-                    "Upsample with different rows and columns not " "supported."
-                )
-            else:
-                fh = keras_layer.size[0]
-                fw = keras_layer.size[1]
-        else:
-            raise ValueError(
-                "Unrecognized upsample factor format %s" % (str(keras_layer.size))
-            )
-
-    kerasmode2coreml = {"nearest": "NN", "bilinear": "BILINEAR"}
-    interpolation = getattr(
-        keras_layer, "interpolation", "nearest"
-    )  # Defaults to 'nearest' for Keras < 2.2.3
-
-    if interpolation not in kerasmode2coreml:
-        raise ValueError(
-            'Only supported "nearest" or "bilinear" interpolation for upsampling layers.'
-        )
-
-    mode = kerasmode2coreml[interpolation]
-
-    builder.add_upsample(
-        name=layer,
-        scaling_factor_h=fh,
-        scaling_factor_w=fw,
-        input_name=input_name,
-        output_name=output_name,
-        mode=mode,
-    )
-
-
-def convert_permute(
-    builder, layer, input_names, output_names, keras_layer, respect_train
-):
-    """
-    Convert a softmax layer from keras to coreml.
-
-    Parameters
-    keras_layer: layer
-    ----------
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    respect_train: boolean
-        Ignored.
-    """
-    input_name, output_name = (input_names[0], output_names[0])
-
-    keras_dims = keras_layer.dims
-    # Keras permute layer index begins at 1
-    if len(keras_dims) == 3:
-        # Keras input tensor interpret as (H,W,C)
-        x = list(_np.array(keras_dims))
-        arr = [2, 3, 1]  # HWC in Keras
-        arr_permuted = [arr[x[0] - 1], arr[x[1] - 1], arr[x[2] - 1]]
-        arr_permuted = [
-            arr_permuted[2],
-            arr_permuted[0],
-            arr_permuted[1],
-        ]  # coreml format: channel first
-        # add a sequence axis
-        dim = [0] + arr_permuted
-        dim = tuple(dim)
-    elif len(keras_dims) == 4:
-        # Here we use Keras converter as a place holder for inserting
-        # permutations - the values here are not valid Keras dim parameters
-        # but parameters we need to use to convert to CoreML model
-        dim = keras_dims
-    else:
-        raise NotImplementedError("Supports only 3d permutation.")
-
-    builder.add_permute(
-        name=layer, dim=dim, input_name=input_name, output_name=output_name
-    )
-
-
-def convert_reshape(
-    builder, layer, input_names, output_names, keras_layer, respect_train
-):
-    """
-    respect_train: boolean
-        Ignored.
-    """
-
-    input_name, output_name = (input_names[0], output_names[0])
-
-    input_shape = keras_layer.input_shape
-    target_shape = keras_layer.target_shape
-
-    def get_coreml_target_shape(target_shape):
-        if len(target_shape) == 1:  # (D,)
-            coreml_shape = (1, target_shape[0], 1, 1)
-        elif len(target_shape) == 2:  # (S,D)
-            coreml_shape = target_shape + (1, 1)
-        elif len(target_shape) == 3:  # (H,W,C)
-            coreml_shape = (1, target_shape[2], target_shape[0], target_shape[1])
-        else:
-            coreml_shape = None
-        return coreml_shape
-
-    def get_mode(input_shape, target_shape):
-        in_shape = input_shape[1:]
-        if len(in_shape) == 3 or len(target_shape) == 3:
-            return 1
-        else:
-            return 0
-
-    new_shape = get_coreml_target_shape(target_shape)
-    if new_shape is not None:
-        mode = get_mode(input_shape, target_shape)
-        builder.add_reshape(
-            name=layer,
-            input_name=input_name,
-            output_name=output_name,
-            target_shape=new_shape,
-            mode=mode,
-        )
-    else:
-        _utils.raise_error_unsupported_categorical_option(
-            "input_shape", str(input_shape), "reshape", layer
-        )
-
-
-def convert_simple_rnn(
-    builder, layer, input_names, output_names, keras_layer, respect_train
-):
-    """
-    Convert an SimpleRNN layer from keras to coreml.
-
-    Parameters
-    ----------
-    keras_layer: layer
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    respect_train: boolean
-        Whether to honor Keras' "trainable" flag (unsupported).
-    """
-    # Get input and output names
-    hidden_size = keras_layer.units
-    input_size = keras_layer.input_shape[-1]
-
-    output_all = keras_layer.return_sequences
-    reverse_input = keras_layer.go_backwards
-
-    W_h = _np.zeros((hidden_size, hidden_size))
-    W_x = _np.zeros((hidden_size, input_size))
-    b = None
-
-    implementation = (
-        keras_layer.implementation if hasattr(keras_layer, "implementation") else 0
-    )
-    if implementation == 0:
-        W_h = keras_layer.get_weights()[1].T
-        W_x = keras_layer.get_weights()[0].T
-        if keras_layer.use_bias:
-            b = keras_layer.get_weights()[2]
-
-    # Set actication type
-    activation_str = _get_recurrent_activation_name_from_keras(keras_layer.activation)
-
-    # Add to the network
-    builder.add_simple_rnn(
-        name=layer,
-        W_h=W_h,
-        W_x=W_x,
-        b=b,
-        hidden_size=hidden_size,
-        input_size=input_size,
-        activation=activation_str,
-        input_names=input_names,
-        output_names=output_names,
-        output_all=output_all,
-        reverse_input=reverse_input,
-    )
-
-    if respect_train and keras_layer.trainable:
-        logging.warning(
-            "RNN layer '%s' is marked updatable, but Core "
-            "ML does not yet support updating layers of this "
-            "type. The layer will be frozen in Core ML.",
-            layer,
-        )
-
-
-def convert_lstm(builder, layer, input_names, output_names, keras_layer, respect_train):
-    """
-    Convert an LSTM layer from keras to coreml.
-
-    Parameters
-    ----------
-    keras_layer: layer
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    respect_train: boolean
-        Whether to honor Keras' "trainable" flag (unsupported).
-    """
-
-    hidden_size = keras_layer.units
-    input_size = keras_layer.input_shape[-1]
-    output_all = keras_layer.return_sequences
-    reverse_input = keras_layer.go_backwards
-
-    # Keras: [W_x, W_h, b] each in I F C O
-    # CoreML: I F O G; W_h and W_x are separated
-    W_h, W_x, b = ([], [], [])
-    keras_W_h = keras_layer.get_weights()[1].T
-    W_h.append(keras_W_h[0 * hidden_size :][:hidden_size])
-    W_h.append(keras_W_h[1 * hidden_size :][:hidden_size])
-    W_h.append(keras_W_h[3 * hidden_size :][:hidden_size])
-    W_h.append(keras_W_h[2 * hidden_size :][:hidden_size])
-
-    keras_W_x = keras_layer.get_weights()[0].T
-    W_x.append(keras_W_x[0 * hidden_size :][:hidden_size])
-    W_x.append(keras_W_x[1 * hidden_size :][:hidden_size])
-    W_x.append(keras_W_x[3 * hidden_size :][:hidden_size])
-    W_x.append(keras_W_x[2 * hidden_size :][:hidden_size])
-    if keras_layer.use_bias:
-        keras_b = keras_layer.get_weights()[2]
-        b.append(keras_b[0 * hidden_size :][:hidden_size])
-        b.append(keras_b[1 * hidden_size :][:hidden_size])
-        b.append(keras_b[3 * hidden_size :][:hidden_size])
-        b.append(keras_b[2 * hidden_size :][:hidden_size])
-    if len(b) == 0:
-        b = None
-
-    # Set activation type
-    inner_activation_str = _get_recurrent_activation_name_from_keras(
-        keras_layer.recurrent_activation
-    )
-    activation_str = _get_recurrent_activation_name_from_keras(keras_layer.activation)
-
-    # Add to the network
-    builder.add_unilstm(
-        name=layer,
-        W_h=W_h,
-        W_x=W_x,
-        b=b,
-        hidden_size=hidden_size,
-        input_size=input_size,
-        input_names=input_names,
-        output_names=output_names,
-        inner_activation=inner_activation_str,
-        cell_state_update_activation=activation_str,
-        output_activation=activation_str,
-        output_all=output_all,
-        forget_bias=keras_layer.unit_forget_bias,
-        reverse_input=reverse_input,
-    )
-
-    if respect_train and keras_layer.trainable:
-        logging.warning(
-            "LSTM layer '%s' is marked updatable, but Core "
-            "ML does not yet support updating layers of this "
-            "type. The layer will be frozen in Core ML.",
-            layer,
-        )
-
-
-def convert_gru(builder, layer, input_names, output_names, keras_layer, respect_train):
-    """
-    Convert a GRU layer from keras to coreml.
-
-    Parameters
-    ----------
-    keras_layer: layer
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    respect_train: boolean
-        Whether to honor Keras' "trainable" flag (unsupported).
-    """
-
-    hidden_size = keras_layer.units
-    input_size = keras_layer.input_shape[-1]
-
-    output_all = keras_layer.return_sequences
-    reverse_input = keras_layer.go_backwards
-
-    # Keras: Z R O
-    # CoreML: Z R O
-    W_h, W_x, b = ([], [], [])
-    keras_W_h = keras_layer.get_weights()[1].T
-    W_h.append(keras_W_h[0 * hidden_size :][:hidden_size])
-    W_h.append(keras_W_h[1 * hidden_size :][:hidden_size])
-    W_h.append(keras_W_h[2 * hidden_size :][:hidden_size])
-
-    keras_W_x = keras_layer.get_weights()[0].T
-    W_x.append(keras_W_x[0 * hidden_size :][:hidden_size])
-    W_x.append(keras_W_x[1 * hidden_size :][:hidden_size])
-    W_x.append(keras_W_x[2 * hidden_size :][:hidden_size])
-
-    if keras_layer.use_bias:
-        keras_b = keras_layer.get_weights()[2]
-        b.append(keras_b[0 * hidden_size :][:hidden_size])
-        b.append(keras_b[1 * hidden_size :][:hidden_size])
-        b.append(keras_b[2 * hidden_size :][:hidden_size])
-    if len(b) == 0:
-        b = None
-
-    # Set actication type
-    inner_activation_str = _get_recurrent_activation_name_from_keras(
-        keras_layer.recurrent_activation
-    )
-    activation_str = _get_recurrent_activation_name_from_keras(keras_layer.activation)
-
-    # Add to the network
-    builder.add_gru(
-        name=layer,
-        W_h=W_h,
-        W_x=W_x,
-        b=b,
-        input_size=input_size,
-        hidden_size=hidden_size,
-        input_names=input_names,
-        output_names=output_names,
-        activation=activation_str,
-        inner_activation=inner_activation_str,
-        output_all=output_all,
-        reverse_input=reverse_input,
-    )
-
-    if respect_train and keras_layer.trainable:
-        logging.warning(
-            "GRU layer '%s' is marked updatable, but Core "
-            "ML does not yet support updating layers of this "
-            "type. The layer will be frozen in Core ML.",
-            layer,
-        )
-
-
-def convert_bidirectional(
-    builder, layer, input_names, output_names, keras_layer, respect_train
-):
-    """
-    Convert a bidirectional layer from keras to coreml.
-    Currently assumes the units are LSTMs.
-
-    Parameters
-    ----------
-    keras_layer: layer
-        A keras layer object.
-
-    builder: NeuralNetworkBuilder
-        A neural network builder object.
-    respect_train: boolean
-        Whether to honor Keras' "trainable" flag (unsupported).
-    """
-
-    input_size = keras_layer.input_shape[-1]
-
-    lstm_layer = keras_layer.forward_layer
-    if type(lstm_layer) != _keras.layers.recurrent.LSTM:
-        raise TypeError("Bidirectional layers only supported with LSTM")
-
-    if lstm_layer.go_backwards:
-        raise TypeError(" 'go_backwards' mode not supported with Bidirectional layers")
-
-    output_all = keras_layer.return_sequences
-    hidden_size = lstm_layer.units
-
-    # Keras: I C F O; W_x, W_h, b
-    # CoreML: I F O G; W_h and W_x are separated
-    # Keras has all forward weights, followed by backward in the same order
-    W_h, W_x, b = ([], [], [])
-    keras_W_h = keras_layer.forward_layer.get_weights()[1].T
-    W_h.append(keras_W_h[0 * hidden_size :][:hidden_size])
-    W_h.append(keras_W_h[1 * hidden_size :][:hidden_size])
-    W_h.append(keras_W_h[3 * hidden_size :][:hidden_size])
-    W_h.append(keras_W_h[2 * hidden_size :][:hidden_size])
-
-    keras_W_x = keras_layer.forward_layer.get_weights()[0].T
-    W_x.append(keras_W_x[0 * hidden_size :][:hidden_size])
-    W_x.append(keras_W_x[1 * hidden_size :][:hidden_size])
-    W_x.append(keras_W_x[3 * hidden_size :][:hidden_size])
-    W_x.append(keras_W_x[2 * hidden_size :][:hidden_size])
-
-    if keras_layer.forward_layer.use_bias:
-        keras_b = keras_layer.forward_layer.get_weights()[2]
-        b.append(keras_b[0 * hidden_size :][:hidden_size])
-        b.append(keras_b[1 * hidden_size :][:hidden_size])
-        b.append(keras_b[3 * hidden_size :][:hidden_size])
-        b.append(keras_b[2 * hidden_size :][:hidden_size])
-
-    if len(b) == 0:
-        b = None
-
-    W_h_back, W_x_back, b_back = ([], [], [])
-    keras_W_h = keras_layer.backward_layer.get_weights()[1].T
-    W_h_back.append(keras_W_h[0 * hidden_size :][:hidden_size])
-    W_h_back.append(keras_W_h[1 * hidden_size :][:hidden_size])
-    W_h_back.append(keras_W_h[3 * hidden_size :][:hidden_size])
-    W_h_back.append(keras_W_h[2 * hidden_size :][:hidden_size])
-
-    keras_W_x = keras_layer.backward_layer.get_weights()[0].T
-    W_x_back.append(keras_W_x[0 * hidden_size :][:hidden_size])
-    W_x_back.append(keras_W_x[1 * hidden_size :][:hidden_size])
-    W_x_back.append(keras_W_x[3 * hidden_size :][:hidden_size])
-    W_x_back.append(keras_W_x[2 * hidden_size :][:hidden_size])
-
-    if keras_layer.backward_layer.use_bias:
-        keras_b = keras_layer.backward_layer.get_weights()[2]
-        b_back.append(keras_b[0 * hidden_size :][:hidden_size])
-        b_back.append(keras_b[1 * hidden_size :][:hidden_size])
-        b_back.append(keras_b[3 * hidden_size :][:hidden_size])
-        b_back.append(keras_b[2 * hidden_size :][:hidden_size])
-    if len(b_back) == 0:
-        b_back = None
-
-    if (b == None and b_back != None) or (b != None and b_back == None):
-        raise ValueError(
-            "Unsupported Bi-directional LSTM configuration. Bias "
-            "must be enabled/disabled for both directions."
-        )
-
-    # Set activation type
-    inner_activation_str = _get_recurrent_activation_name_from_keras(
-        lstm_layer.recurrent_activation
-    )
-    activation_str = _get_recurrent_activation_name_from_keras(lstm_layer.activation)
-
-    output_name_1 = output_names[0]
-    if hasattr(keras_layer, "merge_mode"):
-        merge_mode = keras_layer.merge_mode
-        if merge_mode not in ["concat", "sum", "mul", "ave"]:
-            raise NotImplementedError(
-                "merge_mode '%s' in Bidirectional LSTM "
-                "not supported currently" % merge_mode
-            )
-        if merge_mode != "concat":
-            output_name_1 += "_concatenated_bilstm_output"
-
-    # Add to the network
-    builder.add_bidirlstm(
-        name=layer,
-        W_h=W_h,
-        W_x=W_x,
-        b=b,
-        W_h_back=W_h_back,
-        W_x_back=W_x_back,
-        b_back=b_back,
-        hidden_size=hidden_size,
-        input_size=input_size,
-        input_names=input_names,
-        output_names=[output_name_1] + output_names[1:],
-        inner_activation=inner_activation_str,
-        cell_state_update_activation=activation_str,
-        output_activation=activation_str,
-        forget_bias=lstm_layer.unit_forget_bias,
-        output_all=output_all,
-    )
-
-    if output_name_1 != output_names[0]:
-        mode = "CONCAT"
-        if merge_mode == "sum":
-            mode = "ADD"
-        elif merge_mode == "ave":
-            mode = "AVE"
-        elif merge_mode == "mul":
-            mode = "MULTIPLY"
-        builder.add_split(
-            name=layer + "_split",
-            input_name=output_name_1,
-            output_names=[output_names[0] + "_forward", output_names[0] + "_backward"],
-        )
-        builder.add_elementwise(
-            name=layer + "_elementwise",
-            input_names=[output_names[0] + "_forward", output_names[0] + "_backward"],
-            output_name=output_names[0],
-            mode=mode,
-        )
-
-    if respect_train and keras_layer.trainable:
-        logging.warning(
-            "Bidirectional layer '%s' is marked updatable, but "
-            "Core ML does not yet support updating layers of this "
-            "type. The layer will be frozen in Core ML.",
-            layer,
-        )
-
-
-def convert_repeat_vector(
-    builder, layer, input_names, output_names, keras_layer, respect_train
-):
-    """
-    respect_train: boolean
-        Ignored.
-    """
-    # Keras RepeatVector only deals with 1D input
-    # Get input and output names
-    input_name, output_name = (input_names[0], output_names[0])
-
-    builder.add_sequence_repeat(
-        name=layer, nrep=keras_layer.n, input_name=input_name, output_name=output_name
-    )
-
-
-def default_skip(builder, layer, input_names, output_names, keras_layer, respect_train):
-    """
-    Layers that can be skipped.
-    """
-    return
diff --git a/coremltools/converters/keras/_topology.py b/coremltools/converters/keras/_topology.py
deleted file mode 100644
index d7feb4811..000000000
--- a/coremltools/converters/keras/_topology.py
+++ /dev/null
@@ -1,706 +0,0 @@
-# Copyright (c) 2017, Apple Inc. All rights reserved.
-#
-# Use of this source code is governed by a BSD-3-clause license that can be
-# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
-
-import keras as _keras
-import numpy as _np
-
-_KERAS_LAYERS_1D = [
-    _keras.layers.Convolution1D,
-    _keras.layers.AtrousConvolution1D,
-    _keras.layers.UpSampling1D,
-    _keras.layers.ZeroPadding1D,
-    _keras.layers.Cropping1D,
-    _keras.layers.MaxPooling1D,
-    _keras.layers.AveragePooling1D,
-    _keras.layers.GlobalMaxPooling1D,
-    _keras.layers.GlobalAveragePooling1D,
-]
-_KERAS_ACTIVATION_LAYERS = [
-    _keras.layers.Activation,
-    _keras.layers.advanced_activations.LeakyReLU,
-    _keras.layers.advanced_activations.PReLU,
-    _keras.layers.advanced_activations.ELU,
-    _keras.layers.advanced_activations.ParametricSoftplus,
-    _keras.layers.advanced_activations.ThresholdedReLU,
-    _keras.layers.advanced_activations.SReLU,
-]
-
-_KERAS_NORMALIZATION_LAYERS = [
-    _keras.layers.BatchNormalization,
-]
-
-_KERAS_RECURRENT_LAYERS = [
-    _keras.layers.recurrent.LSTM,
-    _keras.layers.recurrent.SimpleRNN,
-    _keras.layers.recurrent.GRU,
-    _keras.layers.wrappers.Bidirectional,
-]
-
-
-def _to_list(x):
-    if type(x) is not list:
-        return [x]
-    else:
-        return x
-
-
-def _insert_to_dict(d, key, e):
-    # d is a dict where key maps to a list
-    if key not in d:
-        d[key] = []
-    if e not in d[key]:
-        d[key].append(e)
-
-
-class NetGraph(object):
-    """
-    Attributes:
-    layer_list - a list of layer names in the Keras model
-    connection_map - a map where the key is a layer, the value is a list of its successors
-    reverse_connection_map - a map where the key is a layer, the value is a list of its predecessors
-    keras_layer_map - a map where the key is a layer name, the value is Keras layer
-    model - a reference of the keras model.
-    blob_names - blob names for each one of the edge.
-    """
-
-    def __init__(self, model):
-        self.layer_list = []
-        self.edge_map = {}
-        self.reverse_edge_map = {}
-        self.keras_layer_map = {}
-
-        self.input_layers = []
-        self.output_layers = []
-        self.layers_inputs = {}  # each layer's input blobs
-        self.layers_outputs = {}  # each layer's output blobs
-
-        # these will be pairs of the form (name, shape) because it'll show up on the interface
-        self.optional_inputs = []
-        self.optional_outputs = []
-        self.layers_optional_inputs = {}
-        self.layers_optional_outputs = {}
-
-        self.model = model
-
-    def _add_layer(self, keras_layer):
-        # add a layer without adding connections.
-        # when a layer exist alreday, this operation won't do anything
-        layer = keras_layer.name
-        if layer not in self.layer_list:
-            self.layer_list.append(layer)
-            self.keras_layer_map[layer] = keras_layer
-
-    def get_predecessors(self, layer_name):
-        if layer_name in self.reverse_edge_map:
-            return self.reverse_edge_map[layer_name][:]  # needs to make a copy
-        else:
-            return []
-
-    def get_successors(self, layer_name):
-        if layer_name in self.edge_map:
-            return self.edge_map[layer_name][:]  # needs to make a copy
-        else:
-            return []
-
-    def get_keras_layer(self, layer_name):
-        return self.keras_layer_map[layer_name]
-
-    def make_input_layers(self):
-        """
-        Extract the ordering of the input layers.
-        """
-        self.input_layers = []
-        if hasattr(self.model, "input_layers"):
-            input_keras_layers = self.model.input_layers[:]
-            self.input_layers = [None] * len(input_keras_layers)
-            for layer in self.layer_list:
-                keras_layer = self.keras_layer_map[layer]
-                if isinstance(keras_layer, _keras.engine.topology.InputLayer):
-                    if keras_layer in input_keras_layers:
-                        idx = input_keras_layers.index(keras_layer)
-                        self.input_layers[idx] = layer
-        elif len(self.model.inbound_nodes) <= 1:
-            for ts in _to_list(self.model.input):
-                # search for the InputLayer that matches this ts
-                for l in self.layer_list:
-                    kl = self.keras_layer_map[l]
-                    if (
-                        isinstance(kl, _keras.engine.topology.InputLayer)
-                        and kl.input == ts
-                    ):
-                        self.input_layers.append(l)
-        else:
-            raise ValueError("Input values cannot be identified.")
-
-    def make_output_layers(self):
-        """
-        Extract the ordering of output layers.
-        """
-        # TODO
-        # use successors == 0 as the criteria for output layer
-        # will fail when some intermediate layers also generate output.
-        # However, because the possibility of having inserted layers,
-        # it's more difficult to tell which layer is the output layer.
-        # Once possible way is to keep track of newly added layers...
-        self.output_layers = []
-        for layer in self.layer_list:
-            if len(self.get_successors(layer)) == 0:
-                self.output_layers.append(layer)
-
-    def get_input_layers(self):
-        return self.input_layers
-
-    def get_output_layers(self):
-        return self.output_layers
-
-    def generate_blob_names(self):
-        """
-        Generate blob names for each one of the edge.  At this time, Keras does not
-        support "fork" operation (a layer with more than 1 blob output). So we just
-        use names of the src layer to identify a blob.  We also assume all neural
-        networks are singly-connected graphs - which should be the case.
-        """
-        # generate blob names that represent edges in blob_name_map
-        # because of the InputLayers, input blobs are also generated.
-
-        # Generate each layer's input / output blob names
-        for layer in self.layer_list:
-            keras_layer = self.keras_layer_map[layer]
-            # no need to generate InputLayers' blobs
-            if not isinstance(keras_layer, _keras.engine.topology.InputLayer):
-                # layer's input blob names depend on predecessors
-                preds = self.get_predecessors(layer)
-                for pred in preds:
-                    blob_name = pred + "_output"
-                    _insert_to_dict(self.layers_inputs, layer, blob_name)
-                # layer's output blob is just named after itself
-                blob_name = layer + "_output"
-                _insert_to_dict(self.layers_outputs, layer, blob_name)
-
-    def get_layer_blobs(self, layer):
-        keras_layer = self.keras_layer_map[layer]
-        if isinstance(keras_layer, _keras.engine.topology.InputLayer):
-            return None, None
-        else:
-            input_blobs = self.layers_inputs[layer]
-            output_blobs = self.layers_outputs[layer]
-            if layer in self.layers_optional_inputs:
-                input_blobs += self.layers_optional_inputs[layer]
-            if layer in self.layers_optional_outputs:
-                output_blobs += self.layers_optional_outputs[layer]
-            return input_blobs, output_blobs
-
-    def reset_model_input_names(self, new_names):
-        # call this method after make_input_layers() is called
-        if new_names is None:
-            return
-        if len(new_names) != len(self.input_layers):
-            print("Input name length mismatch")
-            return
-        for i, in_layer in enumerate(self.input_layers):
-            old_blob_name = in_layer + "_output"
-            new_blob_name = new_names[i]
-            succs = self.get_successors(in_layer)
-            for succ in succs:
-                idx = self.layers_inputs[succ].index(old_blob_name)
-                self.layers_inputs[succ][idx] = new_blob_name
-
-    def reset_model_output_names(self, new_names):
-        if new_names is None:
-            return
-        if len(new_names) != len(self.output_layers):
-            print("Output name length mismatch")
-            return
-        for i, out_layer in enumerate(self.output_layers):
-            new_blob_name = new_names[i]
-            self.layers_outputs[out_layer][0] = new_blob_name
-
-    # need to update both layer's in/out list and graph in/out ports
-    def add_recurrent_optionals(self):
-        # call this after blob names are generated
-        for layer in self.layer_list:
-            keras_layer = self.keras_layer_map[layer]
-            if type(keras_layer) in _KERAS_RECURRENT_LAYERS:
-                if not isinstance(keras_layer, _keras.layers.wrappers.Bidirectional):
-                    hidden_size = keras_layer.output_dim
-                else:
-                    hidden_size = keras_layer.forward_layer.output_dim
-                h_in_name = layer + "_h_in"
-                h_out_name = layer + "_h_out"
-                self.optional_inputs.append((h_in_name, hidden_size))
-                self.optional_outputs.append((h_out_name, hidden_size))
-                _insert_to_dict(self.layers_optional_inputs, layer, h_in_name)
-                _insert_to_dict(self.layers_optional_outputs, layer, h_out_name)
-                if isinstance(keras_layer, _keras.layers.recurrent.LSTM):
-                    c_in_name = layer + "_c_in"
-                    c_out_name = layer + "_c_out"
-                    self.optional_inputs.append((c_in_name, hidden_size))
-                    self.optional_outputs.append((c_out_name, hidden_size))
-                    _insert_to_dict(self.layers_optional_inputs, layer, c_in_name)
-                    _insert_to_dict(self.layers_optional_outputs, layer, c_out_name)
-                elif isinstance(keras_layer, _keras.layers.wrappers.Bidirectional):
-                    c_in_name = layer + "_c_in"
-                    c_out_name = layer + "_c_out"
-                    h_in_name_rev = layer + "_h_in_rev"
-                    c_in_name_rev = layer + "_c_in_rev"
-                    h_out_name_rev = layer + "_h_out_rev"
-                    c_out_name_rev = layer + "_c_out_rev"
-                    self.optional_inputs.append((c_in_name, hidden_size))
-                    self.optional_outputs.append((c_out_name, hidden_size))
-                    self.optional_inputs.append((h_in_name_rev, hidden_size))
-                    self.optional_inputs.append((c_in_name_rev, hidden_size))
-                    self.optional_outputs.append((h_out_name_rev, hidden_size))
-                    self.optional_outputs.append((c_out_name_rev, hidden_size))
-                    _insert_to_dict(self.layers_optional_inputs, layer, c_in_name)
-                    _insert_to_dict(self.layers_optional_outputs, layer, c_out_name)
-                    _insert_to_dict(self.layers_optional_inputs, layer, h_in_name_rev)
-                    _insert_to_dict(self.layers_optional_inputs, layer, c_in_name_rev)
-                    _insert_to_dict(self.layers_optional_outputs, layer, h_out_name_rev)
-                    _insert_to_dict(self.layers_optional_outputs, layer, c_out_name_rev)
-
-    def _get_first_embedded_model(self):
-        for idx, layer in enumerate(self.layer_list):
-            keras_layer = self.keras_layer_map[layer]
-            if isinstance(keras_layer, _keras.models.Sequential) or isinstance(
-                keras_layer, _keras.models.Model
-            ):
-                return idx
-        return -1
-
-    def _get_first_shared_layer(self):
-        for idx, layer in enumerate(self.layer_list):
-            if (
-                (not isinstance(self.keras_layer_map[layer], _keras.layers.Merge))
-                and len(self.get_predecessors(layer)) > 1
-            ):  # weight sharing criteria
-                return idx
-        return -1
-
-    def _get_first_layer_of_type(self, layer_type):
-        for idx, layer in enumerate(self.layer_list):
-            keras_layer = self.keras_layer_map[layer]
-            if isinstance(keras_layer, layer_type):
-                return idx
-        return -1
-
-    def _add_edge(self, src, snk):
-        if src not in self.edge_map:
-            self.edge_map[src] = []
-        if snk not in self.edge_map[src]:
-            self.edge_map[src].append(snk)
-        if snk not in self.reverse_edge_map:
-            self.reverse_edge_map[snk] = []
-        if src not in self.reverse_edge_map[snk]:
-            self.reverse_edge_map[snk].append(src)
-
-    def _remove_edge(self, src, snk):
-        self.edge_map[src].remove(snk)
-        if len(self.edge_map[src]) == 0:
-            self.edge_map.pop(src)
-        self.reverse_edge_map[snk].remove(src)
-        if len(self.reverse_edge_map[snk]) == 0:
-            self.reverse_edge_map.pop(snk)
-
-    def _remove_layer(self, layer):
-        """
-        remove the layer and its input/output edges
-        """
-        successors = self.get_successors(layer)
-        predecessors = self.get_predecessors(layer)
-        # remove all edges
-        for succ in successors:
-            self._remove_edge(layer, succ)
-        for pred in predecessors:
-            self._remove_edge(pred, layer)
-        # remove layer in the data structures
-        self.keras_layer_map.pop(layer)
-        self.layer_list.remove(layer)
-
-    def _remove_layer_and_reconnect(self, layer):
-        """
-        remove the layer, and reconnect each of its predecessor to each of its successor
-        """
-        successors = self.get_successors(layer)
-        predecessors = self.get_predecessors(layer)
-        # remove layer's edges
-        for succ in successors:
-            self._remove_edge(layer, succ)
-        for pred in predecessors:
-            self._remove_edge(pred, layer)
-
-        # connect predecessors and successors
-        for pred in predecessors:
-            for succ in successors:
-                self._add_edge(pred, succ)
-        # remove layer in the data structures
-        self.layer_list.remove(layer)
-        self.keras_layer_map.pop(layer)
-
-    def _remove_old_edges(self, layer):
-        predecessors = self.get_predecessors(layer)
-        successors = self.get_successors(layer)
-        for pred in predecessors:
-            self._remove_edge(pred, layer)
-        for succ in successors:
-            self._remove_edge(layer, succ)
-
-    def _remove_layers_of_type(self, layer_type):
-        idx = self._get_first_layer_of_type(layer_type)
-        while idx >= 0:
-            layer = self.layer_list[idx]
-            self._remove_layer_and_reconnect(layer)
-            idx = self._get_first_layer_of_type(layer_type)
-
-    def remove_skip_layers(self, skip_layers):
-        for skip_layer in skip_layers:
-            self._remove_layers_of_type(skip_layer)
-
-    def remove_internal_input_layers(self):
-        idx, nb_layers = 0, len(self.layer_list)
-        while idx < nb_layers:
-            layer = self.layer_list[idx]
-            keras_layer = self.keras_layer_map[layer]
-            if (
-                isinstance(keras_layer, _keras.engine.topology.InputLayer)
-                and len(self.get_predecessors(layer)) > 0
-            ):
-                # these are internal input layers that needs to be taken out
-                self._remove_layer_and_reconnect(layer)
-                idx -= 1
-                nb_layers -= 1
-            idx += 1
-
-    def _insert_layer_after(self, layer_idx, new_layer, new_keras_layer):
-        """
-        Insert the new_layer after layer, whose position is layer_idx. The new layer's
-        parameter is stored in a Keras layer called new_keras_layer
-        """
-        # reminder: new_keras_layer is not part of the original Keras network,
-        # so it's input / output blob information is missing. It serves only as
-        # a parameter holder.
-        layer = self.layer_list[layer_idx]
-        self.layer_list.insert(layer_idx + 1, new_layer)
-        self.keras_layer_map[new_layer] = new_keras_layer
-        successors = self.get_successors(layer)
-        # add edge layer -> new_layer
-        self._add_edge(layer, new_layer)
-        # add edges new_layer -> layer_successor, remove layer -> successor
-        for succ in successors:
-            self._add_edge(new_layer, succ)
-            self._remove_edge(layer, succ)
-
-    def _insert_layer_between(self, src, snk, new_layer, new_keras_layer):
-        """
-        Insert the new_layer before layer, whose position is layer_idx. The new layer's
-        parameter is stored in a Keras layer called new_keras_layer
-        """
-        if snk is None:
-            insert_pos = self.layer_list.index(src) + 1
-        else:
-            insert_pos = self.layer_list.index(snk)  # insert position
-        self.layer_list.insert(insert_pos, new_layer)
-        self.keras_layer_map[new_layer] = new_keras_layer
-        if src is None:  # snk is an input layer
-            self._add_edge(new_layer, snk)
-        elif snk is None:  # src is an output layer
-            self._add_edge(src, new_layer)
-        else:
-            self._add_edge(src, new_layer)
-            self._add_edge(new_layer, snk)
-            self._remove_edge(src, snk)
-
-    def defuse_activation(self):
-        """
-        Defuse the fused activation layers in the network.
-        """
-        idx, nb_layers = 0, len(self.layer_list)
-        while idx < nb_layers:
-            layer = self.layer_list[idx]
-            k_layer = self.keras_layer_map[layer]
-            # unwrap time-distributed layers
-            if isinstance(k_layer, _keras.layers.TimeDistributed):
-                k_layer = k_layer.layer
-            if (
-                isinstance(k_layer, _keras.layers.convolutional.Convolution2D)
-                or isinstance(k_layer, _keras.layers.convolutional.Convolution1D)
-                or isinstance(k_layer, _keras.layers.core.Dense)
-            ):
-
-                func_name = k_layer.activation.__name__
-
-                if func_name != "linear":
-                    # Create new layer
-                    new_layer = layer + "__activation__"
-                    new_keras_layer = _keras.layers.core.Activation(func_name)
-                    # insert new layer after it
-                    self._insert_layer_after(idx, new_layer, new_keras_layer)
-                    idx += 1
-                    nb_layers += 1
-
-            idx += 1
-
-    def is_activation(self, layer):
-        keras_layer = self.keras_layer_map[layer]
-        for activation_type in _KERAS_ACTIVATION_LAYERS:
-            if isinstance(keras_layer, activation_type):
-                return True
-        return False
-
-    def is_1d_layer(self, layer):
-        keras_layer = self.keras_layer_map[layer]
-        for layer_type in _KERAS_LAYERS_1D:
-            if isinstance(keras_layer, layer_type):
-                return True
-        return False
-
-    def _get_1d_interface_edges(self):
-        """
-        Get edges that represents transition from not 1D to 1D, and 1D to not 1D
-        A 'in_edge e(u,v)' means u operates on non-1D blobs, but v operates on 1D blobs.
-        An 'out_edge e(u,v)' means u operates on 1D blobs, but v operates on non-1D blobs.
-        """
-        in_edges = []
-        for layer in self.layer_list:
-            if not self.is_1d_layer(layer):
-                continue
-            preds = self.get_predecessors(layer)
-            if len(preds) == 0:
-                in_edges.append((None, layer))
-            else:
-                # because 1D layers are all 1-input, there should only be 1 predecessor
-                u, v = preds[0], layer
-                while (u != None) and (
-                    self.is_activation(u) or type(u) in _KERAS_NORMALIZATION_LAYERS
-                ):
-                    preds = self.get_predecessors(u)
-                    v = u
-                    u = preds[0] if len(preds) > 0 else None
-                if u is None or (not self.is_1d_layer(u)):
-                    in_edges.append((u, v))
-
-        out_edges = []
-        for layer in self.layer_list:
-            if not self.is_1d_layer(layer):
-                continue
-            succs = self.get_successors(layer)
-            if len(succs) == 0:
-                out_edges.append((layer, None))
-            elif not self.is_activation(succs[0]):
-                for succ in succs:
-                    if not self.is_1d_layer(succ):
-                        out_edges.append((layer, succ))
-            else:
-                act_layer = succs[0]
-                succs = self.get_successors(act_layer)
-                if len(succs) == 0:
-                    out_edges.append((act_layer, None))
-                else:
-                    for succ in succs:
-                        if not self.is_1d_layer(succ):
-                            out_edges.append((act_layer, succ))
-
-        return in_edges, out_edges
-
-    def insert_1d_permute_layers(self):
-        """
-        Insert permutation layers before a 1D start point or after 1D end point
-        """
-        idx, nb_layers = 0, len(self.layer_list)
-        in_edges, out_edges = self._get_1d_interface_edges()
-
-        # Hacky Warning: (1) use a 4-D permute, which is not likely to happen in Keras,
-        # to represent actual permutation needed for (seq, c, h, w) in CoreML
-        # (2) Assume 2-D input shape has meaning (seq, c), and during CoreML runtime,
-        # it is represented as 4D blob, (seq, c, h, w)
-        for in_edge in in_edges:
-            src, snk = in_edge
-            if src is None:
-                permute_layer = "_permute_" + snk
-            else:
-                permute_layer = src + "_permute_" + snk
-            keras_permute = _keras.layers.Permute(
-                dims=(3, 1, 2, 0)
-            )  # assume w = 1, switch seq and w
-            self._insert_layer_between(src, snk, permute_layer, keras_permute)
-        for out_edge in out_edges:
-            src, snk = out_edge
-            if snk is None:
-                permute_layer = src + "_permute_"
-            else:
-                permute_layer = src + "_permute_" + snk
-            keras_permute = _keras.layers.Permute(
-                dims=(3, 1, 2, 0)
-            )  # assume w = 1, switch seq and w back
-            self._insert_layer_between(src, snk, permute_layer, keras_permute)
-
-    def insert_permute_for_spatial_bn(self):
-
-        # find spatial batchnorm layers
-        spatial_bn_layers = []
-        for layer in self.layer_list:
-            keras_layer = self.keras_layer_map[layer]
-            if (
-                isinstance(keras_layer, _keras.layers.BatchNormalization)
-                and len(keras_layer.input_shape) == 4
-            ):
-                if keras_layer.axis == 1 or keras_layer.axis == 2:
-                    spatial_bn_layers.append(layer)
-
-        for sbn in spatial_bn_layers:
-            axis = self.keras_layer_map[sbn].axis
-            # axis == 1: swap H axis; axis == 2 : swap W axis
-            dims = (0, 2, 1, 3) if axis == 1 else (0, 3, 2, 1)
-            # add permutation before spatial batchnorm
-            pred = self.get_predecessors(sbn)[0]
-            permute_layer = pred + "_permute_" + sbn
-            keras_permute = _keras.layers.Permute(dims=dims)
-            self._insert_layer_between(pred, sbn, permute_layer, keras_permute)
-            # add permutation after spatial batchnorm
-            succs = self.get_successors(sbn)
-            if len(succs) == 0:
-                permute_layer = sbn + "_permute_"
-                keras_permute = _keras.layers.Permute(dims=dims)
-                self._insert_layer_between(sbn, None, permute_layer, keras_permute)
-            else:
-                for succ in succs:
-                    permute_layer = sbn + "_permute_" + succ
-                    keras_permute = _keras.layers.Permute(dims=dims)
-                    self._insert_layer_between(sbn, succ, permute_layer, keras_permute)
-
-    def build(self):
-        # sanity check.
-        model = self.model
-        if not (
-            type(model) == _keras.models.Sequential
-            or type(model) == _keras.models.Model
-        ):
-            raise TypeError("Keras layer of type %s is not supported." % type(model))
-            self = None
-            return
-
-        # build the graph without considering embedded subgraphs
-        for i, layer in enumerate(model.layers):
-            for node in layer.inbound_nodes:
-                for pred in node.inbound_layers:
-                    if pred.name not in self.layer_list:
-                        self.layer_list.append(pred.name)
-                        self.keras_layer_map[pred.name] = pred
-                    self._add_edge(pred.name, layer.name)
-            self.layer_list.append(layer.name)
-            self.keras_layer_map[layer.name] = layer
-
-        # Duplicate models for weight sharing
-        idx = self._get_first_shared_layer()
-        while idx >= 0:
-            layer = self.layer_list[idx]
-            keras_layer = self.keras_layer_map[layer]
-            predecessors = self.reverse_edge_map[layer]
-            successors = self.edge_map[layer]
-            new_layers = [layer + "_" + str(i) for i in range(len(predecessors))]
-            self.layer_list[idx : idx + 1] = new_layers
-            for i, new_layer in enumerate(new_layers):
-                self.edge_map[new_layer] = []
-                self.reverse_edge_map[new_layer] = []
-                self.keras_layer_map[new_layer] = keras_layer
-                pred = predecessors[i]
-                self._add_edge(pred, new_layer)
-                for succ in successors:
-                    self._add_edge(new_layer, succ)
-            self._remove_old_edges(layer)
-            self.keras_layer_map.pop(layer)
-            idx = self._get_first_shared_layer()
-
-        # Expand the sub-models
-        idx = self._get_first_embedded_model()
-        while idx >= 0:
-            # grab the input and output edges of the embedded model
-            embedded_model = self.layer_list[idx]
-            # build the embedded model
-            embedded_keras_model = self.keras_layer_map[embedded_model]
-            embedded_graph = NetGraph(embedded_keras_model)
-            embedded_graph.build()
-            # replace the embedded model with the layers of the embedded graph
-            embedded_layer_list = embedded_graph.layer_list
-            new_layer_list = []
-            for embedded_layer_name in embedded_layer_list:
-                new_layer_name = embedded_model + "_" + embedded_layer_name
-                new_layer_list.append(new_layer_name)
-                self.keras_layer_map[new_layer_name] = embedded_graph.keras_layer_map[
-                    embedded_layer_name
-                ]
-                # add edge [embed_layer -> its succ]
-                embedded_successors = embedded_graph.get_successors(embedded_layer_name)
-                for embed_succ_name in embedded_successors:
-                    new_embed_succ_name = embedded_model + "_" + embed_succ_name
-                    self._add_edge(new_layer_name, new_embed_succ_name)
-                # add edge [pred -> embed_layer]
-                embedded_predecessors = embedded_graph.get_predecessors(
-                    embedded_layer_name
-                )
-                for embed_pred_name in embedded_predecessors:
-                    new_embed_pred_name = embedded_model + "_" + embed_pred_name
-                    self._add_edge(new_embed_pred_name, new_layer_name)
-
-            self.layer_list[idx + 1 : idx + 1] = new_layer_list
-            # replace input / output edges to the model with input/output edges of the embedded layers
-            predecessors = self.get_predecessors(embedded_model)
-            embedded_inputs = embedded_graph.get_input_layers()
-            for i, pred in enumerate(predecessors):
-                embed_input = embedded_inputs[i]
-                new_embed_input = embedded_model + "_" + embed_input
-                self._add_edge(pred, new_embed_input)
-
-            embedded_outputs = embedded_graph.get_output_layers()
-            successors = self.get_successors(embedded_model)
-            for i, succ in enumerate(successors):
-                embed_output = embedded_outputs[i]
-                new_embed_output = embedded_model + "_" + embed_output
-
-                self._add_edge(new_embed_output, succ)
-
-            # clear up the embedded model
-            self._remove_layer(embedded_model)
-            idx = self._get_first_embedded_model()
-
-        self.make_input_layers()
-        self.make_output_layers()
-
-    def print_layer_list(self):
-        print("\n")
-        print("layer_list")
-        print(self.layer_list)
-
-    def print_edge_map(self):
-        print("\n")
-        print("edge map:")
-        for src in self.edge_map:
-            for snk in self.edge_map[src]:
-                print("  ", src, "-->", snk)
-
-    def print_reverse_edge_map(self):
-        print("\n")
-        print("reverse edge map: ")
-        for snk in self.reverse_edge_map:
-            for src in self.reverse_edge_map[snk]:
-                print("  ", snk, "<--", src)
-
-    def print_mapping(self):
-        print("\nmapping:")
-        for key in self.keras_layer_map:
-            print(
-                key,
-                "-->",
-                self.keras_layer_map[key],
-                "(",
-                self.keras_layer_map[key].name,
-                ")",
-            )
-
-    def print_all(self):
-        print("=" * 80)
-        self.print_layer_list()
-        self.print_edge_map()
-        self.print_reverse_edge_map()
-        self.print_mapping()
diff --git a/coremltools/converters/keras/_topology2.py b/coremltools/converters/keras/_topology2.py
deleted file mode 100644
index 705dc301d..000000000
--- a/coremltools/converters/keras/_topology2.py
+++ /dev/null
@@ -1,837 +0,0 @@
-# Copyright (c) 2021, Apple Inc. All rights reserved.
-#
-# Use of this source code is governed by a BSD-3-clause license that can be
-# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
-
-import keras as _keras
-import numpy as _np
-
-_KERAS_LAYERS_1D = [
-    _keras.layers.Conv1D,
-    _keras.layers.UpSampling1D,
-    _keras.layers.ZeroPadding1D,
-    _keras.layers.Cropping1D,
-    _keras.layers.MaxPooling1D,
-    _keras.layers.AveragePooling1D,
-    _keras.layers.GlobalMaxPooling1D,
-    _keras.layers.GlobalAveragePooling1D,
-]
-
-_KERAS_ACTIVATION_LAYERS = [
-    _keras.layers.Activation,
-    _keras.layers.advanced_activations.LeakyReLU,
-    _keras.layers.advanced_activations.PReLU,
-    _keras.layers.advanced_activations.ELU,
-    _keras.layers.advanced_activations.ThresholdedReLU,
-]
-
-_KERAS_NORMALIZATION_LAYERS = [
-    _keras.layers.BatchNormalization,
-]
-
-_KERAS_RECURRENT_LAYERS = [
-    _keras.layers.recurrent.LSTM,
-    _keras.layers.recurrent.SimpleRNN,
-    _keras.layers.recurrent.GRU,
-    _keras.layers.wrappers.Bidirectional,
-]
-
-_KERAS_MERGE_LAYERS = [
-    _keras.layers.Add,
-    _keras.layers.Multiply,
-    _keras.layers.Average,
-    _keras.layers.Maximum,
-    _keras.layers.Concatenate,
-    _keras.layers.Dot,
-]
-
-_KERAS_SKIP_LAYERS = [
-    _keras.layers.core.Dropout,
-    _keras.layers.core.SpatialDropout1D,
-    _keras.layers.core.SpatialDropout2D,
-]
-
-from distutils.version import StrictVersion as _StrictVersion
-
-if _keras.__version__ >= _StrictVersion("2.2.0"):
-    from keras.engine.input_layer import InputLayer
-else:
-    from keras.engine.topology import InputLayer
-
-
-def _to_list(x):
-    if type(x) is not list:
-        return [x]
-    else:
-        return x
-
-
-def _insert_to_dict(d, key, e):
-    # d is a dict where key maps to a list
-    if key not in d:
-        d[key] = []
-    if e not in d[key]:
-        d[key].append(e)
-
-
-def _is_merge_layer(layer):
-    for lt in _KERAS_MERGE_LAYERS:
-        if isinstance(layer, lt):
-            return True
-    return False
-
-
-class NetGraph(object):
-    """
-    Attributes:
-    layer_list - a list of layer names in the Keras model
-    connection_map - a map where the key is a layer, the value is a list of its successors
-    reverse_connection_map - a map where the key is a layer, the value is a list of its predecessors
-    keras_layer_map - a map where the key is a layer name, the value is Keras layer
-    model - a reference of the keras model.
-    blob_names - blob names for each one of the edge.
-    """
-
-    def __init__(self, model):
-        self.layer_list = []
-        self.edge_map = {}
-        self.reverse_edge_map = {}
-        self.keras_layer_map = {}
-
-        self.input_layers = []
-        self.output_layers = []
-        self.layers_inputs = {}  # each layer's input blobs
-        self.layers_outputs = {}  # each layer's output blobs
-
-        # these will be pairs of the form (name, shape) because it'll show up on the interface
-        self.optional_inputs = []
-        self.optional_outputs = []
-        self.layers_optional_inputs = {}
-        self.layers_optional_outputs = {}
-
-        self.model = model
-
-    def _add_layer(self, keras_layer):
-        # add a layer without adding connections.
-        # when a layer exist alreday, this operation won't do anything
-        layer = keras_layer.name
-        if layer not in self.layer_list:
-            self.layer_list.append(layer)
-            self.keras_layer_map[layer] = keras_layer
-
-    def _replace_blob_name(self, old_name, new_name):
-        # replace a blob with a new name
-        for l in self.layers_outputs:
-            for idx, b in enumerate(self.layers_outputs[l]):
-                if b == old_name:
-                    self.layers_outputs[l][idx] = new_name
-
-        for l in self.layers_inputs:
-            for idx, b in enumerate(self.layers_inputs[l]):
-                if b == old_name:
-                    self.layers_inputs[l][idx] = new_name
-
-    def get_predecessors(self, layer_name):
-        if layer_name in self.reverse_edge_map:
-            return self.reverse_edge_map[layer_name][:]  # needs to make a copy
-        else:
-            return []
-
-    def get_successors(self, layer_name):
-        if layer_name in self.edge_map:
-            return self.edge_map[layer_name][:]  # needs to make a copy
-        else:
-            return []
-
-    def get_keras_layer(self, layer_name):
-        return self.keras_layer_map[layer_name]
-
-    def get_coreml_layers(self, keras_layer):
-        coreml_layers = []
-        for key in self.keras_layer_map:
-            if self.keras_layer_map[key] == keras_layer:
-                coreml_layers.append(key)
-        return coreml_layers
-
-    def make_input_layers(self):
-        """
-        Extract the ordering of the input layers.
-        """
-        self.input_layers = []
-        in_nodes = (
-            self.model._inbound_nodes
-            if hasattr(self.model, "_inbound_nodes")
-            else self.model.inbound_nodes
-        )
-        if hasattr(self.model, "input_layers"):
-            input_keras_layers = self.model.input_layers[:]
-            self.input_layers = [None] * len(input_keras_layers)
-            for layer in self.layer_list:
-                keras_layer = self.keras_layer_map[layer]
-                if isinstance(keras_layer, InputLayer):
-                    if keras_layer in input_keras_layers:
-                        idx = input_keras_layers.index(keras_layer)
-                        self.input_layers[idx] = layer
-        elif hasattr(self.model, "inputs"):
-            for ts in _to_list(self.model.inputs):
-                # search for the InputLayer that matches this ts
-                for l in self.layer_list:
-                    kl = self.keras_layer_map[l]
-                    if isinstance(kl, InputLayer) and kl.input == ts:
-                        self.input_layers.append(l)
-        elif len(in_nodes) <= 1:
-            for ts in _to_list(self.model.input):
-                # search for the InputLayer that matches this ts
-                for l in self.layer_list:
-                    kl = self.keras_layer_map[l]
-                    if isinstance(kl, InputLayer) and kl.input == ts:
-                        self.input_layers.append(l)
-        else:
-            raise ValueError("Input values cannot be identified.")
-
-    def make_output_layers(self):
-        """
-        Extract the ordering of output layers.
-        """
-        self.output_layers = []
-        # import pytest; pytest.set_trace()
-        if hasattr(self.model, "output_layers"):
-            # find corresponding output layers in CoreML model
-            # assume output layers are not shared
-            # Helper function to recursively extract output layers
-            # even if the model has a layer which is a nested model
-            def extract_output_layers(keras_model):
-                output_layers = []
-                for layer in keras_model.output_layers:
-                    if hasattr(layer, "output_layers"):
-                        output_layers.extend(extract_output_layers(layer))
-                    else:
-                        output_layers.append(layer)
-                return output_layers
-
-            for kl in extract_output_layers(self.model):
-                coreml_layers = self.get_coreml_layers(kl)
-                if len(coreml_layers) > 0:
-                    for cl in coreml_layers:
-                        self.output_layers.append(cl)
-        elif len(self.model.outputs) > 0:
-            for model_output in self.model.outputs:
-                for l in self.layer_list:
-                    k_layer = self.keras_layer_map[l]
-                    in_nodes = (
-                        k_layer._inbound_nodes
-                        if hasattr(k_layer, "_inbound_nodes")
-                        else k_layer.inbound_nodes
-                    )
-                    for idx in range(len(in_nodes)):
-                        out_tensor = k_layer.get_output_at(idx)
-                        if out_tensor == model_output or (
-                            out_tensor.name in model_output.name
-                        ):
-                            self.output_layers.append(l)
-        if len(self.output_layers) == 0:
-            raise ValueError("No outputs can be identified")
-
-    def get_input_layers(self):
-        return self.input_layers
-
-    def get_output_layers(self):
-        return self.output_layers
-
-    def generate_blob_names(self):
-        """
-        Generate blob names for each one of the edge.  At this time, Keras does not
-        support "fork" operation (a layer with more than 1 blob output). So we just
-        use names of the src layer to identify a blob.  We also assume all neural
-        networks are singly-connected graphs - which should be the case.
-        """
-        # generate blob names that represent edges in blob_name_map
-        # because of the InputLayers, input blobs are also generated.
-
-        # Generate each layer's input / output blob names
-        for layer in self.layer_list:
-            keras_layer = self.keras_layer_map[layer]
-            # no need to generate InputLayers' blobs
-            if not isinstance(keras_layer, InputLayer):
-                # layer's input blob names depend on predecessors
-                preds = self.get_predecessors(layer)
-                for pred in preds:
-                    blob_name = pred + "_output"
-                    _insert_to_dict(self.layers_inputs, layer, blob_name)
-                # layer's output blob is just named after itself
-                blob_name = layer + "_output"
-                _insert_to_dict(self.layers_outputs, layer, blob_name)
-
-    def get_layer_blobs(self, layer):
-        keras_layer = self.keras_layer_map[layer]
-        if isinstance(keras_layer, InputLayer):
-            return None, None
-        else:
-            input_blobs = self.layers_inputs[layer]
-            output_blobs = self.layers_outputs[layer]
-            if layer in self.layers_optional_inputs:
-                input_blobs += self.layers_optional_inputs[layer]
-            if layer in self.layers_optional_outputs:
-                output_blobs += self.layers_optional_outputs[layer]
-            return input_blobs, output_blobs
-
-    def reset_model_input_names(self, new_names):
-        # call this method after make_input_layers() is called
-        if new_names is None:
-            return
-        if len(new_names) != len(self.input_layers):
-            print("Input name length mismatch")
-            return
-        for i, in_layer in enumerate(self.input_layers):
-            old_blob_name = in_layer + "_output"
-            new_blob_name = new_names[i]
-            succs = self.get_successors(in_layer)
-            for succ in succs:
-                idx = self.layers_inputs[succ].index(old_blob_name)
-                self.layers_inputs[succ][idx] = new_blob_name
-
-    def reset_model_output_names(self, new_names):
-        if new_names is None:
-            return
-        if len(new_names) != len(self.output_layers):
-            print("Output name length mismatch")
-            return
-        for i, out_layer in enumerate(self.output_layers):
-            old_blob_name = self.layers_outputs[self.output_layers[i]][0]
-            self._replace_blob_name(old_blob_name, new_names[i])
-
-    # need to update both layer's in/out list and graph in/out ports
-    def add_recurrent_optionals(self):
-        # call this after blob names are generated
-        for layer in self.layer_list:
-            keras_layer = self.keras_layer_map[layer]
-            if type(keras_layer) in _KERAS_RECURRENT_LAYERS:
-                if not isinstance(keras_layer, _keras.layers.wrappers.Bidirectional):
-                    hidden_size = keras_layer.units
-                else:
-                    hidden_size = keras_layer.forward_layer.units
-                h_in_name = layer + "_h_in"
-                h_out_name = layer + "_h_out"
-                self.optional_inputs.append((h_in_name, hidden_size))
-                self.optional_outputs.append((h_out_name, hidden_size))
-                _insert_to_dict(self.layers_optional_inputs, layer, h_in_name)
-                _insert_to_dict(self.layers_optional_outputs, layer, h_out_name)
-                if isinstance(keras_layer, _keras.layers.recurrent.LSTM):
-                    c_in_name = layer + "_c_in"
-                    c_out_name = layer + "_c_out"
-                    self.optional_inputs.append((c_in_name, hidden_size))
-                    self.optional_outputs.append((c_out_name, hidden_size))
-                    _insert_to_dict(self.layers_optional_inputs, layer, c_in_name)
-                    _insert_to_dict(self.layers_optional_outputs, layer, c_out_name)
-                elif isinstance(keras_layer, _keras.layers.wrappers.Bidirectional):
-                    c_in_name = layer + "_c_in"
-                    c_out_name = layer + "_c_out"
-                    h_in_name_rev = layer + "_h_in_rev"
-                    c_in_name_rev = layer + "_c_in_rev"
-                    h_out_name_rev = layer + "_h_out_rev"
-                    c_out_name_rev = layer + "_c_out_rev"
-                    self.optional_inputs.append((c_in_name, hidden_size))
-                    self.optional_outputs.append((c_out_name, hidden_size))
-                    self.optional_inputs.append((h_in_name_rev, hidden_size))
-                    self.optional_inputs.append((c_in_name_rev, hidden_size))
-                    self.optional_outputs.append((h_out_name_rev, hidden_size))
-                    self.optional_outputs.append((c_out_name_rev, hidden_size))
-                    _insert_to_dict(self.layers_optional_inputs, layer, c_in_name)
-                    _insert_to_dict(self.layers_optional_outputs, layer, c_out_name)
-                    _insert_to_dict(self.layers_optional_inputs, layer, h_in_name_rev)
-                    _insert_to_dict(self.layers_optional_inputs, layer, c_in_name_rev)
-                    _insert_to_dict(self.layers_optional_outputs, layer, h_out_name_rev)
-                    _insert_to_dict(self.layers_optional_outputs, layer, c_out_name_rev)
-
-    def _get_first_embedded_model(self):
-        for idx, layer in enumerate(self.layer_list):
-            keras_layer = self.keras_layer_map[layer]
-            if isinstance(keras_layer, _keras.models.Sequential) or isinstance(
-                keras_layer, _keras.models.Model
-            ):
-                return idx
-        return -1
-
-    def _get_first_shared_layer(self):
-        for idx, layer in enumerate(self.layer_list):
-            keras_layer = self.keras_layer_map[layer]
-            inbound_nodes = (
-                keras_layer.inbound_nodes
-                if hasattr(keras_layer, "inbound_nodes")
-                else keras_layer._inbound_nodes
-            )
-            inbound_nodes = [
-                node for node in inbound_nodes if len(node.inbound_layers) > 0
-            ]
-            if (
-                not _is_merge_layer(self.keras_layer_map[layer])
-                and len(self.get_predecessors(layer)) > 1
-                and len(inbound_nodes) > 1
-            ):
-                return idx
-        return -1
-
-    def _get_first_layer_of_type(self, layer_type):
-        for idx, layer in enumerate(self.layer_list):
-            keras_layer = self.keras_layer_map[layer]
-            if isinstance(keras_layer, layer_type):
-                return idx
-        return -1
-
-    def _add_edge(self, src, snk):
-        if src not in self.edge_map:
-            self.edge_map[src] = []
-        if snk not in self.edge_map[src]:
-            self.edge_map[src].append(snk)
-        if snk not in self.reverse_edge_map:
-            self.reverse_edge_map[snk] = []
-        if src not in self.reverse_edge_map[snk]:
-            self.reverse_edge_map[snk].append(src)
-
-    def _remove_edge(self, src, snk):
-        self.edge_map[src].remove(snk)
-        if len(self.edge_map[src]) == 0:
-            self.edge_map.pop(src)
-        self.reverse_edge_map[snk].remove(src)
-        if len(self.reverse_edge_map[snk]) == 0:
-            self.reverse_edge_map.pop(snk)
-
-    def _remove_layer(self, layer):
-        """
-        remove the layer and its input/output edges
-        """
-        successors = self.get_successors(layer)
-        predecessors = self.get_predecessors(layer)
-        # remove all edges
-        for succ in successors:
-            self._remove_edge(layer, succ)
-        for pred in predecessors:
-            self._remove_edge(pred, layer)
-        # remove layer in the data structures
-        self.keras_layer_map.pop(layer)
-        self.layer_list.remove(layer)
-
-    def _remove_layer_and_reconnect(self, layer):
-        """ Remove the layer, and reconnect each of its predecessor to each of
-        its successor
-        """
-        successors = self.get_successors(layer)
-        predecessors = self.get_predecessors(layer)
-        # remove layer's edges
-        for succ in successors:
-            self._remove_edge(layer, succ)
-        for pred in predecessors:
-            self._remove_edge(pred, layer)
-
-        # connect predecessors and successors
-        for pred in predecessors:
-            for succ in successors:
-                self._add_edge(pred, succ)
-
-        # remove layer in the data structures
-        self.layer_list.remove(layer)
-        self.keras_layer_map.pop(layer)
-
-        # re-assign input and output layers if layer happens to be an
-        # input / output layer
-        if layer in self.input_layers:
-            idx = self.input_layers.index(layer)
-            self.input_layers.pop(idx)
-            for pred in predecessors:
-                self.input_layers.insert(idx, pred)
-                idx += 1
-        if layer in self.output_layers:
-            idx = self.output_layers.index(layer)
-            self.output_layers.pop(idx)
-            for succ in successors:
-                self.output_layers.insert(idx, succ)
-                idx += 1
-
-    def _remove_old_edges(self, layer):
-        predecessors = self.get_predecessors(layer)
-        successors = self.get_successors(layer)
-        for pred in predecessors:
-            self._remove_edge(pred, layer)
-        for succ in successors:
-            self._remove_edge(layer, succ)
-
-    def _remove_layers_of_type(self, layer_type):
-        idx = self._get_first_layer_of_type(layer_type)
-        while idx >= 0:
-            layer = self.layer_list[idx]
-            self._remove_layer_and_reconnect(layer)
-            idx = self._get_first_layer_of_type(layer_type)
-
-    def remove_skip_layers(self, skip_layers):
-        for skip_layer in skip_layers:
-            self._remove_layers_of_type(skip_layer)
-
-    def remove_internal_input_layers(self):
-        idx, nb_layers = 0, len(self.layer_list)
-        while idx < nb_layers:
-            layer = self.layer_list[idx]
-            keras_layer = self.keras_layer_map[layer]
-            if (
-                isinstance(keras_layer, InputLayer)
-                and len(self.get_predecessors(layer)) > 0
-            ):
-                # these are internal input layers that needs to be taken out
-                self._remove_layer_and_reconnect(layer)
-                idx -= 1
-                nb_layers -= 1
-            idx += 1
-
-    def _insert_layer_after(self, layer_idx, new_layer, new_keras_layer):
-        """ Insert the new_layer, whose parameter is stored in a Keras layer
-        structure new_keras_layer, after the layer whose position is layer_idx.
-        """
-        layer = self.layer_list[layer_idx]
-        self.layer_list.insert(layer_idx + 1, new_layer)
-        self.keras_layer_map[new_layer] = new_keras_layer
-        successors = self.get_successors(layer)
-        # add edge layer -> new_layer
-        self._add_edge(layer, new_layer)
-        # add edges new_layer -> layer_successor, remove layer -> successor
-        for succ in successors:
-            self._add_edge(new_layer, succ)
-            self._remove_edge(layer, succ)
-        # if layer is an output layer, change the output layer tag
-        if layer in self.output_layers:
-            idx = self.output_layers.index(layer)
-            self.output_layers[idx] = new_layer
-
-    def _insert_layer_between(self, src, snk, new_layer, new_keras_layer):
-        """ Insert the new_layer, whose keras layer parameters are stored in
-        new_keras_layer, between src and snk.
-        """
-        if snk is None:
-            insert_pos = self.layer_list.index(src) + 1
-        else:
-            insert_pos = self.layer_list.index(snk)  # insert position
-        self.layer_list.insert(insert_pos, new_layer)
-        self.keras_layer_map[new_layer] = new_keras_layer
-        if src is None:  # snk is an input layer
-            self._add_edge(new_layer, snk)
-        elif snk is None:  # src is an output layer
-            self._add_edge(src, new_layer)
-        else:
-            self._add_edge(src, new_layer)
-            self._add_edge(new_layer, snk)
-            self._remove_edge(src, snk)
-
-        # if layer is an output layer, change the output layer tag
-        if src in self.output_layers:
-            idx = self.output_layers.index(src)
-            self.output_layers[idx] = new_layer
-
-    def defuse_activation(self):
-        """ Defuse the fused activation layers in the network.
-        """
-        idx, nb_layers = 0, len(self.layer_list)
-        while idx < nb_layers:
-            layer = self.layer_list[idx]
-            k_layer = self.keras_layer_map[layer]
-            if isinstance(k_layer, _keras.layers.TimeDistributed):
-                k_layer = k_layer.layer
-            if (
-                isinstance(k_layer, _keras.layers.Conv2D)
-                or isinstance(k_layer, _keras.layers.Conv1D)
-                or isinstance(k_layer, _keras.layers.SeparableConv2D)
-                or isinstance(k_layer, _keras.layers.SeparableConv1D)
-                or isinstance(k_layer, _keras.layers.Dense)
-            ):
-
-                func_name = (
-                    k_layer.activation.__name__
-                )
-
-                if func_name != "linear":
-                    # Create new layer
-                    new_layer = layer + "__activation__"
-                    new_keras_layer = _keras.layers.core.Activation(func_name)
-                    # insert new layer after it
-                    self._insert_layer_after(idx, new_layer, new_keras_layer)
-                    idx += 1
-                    nb_layers += 1
-            idx += 1
-
-    def is_activation(self, layer):
-        keras_layer = self.keras_layer_map[layer]
-        for activation_type in _KERAS_ACTIVATION_LAYERS:
-            if isinstance(keras_layer, activation_type):
-                return True
-        return False
-
-    def is_1d_layer(self, layer):
-        keras_layer = self.keras_layer_map[layer]
-        for layer_type in _KERAS_LAYERS_1D:
-            if isinstance(keras_layer, layer_type):
-                return True
-        return False
-
-    def _get_1d_interface_edges(self):
-        """ Get edges that represents transition from not-1D to 1D, and 1D to
-        not-1D. A 'in_edge e(u,v)' means u operates on non-1D blobs, but v
-        operates on 1D blobs. An 'out_edge e(u,v)' means u operates on 1D
-        blobs, but v operates on non-1D blobs.
-        """
-        in_edges = set()
-        for layer in self.layer_list:
-            if not self.is_1d_layer(layer):
-                continue
-            preds = self.get_predecessors(layer)
-            if len(preds) == 0:
-                in_edges.add((None, layer))
-            else:
-                # because 1D layers are all 1-input,
-                # there should only be 1 predecessor
-                u, v = preds[0], layer
-                while u and (
-                    self.is_activation(u) or type(u) in _KERAS_NORMALIZATION_LAYERS
-                ):
-                    preds = self.get_predecessors(u)
-                    v = u
-                    u = preds[0] if len(preds) > 0 else None
-                if u is None or (not self.is_1d_layer(u)):
-                    in_edges.add((u, v))
-
-        out_edges = set()
-        for layer in self.layer_list:
-            if not self.is_1d_layer(layer):
-                continue
-            # cases for 1d->output
-            if layer in self.output_layers:
-                out_edges.add((layer, None))
-
-            succs = self.get_successors(layer)
-            if len(succs) > 0:
-                # this should be handled in 1d->output already
-                if not self.is_activation(succs[0]):
-                    for succ in succs:
-                        if not self.is_1d_layer(succ):
-                            out_edges.add((layer, succ))
-                else:
-                    act_layer = succs[0]
-                    succs = self.get_successors(act_layer)
-                    if len(succs) == 0:
-                        out_edges.add((act_layer, None))
-                    else:
-                        for succ in succs:
-                            if not self.is_1d_layer(succ):
-                                out_edges.add((act_layer, succ))
-
-        return list(in_edges), list(out_edges)
-
-    def insert_1d_permute_layers(self):
-        """
-        Insert permutation layers before a 1D start point or after 1D end point
-        """
-        idx, nb_layers = 0, len(self.layer_list)
-        in_edges, out_edges = self._get_1d_interface_edges()
-
-        # Hacky Warning: (1) use a 4-D permute, which is not likely to happen in Keras,
-        # to represent actual permutation needed for (seq, c, h, w) in CoreML
-        # (2) Assume 2-D input shape has meaning (seq, c), and during CoreML runtime,
-        # it is represented as 4D blob, (seq, c, h, w)
-        for in_edge in in_edges:
-            src, snk = in_edge
-            if src is None:
-                permute_layer = "_permute_" + snk
-            else:
-                permute_layer = src + "_permute_" + snk
-            keras_permute = _keras.layers.Permute(
-                dims=(3, 1, 2, 0)
-            )  # assume w = 1, switch seq and w
-            self._insert_layer_between(src, snk, permute_layer, keras_permute)
-        for out_edge in out_edges:
-            src, snk = out_edge
-            if snk is None:
-                permute_layer = src + "_permute_"
-            else:
-                permute_layer = src + "_permute_" + snk
-            keras_permute = _keras.layers.Permute(
-                dims=(3, 1, 2, 0)
-            )  # assume w = 1, switch seq and w back
-            self._insert_layer_between(src, snk, permute_layer, keras_permute)
-
-    def insert_permute_for_spatial_bn(self):
-
-        # find spatial batchnorm layers
-        spatial_bn_layers = []
-        for layer in self.layer_list:
-            keras_layer = self.keras_layer_map[layer]
-            if (
-                isinstance(keras_layer, _keras.layers.BatchNormalization)
-                and len(keras_layer.input_shape) == 4
-            ):
-                if keras_layer.axis == 1 or keras_layer.axis == 2:
-                    spatial_bn_layers.append(layer)
-
-        for sbn in spatial_bn_layers:
-            axis = self.keras_layer_map[sbn].axis
-            # axis == 1: swap H axis; axis == 2 : swap W axis
-            dims = (0, 2, 1, 3) if axis == 1 else (0, 3, 2, 1)
-            # add permutation before spatial batchnorm
-            pred = self.get_predecessors(sbn)[0]
-            permute_layer = pred + "_permute_" + sbn
-            keras_permute = _keras.layers.Permute(dims=dims)
-            self._insert_layer_between(pred, sbn, permute_layer, keras_permute)
-            # add permutation after spatial batchnorm
-            succs = self.get_successors(sbn)
-            if len(succs) == 0:
-                permute_layer = sbn + "_permute_"
-                keras_permute = _keras.layers.Permute(dims=dims)
-                self._insert_layer_between(sbn, None, permute_layer, keras_permute)
-            else:
-                for succ in succs:
-                    permute_layer = sbn + "_permute_" + succ
-                    keras_permute = _keras.layers.Permute(dims=dims)
-                    self._insert_layer_between(sbn, succ, permute_layer, keras_permute)
-
-    def build(self, is_top_level=True):
-        # sanity check.
-        model = self.model
-        if not (
-            type(model) == _keras.models.Sequential
-            or type(model) == _keras.models.Model
-        ):
-            raise TypeError("Keras layer of type %s is not supported." % type(model))
-
-        # build the graph without considering embedded subgraphs
-        for i, layer in enumerate(model.layers):
-            in_nodes = (
-                layer._inbound_nodes
-                if hasattr(layer, "_inbound_nodes")
-                else layer.inbound_nodes
-            )
-            for node in in_nodes:
-                for pred in node.inbound_layers:
-                    if pred.name not in self.layer_list:
-                        self.layer_list.append(pred.name)
-                        self.keras_layer_map[pred.name] = pred
-                    self._add_edge(pred.name, layer.name)
-            self.layer_list.append(layer.name)
-            self.keras_layer_map[layer.name] = layer
-
-        # Duplicate models for weight sharing
-        idx = self._get_first_shared_layer()
-        while idx >= 0:
-            layer = self.layer_list[idx]
-            keras_layer = self.keras_layer_map[layer]
-            predecessors = self.reverse_edge_map[layer]
-            successors = self.edge_map[layer]
-            new_layers = [layer + "_" + str(i) for i in range(len(predecessors))]
-            self.layer_list[idx : idx + 1] = new_layers
-            for i, new_layer in enumerate(new_layers):
-                self.edge_map[new_layer] = []
-                self.reverse_edge_map[new_layer] = []
-                self.keras_layer_map[new_layer] = keras_layer
-                pred = predecessors[i]
-                self._add_edge(pred, new_layer)
-                for succ in successors:
-                    self._add_edge(new_layer, succ)
-            self._remove_old_edges(layer)
-            self.keras_layer_map.pop(layer)
-            idx = self._get_first_shared_layer()
-
-        # Expand the sub-models
-        idx = self._get_first_embedded_model()
-        while idx >= 0:
-            # grab the input and output edges of the embedded model
-            embedded_model = self.layer_list[idx]
-            # build the embedded model
-            embedded_keras_model = self.keras_layer_map[embedded_model]
-            embedded_graph = NetGraph(embedded_keras_model)
-            embedded_graph.build(is_top_level=False)
-            # replace the embedded model with the layers of the embedded graph
-            embedded_layer_list = embedded_graph.layer_list
-            new_layer_list = []
-            for embedded_layer_name in embedded_layer_list:
-                new_layer_name = embedded_model + "_" + embedded_layer_name
-                new_layer_list.append(new_layer_name)
-                self.keras_layer_map[new_layer_name] = embedded_graph.keras_layer_map[
-                    embedded_layer_name
-                ]
-                # add edge [embed_layer -> its succ]
-                embedded_successors = embedded_graph.get_successors(embedded_layer_name)
-                for embed_succ_name in embedded_successors:
-                    new_embed_succ_name = embedded_model + "_" + embed_succ_name
-                    self._add_edge(new_layer_name, new_embed_succ_name)
-                # add edge [pred -> embed_layer]
-                embedded_predecessors = embedded_graph.get_predecessors(
-                    embedded_layer_name
-                )
-                for embed_pred_name in embedded_predecessors:
-                    new_embed_pred_name = embedded_model + "_" + embed_pred_name
-                    self._add_edge(new_embed_pred_name, new_layer_name)
-
-            self.layer_list[idx + 1 : idx + 1] = new_layer_list
-            # replace input / output edges to the model with input/output edges of the embedded layers
-            predecessors = self.get_predecessors(embedded_model)
-            embedded_inputs = embedded_graph.get_input_layers()
-            for i, pred in enumerate(predecessors):
-                embed_input = embedded_inputs[i]
-                new_embed_input = embedded_model + "_" + embed_input
-                self._add_edge(pred, new_embed_input)
-
-            embedded_outputs = embedded_graph.get_output_layers()
-            successors = self.get_successors(embedded_model)
-            for i, succ in enumerate(successors):
-                embed_output = embedded_outputs[i]
-                new_embed_output = embedded_model + "_" + embed_output
-
-                self._add_edge(new_embed_output, succ)
-
-            # clear up the embedded model
-            self._remove_layer(embedded_model)
-            idx = self._get_first_embedded_model()
-
-        # tag input layers and and output layers
-        self.make_input_layers()
-        self.make_output_layers()
-
-        # make graph level adjustments - do this only on top level
-        if is_top_level:
-            self.remove_skip_layers(_KERAS_SKIP_LAYERS)  # done 1 pass
-            self.insert_1d_permute_layers()
-            self.insert_permute_for_spatial_bn()
-            self.defuse_activation()
-            self.remove_internal_input_layers()
-
-    def print_layer_list(self):
-        print("\n")
-        print("layer_list")
-        print(self.layer_list)
-
-    def print_edge_map(self):
-        print("\n")
-        print("edge map:")
-        for src in self.edge_map:
-            for snk in self.edge_map[src]:
-                print("  ", src, "-->", snk)
-
-    def print_reverse_edge_map(self):
-        print("\n")
-        print("reverse edge map: ")
-        for snk in self.reverse_edge_map:
-            for src in self.reverse_edge_map[snk]:
-                print("  ", snk, "<--", src)
-
-    def print_mapping(self):
-        print("\nmapping:")
-        for key in self.keras_layer_map:
-            print(
-                key,
-                "-->",
-                self.keras_layer_map[key],
-                "(",
-                self.keras_layer_map[key].name,
-                ")",
-            )
-
-    def print_all(self):
-        print("=" * 80)
-        self.print_layer_list()
-        self.print_edge_map()
-        self.print_reverse_edge_map()
-        self.print_mapping()
diff --git a/coremltools/converters/keras/_utils.py b/coremltools/converters/keras/_utils.py
deleted file mode 100644
index aa5b8cf0d..000000000
--- a/coremltools/converters/keras/_utils.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (c) 2017, Apple Inc. All rights reserved.
-#
-# Use of this source code is governed by a BSD-3-clause license that can be
-# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
-
-
-def raise_error_unsupported_categorical_option(
-    option_name, option_value, layer_type, layer_name
-):
-    """
-    Raise an error if an option is not supported.
-    """
-    raise RuntimeError(
-        "Unsupported option %s=%s in layer %s(%s)"
-        % (option_name, option_value, layer_type, layer_name)
-    )
-
-
-def raise_error_unsupported_option(option, layer_type, layer_name):
-    """
-    Raise an error if an option is not supported.
-    """
-    raise RuntimeError(
-        "Unsupported option =%s in layer %s(%s)" % (option, layer_type, layer_name)
-    )
-
-
-def raise_error_unsupported_scenario(message, layer_type, layer_name):
-    """
-    Raise an error if an scenario is not supported.
-    """
-    raise RuntimeError(
-        "Unsupported scenario '%s' in layer %s(%s)" % (message, layer_type, layer_name)
-    )
diff --git a/coremltools/converters/mil/__init__.py b/coremltools/converters/mil/__init__.py
index 4b484545f..d1d4758dc 100644
--- a/coremltools/converters/mil/__init__.py
+++ b/coremltools/converters/mil/__init__.py
@@ -60,6 +60,7 @@
 
 from .input_types import (
     ClassifierConfig,
+    ColorLayout,
     InputType,
     TensorType,
     ImageType,
diff --git a/coremltools/converters/mil/_deployment_compatibility.py b/coremltools/converters/mil/_deployment_compatibility.py
index d979462a1..4fdc2d933 100644
--- a/coremltools/converters/mil/_deployment_compatibility.py
+++ b/coremltools/converters/mil/_deployment_compatibility.py
@@ -3,20 +3,22 @@
 # Use of this source code is governed by a BSD-3-clause license that can be
 # found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
-from enum import Enum
+from enum import IntEnum
 
 from coremltools import (
     _SPECIFICATION_VERSION_IOS_13,
     _SPECIFICATION_VERSION_IOS_14,
-    _SPECIFICATION_VERSION_IOS_15
+    _SPECIFICATION_VERSION_IOS_15,
+    _SPECIFICATION_VERSION_IOS_16,
 )
 
 
-class AvailableTarget(Enum):
+class AvailableTarget(IntEnum):
     # iOS versions
     iOS13 = _SPECIFICATION_VERSION_IOS_13
     iOS14 = _SPECIFICATION_VERSION_IOS_14
     iOS15 = _SPECIFICATION_VERSION_IOS_15
+    iOS16 = _SPECIFICATION_VERSION_IOS_16
 
     # macOS versions (aliases of iOS versions)
     macOS15 = _SPECIFICATION_VERSION_IOS_13
@@ -25,16 +27,19 @@ class AvailableTarget(Enum):
     macOS10_16 = _SPECIFICATION_VERSION_IOS_14
     macOS11 = _SPECIFICATION_VERSION_IOS_14
     macOS12 = _SPECIFICATION_VERSION_IOS_15
+    macOS13 = _SPECIFICATION_VERSION_IOS_16
 
     # watchOS versions (aliases of iOS versions)
     watchOS6 = _SPECIFICATION_VERSION_IOS_13
     watchOS7 = _SPECIFICATION_VERSION_IOS_14
     watchOS8 = _SPECIFICATION_VERSION_IOS_15
+    watchOS9 = _SPECIFICATION_VERSION_IOS_16
 
     # tvOS versions (aliases of iOS versions)
     tvOS13 = _SPECIFICATION_VERSION_IOS_13
     tvOS14 = _SPECIFICATION_VERSION_IOS_14
     tvOS15 = _SPECIFICATION_VERSION_IOS_15
+    tvOS16 = _SPECIFICATION_VERSION_IOS_16
 
 
 _get_features_associated_with = {}
diff --git a/coremltools/converters/mil/backend/backend_helper.py b/coremltools/converters/mil/backend/backend_helper.py
index db0aed949..a138068a8 100644
--- a/coremltools/converters/mil/backend/backend_helper.py
+++ b/coremltools/converters/mil/backend/backend_helper.py
@@ -3,7 +3,9 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+from coremltools.converters.mil.input_types import ColorLayout
 from coremltools.converters.mil.mil.passes.name_sanitization_utils import NameSanitizer
+from coremltools.proto import FeatureTypes_pb2 as ft
 
 def _get_probability_var_for_classifier(prog, classifier_config):
     '''
@@ -39,4 +41,33 @@ def _get_probability_var_for_classifier(prog, classifier_config):
         if probability_var is None:
             msg = "'predicted_probabilities_output', '{}', provided in 'ClassifierConfig', does not exist in the MIL program."
             raise ValueError(msg.format(predicted_probabilities_output))
-    return probability_var
\ No newline at end of file
+    return probability_var
+
+
+def _get_colorspace_enum(color_layout):
+    if color_layout == ColorLayout.GRAYSCALE:
+        return ft.ImageFeatureType.ColorSpace.GRAYSCALE
+    elif color_layout == ColorLayout.GRAYSCALE_FLOAT16:
+        return ft.ImageFeatureType.ColorSpace.GRAYSCALE_FLOAT16
+    elif color_layout == ColorLayout.BGR:
+        return ft.ImageFeatureType.ColorSpace.BGR
+    else:
+        return ft.ImageFeatureType.ColorSpace.RGB
+
+def _validate_image_input_output_shapes(color_layout, shape, name, is_input=True):
+    io_str = "input" if is_input else "output"
+    if len(shape) != 4:
+        raise ValueError("Image {}, '{}', must have rank 4. Instead it has rank {}".
+                         format(io_str, name, len(shape)))
+    if color_layout in (ColorLayout.BGR, ColorLayout.RGB):
+        if shape[1] != 3 or shape[0] != 1:
+            raise ValueError("Shape of the RGB/BGR image {}, '{}', must be of kind (1, 3, H, W), "
+                             "i.e., first two dimensions must be (1, 3), instead they are: {}".
+                             format(io_str, name, shape[:2]))
+    elif color_layout in (ColorLayout.GRAYSCALE, ColorLayout.GRAYSCALE_FLOAT16):
+        if shape[1] != 1 or shape[0] != 1:
+            raise ValueError("Shape of the Grayscale image {}, '{}', must be of kind (1, 1, H, W), "
+                             "i.e., first two dimensions must be (1, 1), instead they are: {}".
+                             format(io_str, name, shape[:2]))
+    else:
+        raise KeyError("Unrecognized color_layout {}".format(color_layout))
\ No newline at end of file
diff --git a/coremltools/converters/mil/backend/mil/helper.py b/coremltools/converters/mil/backend/mil/helper.py
index 138ffa992..fa31469d3 100644
--- a/coremltools/converters/mil/backend/mil/helper.py
+++ b/coremltools/converters/mil/backend/mil/helper.py
@@ -17,7 +17,7 @@
     numpy_type_to_builtin_type,
     builtin_to_string
 )
-from coremltools.converters.mil.backend.nn.op_mapping import to_py_type
+from coremltools.converters.mil.mil.types.type_mapping import np_val_to_py_type
 
 
 def create_valuetype_scalar(data_type):
@@ -98,6 +98,8 @@ def _tensor_field_by_type(tensor_val, builtin_type):
     elif types.is_int(builtin_type):
         if (builtin_type == types.int64 or builtin_type == types.uint64):
             return tensor_val.longInts.values
+        if builtin_type in (types.int8, types.uint8, types.uint32):
+            return tensor_val.bytes.values
         return tensor_val.ints.values
     elif types.is_float(builtin_type):
         if (builtin_type == types.fp64):
@@ -120,6 +122,8 @@ def _set_empty_tensor_field_by_type(tensor_val, builtin_type):
     elif types.is_int(builtin_type):
         if (builtin_type == types.int64 or builtin_type == types.uint64):
             tensor_val.longInts.SetInParent()
+        elif builtin_type in (types.int8, types.uint8, types.uint32):
+            tensor_val.bytes.SetInParent()
         else:
             tensor_val.ints.SetInParent()
     elif types.is_float(builtin_type):
@@ -153,14 +157,11 @@ def create_tensor_value(np_tensor):
         if builtin_type == types.str:
             for x in np.nditer(np_tensor):
                 t_field.append(x.encode("utf-8"))
-        elif builtin_type == types.fp16:
-            bytevals = bytes()
-            for x in np_tensor.flatten():
-                bytevals += to_py_type(x)
-            val.immediateValue.tensor.bytes.values = bytevals
+        elif builtin_type in (types.fp16, types.int8, types.uint8, types.uint32):
+            val.immediateValue.tensor.bytes.values = np_val_to_py_type(np_tensor)
         else:
             for x in np_tensor.flatten():
-                t_field.append(to_py_type(x))
+                t_field.append(np_val_to_py_type(x))
     else:  # This is an "empty" tensor (tensor with a dimension being size 0)
         _set_empty_tensor_field_by_type(t_val, builtin_type)
     return val
@@ -178,12 +179,12 @@ def create_scalar_value(py_scalar):
 
     # Set the tensor value
     t_field = _tensor_field_by_type(t_val, builtin_type)
-    if builtin_type == types.fp16:
-        val.immediateValue.tensor.bytes.values = to_py_type(py_scalar)
+    if builtin_type in (types.fp16, types.int8, types.uint8, types.uint32):
+        val.immediateValue.tensor.bytes.values = np_val_to_py_type(py_scalar)
     else:
         if builtin_type == types.str:
             py_scalar = py_scalar.encode("utf-8")
-        t_field.append(to_py_type(py_scalar))
+        t_field.append(np_val_to_py_type(py_scalar))
 
     return val
 
@@ -284,6 +285,10 @@ def create_file_value(output_var, blob_writer):
     elif output_var.val.dtype.kind == 'f' and output_var.val.dtype.itemsize == 2:
         output_var_fp16_to_bytes_to_uint16 = np.frombuffer(output_var.val.flatten().tobytes(), np.uint16)
         offset = blob_writer.write_fp16_data(output_var_fp16_to_bytes_to_uint16)
+    elif output_var.val.dtype.kind == "u" and output_var.val.dtype.itemsize == 1:
+        offset = blob_writer.write_uint8_data(output_var.val.flatten())
+    elif output_var.val.dtype.kind == "i" and output_var.val.dtype.itemsize == 1:
+        offset = blob_writer.write_int8_data(output_var.val.flatten())
     else:
         raise TypeError("Unsupported type, {}, for net buffer serialization.".format(output_var.val.dtype))
 
@@ -312,6 +317,8 @@ def cast_to_framework_io_dtype(var, is_output):
         return ft.ArrayFeatureType.ArrayDataType.FLOAT32
     elif var.dtype == types.int32:
         return ft.ArrayFeatureType.ArrayDataType.INT32
+    elif var.dtype == types.fp16:
+        return ft.ArrayFeatureType.ArrayDataType.FLOAT16
     else:
         ioname = "Output " if is_output else "Input "
         ioname2 = "outputs" if is_output else "inputs"
diff --git a/coremltools/converters/mil/backend/mil/load.py b/coremltools/converters/mil/backend/mil/load.py
index 1c16855f9..faadf2168 100644
--- a/coremltools/converters/mil/backend/mil/load.py
+++ b/coremltools/converters/mil/backend/mil/load.py
@@ -8,7 +8,9 @@
 import os
 
 from .passes import mil_passes
+from ..backend_helper import _get_colorspace_enum, _validate_image_input_output_shapes
 from coremltools import _SPECIFICATION_VERSION_IOS_15
+from coremltools import _OPSET
 from coremltools.converters.mil.backend.mil.helper import (
     cast_to_framework_io_dtype,
     create_file_value,
@@ -25,7 +27,7 @@
     types
 )
 from coremltools.converters.mil.backend.nn.load import _set_optional_inputs
-from coremltools.converters.mil.input_types import ImageType, TensorType, EnumeratedShapes, RangeDim
+from coremltools.converters.mil.input_types import ColorLayout, ImageType, TensorType, EnumeratedShapes, RangeDim
 from coremltools.converters.mil.mil.ops.registry import SSAOpRegistry
 from coremltools.converters.mil.mil.types.symbolic import (
     any_symbolic,
@@ -54,9 +56,10 @@ def should_use_weight_file(val):
         val is not None
         and isinstance(val, (np.ndarray, np.generic))
         and val.size >= 10
-        and val.dtype in ['float16', 'float32']
+        and val.dtype in ['float16', 'float32', 'uint8', 'int8']
     )
 
+
 def translate_const(op, blob_writer):
     output_var = op.outputs[0]
 
@@ -76,6 +79,32 @@ def translate_const(op, blob_writer):
     )
 
 
+def translate_constexpr(op, blob_writer):
+
+    def get_value(var):
+        if should_use_weight_file(var.val):
+            value = create_file_value(var, blob_writer)
+        else:
+            value = create_immediate_value(var)
+
+        return value
+
+    output_var = op.outputs[0]
+
+    attributes = {"name": create_scalar_value(op.name)}
+    attributes.update({k: get_value(v) for k, v in op.inputs.items()})
+
+    return pm.Operation(
+        type=op.op_type,
+        attributes=attributes,
+        outputs=[
+            pm.NamedValueType(
+                name=output_var.name, type=types_to_proto(output_var.sym_type)
+            )
+        ],
+    )
+
+
 def translate_generic_op(op, parameters, blob_writer, literal_params=[]):
     inputs = {}
     for param_name, vars in op.inputs.items():
@@ -131,8 +160,13 @@ def translate_generic_op(op, parameters, blob_writer, literal_params=[]):
         outputs=outputs,
     )
 
-
 def create_block(block, parameters, blob_writer):
+
+    def feeds_to_only_constexprs(op):
+        return (op.op_type == 'const') \
+               and len(op.outputs[0].child_ops) > 0 \
+               and all((child_op.op_type.startswith("constexpr_")) for child_op in op.outputs[0].child_ops)
+
     proto_ops = []
 
     # Find the const op that generates classify's "label" / "class" string vec.
@@ -150,10 +184,14 @@ def create_block(block, parameters, blob_writer):
     for op in block.operations:
         op_cls_name = type(op).__name__
         if op_cls_name == "const":
+            if feeds_to_only_constexprs(op):
+                continue
             # Do not serialize the const op that creates the var bound to the classifier's "classes" param.
             # The variable's value will be bound directly to classify's "classes" param instead.
             if op != classify_const_classes_op:
                 proto_ops.append(translate_const(op, blob_writer))
+        elif op_cls_name.startswith("constexpr_"):
+            proto_ops.append(translate_constexpr(op, blob_writer))
         elif op_cls_name == "classify":
             # Classify's "classes" param should be serialized as a value literal bound
             # directly to the param, rather than as a const-generated variable.
@@ -172,7 +210,7 @@ def create_block(block, parameters, blob_writer):
     return pm.Block(inputs=inputs, outputs=output_names, operations=proto_ops)
 
 
-def convert_function(function, parameters, blob_writer):
+def convert_function(function, parameters, blob_writer, opset):
     block = create_block(function, parameters, blob_writer)
 
     inputs = []
@@ -180,8 +218,7 @@ def convert_function(function, parameters, blob_writer):
         proto_type = types_to_proto(var.sym_type)
         inputs.append(pm.NamedValueType(name=name, type=proto_type))
 
-    return pm.Function(inputs=inputs, opset="CoreML5", block_specializations={"CoreML5": block})
-
+    return pm.Function(inputs=inputs, opset=opset, block_specializations={opset: block})
 
 # Add a classify op to the output.
 # Replaces the original probabilites output (in the containing MIL block)
@@ -191,6 +228,12 @@ def _add_classify_op(prog, classifier_config):
     '''
     Add a "classify" op to the program, at the end of the main block
     '''
+    def remove_output(block, prob_var):
+        for i in range(len(block.outputs)):
+            if block.outputs[i] is prob_var:
+                block.outputs.pop(i)
+                break
+
     block = prog.functions["main"]
 
     message = "Class labels must be a list of integers / strings or a file path"
@@ -217,7 +260,12 @@ def _add_classify_op(prog, classifier_config):
         if isinstance(classes[0], int):
             classes = [np.int64(x) for x in classes]
         classes_var = mb.const(val=mil_list(classes))
-        out = mb.classify(probabilities=probability_var, classes=classes_var)
+        if probability_var.dtype != types.fp32:
+            remove_output(block, probability_var)
+            probability_var = mb.cast(x=probability_var, dtype="fp32", name=probability_var.name + "_cast_to_fp32")
+        out = mb.classify(probabilities=probability_var,
+                          classes=classes_var
+                          )
 
         predicted_feature_name = "classLabel" if classifier_config.predicted_feature_name is None \
                                               else classifier_config.predicted_feature_name
@@ -225,18 +273,15 @@ def _add_classify_op(prog, classifier_config):
         out[1].name = predicted_feature_name + "_probs"
 
         # Remove probabilities from block outputs, replace with classify's outputs
-        for i in range(0, len(block.outputs)):
-            if block.outputs[i] is probability_var:
-                block.outputs.pop(i)
-                break
+        remove_output(block, probability_var)
         block.outputs[:0] = out
         return out[0].name, out[1].name
 
-def load(prog, weights_dir, resume_on_errors=False, **kwargs):
+def load(prog, weights_dir, resume_on_errors=False, specification_version=_SPECIFICATION_VERSION_IOS_15, **kwargs):
     if "main" not in prog.functions:
         raise ValueError("main function not found in program")
 
-    mil_passes.mil_backend_passes(prog)
+    mil_passes.mil_backend_passes(prog, specification_version)
 
     # if user has specified "ClassifierConfig", then add the "classify" op to the prog
     classifier_config = kwargs.get("classifier_config", None)
@@ -246,18 +291,32 @@ def load(prog, weights_dir, resume_on_errors=False, **kwargs):
         predicted_feature_name, predicted_probabilities_name = _add_classify_op(prog, classifier_config)
 
     input_types = prog.main_input_types
+    output_types = prog.main_output_types
     weight_path = os.path.join(weights_dir, _WEIGHTS_FILE_NAME)
     blob_writer = BlobWriter(weight_path)
 
+    opset = _OPSET[specification_version]
+
     function_protos = {}
     for func_name, func in prog.functions.items():
-        function_protos[func_name] = convert_function(func, prog.parameters, blob_writer)
+        function_protos[func_name] = convert_function(func, prog.parameters, blob_writer, opset)
 
     proto = pm.Program(
         version=1,
         functions=function_protos,
     )
 
+    desc = kwargs.get("model_description", None)
+    if desc and not isinstance(desc, ml.ModelDescription):
+        raise ValueError("Invalid model descriptor")
+
+    if desc:
+        if classifier_config is not None:
+            raise AssertionError("Both model_description and classifier_config can't be provided")
+        model = ml.Model(description=desc, specificationVersion=specification_version)
+        model.mlProgram.CopyFrom(proto)
+        return model
+
     input_features = []
     output_features = []
     symbolic_inputs = []
@@ -303,22 +362,13 @@ def load(prog, weights_dir, resume_on_errors=False, **kwargs):
                 array_type = ft.ArrayFeatureType(shape=shape, dataType=cast_to_framework_io_dtype(var, False))
                 input_feature_type.multiArrayType.CopyFrom(array_type)
             else:
-                if len(shape) < 3:
-                    raise ValueError("Image input, '{}', must have rank at least 3. Instead it has rank {}".
-                                     format(name, len(shape)))
                 # make a feature type of Type "imageType"
                 input_type = image_input_names[name]
+                _validate_image_input_output_shapes(input_type.color_layout, shape, name, is_input=True)
                 if not input_type.channel_first:
                     raise ValueError("Image input, '{}', must be in the channel_first format".
                                      format(name))
-
-                if input_type.color_layout == "G":
-                    clr_space = ft.ImageFeatureType.ColorSpace.GRAYSCALE
-                elif input_type.color_layout == "BGR":
-                    clr_space = ft.ImageFeatureType.ColorSpace.BGR
-                else:
-                    clr_space = ft.ImageFeatureType.ColorSpace.RGB
-
+                clr_space = _get_colorspace_enum(input_type.color_layout)
                 image_type = ft.ImageFeatureType(width=shape[-1],
                                                  height=shape[-2],
                                                  colorSpace=clr_space)
@@ -334,20 +384,44 @@ def load(prog, weights_dir, resume_on_errors=False, **kwargs):
         else:
             raise NotImplementedError()
 
-    for var in prog.functions["main"].outputs:
+    if output_types is not None and classifier_config is None:
+        assert len(output_types) == len(prog.functions["main"].outputs), \
+                "number of mil program outputs do not match the number of outputs provided by the user"
+
+    for i, var in enumerate(prog.functions["main"].outputs):
         output_feature_type = ft.FeatureType()
         if types.is_tensor(var.sym_type) or types.is_primitive(var.sym_type):
-            dataType = None
-            if classifier_config is None or var.name != predicted_feature_name:
-                # Not a classifier output, make sure model output type matches with ML Program type.
-                dataType = cast_to_framework_io_dtype(var, True)
+            if output_types is not None and isinstance(output_types[i], ImageType):
+                if not types.is_tensor(var.sym_type):
+                    raise ValueError("Image output, '{}', is a scalar, but it should be a tensor of rank 4".format(
+                                      var.name))
+                shape = var.sym_type.get_shape()
+                if any_variadic(shape):
+                    raise ValueError("Variable rank model outputs, that are ImageTypes, are not supported")
+                if any([is_symbolic(d) for d in shape]):
+                    raise NotImplementedError("Image output '{}' has symbolic dimensions in its shape".
+                                              format(var.name))
+                _validate_image_input_output_shapes(output_types[i].color_layout, shape, var.name, is_input=False)
+                clr_space = _get_colorspace_enum(output_types[i].color_layout)
+                image_type = ft.ImageFeatureType(width=shape[-1],
+                                                 height=shape[-2],
+                                                 colorSpace=clr_space)
+                output_feature_type.imageType.CopyFrom(image_type)
+                output_features.append(
+                    ml.FeatureDescription(name=var.name, type=output_feature_type)
+                )
             else:
-                # Classifier outputs are set up separately, so default to fp32 for now.
-                dataType = ft.ArrayFeatureType.ArrayDataType.FLOAT32
+                dataType = None
+                if classifier_config is None or var.name != predicted_feature_name:
+                    # Not a classifier output, make sure model output type matches with ML Program type.
+                    dataType = cast_to_framework_io_dtype(var, True)
+                else:
+                    # Classifier outputs are set up separately, so default to fp32 for now.
+                    dataType = ft.ArrayFeatureType.ArrayDataType.FLOAT32
 
-            array_type = ft.ArrayFeatureType(shape=None, dataType=dataType)
-            output_feature_type.multiArrayType.CopyFrom(array_type)
-            output_features.append(ml.FeatureDescription(name=var.name, type=output_feature_type))
+                array_type = ft.ArrayFeatureType(shape=None, dataType=dataType)
+                output_feature_type.multiArrayType.CopyFrom(array_type)
+                output_features.append(ml.FeatureDescription(name=var.name, type=output_feature_type))
         elif (types.is_dict(var.sym_type)):
             output_feature_type.dictionaryType.MergeFromString(b"")
             keytype, valtype = var.sym_type.T
@@ -378,7 +452,7 @@ def load(prog, weights_dir, resume_on_errors=False, **kwargs):
                 break
 
     # Create ML Model
-    model = ml.Model(description=desc, specificationVersion=_SPECIFICATION_VERSION_IOS_15)
+    model = ml.Model(description=desc, specificationVersion=specification_version)
     model.mlProgram.CopyFrom(proto)
 
     # Set symbolic shapes
diff --git a/coremltools/converters/mil/backend/mil/passes/adjust_io_to_supported_types.py b/coremltools/converters/mil/backend/mil/passes/adjust_io_to_supported_types.py
index 84c179824..3a584c2f0 100644
--- a/coremltools/converters/mil/backend/mil/passes/adjust_io_to_supported_types.py
+++ b/coremltools/converters/mil/backend/mil/passes/adjust_io_to_supported_types.py
@@ -3,17 +3,18 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
-import warnings as _warnings
+import logging
 
 from coremltools.converters.mil.mil import Builder as mb, types as types
 from coremltools.converters.mil.mil.passes.pass_registry import register_pass
 from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
+from coremltools.converters.mil._deployment_compatibility import AvailableTarget as target
 
 
 @register_pass(namespace="mil_backend")
 class adjust_io_to_supported_types(AbstractGraphPass):
     """
-    Converts all dTypes to types that are supported by the CoreML runtime.
+    Converts all dtypes to types that are supported by the CoreML runtime.
     The runtime supports only fp16, fp32, int32, str, and bool variables.
 
     General rules:
@@ -23,9 +24,10 @@ class adjust_io_to_supported_types(AbstractGraphPass):
           types are numerical and can be reasonably replaced with 32 bit float types.
 
     The "main" function has additional rules since its I/O is mapped to CoreML model I/O:
-        * Fp16 I/O is replaced with fp32 I/O.
-          Casts (fp32 input -> fp16) are inserted at the beginning of the program to preserve 16 bit inputs.
-          Casts (fp16 -> fp32 output) are inserted at the end of the program to preserve 16 bit computations.
+        * if minimum_deployment_target <  coremltools.target.iOS16, then:
+            * Fp16 I/O is replaced with fp32 I/O.
+                Casts (fp32 input -> fp16) are inserted at the beginning of the program to preserve 16 bit inputs.
+                Casts (fp16 -> fp32 output) are inserted at the end of the program to preserve 16 bit computations.
 
         * All non-integer I/O that is not fp32 is replaced with fp32 I/O.
           A cast (prev input type -> fp32) is inserted at the beginning of the program to preserve non-fp32 inputs.
@@ -61,7 +63,7 @@ class adjust_io_to_supported_types(AbstractGraphPass):
     def apply(self, prog):
         for name, func in prog.functions.items():
             is_main_funtion = name == "main"
-            _adjust_io_to_supported_types(func, is_main_funtion)
+            _adjust_io_to_supported_types(func, is_main_funtion, self.minimun_deployment_target)
 
 __RUNTIME_SUPPORTED_TYPES = [types.fp16, types.fp32, types.int32, types.str, types.bool]
 
@@ -74,7 +76,7 @@ def _adjust_var_dtype_helper(var, dtype):
     else:
         var._sym_type = types.tensor(dtype, var.sym_type.get_shape())
 
-def _adjust_main_inputs(func):
+def _adjust_main_inputs(func, min_deployment_target):
     first_op = func.operations[0] if len(func.operations) > 0 else None
     for input_name, input_var in func.inputs.items():
        if (types.is_tensor(input_var.sym_type) or types.is_scalar(input_var.sym_type)) \
@@ -83,44 +85,69 @@ def _adjust_main_inputs(func):
             input_dtype_str = types.builtin_to_string(input_var.dtype)
             if types.is_int(input_var.dtype):
                 # Replace non-int32 input type with int32.
-                _warnings.warn("Input" + input_var.name + " is of dType " + input_dtype_str +\
+                logging.warning("Input" + input_var.name + " is of dtype " + input_dtype_str +\
                                ". Only integer variables of bit width 32 are supported by the CoreML runtime. " +\
-                               "This input will be assigned a dType of int32. " +\
+                               "This input will be assigned a dtype of int32. " +\
                                "No cast will be inserted; the previous dtype will be replaced.")
                 _adjust_var_dtype_helper(input_var, types.int32)
             elif input_var.dtype == types.fp64:
                 # Replace float64 input type with fp32.
-                _warnings.warn("Input" + input_var.name + " is of dtype fp64. 64 bit float inputs are " +\
-                               "not supported by ML program models. This input will be assigned a dType " +\
+                logging.warning("Input '" + input_var.name + "' is of dtype fp64. 64 bit float inputs are " +\
+                               "not supported by ML program models. This input will be assigned a dtype " +\
                                "of fp32. No cast will be inserted; the previous dtype will be replaced.")
                 _adjust_var_dtype_helper(input_var, types.fp32)
+            elif input_var.dtype == types.fp16 \
+                 and min_deployment_target.value>= target.iOS16.value:
+                pass # do nothing, since fp16 is a valid input type for CoreML
             else:
-                # This is some other dType. Change the type to fp32 and add a cast.
+                # This is some other dtype. Change the type to fp32 and add a cast.
                 # This is only a limitation of main--other functions do not represent CoreML model inputs
                 # and do not have the same limitation on input types.
-                _warnings.warn("Input" + input_var.name + " is of dType " + input_dtype_str + ". The " +\
-                               "CoreML runtime does not support inputs with this dType (only fp32 and " +\
-                               "int32 inputs are supported). This input will be assigned a dType of " +\
+                supported_dtypes = "{int32, fp32, fp64}" if min_deployment_target < target.iOS16 else \
+                                    "{int32, fp16, fp32, fp64}"
+                msg = "\nInput '{}' is of dtype {}. The " +\
+                               "CoreML runtime does not support inputs with this dtype " +\
+                               "(supported dtypes are: {}). This input will be assigned a dtype of " +\
                                "fp32. A cast will be inserted at the beginning of the program to " +\
-                               "convert the input to the originally defined dType.")
+                               "convert the input to the originally defined dtype.\n"
+                if input_var.dtype == types.fp16:
+                    msg += "fp16 dtype input is supported if the minimum_deployment_target is chosen to be at least " \
+                           "iOS16/macOS13.\n"
+                logging.warning(msg.format(
+                    input_var.name,
+                    input_dtype_str,
+                    supported_dtypes))
+
                 with func:
                     casted_input_var = mb.cast(x=input_var, dtype=input_dtype_str, before_op=first_op)
                     func.replace_uses_of_var_after_op(anchor_op=casted_input_var.op, old_var=input_var, new_var=casted_input_var)
                     _adjust_var_dtype_helper(input_var, types.fp32)
 
-def _adjust_main_outputs(func):
+def _adjust_main_outputs(func, min_deployment_target):
     new_outputs = []
     for output_var in func.outputs:
         output_type = output_var.sym_type
         if (types.is_tensor(output_type) or types.is_scalar(output_type)) \
             and output_var.dtype != types.fp32 \
-            and output_var.dtype != types.int32:
+            and output_var.dtype != types.int32 \
+            and (min_deployment_target < target.iOS16 or output_var.dtype != types.fp16):
+            # since fp16 is a valid output type for coreml from ios16 spec onwards, no need to cast
             output_dtype_str = types.builtin_to_string(output_var.dtype)
-            _warnings.warn("Output" + output_var.name + " is of dType " + output_dtype_str + ". The " +\
-                           "CoreML runtime does not support outputs with this dType (only int32 and " +\
-                           "fp32 are supported for outputs). This output will be assigned a dType " +\
+            supported_dtypes = "{int32, fp32, fp64}" if min_deployment_target < target.iOS16 else \
+                                "{int32, fp16, fp32, fp64}"
+            msg = "\nOutput '{}' is of dtype {}. The " +\
+                           "CoreML runtime does not support outputs with this dtype " +\
+                           "(supported dtypes are: {}). This output will be assigned a dtype " +\
                            "of fp32. A cast will be inserted at the end of the program to convert" +\
-                           "the original output dType to the dType supported by the CoreML runtime.")
+                           "the original output dtype to the dtype supported by the CoreML runtime.\n"
+            if output_var.dtype == types.fp16:
+                msg += "fp16 dtype output is supported if the minimum_deployment_target is chosen to be at least " \
+                       "iOS16/macOS13.\n"
+            logging.warning(msg.format(
+                               output_var.name,
+                               output_dtype_str,
+                               supported_dtypes,
+                           ))
 
             output_var_name = output_var.name
             output_var.set_name(output_var_name + "__pre__output__fp32__cast")
@@ -146,16 +173,16 @@ def _adjust_var(var):
         dtype_str = types.builtin_to_string(var.dtype)
         if types.is_int(var.dtype):
             # Replace non-int32 input type with int32.
-            _warnings.warn("Input" + var.name + " is of dType " + dtype_str +\
+            logging.warning("Input '" + var.name + "' is of dtype " + dtype_str +\
                            ". Only integer variables of bit width 32 are supported by the CoreML runtime. " +\
-                           "This input will be assigned a dType of int32. " +\
+                           "This input will be assigned a dtype of int32. " +\
                            "No cast will be inserted; the previous dtype will be replaced.")
             _adjust_var_dtype_helper(var, types.int32)
         else:
-            # This is some other unsupported dType. Change the input type to fp32.
-            _warnings.warn("Var " + var.name + " is of dType " + dtype_str + ". The CoreML runtime " +\
-                           "does not support this dType (only fp16, fp32, bool, and int32 are supported). " +\
-                           "This input will be assigned a dType of fp32. No cast will be inserted; " +\
+            # This is some other unsupported dtype. Change the input type to fp32.
+            logging.warning("Var " + var.name + " is of dtype " + dtype_str + ". The CoreML runtime " +\
+                           "does not support this dtype (only fp16, fp32, bool, and int32 are supported). " +\
+                           "This input will be assigned a dtype of fp32. No cast will be inserted; " +\
                            "the previous dtype will be replaced.")
             _adjust_var_dtype_helper(var, types.fp32)
 
@@ -227,11 +254,11 @@ def _adjust_ops(block):
 #####
 # The Pass
 #####
-def _adjust_io_to_supported_types(func, is_main):
+def _adjust_io_to_supported_types(func, is_main, min_deployment_target):
     if is_main:
-        _adjust_main_inputs(func)
+        _adjust_main_inputs(func, min_deployment_target)
         _adjust_ops(func)
-        _adjust_main_outputs(func)
+        _adjust_main_outputs(func, min_deployment_target)
     else:
         _adjust_func_inputs(func)
         _adjust_ops(func)
diff --git a/coremltools/converters/mil/backend/mil/passes/homogenize_input_dtypes.py b/coremltools/converters/mil/backend/mil/passes/homogenize_input_dtypes.py
index 8617d12e6..f015d9d00 100644
--- a/coremltools/converters/mil/backend/mil/passes/homogenize_input_dtypes.py
+++ b/coremltools/converters/mil/backend/mil/passes/homogenize_input_dtypes.py
@@ -1,15 +1,15 @@
-# -*- coding: utf-8 -*-
-
 #  Copyright (c) 2021, Apple Inc. All rights reserved.
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil.ops.defs import elementwise_binary, matmul
-from coremltools.converters.mil.mil.passes.pass_registry import register_pass
+from coremltools.converters.mil.mil.ops.defs.elementwise_unary import cast as cast_op_class
 from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
+from coremltools.converters.mil.mil.passes.pass_registry import register_pass
 from coremltools.converters.mil.mil.types import promote_dtypes, builtin_to_string
-from coremltools.converters.mil.mil import Builder as mb
+
 
 _SUPPORTED_OPS = {
     # Mapping from op_class --> list of those params which needs to be of the same dtype
@@ -28,9 +28,13 @@ def _is_same_dtype(dtype1, dtype2):
     return (dtype1 is dtype2) or (builtin_to_string(dtype1) == builtin_to_string(dtype2))
 
 def _promoted_var(op, var, promoted_dtype):
-    x = mb.cast(
-        x=var, dtype=builtin_to_string(promoted_dtype), name=var.name + "_promoted", before_op=op
-    )
+    if var.val is None:
+        x = mb.cast(
+            x=var, dtype=builtin_to_string(promoted_dtype), name=var.name + "_promoted", before_op=op
+        )
+    else:
+        const_value_after_cast = cast_op_class.get_cast_value(var, builtin_to_string(promoted_dtype))
+        x = mb.const(val=const_value_after_cast, name=var.name + "_promoted", before_op=op)
     return x
 
 def _homogenize_input_dtypes_block(block):
diff --git a/coremltools/converters/mil/backend/mil/passes/insert_image_preprocessing_op.py b/coremltools/converters/mil/backend/mil/passes/insert_image_preprocessing_op.py
index 772ad02c2..53e1fd956 100644
--- a/coremltools/converters/mil/backend/mil/passes/insert_image_preprocessing_op.py
+++ b/coremltools/converters/mil/backend/mil/passes/insert_image_preprocessing_op.py
@@ -5,7 +5,7 @@
 
 from coremltools.converters.mil.mil.passes.pass_registry import register_pass
 from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
-from coremltools.converters.mil.input_types import ImageType
+from coremltools.converters.mil.input_types import ColorLayout, ImageType
 # import mil internal ops to add it to the builder
 from coremltools.converters.mil.mil.ops import defs as _ops
 from coremltools.converters.mil.mil import Builder as mb
@@ -45,7 +45,7 @@ def _insert_image_preprocessing_ops(block, prog):
                                          y=np.array(input_type.scale, dtype=input_nptype),
                                          before_op=first_op, name=input_var.name + "__scaled__")
                 if has_bias:
-                    if input_type.color_layout == "G":
+                    if input_type.color_layout in (ColorLayout.GRAYSCALE, ColorLayout.GRAYSCALE_FLOAT16):
                         last_output = mb.add(x=last_output,
                                              y=np.array(input_type.bias, dtype=input_nptype),
                                              before_op=first_op, name=input_var.name + "__biased__")
diff --git a/coremltools/converters/mil/backend/mil/passes/mil_passes.py b/coremltools/converters/mil/backend/mil/passes/mil_passes.py
index c5c6ced8a..a7ef65a9c 100644
--- a/coremltools/converters/mil/backend/mil/passes/mil_passes.py
+++ b/coremltools/converters/mil/backend/mil/passes/mil_passes.py
@@ -7,9 +7,11 @@
 
 from coremltools.converters.mil.backend.nn.passes.nn_passes import nn_backend_passes
 from coremltools.converters.mil.mil.passes.pass_registry import PASS_REGISTRY
+from coremltools.converters.mil._deployment_compatibility import AvailableTarget
 
 
-def mil_backend_passes(prog):
+def mil_backend_passes(prog, minimum_spec_version):
+    min_deployment_target = AvailableTarget(minimum_spec_version)
     passes = [
         "common::const_elimination",
         "mil_backend::adjust_io_to_supported_types",
@@ -27,7 +29,6 @@ def mil_backend_passes(prog):
         "mil_backend::sanitize_name_strings",
         "common::dedup_op_and_var_names",
         "nn_backend::handle_unused_inputs",  # must come after dce.
-        "nn_backend::alert_return_type_cast",  # must be at the end.
     ]
 
     _logging.debug("Program before common passes:\n{}".format(prog))
@@ -35,6 +36,7 @@ def mil_backend_passes(prog):
     prog.validate()
     for p in passes:
         _logging.info('Performing passes for mil backend: "{}"'.format(p))
+        PASS_REGISTRY[p].minimun_deployment_target = min_deployment_target
         PASS_REGISTRY[p](prog)
         # No more validation from this point on as prog is not SSA anymore.
 
diff --git a/coremltools/converters/mil/backend/mil/passes/test_passes.py b/coremltools/converters/mil/backend/mil/passes/test_passes.py
index d2886cb2f..f6f34cebc 100644
--- a/coremltools/converters/mil/backend/mil/passes/test_passes.py
+++ b/coremltools/converters/mil/backend/mil/passes/test_passes.py
@@ -14,6 +14,7 @@
 from coremltools.converters.mil.mil.passes.pass_registry import PASS_REGISTRY
 from coremltools.converters.mil import types
 from coremltools.converters.mil.mil.types import string_to_builtin, builtin_to_string, promote_types
+from coremltools.converters.mil._deployment_compatibility import AvailableTarget as target
 
 # Set the testing backend
 import coremltools.converters.mil.testing_reqs as testing_reqs
@@ -148,6 +149,60 @@ def prog(x):
         assert prev_inputs[0][1].name == inputs[0][1].name
         assert inputs[0][1].dtype == types.fp32
 
+
+    @pytest.mark.parametrize(
+        "use_ios16_deployment_target",
+        [False, True],
+    )
+    def test_float16_input_output(self, use_ios16_deployment_target):
+        """
+        Input graph:
+
+        main(%x: (1, 1, 1, 1, fp16)(Tensor)) {
+            block0() {
+                %relu_0: (1, 1, 1, 1, fp16)(Tensor) = relu(x=%x, name="relu_0")
+            } -> (%relu_0)
+        }
+
+        Output graph (if deployment_target < ios16):
+
+        main(%x: (1, 1, 1, 1, fp32)(Tensor)) {
+            block0() {
+                %cast_0: (1, 1, 1, 1, fp16)(Tensor) = cast(x=%x, dtype="fp16", name="cast_0")
+                %relu_0__pre__output__fp32__cast: (1, 1, 1, 1, fp16)(Tensor) = relu(x=%cast_0, name="relu_0")
+                %relu_0: (1, 1, 1, 1, fp32)(Tensor) = cast(x=%relu_0__pre__output__fp32__cast, dtype="fp32", name="cast_1")
+            } -> (%relu_0)
+        }
+
+        Output graph (if deployment_target >= ios16): same as the input graph
+        """
+        @mb.program(input_specs=[mb.TensorSpec(shape=(1, 1, 1, 1), dtype=types.fp16)])
+        def prog(x):
+            return mb.relu(x=x)
+
+        if use_ios16_deployment_target:
+            PASS_REGISTRY["mil_backend::adjust_io_to_supported_types"].minimun_deployment_target = target.iOS16
+
+        prev_prog, prev_block, block = apply_pass_and_basic_check(
+            prog, "mil_backend::adjust_io_to_supported_types"
+        )
+
+        prev_inputs = list(prev_block.inputs.items())
+        inputs = list(block.inputs.items())
+        prev_outputs = prev_block.outputs
+        outputs = block.outputs
+        assert prev_inputs[0][1].name == inputs[0][1].name
+        assert outputs[0].name == prev_outputs[0].name
+        if not use_ios16_deployment_target:
+            assert get_op_types_in_program(prog) == ['cast', 'relu', 'cast']
+            assert inputs[0][1].dtype == types.fp32
+            assert outputs[0].dtype == types.fp32
+        else:
+            assert get_op_types_in_program(prog) == ['relu']
+            assert inputs[0][1].dtype == types.fp16
+            assert block.outputs[0].dtype == types.fp16
+
+
     def test_int8_input(self):
         """
         Input graph:
@@ -846,3 +901,20 @@ def prog(x, y):
         assert len(cast.outputs) == 1
         assert len(cast.outputs[0].child_ops) == 1
         assert cast.outputs[0].child_ops[0].op_type == op
+
+    def test_mul_op_fp32_int32_inputs(self):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(4,), dtype=types.fp32)])
+        def prog(x):
+            const = mb.const(val=5)
+            out = mb.mul(x=x, y=const)
+            return out
+
+        assert get_op_types_in_program(prog) == ["mul"]
+        print(prog)
+        apply_pass_and_basic_check(prog, "mil_backend::homogenize_input_dtypes")
+        print(prog)
+        # verify that there is no cast op in the program, since the
+        # const input (int32) should have been promoted to a float32 and replaced with a new const
+        assert get_op_types_in_program(prog) == ["mul"]
+
+
diff --git a/coremltools/converters/mil/backend/nn/load.py b/coremltools/converters/mil/backend/nn/load.py
index 6bc2e82ff..8e27b368d 100644
--- a/coremltools/converters/mil/backend/nn/load.py
+++ b/coremltools/converters/mil/backend/nn/load.py
@@ -5,6 +5,7 @@
 
 import coremltools as ct
 from coremltools.converters.mil.input_types import (
+    ColorLayout,
     ImageType,
     EnumeratedShapes,
     Shape,
@@ -31,25 +32,25 @@
 )
 from .op_mapping import convert_ops
 from .passes.nn_passes import nn_backend_passes
-
+from ..backend_helper import _get_colorspace_enum, _validate_image_input_output_shapes
 
 def _convert_to_image_input(proto, inputs, skip_model_load=False):
     tmp_model = MLModel(proto, skip_model_load=skip_model_load)
     for input_type in inputs:
         if isinstance(input_type, ImageType):
-            if input_type.color_layout == "G":
+            if input_type.color_layout in (ColorLayout.GRAYSCALE, ColorLayout.GRAYSCALE_FLOAT16):
                 gray_bias = input_type.bias
                 red_bias, green_bias, blue_bias = 0.0, 0.0, 0.0
-            elif input_type.color_layout == "RGB":
+            elif input_type.color_layout == ColorLayout.RGB:
                 gray_bias = 0.0
                 red_bias, green_bias, blue_bias = input_type.bias
-            elif input_type.color_layout == "BGR":
+            elif input_type.color_layout == ColorLayout.BGR:
                 gray_bias = 0.0
                 blue_bias, green_bias, red_bias = input_type.bias
             tmp_model = neural_network.utils.make_image_input(
                 tmp_model,
                 input_type.name,
-                is_bgr=input_type.color_layout == "BGR",
+                is_bgr=input_type.color_layout == ColorLayout.BGR,
                 image_format="NCHW" if input_type.channel_first else "NHWC",
                 red_bias=red_bias,
                 green_bias=green_bias,
@@ -210,6 +211,7 @@ def load(prog, **kwargs):
 
     nn_backend_passes(prog)
     input_types = prog.main_input_types
+    output_types = prog.main_output_types
 
     v1_inputs = []
     symbolic_inputs = {}
@@ -275,6 +277,28 @@ def load(prog, **kwargs):
         proto = _convert_to_image_input(proto, input_types,
                                         skip_model_load=kwargs.get("skip_model_load", False))
 
+    # image output
+    if output_types is not None:
+        assert len(output_types) == len(prog.functions["main"].outputs), \
+                "number of mil program outputs do not match the number of outputs provided by the user"
+        for i, output_proto_desc in enumerate(proto.description.output):
+            output_var = prog.functions["main"].outputs[i]
+            if isinstance(output_types[i], ImageType):
+                if not types.is_tensor(var.sym_type):
+                    raise ValueError("Image output, '{}', is a scalar, but it should be a tensor of rank 4".format(
+                        var.name))
+                shape = var.sym_type.get_shape()
+                if any_variadic(shape):
+                    raise ValueError("Variable rank model outputs, that are ImageTypes, are not supported")
+                if any([is_symbolic(d) for d in shape]):
+                    raise NotImplementedError("Image output '{}' has symbolic dimensions in its shape".
+                                              format(var.name))
+                _validate_image_input_output_shapes(output_types[i].color_layout, shape, var.name, is_input=False)
+                clr_space = _get_colorspace_enum(output_types[i].color_layout)
+                output_proto_desc.type.imageType.colorSpace = clr_space
+                output_proto_desc.type.imageType.width = shape[-1]
+                output_proto_desc.type.imageType.height = shape[-2]
+
     # classifier flag
     classifier_config = kwargs.get("classifier_config", None)
     if classifier_config is not None:
diff --git a/coremltools/converters/mil/backend/nn/op_mapping.py b/coremltools/converters/mil/backend/nn/op_mapping.py
index 9c20b377d..d642d5c36 100644
--- a/coremltools/converters/mil/backend/nn/op_mapping.py
+++ b/coremltools/converters/mil/backend/nn/op_mapping.py
@@ -16,9 +16,9 @@
     any_symbolic,
     is_symbolic,
 )
-from coremltools.converters.mil.mil.types import np_dtype_to_py_type
 from coremltools.converters.mil.mil import types
 from coremltools.converters.mil.mil.ops.registry import SSAOpRegistry
+from coremltools.converters.mil.mil.types.type_mapping import np_val_to_py_type
 from coremltools.models.neural_network.quantization_utils import (
     _convert_array_to_nbit_quantized_bytes,
 )
@@ -78,26 +78,6 @@ def make_input(const_context, builder, variables):
         add_const(const_context, builder, v.name, v.val)
     return v.name
 
-def to_py_type(val):
-    """Convert numpy val to python primitive equivalent. Ex:
-
-    Given: val = np.array([True, False])
-    Returns: [True, False]
-
-    Given: val = np.array(32, dtype=np.int)
-    Returns 32
-    """
-    if not isinstance(val, (_np.ndarray, _np.generic)):
-        return val
-
-    # val is np.ndarray or np.generic
-    is_np_scalar = isinstance(val, _np.generic) or val.shape == ()
-    py_type = np_dtype_to_py_type(val.dtype)
-    if is_np_scalar:
-        return py_type(val)
-    # flatten them to 1D array
-    val = val.flatten()
-    return tuple(py_type(v) for v in val)
 
 def _convert_pool(const_context, builder, op, mode, exclude_padding_from_average=True):
     num_spatial_dimensions = len(op.kernel_sizes.val)
@@ -706,13 +686,13 @@ def _add_elementwise_binary(
         if op.x.val is not None and op.x.rank == 0 and _np.isfinite(op.x.val):
             params["input_names"] = make_input(const_context, builder, [op.y])
             val = op.x.val if not isinstance(op.x.val, _np.float16) else op.x.val.astype(_np.float32)
-            params["alpha"] = to_py_type(val)
+            params["alpha"] = np_val_to_py_type(val)
             builder.add_elementwise(**params)
             return
         elif op.y.val is not None and op.y.rank == 0 and _np.isfinite(op.y.val):
             params["input_names"] = make_input(const_context, builder, [op.x])
             val = op.y.val if not isinstance(op.y.val, _np.float16) else op.y.val.astype(_np.float32)
-            params["alpha"] = to_py_type(val)
+            params["alpha"] = np_val_to_py_type(val)
             builder.add_elementwise(**params)
             return
     elif mode in ["equal", "not_equal"]:
@@ -721,13 +701,13 @@ def _add_elementwise_binary(
         if op.x.val is not None and op.x.rank == 0 and _np.isfinite(op.x.val):
             params["input_names"] = make_input(const_context, builder, [op.y])
             val = op.x.val if not isinstance(op.x.val, _np.float16) else op.x.val.astype(_np.float32)
-            params["alpha"] = to_py_type(val)
+            params["alpha"] = np_val_to_py_type(val)
             add_func(**params)
             return
         elif op.y.val is not None and op.y.rank == 0 and _np.isfinite(op.y.val):
             params["input_names"] = make_input(const_context, builder, [op.x])
             val = op.y.val if not isinstance(op.y.val, _np.float16) else op.y.val.astype(_np.float32)
-            params["alpha"] = to_py_type(val)
+            params["alpha"] = np_val_to_py_type(val)
             add_func(**params)
             return
     elif mode in ["greater_than", "greater_equal", "less_than", "less_equal"]:
@@ -735,7 +715,7 @@ def _add_elementwise_binary(
         if op.x.val is not None and op.x.rank == 0 and _np.isfinite(op.x.val):
             params["input_names"] = make_input(const_context, builder, [op.y])
             val = op.x.val if not isinstance(op.x.val, _np.float16) else op.x.val.astype(_np.float32)
-            params["alpha"] = to_py_type(val)
+            params["alpha"] = np_val_to_py_type(val)
             if "less" in mode:
                 params["use_greater_than_equal"] = mode.endswith("_equal")
                 builder.add_greater_than(**params)
@@ -746,7 +726,7 @@ def _add_elementwise_binary(
         elif op.y.val is not None and op.y.rank == 0 and _np.isfinite(op.y.val):
             params["input_names"] = make_input(const_context, builder, [op.x])
             val = op.y.val if not isinstance(op.y.val, _np.float16) else op.y.val.astype(_np.float32)
-            params["alpha"] = to_py_type(val)
+            params["alpha"] = np_val_to_py_type(val)
             if "greater" in mode:
                 params["use_greater_than_equal"] = mode.endswith("_equal")
                 builder.add_greater_than(**params)
@@ -1215,20 +1195,20 @@ def slice_by_index(const_context, builder, op):
             output_name=op.outputs[0].name,
             begin_ids=op.begin.val,
             end_ids=op.end.val,
-            strides=to_py_type(stride),
-            begin_masks=to_py_type(begin_mask),
-            end_masks=to_py_type(end_mask),
-            squeeze_masks=to_py_type(squeeze_mask),
+            strides=np_val_to_py_type(stride),
+            begin_masks=np_val_to_py_type(begin_mask),
+            end_masks=np_val_to_py_type(end_mask),
+            squeeze_masks=np_val_to_py_type(squeeze_mask),
         )
     else:
         builder.add_slice_dynamic(
             name=op.name,
             input_names=make_input(const_context, builder, [op.x, op.begin, op.end]),
             output_name=op.outputs[0].name,
-            strides=to_py_type(stride),
-            begin_masks=to_py_type(begin_mask),
-            end_masks=to_py_type(end_mask),
-            squeeze_masks=to_py_type(squeeze_mask),
+            strides=np_val_to_py_type(stride),
+            begin_masks=np_val_to_py_type(begin_mask),
+            end_masks=np_val_to_py_type(end_mask),
+            squeeze_masks=np_val_to_py_type(squeeze_mask),
         )
 
 
@@ -2121,6 +2101,101 @@ def space_to_depth(const_context, builder, op):
     )
 
 
+@register_mil_to_nn_mapping
+def batch_to_space(const_context, builder, op):
+    block_size = op.block_shape.val
+    if block_size[0] != block_size[1]:
+        raise ValueError("batch_to_space non-equal block shape is not supported in 'neuralnetwork' backend! Please change the convert_to to 'mlprogram'.")
+    block_size = block_size[0]
+    if block_size == 1:
+        raise ValueError("batch_to_space block shape == 1 not supported in 'neuralnetwork' backend! Please change the convert_to to 'mlprogram'.")
+
+    transpose_1_name = op.name + "_transpose_1"
+    builder.add_transpose(
+        name=transpose_1_name,
+        input_name=make_input(const_context, builder, op.x),
+        axes=[1, 0, 2, 3],
+        output_name=transpose_1_name,
+    )
+    depth_to_space_name = op.name + "_depth_to_space"
+    builder.add_reorganize_data(
+        name=depth_to_space_name,
+        input_name=transpose_1_name,
+        output_name=depth_to_space_name,
+        mode="DEPTH_TO_SPACE",
+        block_size=block_size,
+    )
+    crop_name = op.name + "_crop"
+    crops = op.crops.val
+    builder.add_crop(
+        name=crop_name,
+        input_names=[depth_to_space_name],
+        output_name=crop_name,
+        offset=0,
+        top=crops[0][0],
+        bottom=crops[0][1],
+        left=crops[1][0],
+        right=crops[1][1],
+    )
+    transpose_2_name = op.name + "_transpose_2"
+    builder.add_transpose(
+        name=transpose_2_name,
+        input_name=crop_name,
+        axes=[1, 0, 2, 3],
+        output_name=op.outputs[0].name,
+    )
+
+
+@register_mil_to_nn_mapping
+def space_to_batch(const_context, builder, op):
+    block_size = op.block_shape.val
+    if block_size[0] != block_size[1]:
+        raise ValueError("space_to_batch non-equal block shape is not supported in 'neuralnetwork' backend! Please change the convert_to to 'mlprogram'.")
+    block_size = block_size[0]
+    if block_size == 1:
+        raise ValueError("space_to_batch block shape == 1 not supported in 'neuralnetwork' backend! Please change the convert_to to 'mlprogram'.")
+
+    pad = op.paddings.val.flatten()
+    left, right = pad[2], pad[3]
+    top, bottom = pad[0], pad[1]
+
+    pad_name = op.name + "_pad"
+    builder.add_padding(
+        name=pad_name,
+        left=left,
+        right=right,
+        top=top,
+        bottom=bottom,
+        input_name=make_input(const_context, builder, op.x),
+        output_name=pad_name,
+        padding_type="constant",
+        value=0.,
+    )
+
+    transpose_1_name = op.name + "_transpose_1"
+    builder.add_transpose(
+        name=transpose_1_name,
+        input_name=pad_name,
+        axes=[1, 0, 2, 3],
+        output_name=transpose_1_name,
+    )
+    space_to_depth_name = op.name + "_space_to_depth"
+    builder.add_reorganize_data(
+        name=space_to_depth_name,
+        input_name=transpose_1_name,
+        output_name=space_to_depth_name,
+        mode="SPACE_TO_DEPTH",
+        block_size=block_size,
+    )
+    transpose_2_name = op.name + "_transpose_2"
+    builder.add_transpose(
+        name=transpose_2_name,
+        input_name=space_to_depth_name,
+        axes=[1, 0, 2, 3],
+        output_name=op.outputs[0].name,
+    )
+
+
 @register_mil_to_nn_mapping
 def transpose(const_context, builder, op):
     builder.add_transpose(
diff --git a/coremltools/converters/mil/backend/nn/passes/alert_return_type_cast.py b/coremltools/converters/mil/backend/nn/passes/alert_return_type_cast.py
index 343e72259..26228e664 100644
--- a/coremltools/converters/mil/backend/nn/passes/alert_return_type_cast.py
+++ b/coremltools/converters/mil/backend/nn/passes/alert_return_type_cast.py
@@ -1,15 +1,13 @@
-# -*- coding: utf-8 -*-
-
 #  Copyright (c) 2020, Apple Inc. All rights reserved.
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+import logging
 
-from coremltools.converters.mil.mil.passes.pass_registry import register_pass
+from coremltools.converters.mil.mil import types, Var
 from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
-from coremltools.converters.mil.mil import Var, types
-import logging
+from coremltools.converters.mil.mil.passes.pass_registry import register_pass
 
 
 @register_pass(namespace="nn_backend")
diff --git a/coremltools/converters/mil/backend/nn/passes/handle_return_inputs_as_outputs.py b/coremltools/converters/mil/backend/nn/passes/handle_return_inputs_as_outputs.py
index 232315c3c..1a5f42a53 100644
--- a/coremltools/converters/mil/backend/nn/passes/handle_return_inputs_as_outputs.py
+++ b/coremltools/converters/mil/backend/nn/passes/handle_return_inputs_as_outputs.py
@@ -1,14 +1,12 @@
-# -*- coding: utf-8 -*-
-
 #  Copyright (c) 2020, Apple Inc. All rights reserved.
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
-
 from coremltools.converters.mil.mil import Builder as mb
-from coremltools.converters.mil.mil.passes.pass_registry import register_pass
 from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
+from coremltools.converters.mil.mil.passes.pass_registry import register_pass
+
 
 def _handle_return_inputs_as_outputs_func(f):
     returned_inputs = []
diff --git a/coremltools/converters/mil/backend/nn/passes/handle_return_unused_inputs.py b/coremltools/converters/mil/backend/nn/passes/handle_return_unused_inputs.py
index 9b741eee4..3f8e2b9e2 100644
--- a/coremltools/converters/mil/backend/nn/passes/handle_return_unused_inputs.py
+++ b/coremltools/converters/mil/backend/nn/passes/handle_return_unused_inputs.py
@@ -1,14 +1,12 @@
-# -*- coding: utf-8 -*-
-
 #  Copyright (c) 2020, Apple Inc. All rights reserved.
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
-
 from coremltools.converters.mil.mil import Builder as mb
-from coremltools.converters.mil.mil.passes.pass_registry import register_pass
 from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
+from coremltools.converters.mil.mil.passes.pass_registry import register_pass
+
 
 def _handle_return_unused_inputs_func(f):
 
diff --git a/coremltools/converters/mil/backend/nn/passes/test_mlmodel_passes.py b/coremltools/converters/mil/backend/nn/passes/test_mlmodel_passes.py
index 3c8669c6f..a65c24535 100644
--- a/coremltools/converters/mil/backend/nn/passes/test_mlmodel_passes.py
+++ b/coremltools/converters/mil/backend/nn/passes/test_mlmodel_passes.py
@@ -9,6 +9,7 @@
 
 import numpy as np
 
+from coremltools import ComputeUnit
 from coremltools._deps import _IS_MACOS
 import coremltools.models.datatypes as datatypes
 from coremltools.models.utils import _macos_version
@@ -105,11 +106,11 @@ def test_dead_layer_remove_branch(self):
         )
         builder.add_squeeze("out", "input", "out", squeeze_all=True)
 
-        mlmodel = MLModel(builder.spec)
+        mlmodel = MLModel(builder.spec, compute_units=ComputeUnit.CPU_ONLY)
         data = np.random.rand(2,)
         data_dict = {"input": data}
         if _IS_MACOS:
-            before_pass_out = mlmodel.predict(data_dict, useCPUOnly=True)["out"]
+            before_pass_out = mlmodel.predict(data_dict)["out"]
             if DEBUG:
                 print(
                     "\n mlmodel description before remove disconnected layers pass: \n"
@@ -121,8 +122,8 @@ def test_dead_layer_remove_branch(self):
                     "\n mlmodel description after remove disconnected layers pass: \n"
                 )
                 print_network_spec(builder.spec, style="coding")
-            mlmodel = MLModel(builder.spec)
-            after_pass_out = mlmodel.predict(data_dict, useCPUOnly=True)["out"]
+            mlmodel = MLModel(builder.spec, compute_units=ComputeUnit.CPU_ONLY)
+            after_pass_out = mlmodel.predict(data_dict)["out"]
 
             np.testing.assert_almost_equal(before_pass_out, after_pass_out, decimal=2)
             np.testing.assert_equal(len(builder.spec.neuralNetwork.layers), 1)
@@ -159,7 +160,7 @@ def test_dead_layer_partial_branch(self):
         )
         builder.add_squeeze("out", "relu2_out", "out", squeeze_all=True)
 
-        mlmodel = MLModel(builder.spec)
+        mlmodel = MLModel(builder.spec, compute_units=ComputeUnit.CPU_ONLY)
 
         if not _IS_MACOS:
             # Can not get predictions unless on macOS.
@@ -167,7 +168,7 @@ def test_dead_layer_partial_branch(self):
 
         data = np.random.rand(2,)
         data_dict = {"input": data}
-        before_pass_out = mlmodel.predict(data_dict, useCPUOnly=True)["out"]
+        before_pass_out = mlmodel.predict(data_dict)["out"]
         if DEBUG:
             print("\n mlmodel description before remove disconnected layers pass: \n")
             print_network_spec(builder.spec, style="coding")
@@ -176,8 +177,8 @@ def test_dead_layer_partial_branch(self):
         if DEBUG:
             print("\n mlmodel description after remove disconnected layers pass: \n")
             print_network_spec(builder.spec, style="coding")
-        mlmodel = MLModel(builder.spec)
-        after_pass_out = mlmodel.predict(data_dict, useCPUOnly=True)["out"]
+        mlmodel = MLModel(builder.spec, compute_units=ComputeUnit.CPU_ONLY)
+        after_pass_out = mlmodel.predict(data_dict)["out"]
 
         np.testing.assert_almost_equal(before_pass_out, after_pass_out, decimal=2)
         np.testing.assert_equal(
@@ -236,10 +237,10 @@ def test_conv_crop_bn_to_conv_bn_crop(self):
 
         # Predict
         if _IS_MACOS:
-            mlmodel = MLModel(builder.spec)
+            mlmodel = MLModel(builder.spec, dict, compute_units=ComputeUnit.CPU_ONLY)
             data = np.random.rand(1, 10, 10)
             data_dict = {"data": data}
-            before_pass_out = mlmodel.predict(data_dict, useCPUOnly=True)["out"]
+            before_pass_out = mlmodel.predict(data_dict)["out"]
 
         # transform the pattern
         transform_conv_crop(builder.spec)
@@ -249,8 +250,8 @@ def test_conv_crop_bn_to_conv_bn_crop(self):
 
         if _IS_MACOS:
             # Predict
-            mlmodel = MLModel(builder.spec)
-            after_pass_out = mlmodel.predict(data_dict, useCPUOnly=True)["out"]
+            mlmodel = MLModel(builder.spec, compute_units=ComputeUnit.CPU_ONLY)
+            after_pass_out = mlmodel.predict(data_dict)["out"]
             np.testing.assert_almost_equal(before_pass_out, after_pass_out, decimal=3)
 
     def test_conv_crop_bn_relu_to_conv_bn_relu_crop(self):
@@ -306,10 +307,10 @@ def test_conv_crop_bn_relu_to_conv_bn_relu_crop(self):
 
         # Predict
         if _IS_MACOS:
-            mlmodel = MLModel(builder.spec)
+            mlmodel = MLModel(builder.spec, compute_units=ComputeUnit.CPU_ONLY)
             data = np.random.rand(1, 10, 10)
             data_dict = {"data": data}
-            before_pass_out = mlmodel.predict(data_dict, useCPUOnly=True)["out"]
+            before_pass_out = mlmodel.predict(data_dict)["out"]
 
         # transform the pattern
         transform_conv_crop(builder.spec)
@@ -319,9 +320,9 @@ def test_conv_crop_bn_relu_to_conv_bn_relu_crop(self):
         np.testing.assert_equal("crop", spec.layers[3].WhichOneof("layer"))
 
         # Predict
-        mlmodel = MLModel(builder.spec)
+        mlmodel = MLModel(builder.spec, compute_units=ComputeUnit.CPU_ONLY)
         if _IS_MACOS:
-            after_pass_out = mlmodel.predict(data_dict, useCPUOnly=True)["out"]
+            after_pass_out = mlmodel.predict(data_dict)["out"]
             np.testing.assert_almost_equal(before_pass_out, after_pass_out, decimal=3)
 
 
@@ -334,8 +335,8 @@ def _test_builder(self, builder, input_shape, expected_layer_num=None):
         data = np.random.rand(*input_shape)
 
         # Mlmodel before
-        mlmodel = MLModel(builder.spec)
-        output_before = mlmodel.predict({"data": data}, useCPUOnly=True)["out"]
+        mlmodel = MLModel(builder.spec, compute_units=ComputeUnit.CPU_ONLY)
+        output_before = mlmodel.predict({"data": data})["out"]
         num_layers_before = len(builder.spec.neuralNetwork.layers)
 
         remove_redundant_transposes(builder.spec)
@@ -347,8 +348,8 @@ def _test_builder(self, builder, input_shape, expected_layer_num=None):
             self.assertEqual(len(layers), expected_layer_num)
 
         # Mlmodel after
-        mlmodel = MLModel(builder.spec)
-        output_after = mlmodel.predict({"data": data}, useCPUOnly=True)["out"]
+        mlmodel = MLModel(builder.spec, compute_units=ComputeUnit.CPU_ONLY)
+        output_after = mlmodel.predict({"data": data})["out"]
 
         np.testing.assert_almost_equal(output_before, output_after, decimal=3)
 
diff --git a/coremltools/converters/mil/conftest.py b/coremltools/converters/mil/conftest.py
index 00bb3e0dc..236ca03f5 100644
--- a/coremltools/converters/mil/conftest.py
+++ b/coremltools/converters/mil/conftest.py
@@ -3,11 +3,10 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+
 def pytest_make_parametrize_id(config, val, argname):
-    if isinstance(val, (tuple, list)) and all(isinstance(elem, str) for elem in val):
-        return "-".join(val)
-    if isinstance(val, (int,bool,float)):
-        return "[{}={}]".format(argname, val)
-    if isinstance(val, (tuple, list)) and all(isinstance(elem, int) for elem in val):
-        return "[{}=({})]".format(argname, ",".join([str(i) for i in val]))
-    return None
+    '''
+    This function is a hook into pytest. It generates a user friendly string
+    representation of the parameterized values.
+    '''
+    return "{}={}".format(argname, str(val))
diff --git a/coremltools/converters/mil/converter.py b/coremltools/converters/mil/converter.py
index 99aabafd6..a4ec005a9 100644
--- a/coremltools/converters/mil/converter.py
+++ b/coremltools/converters/mil/converter.py
@@ -4,7 +4,6 @@
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 import os as _os
-import shutil as _shutil
 import stat as _stat
 import tempfile as _tempfile
 import warnings as _warnings
diff --git a/coremltools/converters/mil/frontend/_utils.py b/coremltools/converters/mil/frontend/_utils.py
index 7dd4defc4..88e47482f 100644
--- a/coremltools/converters/mil/frontend/_utils.py
+++ b/coremltools/converters/mil/frontend/_utils.py
@@ -3,6 +3,7 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+from coremltools.converters.mil.input_types import InputType
 from coremltools.converters.mil.mil import Builder as mb, types
 from coremltools.converters.mil.mil.ops.defs._utils import parse_einsum_equation
 from coremltools.converters.mil.mil.types.symbolic import any_symbolic, is_symbolic
@@ -143,3 +144,18 @@ def _does_block_contain_symbolic_shape(block):
         if _does_block_contain_symbolic_shape(f):
             return True
     return False
+
+
+def get_output_names(outputs):
+    """
+    :param: list[ct.TensorType/ct.ImageType]
+    :return: list[str]
+    """
+    output_names = None
+    if outputs is not None:
+        assert all([isinstance(t, InputType) for t in outputs]), \
+            "outputs must be a list of ct.ImageType or ct.TensorType"
+        output_names = [t.name for t in outputs]
+        if all([name is None for name in output_names]):
+            output_names = None
+    return output_names
\ No newline at end of file
diff --git a/coremltools/converters/onnx/_tests/__init__.py b/coremltools/converters/mil/frontend/milproto/__init__.py
similarity index 67%
rename from coremltools/converters/onnx/_tests/__init__.py
rename to coremltools/converters/mil/frontend/milproto/__init__.py
index 8aa13a28b..34ab79f0b 100644
--- a/coremltools/converters/onnx/_tests/__init__.py
+++ b/coremltools/converters/mil/frontend/milproto/__init__.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2017, Apple Inc. All rights reserved.
+# Copyright (c) 2022, Apple Inc. All rights reserved.
 #
 # Use of this source code is governed by a BSD-3-clause license that can be
 # found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from .load import load
diff --git a/coremltools/converters/mil/frontend/milproto/helper.py b/coremltools/converters/mil/frontend/milproto/helper.py
new file mode 100644
index 000000000..85a05413e
--- /dev/null
+++ b/coremltools/converters/mil/frontend/milproto/helper.py
@@ -0,0 +1,70 @@
+#  Copyright (c) 2022, Apple Inc. All rights reserved.
+#
+# Use of this source code is governed by a BSD-3-clause license that can be
+# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import sys
+
+from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil.program import get_new_symbol
+
+
+def opstr_to_opcls(op_str):
+    return getattr(sys.modules["coremltools.converters.mil.mil.ops.defs"], op_str)
+
+
+def get_proto_dim(dim):
+    if dim.WhichOneof("dimension") == "constant":
+        return dim.constant.size
+    else:
+        if not dim.unknown.variadic:
+            return get_new_symbol()
+        raise NotImplementedError("Variadic dimensions not yet implemented.")
+
+
+def proto_to_types(valuetype):
+    """
+    A helper function that maps the proto value type to PyMIL types.
+    """
+    if valuetype.WhichOneof("type") == "tensorType":
+        tensortype = valuetype.tensorType
+        dtype = types.proto_to_builtin_types[tensortype.dataType]
+
+        if tensortype.rank < 0:
+            raise ValueError("Negative or Dynamic ranks not supported")
+        if tensortype.rank != len(tensortype.dimensions):
+            raise ValueError("Rank doesn't match the number of dimensions")
+        if tensortype.attributes != {}:
+            raise ValueError("Attributes on tensorType not supported")
+
+        shape = []
+        for i in range(tensortype.rank):
+            shape.append(get_proto_dim(tensortype.dimensions[i]))
+
+        # For the zero rank tensor, we always convert it back to scalar in PyMIL first
+        if tensortype.rank == 0:
+            return dtype
+            
+        return types.tensor(dtype, shape)
+
+    elif valuetype.WhichOneof("type") == "listType":
+        listtype = valuetype.listType
+        elem_type = proto_to_types(listtype.type)
+        
+        if listtype.length.unknown:
+            init_length = None
+        else:
+            init_length = listtype.length.constant.size
+
+        # In the MIL proto, there is no such thing of "dynamic_length", hence we set it to True when
+        # converting back to PyMIL
+        return types.list(elem_type, init_length, dynamic_length=True)
+
+    elif valuetype.WhichOneof("type") == "dictionaryType":
+        dicttype = valuetype.dictionaryType
+        keytype = proto_to_types(dicttype.keyType)
+        valuetype = proto_to_types(dicttype.valueType)
+        
+        return types.dict(keytype, valuetype)
+    else:
+        raise NotImplementedError("Types {} not yet implemented".format(valuetype.WhichOneof("type")))
diff --git a/coremltools/converters/mil/frontend/milproto/load.py b/coremltools/converters/mil/frontend/milproto/load.py
new file mode 100644
index 000000000..8c7964c88
--- /dev/null
+++ b/coremltools/converters/mil/frontend/milproto/load.py
@@ -0,0 +1,422 @@
+#  Copyright (c) 2022, Apple Inc. All rights reserved.
+#
+# Use of this source code is governed by a BSD-3-clause license that can be
+# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import os
+import logging
+
+import numpy as np
+
+from coremltools import _OPSET
+from coremltools.converters.mil.mil import (
+    Block,
+    Builder as mb,
+    Function,
+    ListVar,
+    mil_list,
+    Placeholder,
+    Program,
+    TupleInputType,
+    types,
+    Var,
+)
+from coremltools.converters.mil.mil.block import curr_block
+from coremltools.libmilstoragepython import _BlobStorageReader as BlobReader
+from coremltools.proto import MIL_pb2 as pm
+from .helper import proto_to_types, opstr_to_opcls
+
+
+class TranscriptionContext:
+    """
+    Holds shared variables needed for transcription.
+    """
+
+    def __init__(self, weights_dir=""):
+        self.name_to_var = {} # mapping from name -> var object
+        self.blob_reader_from_filename = (
+            {}
+        )  # mapping from filename -> BlobReader object
+        self.weights_dir = weights_dir
+
+    def register_var_with_name(self, name, var):
+        var.name = name
+        if name in self.name_to_var:
+            # Overriding allow us to translate control flow blocks
+            msg = "Var %s is added again. Overriding previous value"
+            logging.info(msg % name)
+        self.name_to_var[name] = var
+
+    def get_var_from_name(self, name):
+        if name not in self.name_to_var:
+            raise KeyError("Var {} not found".format(name))
+        return self.name_to_var[name]
+
+
+def _load_tensorvalue(tensorvalue_spec):
+    if not isinstance(tensorvalue_spec, pm.TensorValue):
+        raise TypeError("Invalid TensorValue spec object")
+
+    if tensorvalue_spec.WhichOneof("value") == "floats":
+        return tensorvalue_spec.floats.values
+    elif tensorvalue_spec.WhichOneof("value") == "ints":
+        return tensorvalue_spec.ints.values
+    elif tensorvalue_spec.WhichOneof("value") == "bools":
+        return tensorvalue_spec.bools.values
+    elif tensorvalue_spec.WhichOneof("value") == "strings":
+        return tensorvalue_spec.strings.values
+    elif tensorvalue_spec.WhichOneof("value") == "longInts":
+        return tensorvalue_spec.longInts.values
+    elif tensorvalue_spec.WhichOneof("value") == "doubles":
+        return tensorvalue_spec.doubles.values
+    elif tensorvalue_spec.WhichOneof("value") == "bytes":
+        return tensorvalue_spec.bytes.values
+    else:
+        raise ValueError("Invalid dtype for TensorValue type")
+
+
+def _load_immediate_value(immediatevalue_spec):
+    if not isinstance(immediatevalue_spec, pm.Value.ImmediateValue):
+        raise TypeError("Invalid ImmedidateValue spec object")
+
+    if immediatevalue_spec.WhichOneof("value") == "tensor":
+        return _load_tensorvalue(immediatevalue_spec.tensor)
+    elif immediatevalue_spec.WhichOneof("value") == "list":
+        return immediatevalue_spec.list.values
+    else:
+        raise NotImplementedError(
+            "Immediate value type not supported yet."
+        )
+
+
+def _load_file_value(context, filevalue_spec, dtype):
+    if not isinstance(filevalue_spec, pm.Value.BlobFileValue):
+        raise TypeError("Invalid BlobFileValue spec object")
+
+    filename = os.path.join(context.weights_dir, filevalue_spec.fileName.split("/")[-1])
+    offset = filevalue_spec.offset
+
+    if filename in context.blob_reader_from_filename:
+        blob_reader = context.blob_reader_from_filename[filename]
+    else:
+        blob_reader = BlobReader(filename)
+        context.blob_reader_from_filename[filename] = blob_reader
+
+    if dtype == types.uint8:
+        np_value = np.array(blob_reader.read_uint8_data(offset), np.uint8)
+    elif dtype == types.int8:
+        np_value = np.array(blob_reader.read_int8_data(offset), np.int8)
+    elif dtype == types.fp16:
+        np_value_uint16 = np.array(blob_reader.read_fp16_data(offset), np.uint16)
+        np_value = np.frombuffer(np_value_uint16.tobytes(), np.float16)
+    elif dtype == types.fp32:
+        np_value = np.array(blob_reader.read_float_data(offset), np.float32)
+    else:
+        raise ValueError("Invalid dtype for blob file value type")
+
+    return np_value
+
+
+def _load_value(context, value_spec):
+    if not isinstance(value_spec, pm.Value):
+        raise TypeError("Invalid Value spec object")
+
+    if value_spec.docString:
+        raise ValueError("Docstring would get lost in the process.")
+
+    if value_spec.type.WhichOneof("type") == "tensorType":
+        valuetype = proto_to_types(value_spec.type)
+
+        is_tensor = types.is_tensor(valuetype)
+
+        dtype = valuetype if not is_tensor else valuetype.get_primitive()
+        shape = () if not is_tensor else valuetype.get_shape()
+
+        if value_spec.WhichOneof("value") == "immediateValue":
+            value = _load_immediate_value(value_spec.immediateValue)
+        else:
+            value = _load_file_value(context, value_spec.blobFileValue, dtype)
+
+        if dtype in (types.fp16, types.int8, types.uint8, types.uint32):
+            value = np.frombuffer(value, types.nptype_from_builtin(dtype)).reshape(
+                shape
+            )
+        elif dtype == types.str and shape == ():
+            value = str(value[0])
+        elif dtype in (types.fp32, types.str, types.bool, types.int32, types.int64):
+            value = (
+                np.array(value).astype(types.nptype_from_builtin(dtype)).reshape(shape)
+            )
+        else:
+            raise ValueError("Invalid dtype for tensor value")
+    else:
+        raise NotImplementedError("Only value of tensorType implemented yet")
+
+    if not is_tensor and not isinstance(value, str):
+        value = types.nptype_from_builtin(dtype)(value.item())
+
+    return value
+
+
+def _create_var_from_spec(spec):
+    """
+    This helper function is used for creating PyMIL Var/ListVar from the proto spec.
+    Mainly used for the contruction of the control flow ops.
+    """
+    assert isinstance(spec, pm.NamedValueType)
+    sym_type = proto_to_types(spec.type)
+    name = spec.name
+    if types.is_list(sym_type):
+        var = ListVar(
+            name, 
+            elem_type=sym_type.T[0], 
+            init_length=sym_type.T[1], 
+            dynamic_length=sym_type.T[2])
+    else:
+        var = Var(name, sym_type, None, op=None, op_output_idx=None)
+    return var
+
+def _set_outer_op_for_nested_blocks(blocks, op):
+    """
+    An ultility function that sets the outer_op of the blocks for control flow ops.
+    """
+    for block in blocks:
+        block.outer_op = op
+
+def _create_nested_blocks(context, op_spec):
+    """
+    An utility function that creates nested blocks for control flow ops.
+    """
+    if not op_spec.blocks:
+        return []
+
+    blocks = []
+
+    for block_spec in op_spec.blocks:
+        input_vars = [_create_var_from_spec(input) for input in block_spec.inputs]
+
+        # add block input vars to the context
+        for v in input_vars:
+            context.register_var_with_name(v.name, v)
+
+        # In pymil, the outer_op for a block can only be None if the block is a Functino.
+        # As the result, we use a dummy outer_op here for block creation, and set it to
+        # the legit op later on in _set_outer_op_for_nested_blocks
+        dummy = mb.const(val=0.)
+        with Block(block_inputs=input_vars, outer_op=dummy._op,
+                   name=Block._get_new_name()) as block:
+            _load_block(context, block_spec)
+
+        blocks.append(block)
+
+    return blocks
+
+def _set_inputs_for_control_flow_op(inputs, blocks, op_type):
+    """
+    An utility function that set the dummy functional inputs and blocks inputs for 
+    control flow ops.
+    """
+    if op_type == "while_loop":
+        def _dummy_cond(*loop_vars):
+            return None
+        
+        def _dummy_body(*loop_vars):
+            return None
+
+        inputs["_existing_blocks"] = blocks
+        inputs["_cond"] = _dummy_cond
+        inputs["_body"] = _dummy_body
+
+    elif op_type == "cond":
+        def _dummy_true_fn(*loop_vars):
+            return None
+        def _dummy_false_fn(*loop_vars):
+            return None
+
+        inputs["_existing_blocks"] = blocks
+        inputs["_true_fn"] = _dummy_true_fn
+        inputs["_false_fn"] = _dummy_false_fn
+
+
+def _load_operation(context, op_spec):
+    if not isinstance(op_spec, pm.Operation):
+        raise TypeError("Invalid Operation spec object")
+
+    op_type = op_spec.type
+    if op_type == "const" or op_type.startswith("constexpr_"):
+        if op_spec.blocks:
+            raise ValueError("const / constexpr operation can't have any block")
+        if op_spec.inputs:
+            raise ValueError("const / constexpr operation can't have any input")
+
+        inputs = {k: _load_value(context, v) for k, v in op_spec.attributes.items()}
+        pymil_var = getattr(mb, op_type)(**inputs)
+        context.register_var_with_name(op_spec.outputs[0].name, pymil_var)
+
+    else:
+        if op_type == "custom_layer":
+            raise NotImplementedError(
+                "Loading Custom Layer operation not yet implemented"
+            )
+
+        if op_spec.attributes:
+            raise ValueError("Attributes on operation not supported")
+
+        # The conversion steps of an operation proto -> PyMIL operation are as following:
+
+        # (i)   Convert the input arguments:
+        #       In most of the cases, the input variable is already created beforehand, hence we can
+        #       directly access and get them through the TranscriptionContext.
+        #       There are cases, though, the inputs are literal value. This could happens in the classify op spec.
+        #       For that case, we directly create a constant variable.
+
+        # (ii)  Create nested blocks for control flow operations:
+        #       The Python functinoal input arguments for control flow ops cannot be recovered from milproto -> pymil conversion,
+        #       for instance, the _body, _cond for mb.while_loop and _true_fn, _false_fn for mb.cond are not invertible
+        #       Hence, here we directly create the nested blocks from the proto, and set them to mb.while_loop.blocks / mb.cond.blocks.
+        #       Note that, when creating a block, PyMIL required an outer_op, which should be the control flow operation itself. However,
+        #       in this approach we take, the outer_op hasn't been created at the time when the blocks produced. Here, we make a "dummy outer_op",
+        #       which could pass the check in PyMIL, also it could provide enough information (such as visible variables in the blocks etc.)
+        #       for the creation of the block.
+
+        # (iii) Create PyMIL operation using inputs / blocks
+        #       Note that for the control flow cases, we create dummy functional inputs, and use the exisiting block to create the op.
+
+        # (iv)  Set the outer_op for control flow
+        #       Once the operation is created, we replace the dummy outer_op with the legit one, to make it a valid PyMIL program
+
+        inputs = {}
+        for param_name, argument in op_spec.inputs.items():
+            vars = []
+            for binding in argument.arguments:
+                if binding.WhichOneof("binding") == "name":
+                    vars.append(context.get_var_from_name(binding.name))
+                elif binding.WhichOneof("binding") == "value":
+                    # We only support the list value for now (for the classifier use case)
+                    value_spec = binding.value
+                    assert value_spec.WhichOneof("value") == "immediateValue"
+                    assert value_spec.immediateValue.WhichOneof("value") == "list"
+                    list_value = _load_immediate_value(value_spec.immediateValue)
+                    values = []
+                    for value_spec in list_value:
+                        values.append(_load_value(context, value_spec))
+                    var = mb.const(val=mil_list(values))
+                    vars.append(var)
+                else:
+                    raise NotImplementedError("Binding {} not yet implemented".format(binding_type))
+
+            # TODO: rdar://92930138 (Milproto -> Pymil op translation should take account of the op version)
+            # we need to use the spec version of the function to pick up the correct version of op
+            op_cls = opstr_to_opcls(op_type)
+            if len(vars) == 1 and not isinstance(
+                op_cls.input_spec.input_types[param_name], TupleInputType
+            ):
+                inputs[param_name] = vars[0]
+            else:
+                inputs[param_name] = vars
+
+        blocks = _create_nested_blocks(context, op_spec)
+        _set_inputs_for_control_flow_op(inputs, blocks, op_type)
+
+        output_var = getattr(mb, op_type)(**inputs)
+        if not isinstance(output_var, (tuple, list)):
+            output_var = [output_var]
+
+        if len(output_var) != len(op_spec.outputs):
+            raise AssertionError(
+                "Mismatch between number of outputs in operation specification vs PyMIL outputs"
+            )
+
+        for spec, var in zip(op_spec.outputs, output_var):
+            context.register_var_with_name(spec.name, var)
+
+            pymil_type = var.sym_type
+            proto_type = proto_to_types(spec.type)
+            if not types.is_compatible_type(pymil_type, proto_type):
+                # We allow a corner case where the pymil has an 0 rank tensor and the spec produces a scalar
+                if types.is_tensor(pymil_type) and types.is_scalar(proto_type):
+                    if pymil_type.get_primitive() == proto_type:
+                        continue
+                raise AssertionError(
+                    "Mismatch between var types in specification vs PyMIL"
+                )
+
+        _set_outer_op_for_nested_blocks(blocks, output_var[0].op)
+
+
+def _load_block(context, block_spec):
+    if not isinstance(block_spec, pm.Block):
+        raise TypeError("Invalid Block spec object")
+
+    if block_spec.attributes:
+        raise ValueError("Attributes on block not supported")
+
+    block_outputs = block_spec.outputs
+    output_vars = []
+    for op_spec in block_spec.operations:
+        _load_operation(context, op_spec)
+
+    for proto_output_name in block_outputs:
+        output_vars.append(context.get_var_from_name(proto_output_name))
+
+    pymil_block = curr_block()
+    pymil_block.set_outputs(output_vars)
+    return pymil_block
+
+
+def _load_function(context, func_spec, spec_version):
+    if not isinstance(func_spec, pm.Function):
+        raise TypeError("Invalid Function spec object")
+
+    opset = func_spec.opset
+    if opset != _OPSET[spec_version]:
+        raise AssertionError(
+            "Mismatch between provide specification version vs version implied by opset field"
+        )
+
+    if func_spec.attributes:
+        raise ValueError("Attributes on functions not supported")
+
+    func_inputs = {}
+    for named_value_type in func_spec.inputs:
+        name = named_value_type.name
+        valuetype = proto_to_types(named_value_type.type)
+
+        if not types.is_tensor(valuetype):
+            raise ValueError("Functions inputs can only be tensors")
+        func_inputs[name] = Placeholder(
+            sym_shape=valuetype.get_shape(), dtype=valuetype.get_primitive(), name=name
+        )
+        context.register_var_with_name(name, func_inputs[name].outputs[0])
+
+    if opset not in func_spec.block_specializations:
+        raise ValueError("Missing block specialization for opset {}".format(opset))
+
+    with Function(func_inputs) as pymil_func:
+        _load_block(context, func_spec.block_specializations[opset])
+
+    return pymil_func
+
+
+def load(program_spec, specification_version, file_weights_dir="", **kwargs):
+    if not isinstance(program_spec, pm.Program):
+        raise TypeError("Invalid Program spec object")
+
+    if program_spec.docString:
+        raise NotImplementedError("Docstring would be lost in the process")
+
+    if program_spec.version != 1:
+        raise ValueError("Invalid program version")
+
+    context = TranscriptionContext(file_weights_dir)
+    pymil_program = Program()
+    for func_name, func_spec in program_spec.functions.items():
+        pymil_program.add_function(
+            func_name, _load_function(context, func_spec, specification_version)
+        )
+
+    for attr_name, attr_spec in program_spec.attributes.items():
+        if attr_name not in ("buildInfo",):
+            raise ValueError("Invalid attribute for program")
+
+    return pymil_program
diff --git a/coremltools/converters/mil/frontend/milproto/test_load.py b/coremltools/converters/mil/frontend/milproto/test_load.py
new file mode 100644
index 000000000..282866e43
--- /dev/null
+++ b/coremltools/converters/mil/frontend/milproto/test_load.py
@@ -0,0 +1,164 @@
+# Copyright (c) 2022, Apple Inc. All rights reserved.
+#
+# Use of this source code is governed by a BSD-3-clause license that can be
+# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import numpy as np
+import pytest
+import torch
+
+import coremltools as ct
+from coremltools import ComputeUnit
+from coremltools.converters.mil.converter import mil_convert
+from coremltools.converters.mil.frontend.milproto.load import load as milproto_to_pymil
+from coremltools.converters.mil.frontend.torch.test.test_torch_ops import TestScriptedModels as _TestScriptedModels
+from coremltools.converters.mil.frontend.tensorflow.test.test_ops import TestTensorArray as _TestTensorArray
+from coremltools.converters.mil.frontend.tensorflow.test.testing_utils import run_compare_tf
+from coremltools.converters.mil.mil.ops.tests.testing_utils import compare_backend
+from coremltools.converters.mil.testing_utils import get_op_types_in_program
+from coremltools.converters._converters_entry import _get_metadata_from_mlmodel
+
+
+def roundtrip_and_compare_mlmodel(mlmodel, input_dict):
+    model_spec = mlmodel.get_spec()
+    if model_spec.WhichOneof("Type") != "mlProgram":
+        raise ValueError("Only MIL proto based mlmodels can be loaded")
+
+    program_spec = model_spec.mlProgram
+    model_description = model_spec.description
+
+    pymil_prog = milproto_to_pymil(
+        program_spec=program_spec,
+        specification_version=model_spec.specificationVersion,
+        file_weights_dir=mlmodel.weights_dir,
+    )
+    roundtripped_mlmodel = mil_convert(
+        pymil_prog,
+        convert_to="mlprogram",
+        convert_from="milinternal",
+        compute_units=ComputeUnit.ALL,
+        model_description=model_description,
+    )
+
+    # set MIL program attributes
+    build_info = _get_metadata_from_mlmodel(mlmodel)
+    roundtripped_mlmodel._set_build_info_mil_attributes(build_info)
+
+    expected_outputs = mlmodel.predict(input_dict)
+    compare_backend(roundtripped_mlmodel, input_dict, expected_outputs)
+
+
+class TestLoadAPIUsage:
+    def test_mil_proto_to_pymil(self):
+        from coremltools.converters.mil import Builder as mb
+
+        # Define a PyMIL program
+        @mb.program(input_specs=[mb.TensorSpec(shape=(1, 3, 100, 100)), ])
+        def prog(x):
+            # MIL operation takes named inputs (instead of positional inputs).
+            # Here `name` argument is optional.
+            x = mb.relu(x=x, name='relu')
+            x = mb.conv(x=x, weight=np.random.rand(10, 3, 2, 2), name="conv")
+            x = mb.transpose(x=x, perm=[0, 3, 1, 2], name='transpose')
+            x = mb.reduce_mean(x=x, axes=[2, 3], keep_dims=False, name='reduce')
+            x = mb.log(x=x, name='log')
+            return x
+
+        # Convert it to MIL proto backed MLModel
+        mlmodel = mil_convert(
+            prog,
+            convert_to="mlprogram",
+            convert_from="milinternal",
+            compute_units=ComputeUnit.ALL,
+        )
+
+        # Load MLModel back to PyMIL
+        model_spec = mlmodel.get_spec()
+        program_spec = model_spec.mlProgram
+        loaded_pymil_prog = milproto_to_pymil(
+            program_spec=program_spec,
+            specification_version=model_spec.specificationVersion,
+            file_weights_dir=mlmodel.weights_dir,
+        )
+
+        # Assert that loaded PyMIL prog matches with defined PyMIL prog
+        if get_op_types_in_program(loaded_pymil_prog) != get_op_types_in_program(prog):
+            raise AssertionError("Mismatch between defined PyMIL prog and loaded PyMIL prog")
+
+
+@pytest.mark.skipif(ct.utils._macos_version() < (12, 0), reason="mlprogram predict available only on macOS12+")
+class TestE2ENumericalCorrectness:
+    def test_elu(self):
+        inputs = [ct.TensorType(name="data", shape=(2, 3, 1))]
+        input_data = [torch.rand(*i.shape.to_list()) for i in inputs]
+        torchmodel = torch.jit.trace(torch.nn.ELU(inplace=False), input_data)
+
+        mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram",
+                             compute_units=ComputeUnit.CPU_ONLY)
+        input_values = {
+            i.name: val.detach().numpy() for i, val in zip(inputs, input_data)
+        }
+        roundtrip_and_compare_mlmodel(mlmodel, input_values)
+
+    def test_linear(self):
+        inputs = [ct.TensorType(name="data", shape=(10, 2))]
+        input_data = [torch.rand(*i.shape.to_list()) for i in inputs]
+        torchmodel = torch.jit.trace(
+            torch.nn.Linear(in_features=2, out_features=3, bias=True), input_data
+        )
+
+        mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram",
+                             compute_units=ComputeUnit.CPU_ONLY)
+        input_values = {
+            i.name: val.detach().numpy() for i, val in zip(inputs, input_data)
+        }
+        roundtrip_and_compare_mlmodel(mlmodel, input_values)
+
+    def test_conv(self):
+        inputs = [ct.TensorType(name="data", shape=(5, 10, 4, 4))]
+        input_data = [torch.rand(*i.shape.to_list()) for i in inputs]
+        torchmodel = torch.jit.trace(
+            torch.nn.Conv2d(in_channels=10, out_channels=20, kernel_size=4), input_data
+        )
+
+        mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram",
+                             compute_units=ComputeUnit.CPU_ONLY)
+        input_values = {
+            i.name: val.detach().numpy() for i, val in zip(inputs, input_data)
+        }
+        roundtrip_and_compare_mlmodel(mlmodel, input_values)
+
+    def test_while_loop(self):
+        model = _TestScriptedModels.get_while_loop_model()
+        model_spec = torch.jit.script(model)
+        mlmodel = ct.convert(model_spec,
+                             inputs=[ct.TensorType(name="data", shape=model.input_size, dtype=np.float32)],
+                             convert_to="mlprogram",
+                             compute_units=ComputeUnit.CPU_ONLY
+        )
+        input_values = {"data": np.array([10.])}
+        roundtrip_and_compare_mlmodel(mlmodel, input_values)
+
+    def test_cond(self):
+        model = _TestScriptedModels.get_cond_model()
+        model_spec = torch.jit.script(model)
+        mlmodel = ct.convert(model_spec,
+                             inputs=[ct.TensorType(name="data", shape=(1,), dtype=np.float32)],
+                             convert_to="mlprogram",
+                             compute_units=ComputeUnit.CPU_ONLY
+        )
+        roundtrip_and_compare_mlmodel(mlmodel, {"data": np.array([1.])})
+        roundtrip_and_compare_mlmodel(mlmodel, {"data": np.array([11.])})
+
+    def test_list(self):
+        model, inputs, outputs = _TestTensorArray.get_dynamic_elem_shape_model()
+        input_values = [np.random.rand(2, 3)]
+        input_dict = dict(zip(inputs, input_values))
+        _, mlmodel, _, _ = run_compare_tf(
+            model,
+            input_dict, 
+            outputs,
+            use_cpu_for_conversion=True,
+            backend=("mlprogram", "fp16")
+        )
+        roundtrip_and_compare_mlmodel(mlmodel, {"Placeholder": input_values[0]})
diff --git a/coremltools/converters/mil/frontend/tensorflow/basic_graph_ops.py b/coremltools/converters/mil/frontend/tensorflow/basic_graph_ops.py
index ed8b3e06e..81d2f72e4 100644
--- a/coremltools/converters/mil/frontend/tensorflow/basic_graph_ops.py
+++ b/coremltools/converters/mil/frontend/tensorflow/basic_graph_ops.py
@@ -196,7 +196,7 @@ def fill_outputs(gd):
             gd[i].outputs.append(v.name)
         for i in v.control_inputs:
             gd[i].control_outputs.append(v.name)
-    get_tuple_ops = ["Split", "SplitV", "LSTMBlock"]
+    get_tuple_ops = ["Split", "SplitV", "LSTMBlock", "NonMaxSuppressionV5"]
     for k, v in gd.items():
         if v.op in get_tuple_ops:
             outputs = [[out, int(gd[out].attr["index"])] for out in v.outputs]
diff --git a/coremltools/converters/mil/frontend/tensorflow/convert_utils.py b/coremltools/converters/mil/frontend/tensorflow/convert_utils.py
index 6c13dcf53..a92f77c24 100644
--- a/coremltools/converters/mil/frontend/tensorflow/convert_utils.py
+++ b/coremltools/converters/mil/frontend/tensorflow/convert_utils.py
@@ -168,7 +168,7 @@ def convert_graph(context, graph, outputs=None):
     # Translate the non-placeholder ops.
     num_nodes = len(nodes)
     for i, node_name in enumerate(
-        _tqdm(nodes, desc="Converting Frontend ==> MIL Ops", unit=" ops")
+        _tqdm(nodes, desc="Converting TF Frontend ==> MIL Ops", unit=" ops")
     ):
         node = graph[node_name]
         if node.op == "return":
diff --git a/coremltools/converters/mil/frontend/tensorflow/converter.py b/coremltools/converters/mil/frontend/tensorflow/converter.py
index f954c9e3e..448119a10 100644
--- a/coremltools/converters/mil/frontend/tensorflow/converter.py
+++ b/coremltools/converters/mil/frontend/tensorflow/converter.py
@@ -8,6 +8,7 @@
 from .basic_graph_ops import simple_topsort
 from .convert_utils import convert_graph
 from .ssa_passes.tf_passes import tensorflow_passes
+from .._utils import get_output_names
 from coremltools.converters.mil.input_types import (
     _get_shaping_class,
     InputType,
@@ -22,7 +23,8 @@
     Builder as mb,
     Function,
     get_new_symbol,
-    Program
+    Program,
+    types,
 )
 from coremltools.converters._profile_utils import _profile
 
@@ -119,14 +121,15 @@ def __init__(self, tfssa, inputs=None, outputs=None, **kwargs):
         """
         tfssa: TensorFlow IR.
         inputs: list of TensorType or ImageType, optional, defaults to None.
-        outputs: list of str or str, optional, defaults to None.
-            A list of names of the output nodes or a str for single output name.
-            If None, the converter will try to extract the output information from
-            TensorFlow model.
+        outputs: list[ct.InputType] or None
+            list of either ct.TensorTypes or ct.ImageTypes (both of which are child classes of InputType)
+            This is the value of the "outputs" argument, passed on by the user in "coremltools.convert" API.
         """
         self.tfssa = tfssa
         self.global_type = {}
         self.inputs = None
+        self.main_output_types = outputs
+        output_names = get_output_names(outputs)
 
         main_func = tfssa.functions["main"]
         graph = main_func.graph
@@ -219,11 +222,11 @@ def __init__(self, tfssa, inputs=None, outputs=None, **kwargs):
             node.attr["_output_shapes"] = [shape]  # list of length 1
 
         # infer outputs if not provided
-        self._validate_outputs(tfssa, outputs)
-        outputs = main_func.outputs if outputs is None else outputs
-        outputs = outputs if isinstance(outputs, (tuple, list)) else [outputs]
-        outputs = [x if isinstance(x, str) else x.name for x in outputs]
-        self.outputs = outputs
+        self._validate_outputs(tfssa, output_names)
+        output_names = main_func.outputs if output_names is None else output_names
+        output_names = output_names if isinstance(output_names, (tuple, list)) else [output_names]
+        output_names = [x if isinstance(x, str) else x.name for x in output_names]
+        self.output_names = output_names
 
         # We would like a stack so that we run conversion sequentially.
         self.graph_stack = self._get_stack(tfssa, root="main")
@@ -293,6 +296,58 @@ def _validate_outputs(self, tfssa, outputs):
             if self._get_tensor_name(n) not in output_nodes + all_nodes:
                 raise KeyError('Output node name "{}" does exist.'.format(n))
 
+    def _validate_and_update_main_output_types(self, prog):
+        assert isinstance(self.main_output_types, list)
+        assert len(self.main_output_types) > 0
+        output_vars = prog.functions["main"].outputs
+        output_vars_names = set([var.name for var in output_vars])
+
+        # validation
+        if get_output_names(self.main_output_types) is None:
+            # this is the case, where the user did not provide names for the outputs.
+            # In this case, the outputs were inferred from the TF graph autmatically.
+            # There are two scenarios here: number of inferred outputs equal to 1 or greater than 1
+            if len(output_vars) == 1:
+                if len(self.main_output_types) > 1:
+                    msg = "The list of ct.TensorType()/ct.ImageType() provided in the 'outputs' argument, does not " \
+                          "have names. When more than 1 output is provided for tensorflow conversion, " \
+                          "each entry in the outputs list must have the name specified as well, " \
+                          "via the 'name' argument in ct.TensorType/ct.ImageType"
+                    raise ValueError(msg)
+            else: # len(output_vars) > 1
+                # if there are more than 1 sink nodes (i.e. inferred outputs), the user must provide names
+                # so that the output types can be correctly mapped.
+                msg = "The list of ct.TensorType()/ct.ImageType() provided in the 'outputs' argument, does not " \
+                      "have names. When names are not provided, the outputs are automatically inferred " \
+                      "from the TF graph. There are {} outputs detected which are more than 1. " \
+                      "In this case, to map the output types correctly, " \
+                      "please provide names for each of the " \
+                      "outputs. The output names inferred from the TF graph are: {} "
+                raise ValueError(msg.format(
+                    len(output_vars),
+                    output_vars_names,
+                ))
+        else:
+            # user provided output names. In this case, the appropriate tensors must have
+            # been selected from the TF graph bases on the output names.
+            # Verify that the names present in self.main_output_types match the output_vars_names (it should match).
+            # Also, reconstruct the self.main_output_types list, in the same order of outputs as
+            # present in the output_vars_names
+            assert len(output_vars) == len(self.main_output_types), \
+                "this should match if the outputs were picked correctly from the TF graph"
+            for out in self.main_output_types:
+                if out.name not in output_vars_names:
+                    msg = "output name, '{}', not found in Tensorflow Graph. Available output names are: {}"
+                    raise KeyError(msg.format(out.name, output_vars_names))
+            name_to_input_type_map = {}
+            for out in self.main_output_types:
+                name_to_input_type_map[out.name] = out
+            main_output_types = []
+            for out_var in output_vars:
+                main_output_types.append(name_to_input_type_map[out_var.name])
+            self.main_output_types = main_output_types
+
+
     def check_placeholder_output(self, prog, outputs_name):
         """
         Handle the cases where placeholder is output.
@@ -301,7 +356,7 @@ def check_placeholder_output(self, prog, outputs_name):
                 block3() {
                 } -> (%Placeholder)
             }
-        But self.outputs = ["Placeholder:0"]
+        But self.output_names = ["Placeholder:0"]
         We need to change the block output to Placeholder:0 by inserting an identity
         """
         block = prog["main"]
@@ -326,8 +381,19 @@ def convert_main_graph(self, prog, graph):
         with Function(func_inputs) as ssa_func:
             # Get the input Var
             for name in func_inputs.keys():
-                self.context.add(name, ssa_func.inputs[name])
-            outputs = convert_graph(self.context, graph, self.outputs)
+                input_var = ssa_func.inputs[name]
+                if (types.is_tensor(input_var.sym_type) or types.is_scalar(input_var.sym_type)) \
+                        and (input_var.dtype == types.fp16 or input_var.dtype == types.fp64):
+                    # cast the input var to float32
+                    # We need to do this because the type inference is very buggy when started from
+                    # float16/float64 typed inputs. Until that is fixed in the following radar
+                    # we cast all inputs of type float16/float64 to float32 as the first step.
+                    # These casts will later get removed, if compute_precision=Float16 is
+                    # provided, which will cause the FP16ComputePrecision pass to run.
+                    # TODO: remove this when this radar is fixed: rdar://93731970
+                    input_var = mb.cast(x=input_var, dtype="fp32", name=name)
+                self.context.add(name, input_var)
+            outputs = convert_graph(self.context, graph, self.output_names)
             ssa_func.set_outputs(outputs)
             prog.add_function("main", ssa_func)
         # check duplicate output
@@ -379,13 +445,19 @@ def convert_main_graph(self, prog, graph):
         # Note: only rename the output if the output is not Placeholder.
 
         input_names = [x.name for x in self.inputs]
-        for v_o, out_name in zip(prog["main"].outputs, self.outputs):
+        for v_o, out_name in zip(prog["main"].outputs, self.output_names):
             if v_o.name != out_name and v_o.name not in input_names:
                 logging.info(
                     "Renaming output var: '{}' -> '{}'".format(v_o.name, out_name)
                 )
                 v_o.name = out_name
-        self.check_placeholder_output(prog, self.outputs)
+        self.check_placeholder_output(prog, self.output_names)
+
+        # verify that if model output dtypes / names are provided by the user, they are valid
+        if self.main_output_types is not None:
+            self._validate_and_update_main_output_types(prog)
+            prog.set_main_output_types(self.main_output_types)
+
 
     @_profile
     def convert(self):
diff --git a/coremltools/converters/mil/frontend/tensorflow/dot_visitor.py b/coremltools/converters/mil/frontend/tensorflow/dot_visitor.py
index f60b611cd..2e4ba5040 100644
--- a/coremltools/converters/mil/frontend/tensorflow/dot_visitor.py
+++ b/coremltools/converters/mil/frontend/tensorflow/dot_visitor.py
@@ -6,7 +6,7 @@
 from coremltools.converters.mil.mil import types
 
 
-class DotVisitor(object):
+class DotVisitor:
     """
     Generates a dot description of a graph in dictionary form.
     """
diff --git a/coremltools/converters/mil/frontend/tensorflow/load.py b/coremltools/converters/mil/frontend/tensorflow/load.py
index 91be00e70..3dd38cbec 100644
--- a/coremltools/converters/mil/frontend/tensorflow/load.py
+++ b/coremltools/converters/mil/frontend/tensorflow/load.py
@@ -28,6 +28,7 @@
 )
 from .tfssa import NetworkEnsemble, SSAFunction
 from .parsed_tf_node import ParsedTFNode
+from .._utils import get_output_names
 from coremltools.converters._profile_utils import _profile
 from coremltools._deps import _get_version
 
@@ -60,7 +61,8 @@ def load(self):
 
         logging.info("Loading TensorFlow model '{}'".format(self.model))
         outputs = self.kwargs.get("outputs", None)
-        self._graph_def = self._graph_def_from_model(outputs)
+        output_names = get_output_names(outputs)
+        self._graph_def = self._graph_def_from_model(output_names)
 
         if self._graph_def is not None and len(self._graph_def.node) == 0:
             msg = "tf.Graph should have at least 1 node, Got empty graph."
@@ -86,7 +88,7 @@ def load(self):
         return program
 
     # @abstractmethod
-    def _graph_def_from_model(self, outputs=None):
+    def _graph_def_from_model(self, output_names=None):
         """Load TensorFlow model into GraphDef. Overwrite for different TF versions."""
         pass
 
@@ -137,15 +139,15 @@ def __init__(self, model, debug=False, **kwargs):
         """
         TFLoader.__init__(self, model, debug, **kwargs)
 
-    def _graph_def_from_model(self, outputs=None):
+    def _graph_def_from_model(self, output_names=None):
         """Overwrites TFLoader._graph_def_from_model()"""
         msg = "Expected model format: [tf.Graph | .pb | SavedModel | tf.keras.Model | .h5], got {}"
         if isinstance(self.model, tf.Graph) and hasattr(self.model, "as_graph_def"):
             graph_def = self.model.as_graph_def(add_shapes=True)
-            return self.extract_sub_graph(graph_def, outputs)
+            return self.extract_sub_graph(graph_def, output_names)
         elif isinstance(self.model, tf.keras.Model):
             graph_def = self._from_tf_keras_model(self.model)
-            return self.extract_sub_graph(graph_def, outputs)
+            return self.extract_sub_graph(graph_def, output_names)
         elif isinstance(self.model, str):
             if not os.path.exists(str(self.model)):
                 raise ValueError('Input model "{}" does not exist'.format(self.model))
@@ -163,13 +165,13 @@ def _graph_def_from_model(self, outputs=None):
                     with tf.Graph().as_default() as graph:
                         tf.graph_util.import_graph_def(gd, name="")
                 graph_def = graph.as_graph_def(add_shapes=True)
-                return self.extract_sub_graph(graph_def, outputs)
+                return self.extract_sub_graph(graph_def, output_names)
             elif os.path.isfile(str(self.model)) and self.model.endswith(".h5"):
                 graph_def = self._from_tf_keras_model(self.model)
-                return self.extract_sub_graph(graph_def, outputs)
+                return self.extract_sub_graph(graph_def, output_names)
             elif os.path.isdir(str(self.model)):
                 graph_def = self._from_saved_model(self.model)
-                return self.extract_sub_graph(graph_def, outputs)
+                return self.extract_sub_graph(graph_def, output_names)
             else:
                 raise NotImplementedError(msg.format(self.model))
         else:
diff --git a/coremltools/converters/mil/frontend/tensorflow/ops.py b/coremltools/converters/mil/frontend/tensorflow/ops.py
index dc0e97f0b..287436046 100644
--- a/coremltools/converters/mil/frontend/tensorflow/ops.py
+++ b/coremltools/converters/mil/frontend/tensorflow/ops.py
@@ -158,6 +158,35 @@ def _get_MFCC_constants(spectrogram_N,
     return weights, mat_weighted, mat_spec_val, cosines
 
 
+def _reshape_remaining_dimensions_to_canonical_shape(x, remaining_rank):
+    # An utility function that reshape a tensor with shape [batch, spatial_dims, remaining_dim_1, ..., remaining_dim_N]
+    # to [batch, spatial_dims, remaining_dim_1 * ... * remaining_dim_N]
+    # For the special case where there is no remaining dimensions, we expand the last axis
+    assert remaining_rank != 1
+    if remaining_rank == 0:
+        return mb.expand_dims(x=x, axes=[-1])
+    else:
+        x_shape = mb.shape(x=x)
+        batch_and_spatial_shape = mb.slice_by_size(x=x_shape, begin=[0], size=[x.rank-remaining_rank])
+        reshape_shape = mb.concat(values=[batch_and_spatial_shape, [-1]], axis=0)
+        return mb.reshape(x=x, shape=reshape_shape)
+
+
+def _reshape_remaining_dimension_to_original_shape(x, original_shape, remaining_rank):
+    # An utility function that reshape the tensor with shape [batch_new, spatial_dims_new, remaining_dims] to the original
+    # form, which is [batch_new, spatial_dims_new, remaining_dim_1, ..., remaining_dim_N]
+    assert remaining_rank != 1
+    if remaining_rank == 0:
+        return mb.squeeze(x=x, axes=[-1])
+    else:
+        x_shape = mb.shape(x=x)
+        spatial_rank = original_shape.shape[0] - remaining_rank - 1
+        batch_and_spatial_shape = mb.slice_by_size(x=x_shape, begin=[0], size=[1+spatial_rank])
+        remaining_shape = mb.slice_by_size(x=original_shape, begin=[1+spatial_rank], size=[-1])
+        reshape_shape = mb.concat(values=[batch_and_spatial_shape, remaining_shape], axis=0)
+        return mb.reshape(x=x, shape=reshape_shape)
+
+
 @register_tf_op(tf_alias=["BiasAdd", "AddV2"])
 def Add(context, node):
     x = context[node.inputs[0]]
@@ -324,98 +353,86 @@ def AvgPool3D(context, node):
 
 @register_tf_op
 def BatchToSpaceND(context, node):
+    # In tensorflow, the input tensor has the shape of (batch,) + spatial_shape + remaining_shape.
+    # The shape is treated as a combination of 3 components:
+    # 1. A single batch dimension
+    # 2. Spatial dimensions, with a length spatial_rank, which could be neither 1 or 2. Also, spatial_rank
+    #    is equal to the length of block_shape
+    # 3. Remaining dimensions, with a length remaining_rank 
+
+    # The logic of translating this op is as followed:
+    # 1. We first reshape the input to a canonical shape (rolling the remaining shape dimensions into a 
+    #    single dimension): (batch,) + spatial_shape + (R), where R = remaining_dim_1 * ... * remaining_dim_n
+    # 2. We support rank 1 and rank 2 spatial shape:
+    #    (i) rank 1: We decompose the BatchToSpace into small basic ops.
+    #    (ii) rank 2: We directly use the built in batch_to_space op.
+    #    The output would have shape (batch_new,) + spatial_shape_new + (R)
+    # 3. We transform the tensor back, by unrolling the remaining shape: (B_new,) + spatial_shape_new + remaining_shape
+
     x = context[node.inputs[0]]
     block_shape = context[node.inputs[1]].val
     crops = context[node.inputs[2]].val
+    original_shape = mb.shape(x=x)
 
-    if x.rank != 3 and x.rank != 4:
-        raise NotImplementedError("rank of input must be 3 or 4!")
+    input_rank = x.rank
+    spatial_rank = len(block_shape)
+    remaining_rank = x.rank - 1 - spatial_rank
+    has_non_unity_remaining_dims = remaining_rank != 1
 
     if block_shape is None or crops is None:
         raise NotImplementedError(
-            "Not support dynamic block_shape and paddings for BatchToSpaceND!"
+            "Not support dynamic block_shape and crops for BatchToSpaceND!"
         )
 
-    if len(block_shape.flatten()) > 2:
-        raise NotImplementedError("rank of spatial shape > 2 is not yet supported")
+    if has_non_unity_remaining_dims:
+        # Reshape the input tensor to shape [batch, spatial_shape, remaining_dim_1 * ... * remaining_dim_N]
+        x = _reshape_remaining_dimensions_to_canonical_shape(x, remaining_rank)
 
-    if x.rank == 3 or (x.rank == 4 and len(block_shape) == 1):
+    if spatial_rank >= 3:
+        raise NotImplementedError("Rank of spatial shape > 2 is not supported.")
 
-        input_shape = mb.shape(x=x)
-        rank = x.rank
-        spatial_rank = len(block_shape)
+    if spatial_rank == 2:
+        # Tensor has shape [B, H, W, C], we can directly use the batch_to_space op by doing
+        # [B, H, W, C] -> transpose -> [B, C, H, W] -> batch_to_space -> [B_new, C, H_new, W_new] ->
+        # transpose -> [B_new, H_new, W_new, C]
+        x = mb.transpose(x=x, perm=[0, 3, 1, 2])  
+        x = mb.batch_to_space(x=x, block_shape=block_shape, crops=crops, name=node.name)
+        x = mb.transpose(x=x, perm=[0, 2, 3, 1])
+
+    if spatial_rank == 1:
+        # In this case, we decompose space_to_batch into small basic ops
+        # [B, H, C] -> decomposite ops -> [B_new, H_new, C]
 
-        # reshape input to [block_shape] + [batch_size/prod(block_shape)] + x.shape[1:]
+        # reshape input to [block_shape, B/block_shape, H, C]
+        input_shape = mb.shape(x=x)
+        block_shape = block_shape[0]
         batch_size = _value_at(input_shape, 0)
-        block_shape_prod = _np.prod(block_shape)
-        resize_batch_size = mb.real_div(x=batch_size, y=block_shape_prod)
-        resize_batch_size = [mb.cast(x=resize_batch_size, dtype="int32")]
-        remain_dims = [_value_at(input_shape, i) for i in range(1, rank)]
-        block_dims = [dim for dim in block_shape]
-        reshape_values = block_dims + resize_batch_size + remain_dims
+        spatial_size = _value_at(input_shape, 1)
+        channel_size = _value_at(input_shape, 2)
+        new_batch_size = mb.cast(x=mb.real_div(x=batch_size, y=block_shape), dtype="int32")
+        reshape_values = [block_shape, new_batch_size, spatial_size, channel_size]
         reshape_shape = mb.concat(values=reshape_values, axis=0)
-        reshaped = mb.reshape(x=x, shape=reshape_shape)
-
-        # permute the tensor to shape [batch / prod(block_shape)] +
-        #                             [input_shape[1], block_shape[0], ..., input_shape[M], block_shape[M-1]] +
-        #                             [input_shape[M+1], ..., input_shape[N-1]]
-        block_shape_dims = list(range(spatial_rank))
-        batch_dim = [spatial_rank]
-        input_shape_dims = list(range(spatial_rank + 1, reshaped.rank))
-        perm = [batch_dim[0]]
-        for i in range(spatial_rank):
-            perm += [input_shape_dims[i], block_shape_dims[i]]
-        perm += input_shape_dims[spatial_rank:]
-        permuted = mb.transpose(x=reshaped, perm=perm)
-
-        # reshape tensor to shape [batch / prod(block_shape)] +
-        #                         [input_shape[1] * block_shape[0], ..., input_shape[M] * block_shape[M-1]] +
-        #                         [input_shape[M+1], ..., input_shape[N-1]]
-        spatial_dims = []
-        for i in range(spatial_rank):
-            spatial_dims.append(
-                mb.mul(x=_value_at(input_shape, i + 1), y=block_shape[i])
-            )
-        remain_dims = [_value_at(input_shape, i) for i in range(spatial_rank + 1, rank)]
-        reshape_values = resize_batch_size + spatial_dims + remain_dims
-        reshape_shape = mb.concat(values=reshape_values, axis=0)
-        reshape_permuted = mb.reshape(x=permuted, shape=reshape_shape)
-
-        # crop the tensor using stride slice
-        begin = [0]
-        for i in range(spatial_rank):
-            begin.append(crops[i][0])
-        for i in range(spatial_rank + 1, rank):
-            begin.append(0)
-        end = [resize_batch_size[0]]
-        for i in range(spatial_rank):
-            end.append(mb.sub(x=spatial_dims[i], y=crops[i][1]))
-        end += remain_dims
-        end = mb.concat(values=end, axis=0)
-        x = mb.slice_by_index(x=reshape_permuted, begin=begin, end=end, name=node.name)
-    else:
-        if len(block_shape.flatten()) != 2:
-            raise NotImplementedError(
-                "rank of spatial shape != 2 is not yet supported for 4d input."
-            )
-        if block_shape[0] != block_shape[1]:
-            raise NotImplementedError("non-equal block shape is not yet supported")
+        x = mb.reshape(x=x, shape=reshape_shape, name=node.name)
 
-        needs_cropping = any(crops.flatten())
+        # permute the tensor to [B/block_shape, H, block_shape, C]
+        x = mb.transpose(x=x, perm=[1, 2, 0, 3])
 
-        x = mb.transpose(x=x, perm=[3, 0, 1, 2])
+        # reshape the tensor to [B/block_shape, H*block_shape, C]
+        new_spatial_size = mb.cast(x=mb.mul(x=spatial_size, y=block_shape), dtype="int32")
+        reshape_values = [new_batch_size, new_spatial_size, channel_size]
+        reshape_shape = mb.concat(values=reshape_values, axis=0)
+        x = mb.reshape(x=x, shape=reshape_shape)
 
-        x = mb.depth_to_space(x=x, block_size=block_shape[0])
-        if needs_cropping:
-            x = mb.crop(
-                x=x,
-                crop_height=[crops[0][0], crops[0][1]],
-                crop_width=[crops[1][0], crops[1][1]],
-            )
+        # crop the tensor to [B/block_shape, H - crops[0][0] - crops[0][1], C]
+        x = mb.crop(x=x, crop_height=crops[0], crop_width=[0, 0])
 
-        x = mb.transpose(x=x, perm=[1, 2, 3, 0], name=node.name)
-    context.add(node.name, x)
+    if has_non_unity_remaining_dims:
+        # Reshape the tensor from shape [batch_new, spatial_shape_new, remaining_dim_1 * ... * remaining_dim_N] back to 
+        # shape [batch_new, spatial_shape_new, remaining_shape]
+        x = _reshape_remaining_dimension_to_original_shape(x, original_shape, remaining_rank)
 
+    context.add(node.name, mb.identity(x=x, name=node.name))
+    
 
 @register_tf_op
 def Ceil(context, node):
@@ -1043,6 +1060,11 @@ def EuclideanNorm(context, node):
     x = mb.reduce_l2_norm(x=x, axes=axes, keep_dims=keep_dims, name=node.name)
     context.add(node.name, x)
 
+@register_tf_op
+def IdentityN(context, node):
+    res = [mb.identity(x=context[x]) for x in node.inputs]
+    context.add(node.name, res)
+
 
 @register_tf_op
 def ExpandDims(context, node):
@@ -1426,6 +1448,17 @@ def Square(context, node):
     x = mb.mul(x=x, y=x, name=node.name)
     context.add(node.name, x)
 
+
+def _softmax_cross_entropy_with_logits(feats, labels, name):
+    # compute the log softmax
+    y = mb.reduce_log_sum_exp(x=feats, axes=[-1], keep_dims=True)
+    log_softmax = mb.sub(x=feats, y=y)
+    loss = mb.mul(x=labels, y=log_softmax)
+    loss = mb.mul(x=-1, y=loss)
+    loss = mb.reduce_sum(x=loss, axes=[-1], name=name)
+    return loss
+
+
 @register_tf_op
 def SparseSoftmaxCrossEntropyWithLogits(context, node):
     feats = context[node.inputs[0]]
@@ -1435,15 +1468,18 @@ def SparseSoftmaxCrossEntropyWithLogits(context, node):
         indices=labels, 
         one_hot_vector_size=class_nums,
         )
+    loss = _softmax_cross_entropy_with_logits(feats, labels, node.name)
+    context.add(node.name, loss)
 
-    # compute the log softmax
-    y = mb.reduce_log_sum_exp(x=feats, axes=[-1], keep_dims=True)
-    log_softmax = mb.sub(x=feats, y=y)
-    loss = mb.mul(x=labels, y=log_softmax)
-    loss = mb.mul(x=-1, y=loss)
-    loss = mb.reduce_sum(x=loss, axes=[-1], name=node.name)
+
+@register_tf_op
+def SoftmaxCrossEntropyWithLogits(context, node):
+    feats = context[node.inputs[0]]
+    labels = context[node.inputs[1]]
+    loss = _softmax_cross_entropy_with_logits(feats, labels, node.name)
     context.add(node.name, loss)
 
+
 @register_tf_op
 def StridedSlice(context, node):
     x = context[node.inputs[0]]
@@ -1942,98 +1978,93 @@ def Softmax(context, node):
 
 
 @register_tf_op
-def SpaceToBatchND(context, node):
+def SpaceToBatchND(context, node):    
+    # In tensorflow, the input tensor has the shape of (batch,) + spatial_shape + remaining_shape.
+    # The shape is treated as a combination of 3 components:
+    # 1. A single batch dimension
+    # 2. Spatial dimensions, with a length spatial_rank, which could be neither 1 or 2. Also, spatial_rank
+    #    is equal to the length of block_shape
+    # 3. Remaining dimensions, with a length remaining_rank 
+
+    # The logic of translating this op is as followed:
+    # 1. We first reshape the input to a canonical shape (rolling the remaining shape dimensions into a 
+    #    single dimension): (batch,) + spatial_shape + (R), where R = remaining_dim_1 * ... * remaining_dim_n
+    # 2. We support rank 1 and rank 2 spatial shape:
+    #    (i) rank 1: We decompose the SpaceToBatch into small basic ops.
+    #    (ii) rank 2: We directly use the built in space_to_batch op.
+    #    The output would have shape (batch_new,) + spatial_shape_new + (R)
+    # 3. We transform the tensor back, by unrolling the remaining shape: (B_new,) + spatial_shape_new + remaining_shape
+  
     x = context[node.inputs[0]]
     block_shape = context[node.inputs[1]].val
     paddings = context[node.inputs[2]].val
+    original_shape = mb.shape(x=x)
 
-    if x.rank != 3 and x.rank != 4:
-        raise NotImplementedError("rank of input must be 3 or 4!")
+    input_rank = x.rank
+    spatial_rank = len(block_shape)
+    remaining_rank = x.rank - 1 - spatial_rank
+    has_non_unity_remaining_dims = remaining_rank != 1
 
     if block_shape is None or paddings is None:
         raise NotImplementedError(
             "Not support dynamic block_shape and paddings for SpaceToBatchND!"
         )
 
-    if len(block_shape.flatten()) > 2:
-        raise NotImplementedError("rank of spatial shape > 2 is not yet supported")
-
-    # use sequence of ops to implement spacetobatch for cases:
-    # (1) x.rank == 3
-    # (2) x.rank == 4 and len(block_shape) == 1
-    if x.rank == 3 or (x.rank == 4 and len(block_shape) == 1):
-
-        rank = x.rank
-        spatial_rank = len(block_shape)
-
-        # expand padding to have shape [x.rank, 2]
-        paddings = _np.concatenate(
-            [[[0, 0]], paddings, _np.zeros(shape=(3, 2), dtype=_np.int32)], axis=0
-        )
-        paddings = paddings[: x.rank, :]
+    if has_non_unity_remaining_dims:
+        # Reshape the input tensor to shape [batch, spatial_shape, remaining_dim_1 * ... * remaining_dim_N]
+        x = _reshape_remaining_dimensions_to_canonical_shape(x, remaining_rank)
+
+    if spatial_rank >= 3:
+        raise NotImplementedError("Rank of spatial shape > 2 is not supported.")
+
+    if spatial_rank == 2:
+        # Tensor has shape [B, H, W, C], we can directly use the space_to_batch op by doing
+        # [B, H, W, C] -> transpose -> [B, C, H, W] -> space_to_batch -> [B_new, C, H_new, W_new] ->
+        # transpose -> [B_new, H_new, W_new, C]
+        x = mb.transpose(x=x, perm=[0, 3, 1, 2])
+        x = mb.space_to_batch(x=x, block_shape=block_shape, paddings=paddings)
+        x = mb.transpose(x=x, perm=[0, 2, 3, 1])
+
+    if spatial_rank == 1:
+        # In this case, we decompose space_to_batch into small basic ops
+        # [B, H, C] -> decomposite ops -> [B_new, H_new, C]
+        
+        # expand padding to shape [3, 2]
+        new_paddings = _np.zeros(shape=(3, 2), dtype=_np.int32)
+        new_paddings[1] = paddings
+        paddings = new_paddings
         needs_paddings = any(paddings.flatten())
         if needs_paddings:
             padded = mb.pad(x=x, pad=paddings.flatten(), mode="constant")
         else:
             padded = x
-        padded_shape = mb.shape(x=padded)
 
-        # padded_shape = [batch_size] + [spatial_dims] + [remaining_dims]
-        batch_size = [_value_at(padded_shape, 0)]
-        spatial_dims = [_value_at(padded_shape, i) for i in range(1, spatial_rank + 1)]
-        remaining_dims = [
-            _value_at(padded_shape, i) for i in range(spatial_rank + 1, rank)
-        ]
+        # padded_shape = [B, H_padded, C]
+        padded_shape = mb.shape(x=padded)
 
-        # padded_shape = [batch_size] + [s0, s1, ..., sm] + [remaining_dims]
-        # reshape_shape = [batch_size] +
-        #                 [s0/block_shape[0],block_shape[0],...,sm/block_shape[m],block_shape[m]] +
-        #                 [remaining_dims]
-        values = []
-        for i in range(spatial_rank):
-            dim = mb.real_div(x=spatial_dims[i], y=block_shape[i])
-            values.append(mb.cast(x=dim, dtype="int32"))
-            values.append(block_shape[i])
-        values = batch_size + values + remaining_dims
-        reshape_shape = mb.concat(values=values, axis=0)
+        # reshape to [B, H_padded/block_shape, block_shape, C]
+        block_shape = block_shape[0]
+        batch_size = _value_at(padded_shape, 0)
+        spatial_dim = mb.real_div(x=_value_at(padded_shape, 1), y=block_shape)
+        spatial_dim = mb.cast(x=spatial_dim, dtype="int32")
+        remain_dim = _value_at(padded_shape, 2)
+        reshape_shape = mb.concat(values=[batch_size, spatial_dim, block_shape, remain_dim], axis=0)
         reshaped_padded = mb.reshape(x=padded, shape=reshape_shape)
 
-        # permute the shape to : [block_shape] + [batch_size] +
-        #                        [s0/block_shape[0],...,sm/block_shape[m]] +
-        #                        [remaining_dims]
-        batch_axis = [0]
-        block_shape_axis = [2 + 2 * i for i in range(spatial_rank)]
-        spatial_axis = [1 + 2 * i for i in range(spatial_rank)]
-        remaining_axis = list(range(block_shape_axis[-1] + 1, len(values)))
-        perm = block_shape_axis + batch_axis + spatial_axis + remaining_axis
-        permuted_reshaped_padded = mb.transpose(x=reshaped_padded, perm=perm)
-
-        # reshape the tensor to [prod(block_shape)*batch_size] +
-        #                       [s0/block_shape[0],...,sm/block_shape[m],block_shape[m]] +
-        #                       [remaining_dims]
-        prod_block_shape = _np.prod(block_shape.flatten())
-        resize_batch_size = [mb.mul(x=values[0], y=prod_block_shape)]
-        resize_spatial_dims = [values[1 + 2 * i] for i in range(spatial_rank)]
-        final_reshape_values = resize_batch_size + resize_spatial_dims + remaining_dims
-        final_shape = mb.concat(values=final_reshape_values, axis=0)
-        x = mb.reshape(x=permuted_reshaped_padded, shape=final_shape, name=node.name)
-    else:
-
-        if block_shape[0] != block_shape[1]:
-            raise NotImplementedError(
-                "non-equal block shape is not yet supported for 4d input."
-            )
-        needs_paddings = any(paddings.flatten())
-
-        x = mb.transpose(x=x, perm=[3, 0, 1, 2])
+        # permute the shape to: [block_shape, B, H_padded/block_shape, C]
+        permuted_reshaped_padded = mb.transpose(x=reshaped_padded, perm=[2, 0, 1, 3])
 
-        if needs_paddings:
-            x = mb.pad(x=x, pad=paddings.flatten(), mode="constant")
+        # reshape the tensor to [block_shape * B, H_padded/block_shape, C]
+        final_reshape_values = [mb.mul(x=batch_size, y=block_shape), spatial_dim, remain_dim]
+        final_shape = mb.concat(values=final_reshape_values, axis=0)
+        x = mb.reshape(x=permuted_reshaped_padded, shape=final_shape)
 
-        x = mb.space_to_depth(x=x, block_size=block_shape[0])
-        x = mb.transpose(x=x, perm=[1, 2, 3, 0], name=node.name)
+    if has_non_unity_remaining_dims:
+        # Reshape the tensor from shape [batch_new, spatial_shape_new, remaining_dim_1 * ... * remaining_dim_N] back to 
+        # shape [batch_new, spatial_shape_new, remaining_shape]
+        x = _reshape_remaining_dimension_to_original_shape(x, original_shape, remaining_rank)
 
-    context.add(node.name, x)
+    context.add(node.name, mb.identity(x=x, name=node.name))
 
 
 @register_tf_op
@@ -2065,6 +2096,25 @@ def TopK(context, node):
     context.add(node.name, x)
 
 
+@register_tf_op(tf_alias=["InTopKV2"])
+def InTopK(context, node):
+    x = context[node.inputs[0]]
+    target = context[node.inputs[1]]
+    k = context[node.inputs[2]].val
+
+    _, class_num = x.shape
+    if not is_symbolic(class_num):
+        k = min(k, class_num)
+
+    _, indices = mb.topk(x=x, k=k, axis=-1)
+    target = mb.expand_dims(x=target, axes=[-1])
+    x = mb.equal(x=target, y=indices)
+    x = mb.cast(x=x, dtype="fp32")
+    x = mb.reduce_sum(x=x, axes=[-1], keep_dims=False)
+    x = mb.cast(x=x, dtype="bool", name=node.name)
+    context.add(node.name, x)
+
+
 @register_tf_op
 def Cumsum(context, node):
     x = context[node.inputs[0]]
@@ -2224,9 +2274,11 @@ def OneHot(context, node):
     )
     context.add(node.name, x)
 
-
-@register_tf_op(tf_alias=["NonMaxSuppressionV3"])
-def NonMaxSuppression(context, node):
+def _get_non_maximum_supression(context, node):
+    """
+    The helper function returns the outputs from mb.non_maximum_suppression,
+    along with the number of boxes and the maximum number of boxes.
+    """
     boxes = context[node.inputs[0]]
     scores = context[node.inputs[1]]
     max_boxes = context[node.inputs[2]]
@@ -2238,7 +2290,7 @@ def NonMaxSuppression(context, node):
         score_threshold = -3.4e38
     boxes = mb.expand_dims(x=boxes, axes=[0])
     scores = mb.expand_dims(x=scores, axes=[0, -1])
-    _, _, x, _ = mb.non_maximum_suppression(
+    coordinates, scores, indices, valid_outputs = mb.non_maximum_suppression(
         boxes=boxes,
         scores=scores,
         max_boxes=max_boxes,
@@ -2246,12 +2298,39 @@ def NonMaxSuppression(context, node):
         score_threshold=score_threshold,
     )
     num_boxes = boxes.shape[1]
-    if not is_symbolic(num_boxes) and num_boxes < max_boxes.val:
-        x = mb.squeeze(x=x, axes=[0])
-        x = mb.slice_by_index(x=x, begin=[0], end=[num_boxes], name=node.name)
+
+    return coordinates, scores, indices, valid_outputs, num_boxes, max_boxes.val
+
+@register_tf_op(tf_alias=["NonMaxSuppressionV3"])
+def NonMaxSuppression(context, node):
+    _, _, indices, _, num_boxes, max_boxes = _get_non_maximum_supression(context, node)
+
+    if not is_symbolic(num_boxes) and num_boxes < max_boxes:
+        indices = mb.squeeze(x=indices, axes=[0])
+        indices = mb.slice_by_index(x=indices, begin=[0], end=[num_boxes], name=node.name)
     else:
-        x = mb.squeeze(x=x, axes=[0], name=node.name)
-    context.add(node.name, x)
+        indices = mb.squeeze(x=indices, axes=[0], name=node.name)
+    context.add(node.name, indices)
+
+
+@register_tf_op
+def NonMaxSuppressionV5(context, node):
+    """
+    Different from NonMaxSuppression/NonMaxSuppressionV3, which only returns the indices of the selected boxes,
+    NonMaxSuppressionV5 returns all indices, scores and number of the selected boxes.
+    """
+    _, scores, indices, valid_outputs, num_boxes, max_boxes = _get_non_maximum_supression(context, node)
+    soft_nms_sigma = context[node.inputs[5]].val
+    if soft_nms_sigma != 0:
+        raise NotImplementedError("NonMaxSuppressionV5 with soft_nms_sigma != 0 not supported.")
+    scores = mb.squeeze(x=scores, axes=[0, -1])
+    indices = mb.squeeze(x=indices, axes=[0])
+    valid_outputs = mb.squeeze(x=valid_outputs, axes=[0])
+    if not is_symbolic(num_boxes) and num_boxes < max_boxes:
+        scores = mb.slice_by_index(x=scores, begin=[0], end=[num_boxes])
+        indices = mb.slice_by_index(x=indices, begin=[0], end=[num_boxes])
+    res = [indices, scores, valid_outputs]
+    context.add(node.name, res)
 
 
 @register_tf_op
@@ -2935,13 +3014,15 @@ def LSTMBlockCell(context, node):
     )
     context.add(node.name, res)
 
-
-@register_tf_op()
+@register_tf_op(tf_alias=["BlockLSTMV2"])
 def BlockLSTM(context, node):
+    # BlockLSTM: https://www.tensorflow.org/api_docs/python/tf/raw_ops/BlockLSTM
+    # BlockLSTMV2: https://www.tensorflow.org/api_docs/python/tf/raw_ops/BlockLSTMV2
     seq_len = context[node.inputs[0]]  # int
     x = context[node.inputs[1]]  # [padded_len, batch, input_dim]
     init_c = context[node.inputs[2]]  # [1, hidden_dim]
     init_h = context[node.inputs[3]]  # [1, hidden_dim]
+    # BlockLSTM: icfo format, BlockLSTMV2: ifco format
     weight = context[node.inputs[4]]  # [input_dim + hidden_dim, 4*hidden_dim]
 
     kwargs = {}
@@ -2954,13 +3035,24 @@ def BlockLSTM(context, node):
         kwargs["weight_peep_f"] = peep_f
         kwargs["weight_peep_o"] = peep_o
 
+    # BlockLSTM: icfo format, BlockLSTMV2: ifco format
     bias = context[node.inputs[8]]  # [4*hidden_dim,]
 
-    forget_bias = node.attr["forget_bias"]
+    # forget bias is always 0 for BlockLSTMV2
+    forget_bias = 0.0 if node.op == "BlockLSTMV2" else node.attr["forget_bias"]
     cell_clip = None
     if node.attr["cell_clip"] is not None and node.attr["cell_clip"] > 0:
         cell_clip = node.attr["cell_clip"]
 
+    if node.op == "BlockLSTMV2":
+        # mb.tf_lstm_block takes weights and bias in icfo format
+        # BlockLSTMV2's weights and bias are in ifco format
+        # convert from ifco to icfo format
+        w_i, w_f, w_c, w_o = mb.split(x=weight, num_splits=4, axis=-1)
+        weight = mb.concat(values=(w_i, w_c, w_f, w_o), axis=1, name=weight.name)
+        b_i, b_f, b_c, b_o = mb.split(x=bias, num_splits=4, axis=-1)
+        bias = mb.concat(values=(b_i, b_c, b_f, b_o), axis=0, name=bias.name)
+
     res = mb.tf_lstm_block(
         seq_len=seq_len,
         x=x,
diff --git a/coremltools/converters/mil/frontend/tensorflow/parse.py b/coremltools/converters/mil/frontend/tensorflow/parse.py
index a513bd414..708824480 100644
--- a/coremltools/converters/mil/frontend/tensorflow/parse.py
+++ b/coremltools/converters/mil/frontend/tensorflow/parse.py
@@ -1,18 +1,16 @@
-# -*- coding: utf-8 -*-
-
 #  Copyright (c) 2020, Apple Inc. All rights reserved.
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+import logging as _logging
+
+import numpy as _np
 
 from coremltools.converters.mil.mil import types
 from tensorflow.core.framework.types_pb2 import DataType
 from tensorflow.python.framework.dtypes import _TF_TO_NP
 
-import logging as _logging
-import numpy as _np
-
 
 def parse_type(t):
     mapping = {
diff --git a/coremltools/converters/mil/frontend/tensorflow/ssa_passes/test_passes.py b/coremltools/converters/mil/frontend/tensorflow/ssa_passes/test_passes.py
index 662e39af0..bfc47f4e7 100644
--- a/coremltools/converters/mil/frontend/tensorflow/ssa_passes/test_passes.py
+++ b/coremltools/converters/mil/frontend/tensorflow/ssa_passes/test_passes.py
@@ -13,7 +13,7 @@
 import copy
 import pytest
 
-pytest.importorskip("tensorflow", minversion="1.14.0")
+pytest.importorskip("tensorflow", minversion="1.15.0")
 
 
 def test_backfill_make_list_elem_type():
diff --git a/coremltools/converters/mil/frontend/tensorflow/test/test_composite_ops.py b/coremltools/converters/mil/frontend/tensorflow/test/test_composite_ops.py
index bf400b36d..3771bda4d 100644
--- a/coremltools/converters/mil/frontend/tensorflow/test/test_composite_ops.py
+++ b/coremltools/converters/mil/frontend/tensorflow/test/test_composite_ops.py
@@ -67,7 +67,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
diff --git a/coremltools/converters/mil/frontend/tensorflow/test/test_conversion_api.py b/coremltools/converters/mil/frontend/tensorflow/test/test_conversion_api.py
new file mode 100644
index 000000000..3b226b856
--- /dev/null
+++ b/coremltools/converters/mil/frontend/tensorflow/test/test_conversion_api.py
@@ -0,0 +1,517 @@
+#  Copyright (c) 2022, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import numpy as np
+from PIL import Image
+import pytest
+
+import coremltools as ct
+from coremltools.converters.mil.testing_utils import (
+    assert_cast_ops_count,
+    assert_input_dtype,
+    assert_ops_in_mil_program,
+    assert_output_dtype,
+    assert_prog_input_type,
+    assert_prog_output_type,
+    assert_spec_input_image_type,
+    assert_spec_output_image_type,
+    verify_prediction,
+)
+from coremltools.proto import FeatureTypes_pb2 as ft
+from coremltools._deps import _HAS_TF_2
+
+tf = pytest.importorskip("tensorflow")
+
+@pytest.fixture
+def int32_input_model():
+    with tf.Graph().as_default() as graph:
+        x = tf.placeholder(tf.int32, shape=[10, 20], name="input")
+        out = tf.add(x, tf.constant(5, dtype=tf.int32), name="output")
+    return graph
+
+@pytest.fixture
+def float32_input_model_add_op():
+    with tf.Graph().as_default() as graph:
+        x = tf.placeholder(tf.float32, shape=[10, 20], name="input")
+        out = tf.add(x, tf.constant(5.5, dtype=tf.float32), name="output")
+    return graph
+
+@pytest.fixture
+def float32_input_model_relu_ops():
+    with tf.Graph().as_default() as graph:
+        x = tf.placeholder(tf.float32, shape=[10, 20], name="input")
+        x1 = tf.nn.relu(x)
+        out = tf.nn.relu(x1, name="output")
+    return graph
+
+@pytest.fixture
+def int64_input_model():
+    with tf.Graph().as_default() as graph:
+        x = tf.placeholder(tf.int64, shape=[10, 20], name="input")
+        out = tf.add(x, tf.constant(5, dtype=tf.int64), name="output")
+    return graph
+
+@pytest.fixture
+def float32_two_input_model():
+    with tf.Graph().as_default() as graph:
+        x = tf.placeholder(tf.float32, shape=[10, 20], name="input1")
+        y = tf.placeholder(tf.float32, shape=[10, 20], name="input2")
+        out = tf.add(x, y, name="output")
+    return graph
+
+@pytest.fixture
+def float32_two_output_model():
+    with tf.Graph().as_default() as graph:
+        x = tf.placeholder(tf.float32, shape=[10, 20], name="input")
+        y = tf.nn.relu(x)
+        out2 = tf.nn.relu6(x, name="output2")
+        out1 = tf.nn.relu(y, name="output1")
+    return graph
+
+@pytest.fixture
+def rank3_input_model():
+    with tf.Graph().as_default() as graph:
+        x = tf.placeholder(tf.float32, shape=[1, 10, 20], name="input")
+        out = tf.add(x, tf.constant(5, dtype=tf.float32), name="output")
+    return graph
+
+@pytest.fixture
+def rank4_input_model():
+    with tf.Graph().as_default() as graph:
+        x = tf.placeholder(tf.float32, shape=[1, 10, 20, 3], name="input")
+        out = tf.add(x, tf.constant(5, dtype=tf.float32), name="output")
+    return graph
+
+@pytest.fixture
+def rank4_input_model_with_channel_first_output():
+    with tf.Graph().as_default() as graph:
+        x = tf.placeholder(tf.float32, shape=[1, 10, 20, 3], name="input")
+        y = tf.add(x, tf.constant(5, dtype=tf.float32))
+        out = tf.transpose(y, perm=[0, 3, 1, 2], name="output")
+    return graph
+
+@pytest.fixture
+def rank4_grayscale_input_model():
+    with tf.Graph().as_default() as graph:
+        x = tf.placeholder(tf.float32, shape=[1, 10, 20, 1], name="input")
+        out = tf.add(x, tf.constant(5, dtype=tf.float32), name="output")
+    return graph
+
+@pytest.fixture
+def rank4_grayscale_input_model_with_channel_first_output():
+    with tf.Graph().as_default() as graph:
+        x = tf.placeholder(tf.float32, shape=[1, 10, 20, 1], name="input")
+        y = tf.add(x, tf.constant(5, dtype=tf.float32))
+        out = tf.transpose(y, perm=[0, 3, 1, 2], name="output")
+    return graph
+
+@pytest.fixture
+def linear_model():
+    # this model will test the fuse_matmul_weight_bias pass
+    with tf.Graph().as_default() as graph:
+        x = tf.placeholder(tf.float32, shape=[1, 2], name="input")
+        y = tf.matmul(x, tf.constant([1, 2], shape=(2, 4), dtype=tf.float32))
+        y = tf.add(y, tf.constant([1, 2, 3, 4], shape=(4,), dtype=tf.float32))
+        out = tf.nn.relu(y)
+    return graph
+
+
+@pytest.mark.skipif(ct.utils._macos_version() < (13, 0), reason='Tests are for deployment target ios16/macos13')
+class TestInputOutputConversionAPI:
+
+    def test_input_dtype_inferred(self, int32_input_model):
+        # test that the input dtype is picked up from TF correctly
+        mlmodel = ct.convert(int32_input_model,
+                             minimum_deployment_target=ct.target.macOS12)
+        assert_input_dtype(mlmodel, expected_type_str="int32")
+        verify_prediction(mlmodel)
+
+    def test_unsupported_input_dtype_in_tf_graph(self, int64_input_model):
+        # test that no error is raised when no dtype is provided by the user,
+        # and the TF graph's input dtype is not supported.
+        # In this case, it will be mapped to the closest supported dtype
+        mlmodel = ct.convert(int64_input_model,
+                             minimum_deployment_target=ct.target.macOS12)
+        assert_input_dtype(mlmodel, expected_type_str="int32")
+        verify_prediction(mlmodel)
+
+    def test_input_dtype_user_provided(self, int32_input_model):
+        # test that provided dtype in the api overrides the input dtype in the TF model
+        mlmodel = ct.convert(int32_input_model,
+                             inputs=[ct.TensorType(dtype=np.float32)],
+                             minimum_deployment_target=ct.target.macOS12)
+        assert_input_dtype(mlmodel, expected_type_str="fp32")
+        assert_output_dtype(mlmodel, expected_type_str="fp32")
+        verify_prediction(mlmodel)
+
+    def test_invalid_input_dtype(self, int32_input_model):
+        # error should be raised if a dtype is provided by the user that is not supported
+        with pytest.raises(TypeError,
+                           match="is unsupported for inputs/outputs of the model"
+                           ):
+            mlmodel = ct.convert(int32_input_model,
+                                 inputs=[ct.TensorType(dtype=np.int16)],
+                                 minimum_deployment_target=ct.target.macOS12)
+
+        with pytest.raises(TypeError,
+                           match="float16 dtype for inputs is only supported for deployment target >= iOS16/macOS13"
+                           ):
+            mlmodel = ct.convert(int32_input_model,
+                                 inputs=[ct.TensorType(dtype=np.float16)],
+                                 minimum_deployment_target=ct.target.macOS12)
+
+    def test_fp16_input_dtype(self, float32_input_model_add_op, float32_input_model_relu_ops, int32_input_model):
+        """
+        Test that providing fp16 input dtype works with macOS13.
+        """
+        mlmodel = ct.convert(float32_input_model_add_op,
+                             inputs=[ct.TensorType(dtype=np.float16)],
+                             minimum_deployment_target=ct.target.macOS13
+                             )
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["add", "cast"])
+        assert_input_dtype(mlmodel, expected_type_str="fp16")
+        assert_output_dtype(mlmodel, expected_type_str="fp32")
+        verify_prediction(mlmodel)
+
+        mlmodel = ct.convert(float32_input_model_relu_ops,
+                             inputs=[ct.TensorType(dtype=np.float16)],
+                             minimum_deployment_target=ct.target.macOS13
+                             )
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["relu", "relu", "cast"])
+        assert_input_dtype(mlmodel, expected_type_str="fp16")
+        assert_output_dtype(mlmodel, expected_type_str="fp32")
+        verify_prediction(mlmodel)
+
+        mlmodel = ct.convert(int32_input_model,
+                             inputs=[ct.TensorType(dtype=np.float16)],
+                             minimum_deployment_target=ct.target.macOS13,
+                             )
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["add", "cast"])
+        assert_input_dtype(mlmodel, expected_type_str="fp16")
+        assert_output_dtype(mlmodel, expected_type_str="fp32")
+        verify_prediction(mlmodel)
+
+    def test_fp16_input_dtype_fp32_precision(self, float32_input_model_add_op, float32_input_model_relu_ops,
+                                             int32_input_model):
+        """
+        Same test as test_fp16_input_dtype, but with Float32 precision
+        """
+        mlmodel = ct.convert(float32_input_model_add_op,
+                             inputs=[ct.TensorType(dtype=np.float16)],
+                             minimum_deployment_target=ct.target.macOS13,
+                             compute_precision=ct.precision.FLOAT32,
+                             )
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["cast", "add"])
+        assert_input_dtype(mlmodel, expected_type_str="fp16")
+        assert_output_dtype(mlmodel, expected_type_str="fp32")
+        verify_prediction(mlmodel)
+
+        mlmodel = ct.convert(float32_input_model_relu_ops,
+                             inputs=[ct.TensorType(dtype=np.float16)],
+                             minimum_deployment_target=ct.target.macOS13,
+                             compute_precision=ct.precision.FLOAT32,
+                             )
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["cast", "relu", "relu"])
+        assert_input_dtype(mlmodel, expected_type_str="fp16")
+        assert_output_dtype(mlmodel, expected_type_str="fp32")
+
+    def test_two_input_model(self, float32_two_input_model):
+        # test forcing input type of "input1" to be int32
+        mlmodel = ct.convert(float32_two_input_model,
+                             inputs=[ct.TensorType(name="input1", dtype=np.int32)],
+                             minimum_deployment_target=ct.target.macOS12)
+        assert_input_dtype(mlmodel, expected_type_str="int32", expected_name="input1")
+        assert_input_dtype(mlmodel, expected_type_str="fp32", expected_name="input2")
+        assert_output_dtype(mlmodel, expected_type_str="fp32")
+
+        # test forcing both inputs to be int32
+        mlmodel = ct.convert(float32_two_input_model,
+                             inputs=[ct.TensorType(name="input1", dtype=np.int32),
+                                     ct.TensorType(name="input2", dtype=np.int32),
+                                     ],
+                             minimum_deployment_target=ct.target.macOS12)
+        assert_input_dtype(mlmodel, expected_type_str="int32", expected_name="input1")
+        assert_input_dtype(mlmodel, expected_type_str="int32", expected_name="input2")
+        assert_output_dtype(mlmodel, expected_type_str="int32")
+
+        # if names are not provided an error should be raised
+        with pytest.raises(ValueError):
+            mlmodel = ct.convert(float32_two_input_model,
+                                 inputs=[ct.TensorType(dtype=np.int32),
+                                         ct.TensorType(dtype=np.int32),
+                                         ],
+                                 minimum_deployment_target=ct.target.macOS12)
+
+        # test forcing both inputs to be float16
+        mlmodel = ct.convert(float32_two_input_model,
+                             inputs=[ct.TensorType(name="input1", dtype=np.float16),
+                                     ct.TensorType(name="input2", dtype=np.float16),
+                                     ],
+                             minimum_deployment_target=ct.target.macOS13)
+        assert_input_dtype(mlmodel, expected_type_str="fp16", expected_name="input1")
+        assert_input_dtype(mlmodel, expected_type_str="fp16", expected_name="input2")
+        assert_output_dtype(mlmodel, expected_type_str="fp32")
+        assert_cast_ops_count(mlmodel, expected_count=1)
+        verify_prediction(mlmodel)
+
+    def test_single_output_model(self, int32_input_model, float32_input_model_relu_ops):
+        # test output type
+        mlmodel = ct.convert(int32_input_model,
+                             minimum_deployment_target=ct.target.macOS12)
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["add"])
+        assert_output_dtype(mlmodel, expected_type_str="int32")
+
+        # test that error is raised when an output of unknown name is provided
+        with pytest.raises(Exception):
+            # output name does not exist in the model
+            mlmodel = ct.convert(int32_input_model,
+                                 outputs=["z"],
+                                 minimum_deployment_target=ct.target.macOS12)
+
+        # test that error is raised when two outputs are provided without names
+        with pytest.raises(ValueError, match=", does not have names"):
+            mlmodel = ct.convert(int32_input_model,
+                                 outputs=[ct.TensorType(dtype=np.float32), ct.TensorType(dtype=np.float32)],
+                                 minimum_deployment_target=ct.target.macOS12)
+
+        # test that an error is raised when shape is provided for the output
+        with pytest.raises(ValueError):
+            mlmodel = ct.convert(int32_input_model,
+                                 outputs=[ct.TensorType(dtype=np.float32, shape=(10, 20))],
+                                 minimum_deployment_target=ct.target.macOS12)
+
+        # test that the output dtype provided by the user is applied during conversion
+        mlmodel = ct.convert(int32_input_model,
+                             outputs=[ct.TensorType(dtype=np.float32)],
+                             minimum_deployment_target=ct.target.macOS12)
+        assert_output_dtype(mlmodel, expected_type_str="fp32", expected_name="Identity" if _HAS_TF_2 else "output")
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["add", "cast"])
+
+        # test that output dtype of float16 is rejected when deployment target is low
+        with pytest.raises(TypeError,
+                           match="float16 dtype for outputs is only supported for deployment target >= iOS16/macOS13"
+                           ):
+            ct.convert(float32_input_model_relu_ops,
+                       outputs=[ct.TensorType(dtype=np.float16)],
+                       minimum_deployment_target=ct.target.macOS12,
+                       )
+
+        # test that output type float16 is applied correctly
+        mlmodel = ct.convert(float32_input_model_relu_ops,
+                             outputs=[ct.TensorType(dtype=np.float16)],
+                             minimum_deployment_target=ct.target.macOS13,
+                             )
+        assert_output_dtype(mlmodel, expected_type_str="fp16", expected_name="Identity" if _HAS_TF_2 else "output")
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["cast", "relu", "relu"])
+
+        # test that input and output types float16 are applied correctly
+        mlmodel = ct.convert(float32_input_model_relu_ops,
+                             inputs=[ct.TensorType(dtype=np.float16)],
+                             outputs=[ct.TensorType(dtype=np.float16)],
+                             minimum_deployment_target=ct.target.macOS13,
+                             )
+        assert_input_dtype(mlmodel, expected_type_str="fp16")
+        assert_output_dtype(mlmodel, expected_type_str="fp16", expected_name="Identity" if _HAS_TF_2 else "output")
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["relu", "relu"])
+        verify_prediction(mlmodel)
+
+    def test_multi_output_model(self, float32_two_output_model):
+        # check that error is raised when only 1 output provided
+        with pytest.raises(ValueError, match="please provide names for each of the outputs"):
+            mlmodel = ct.convert(float32_two_output_model,
+                                 outputs=[ct.TensorType(dtype=np.float16)],
+                                 minimum_deployment_target=ct.target.macOS13,
+                                 )
+
+        # check that error is raised when multiple outputs are provided without names
+        with pytest.raises(ValueError, match="please provide names for each of the outputs"):
+            mlmodel = ct.convert(float32_two_output_model,
+                                 outputs=[ct.TensorType(dtype=np.float16), ct.TensorType(dtype=np.float32)],
+                                 minimum_deployment_target=ct.target.macOS13,
+                                 )
+
+        # set 1 output to float16 and the other to float32
+        output1_name = "Identity" if _HAS_TF_2 else "output1"
+        output2_name = "Identity_1" if _HAS_TF_2 else "output2"
+        mlmodel = ct.convert(float32_two_output_model,
+                             inputs=[ct.TensorType(dtype=np.float16)],
+                             outputs=[ct.TensorType(name=output2_name, dtype=np.float16),
+                                      ct.TensorType(name=output1_name, dtype=np.float32)],
+                             minimum_deployment_target=ct.target.macOS13,
+                             )
+        assert_cast_ops_count(mlmodel, expected_count=1)
+        assert_output_dtype(mlmodel, expected_type_str="fp16", expected_name=output2_name, index=0)
+        assert_output_dtype(mlmodel, expected_type_str="fp32", expected_name=output1_name, index=1)
+        assert_input_dtype(mlmodel, expected_type_str="fp16")
+        verify_prediction(mlmodel)
+
+        # in this case only the single output will be selected
+        mlmodel = ct.convert(float32_two_output_model,
+                             inputs=[ct.TensorType(dtype=np.float16)],
+                             outputs=[ct.TensorType(name=output2_name, dtype=np.float16)],
+                             minimum_deployment_target=ct.target.macOS13,
+                             )
+        assert_cast_ops_count(mlmodel, expected_count=0)
+        assert_output_dtype(mlmodel, expected_type_str="fp16", expected_name=output2_name, index=0)
+        assert_input_dtype(mlmodel, expected_type_str="fp16")
+        verify_prediction(mlmodel)
+
+    def test_color_input(self, rank4_input_model, rank3_input_model):
+        mlmodel = ct.convert(rank4_input_model,
+                             inputs=[ct.ImageType(color_layout=ct.colorlayout.RGB)],
+                             minimum_deployment_target=ct.target.macOS13,
+                             )
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["cast", "transpose", "add", "cast"])
+        assert_spec_input_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.RGB)
+        assert_prog_input_type(mlmodel._mil_program, expected_dtype_str="fp32")
+        assert_prog_output_type(mlmodel._mil_program, expected_dtype_str="fp32")
+        verify_prediction(mlmodel)
+
+        with pytest.raises(ValueError, match="must have rank 4"):
+            mlmodel = ct.convert(rank3_input_model,
+                                 inputs=[ct.ImageType(color_layout=ct.colorlayout.RGB)],
+                                 minimum_deployment_target=ct.target.macOS12,
+                                 )
+
+    def test_grayscale_input(self, rank4_input_model, rank3_input_model, rank4_grayscale_input_model):
+        with pytest.raises(ValueError, match="must have rank 4"):
+            mlmodel = ct.convert(rank3_input_model,
+                                 inputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE)],
+                                 minimum_deployment_target=ct.target.macOS13,
+                                 )
+
+        # invalid shape
+        with pytest.raises(ValueError):
+            mlmodel = ct.convert(rank4_input_model,
+                                 inputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE)],
+                                 minimum_deployment_target=ct.target.macOS13,
+                                 )
+
+        mlmodel = ct.convert(rank4_grayscale_input_model,
+                             inputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE)],
+                             minimum_deployment_target=ct.target.macOS13,
+                             )
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["cast", "transpose", "add", "cast"])
+        assert_spec_input_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.GRAYSCALE)
+        assert_prog_input_type(mlmodel._mil_program, expected_dtype_str="fp32")
+        assert_prog_output_type(mlmodel._mil_program, expected_dtype_str="fp32")
+        verify_prediction(mlmodel)
+
+        with pytest.raises(TypeError, match="float16 dtype for inputs is only supported for deployment target >= iOS16/macOS13"):
+            mlmodel = ct.convert(rank4_grayscale_input_model,
+                                 inputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE_FLOAT16)],
+                                 minimum_deployment_target=ct.target.macOS12,
+                                 )
+
+        # test that grayscale_16 raises error when used with neural network
+        with pytest.raises(TypeError, match="float16 dtype for inputs is only supported for deployment target >= iOS16/macOS13"):
+            mlmodel = ct.convert(rank4_grayscale_input_model,
+                                 inputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE_FLOAT16)],
+                                 )
+
+        mlmodel = ct.convert(rank4_grayscale_input_model,
+                             inputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE_FLOAT16)],
+                             outputs=[ct.TensorType(dtype=np.float16)],
+                             minimum_deployment_target=ct.target.macOS13,
+                             )
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["transpose", "add"])
+        assert_spec_input_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.GRAYSCALE_FLOAT16)
+        assert_prog_input_type(mlmodel._mil_program, expected_dtype_str="fp16")
+        assert_output_dtype(mlmodel, expected_type_str="fp16")
+        # TODO: uncomment the following when rdar://92239179 is fixed
+        # verify_prediction(mlmodel)
+
+    def test_color_output(self, rank4_input_model, rank4_input_model_with_channel_first_output):
+        # check that an error is raised if the output shape is not of form (1, 3, H, W)
+        with pytest.raises(ValueError, match="Shape of the RGB/BGR image output,"):
+            mlmodel = ct.convert(rank4_input_model,
+                                 inputs=[ct.ImageType(color_layout=ct.colorlayout.RGB)],
+                                 outputs=[ct.ImageType(color_layout=ct.colorlayout.RGB)],
+                                 minimum_deployment_target=ct.target.macOS13,
+                                 )
+
+        mlmodel = ct.convert(rank4_input_model_with_channel_first_output,
+                             inputs=[ct.ImageType(color_layout=ct.colorlayout.BGR)],
+                             outputs=[ct.ImageType(color_layout=ct.colorlayout.RGB)],
+                             minimum_deployment_target=ct.target.macOS13,
+                             )
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["cast", "add", "cast"])
+        assert_spec_input_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.BGR)
+        assert_spec_output_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.RGB)
+        assert_prog_input_type(mlmodel._mil_program, expected_dtype_str="fp32")
+        assert_prog_output_type(mlmodel._mil_program, expected_dtype_str="fp32")
+        verify_prediction(mlmodel)
+
+        # check neural network conversion
+        mlmodel = ct.convert(rank4_input_model_with_channel_first_output,
+                             inputs=[ct.ImageType(color_layout=ct.colorlayout.RGB)],
+                             outputs=[ct.ImageType(color_layout=ct.colorlayout.BGR)],
+                             )
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["add"])
+        assert_spec_input_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.RGB)
+        assert_spec_output_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.BGR)
+        verify_prediction(mlmodel)
+
+    def test_grayscale_output(self, rank4_grayscale_input_model, rank4_grayscale_input_model_with_channel_first_output):
+        # check that an error is raised if the output shape is not of form (1, 1, H, W)
+        with pytest.raises(ValueError, match="Shape of the Grayscale image output,"):
+            mlmodel = ct.convert(rank4_grayscale_input_model,
+                                 inputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE)],
+                                 outputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE)],
+                                 )
+
+        with pytest.raises(TypeError, match="float16 dtype for outputs is only supported for deployment target >= iOS16/macOS13"):
+            mlmodel = ct.convert(rank4_grayscale_input_model_with_channel_first_output,
+                                 outputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE_FLOAT16)],
+                                 minimum_deployment_target=ct.target.macOS12,
+                                 )
+
+        mlmodel = ct.convert(rank4_grayscale_input_model_with_channel_first_output,
+                             inputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE)],
+                             outputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE)],
+                             )
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["add"])
+        assert_spec_input_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.GRAYSCALE)
+        assert_spec_output_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.GRAYSCALE)
+        verify_prediction(mlmodel)
+
+        mlmodel = ct.convert(rank4_grayscale_input_model_with_channel_first_output,
+                             inputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE_FLOAT16)],
+                             outputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE_FLOAT16)],
+                             minimum_deployment_target=ct.target.macOS13,
+                             )
+        assert_cast_ops_count(mlmodel, expected_count=0)
+        assert_spec_input_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.GRAYSCALE_FLOAT16)
+        assert_spec_output_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.GRAYSCALE_FLOAT16)
+        assert_prog_input_type(mlmodel._mil_program, expected_dtype_str="fp16")
+        assert_prog_output_type(mlmodel._mil_program, expected_dtype_str="fp16")
+        # TODO: uncomment the following when rdar://92239179 is fixed
+        # verify_prediction(mlmodel)
+
+        mlmodel = ct.convert(rank4_grayscale_input_model_with_channel_first_output,
+                             inputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE)],
+                             outputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE_FLOAT16)],
+                             minimum_deployment_target=ct.target.macOS13,
+                             )
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["cast", "add"])
+        assert_spec_input_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.GRAYSCALE)
+        assert_spec_output_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.GRAYSCALE_FLOAT16)
+        assert_prog_input_type(mlmodel._mil_program, expected_dtype_str="fp32")
+        assert_prog_output_type(mlmodel._mil_program, expected_dtype_str="fp16")
+        # TODO: uncomment the following when rdar://92239179 is fixed
+        # verify_prediction(mlmodel)
+
+
+    def test_linear_model(self, linear_model):
+        # this will test the fuse_matmul_weight_bias pass, when the inputs are of type float16
+        mlmodel = ct.convert(linear_model,
+                             inputs=[ct.TensorType(dtype=np.float16)],
+                             outputs=[ct.TensorType(dtype=np.float16)],
+                             minimum_deployment_target=ct.target.macOS13,
+                             )
+        assert_input_dtype(mlmodel, expected_type_str="fp16")
+        assert_output_dtype(mlmodel, expected_type_str="fp16")
+        assert_ops_in_mil_program(mlmodel, ["linear", "relu"])
+        verify_prediction(mlmodel)
diff --git a/coremltools/converters/mil/frontend/tensorflow/test/test_custom_ops.py b/coremltools/converters/mil/frontend/tensorflow/test/test_custom_ops.py
index 22b3af499..bb7b13476 100644
--- a/coremltools/converters/mil/frontend/tensorflow/test/test_custom_ops.py
+++ b/coremltools/converters/mil/frontend/tensorflow/test/test_custom_ops.py
@@ -86,7 +86,6 @@ def type_inference(self):
             # For illustration purpose, assumming getting valid shape
             # Ideally, should consider transpose_?, ?_is_sparse parameters into consideration
             # for computing output shape
-            ret_shape = [x_shape[0], y_shape[1]]
             return types.tensor(x_type, [x_shape[0], y_shape[1]])
 
     # TensorFlow Sparse Matmul Op
diff --git a/coremltools/converters/mil/frontend/tensorflow/test/test_graphs.py b/coremltools/converters/mil/frontend/tensorflow/test/test_graphs.py
index 3edbc0fbd..9923fc4a2 100644
--- a/coremltools/converters/mil/frontend/tensorflow/test/test_graphs.py
+++ b/coremltools/converters/mil/frontend/tensorflow/test/test_graphs.py
@@ -41,7 +41,7 @@ def build_model(input):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
diff --git a/coremltools/converters/mil/frontend/tensorflow/test/test_load.py b/coremltools/converters/mil/frontend/tensorflow/test/test_load.py
index 6bd87c5a1..445455f60 100644
--- a/coremltools/converters/mil/frontend/tensorflow/test/test_load.py
+++ b/coremltools/converters/mil/frontend/tensorflow/test/test_load.py
@@ -3,7 +3,6 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
-
 import os
 import tempfile
 import shutil
@@ -12,8 +11,8 @@
 import pytest
 
 import coremltools as ct
-import coremltools.converters as converter
 from coremltools._deps import _IS_MACOS
+import coremltools.converters as converter
 import coremltools.proto.FeatureTypes_pb2 as ft
 from coremltools import TensorType, ImageType, RangeDim, EnumeratedShapes
 from coremltools.converters.mil.testing_utils import random_gen
@@ -151,11 +150,11 @@ def build_model(x):
 
         with pytest.raises(ValueError) as e:
             converter.convert(model, minimum_deployment_target=target)
-        e.match(
-            r"Provided minimum deployment target requires model to be of version 4 but converted model "
-            r"uses following features which are available from version 5 onwards. "
-            r"Please use a higher minimum deployment target to convert. \n    1. Cumsum operation\n"
-        )
+            e.match(
+                r"Provided minimum deployment target requires model to be of version 4 but converted model "
+                r"uses following features which are available from version 5 onwards. "
+                r"Please use a higher minimum deployment target to convert. \n    1. Cumsum operation\n"
+            )
 
     @pytest.mark.parametrize(
         "target",
@@ -261,7 +260,7 @@ def build_flexible_model(x):
             np.allclose(ret[output_name], np.maximum(input_values[0], 0.0))
 
         if _IS_MACOS:
-            with pytest.raises(RuntimeError) as e:
+            with pytest.raises(RuntimeError):
                 input_values = [random_gen((5, 4, 5), -10.0, 10.0)]
                 input_dict = {input_name: input_values[0]}
                 ret = mlmodel.predict(input_dict)
@@ -323,18 +322,18 @@ def test_graph_def(self):
         with tf.Graph().as_default() as graph:
             x = tf.placeholder(tf.float32, shape=(3, 4, 5))
             out = tf.nn.relu(x)
-        mlmodel = converter.convert(
-            graph, inputs=[TensorType(x.op.name, (3, 4, 5))], outputs=[out.op.name]
-        )
-        assert mlmodel is not None
+            mlmodel = converter.convert(
+                graph, inputs=[TensorType(x.op.name, (3, 4, 5))], outputs=[out.op.name]
+            )
+            assert mlmodel is not None
 
     def test_graph_def_file(self):
         with tf.Graph().as_default() as graph:
             x = tf.placeholder(tf.float32, shape=(3, 4, 5))
             out = tf.nn.relu(x)
-        tf.io.write_graph(
-            graph, self.saved_model_dir, self.model_path_pb, as_text=False
-        )
+            tf.io.write_graph(
+                graph, self.saved_model_dir, self.model_path_pb, as_text=False
+            )
         mlmodel = converter.convert(
             self.model_path_pb,
             inputs=[TensorType(x.op.name, (3, 4, 5))],
@@ -385,18 +384,18 @@ def test_model_metadata(self):
         with tf.Graph().as_default() as graph:
             x = tf.placeholder(tf.float32, shape=(3, 4, 5))
             out = tf.nn.relu(x)
-        mlmodel = converter.convert(
-            graph, inputs=[TensorType(x.op.name, (3, 4, 5))], outputs=[out.op.name]
-        )
-        metadata_keys = mlmodel.get_spec().description.metadata.userDefined
-        assert "com.github.apple.coremltools.version" in metadata_keys
-        assert "com.github.apple.coremltools.source" in metadata_keys
-        assert "tensorflow==1." in metadata_keys["com.github.apple.coremltools.source"]
+            mlmodel = converter.convert(
+                graph, inputs=[TensorType(x.op.name, (3, 4, 5))], outputs=[out.op.name]
+            )
+            metadata_keys = mlmodel.get_spec().description.metadata.userDefined
+            assert "com.github.apple.coremltools.version" in metadata_keys
+            assert "com.github.apple.coremltools.source" in metadata_keys
+            assert "tensorflow==1." in metadata_keys["com.github.apple.coremltools.source"]
 
     def test_invalid_format_none(self):
         with pytest.raises(NotImplementedError) as e:
             converter.convert(None, source="tensorflow")
-        e.match(r"Expected model format: .* .pb")
+            e.match(r"Expected model format: .* .pb")
 
     def test_invalid_format_invalid_extension(self):
         _, invalid_filename = tempfile.mkstemp(
@@ -404,35 +403,35 @@ def test_invalid_format_invalid_extension(self):
         )
         with pytest.raises(NotImplementedError) as e:
             converter.convert(invalid_filename, source="tensorflow")
-        e.match(r"Expected model format: .* .pb")
+            e.match(r"Expected model format: .* .pb")
 
     def test_invalid_converter_source(self):
         with pytest.raises(ValueError) as e:
             converter.convert(None, source="invalid")
-        expected_msg = r'Unrecognized value of argument "source": .*'
-        e.match(expected_msg)
+            expected_msg = r'Unrecognized value of argument "source": .*'
+            e.match(expected_msg)
 
     def test_invalid_converter_minimum_deployment_flag(self):
         with pytest.raises(TypeError) as e:
             converter.convert(
                 None, source="tensorflow", minimum_deployment_target="iOs14"
             )
-        expected_msg = (
-            "Unrecognized value of argument 'minimum_deployment_target': iOs14. "
-            "It needs to be a member of 'coremltools.target' enumeration"
-        )
+            expected_msg = (
+                "Unrecognized value of argument 'minimum_deployment_target': iOs14. "
+                "It needs to be a member of 'coremltools.target' enumeration"
+            )
 
-        e.match(expected_msg)
+            e.match(expected_msg)
 
     def test_invalid_converter_target(self):
         with tf.Graph().as_default() as graph:
             x = tf.placeholder(tf.float32, shape=(3, 4, 5))
         with pytest.raises(NotImplementedError) as e:
             converter.convert(graph, convert_to="invalid", source="tensorflow")
-        e.match(r"Backend converter .* not implemented")
+            e.match(r"Backend converter .* not implemented")
 
     def test_invalid_format_non_exist(self):
         non_exist_filename = self.model_path_pb.replace(".pb", "_non_exist.pb")
         with pytest.raises(ValueError) as e:
             converter.convert(non_exist_filename, source="tensorflow")
-        e.match(r"Input model .* does not exist")
+            e.match(r"Input model .* does not exist")
diff --git a/coremltools/converters/mil/frontend/tensorflow/test/test_ops.py b/coremltools/converters/mil/frontend/tensorflow/test/test_ops.py
index e0727ff06..b16441aaa 100644
--- a/coremltools/converters/mil/frontend/tensorflow/test/test_ops.py
+++ b/coremltools/converters/mil/frontend/tensorflow/test/test_ops.py
@@ -4,13 +4,15 @@
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 import itertools
-import numpy as np
 import math
 import os
-import pytest
 import shutil
 import tempfile
 
+import numpy as np
+import pytest
+
+from coremltools import TensorType, RangeDim
 from coremltools.converters.mil import testing_reqs
 from coremltools.converters.mil.testing_utils import random_gen
 from coremltools.converters.mil.frontend.tensorflow.test.testing_utils import (
@@ -64,7 +66,7 @@ def build_model(x, warp):
         ]
         input_dict = dict(zip(inputs, input_values))
         self.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend,
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend,
         )
 
 
@@ -94,7 +96,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             backend=backend,
         )
 
@@ -117,7 +119,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             backend=backend,
         )
 
@@ -140,7 +142,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             backend=backend,
         )
 
@@ -166,7 +168,7 @@ def build_model(x, y):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -196,7 +198,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -224,7 +226,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -250,7 +252,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -279,7 +281,7 @@ def build_model(*inputs):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -330,7 +332,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -356,7 +358,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -382,7 +384,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -414,7 +416,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -442,7 +444,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -468,7 +470,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -499,7 +501,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -525,7 +527,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -551,7 +553,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -577,7 +579,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -622,7 +624,7 @@ def build_model_select(cond, a, b):
         model, inputs, outputs = build_model_select
         inputs_dic = dict(zip(inputs, [cond_val, a_val, b_val]))
         TensorFlowBaseTest.run_compare_tf(
-            model, inputs_dic, outputs, use_cpu_only=use_cpu_for_conversion, backend=backend,
+            model, inputs_dic, outputs, backend=backend,
             use_cpu_for_conversion=use_cpu_for_conversion,
         )
 
@@ -641,7 +643,7 @@ def test_where_1_input(self, use_cpu_only, backend, rank):
                 graph,
                 {x: x_val},
                 tf.where(x),
-                use_cpu_only=use_cpu_only,
+                use_cpu_for_conversion=use_cpu_only,
                 backend=backend,
             )
 
@@ -663,7 +665,7 @@ def test_where(self, use_cpu_only, backend, rank):
                 graph,
                 {cond: cond_val, a: a_val, b: b_val},
                 ref,
-                use_cpu_only=use_cpu_only,
+                use_cpu_for_conversion=use_cpu_only,
                 backend=backend,
             )
 
@@ -689,10 +691,25 @@ def build_model(x):
             return y
 
         model, inputs, outputs = build_model
-        input_values = [random_gen(shape, -100, 100)]
+        min_range, max_range = -100, 100
+        input_values = [random_gen(shape, min_range, max_range)]
+
+        # When using GPU with neuralnetwork backend, that uses FP16 precision, we make sure that 
+        # the input is not too close to its ceiling / floor,
+        # for instance, 24.993 or -13.985 will not be allowed.
+        if not use_cpu_only and dtype == "int32":
+            TOR_THRESHOLD = 0.03
+            value = input_values[0].flatten()
+            for i, v in enumerate(value):
+                while abs(math.ceil(v) - v) < TOR_THRESHOLD or abs(math.floor(v) - v) < TOR_THRESHOLD:
+                    v = random_gen((1,), min_range, max_range)[0]
+                value[i] = v
+            value = np.reshape(value, shape)
+            input_values = [value]
+
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(model, input_dict, outputs,
-                       use_cpu_only=use_cpu_only,
+                       use_cpu_for_conversion=use_cpu_only,
                        backend=backend)
 
 
@@ -715,16 +732,13 @@ def build_model(x, y):
         input_dict = dict(zip(inputs, input_values))
 
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend
         )
 
     @pytest.mark.parametrize(
         "use_cpu_for_conversion, backend", itertools.product([True, False], backends,)
     )
     def test_cond(self, use_cpu_for_conversion, backend):
-        if backend[0] == "mlprogram" and not use_cpu_for_conversion:
-            pytest.xfail("rdar://77441664")
-
         @make_tf_graph([(1,), (1,)])
         def build_model(x, y):
             z = tf.multiply(x, y)
@@ -738,17 +752,13 @@ def build_model(x, y):
         ]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_for_conversion, backend=backend,
-            use_cpu_for_conversion=use_cpu_for_conversion,
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_for_conversion, backend=backend,
         )
 
     @pytest.mark.parametrize(
         "use_cpu_for_conversion, backend", itertools.product([True, False], backends,)
     )
     def test_cond_multi_returns(self, use_cpu_for_conversion, backend):
-        if backend[0] == "mlprogram" and not use_cpu_for_conversion:
-            pytest.xfail("rdar://77441664")
-
         @make_tf_graph([(1,), (1,)])
         def build_model(x, y):
             z = tf.multiply(x, y)
@@ -769,17 +779,13 @@ def false_fn():
         ]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_for_conversion, backend=backend,
-            use_cpu_for_conversion=use_cpu_for_conversion,
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_for_conversion, backend=backend,
         )
 
     @pytest.mark.parametrize(
         "use_cpu_for_conversion, backend", itertools.product([True, False], backends,)
     )
     def test_cond_with_identity(self, use_cpu_for_conversion, backend):
-        if backend[0] == "mlprogram" and not use_cpu_for_conversion:
-            pytest.xfail("rdar://77441664")
-
         @make_tf_graph([(1,), (1,)])
         def build_model(x, y):
             z = tf.multiply(x, y)
@@ -793,17 +799,13 @@ def build_model(x, y):
         ]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_for_conversion, backend=backend,
-            use_cpu_for_conversion=use_cpu_for_conversion,
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_for_conversion, backend=backend,
         )
 
     @pytest.mark.parametrize(
         "use_cpu_for_conversion, backend", itertools.product([True, False], backends,)
     )
     def test_cond_multi_returns_with_identity(self, use_cpu_for_conversion, backend):
-        if backend[0] == "mlprogram" and not use_cpu_for_conversion:
-            pytest.xfail("rdar://77441664")
-
         @make_tf_graph([(1,), (1,)])
         def build_model(x, y):
             z = tf.multiply(x, y)
@@ -824,17 +826,13 @@ def false_fn():
         ]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_for_conversion, backend=backend,
-            use_cpu_for_conversion=use_cpu_for_conversion,
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_for_conversion, backend=backend
         )
 
     @pytest.mark.parametrize(
         "use_cpu_for_conversion, backend", itertools.product([True, False], backends,)
     )
     def test_cond_nested_0(self, use_cpu_for_conversion, backend):
-        if backend[0] == "mlprogram" and not use_cpu_for_conversion:
-            pytest.xfail("rdar://77441664")
-
         if backend == ("mlprogram", "fp16"):
             pytest.xfail("rdar://80660074 (Cond mlprogram FP16 tests falling in TF1 converter with numerical errors)")
 
@@ -856,17 +854,13 @@ def build_model(x, y):
         ]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_for_conversion, backend=backend,
-            use_cpu_for_conversion=use_cpu_for_conversion,
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_for_conversion, backend=backend,
         )
 
     @pytest.mark.parametrize(
         "use_cpu_for_conversion, backend", itertools.product([True, False], backends,)
     )
     def test_cond_nested_1(self, use_cpu_for_conversion, backend):
-        if backend[0] == "mlprogram" and not use_cpu_for_conversion:
-            pytest.xfail("rdar://77441664")
-
         if backend == ("mlprogram", "fp16"):
             pytest.xfail("rdar://80660074 (Cond mlprogram FP16 tests falling in TF1 converter with numerical errors)")
 
@@ -888,26 +882,26 @@ def build_model(x, y):
         ]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_for_conversion, backend=backend,
-            use_cpu_for_conversion=use_cpu_for_conversion,
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_for_conversion, backend=backend,
         )
 
 
 class TestWhileLoop(TensorFlowBaseTest):
     @pytest.mark.parametrize(
-        "use_cpu_only, backend", itertools.product([True, False], backends))
+        "use_cpu_only, backend", itertools.product([True, False], backends)
+    )
     def test_while_loop_with_changing_shape(self, use_cpu_only, backend):
-        @make_tf_graph([(2,1),(2,1)])
-        def build_model(x,y):
-            c = lambda i,j: tf.less(tf.shape(j)[1], 5)
-            b = lambda i,j: (i, tf.concat([i,j], axis=1))
-            return tf.while_loop(c, b, [x,y], shape_invariants=[x.get_shape(), tf.TensorShape([2, None])])
+        @make_tf_graph([(2, 1), (2, 1)])
+        def build_model(x, y):
+            c = lambda i, j: tf.less(tf.shape(j)[1], 5)
+            b = lambda i, j: (i, tf.concat([i, j], axis=1))
+            return tf.while_loop(c, b, [x, y], shape_invariants=[x.get_shape(), tf.TensorShape([2, None])])
 
         model, inputs, outputs = build_model
-        input_values = [np.array([[1],[2]], dtype=np.float32),np.array([[1],[2]], dtype=np.float32)]
+        input_values = [np.array([[1], [2]], dtype=np.float32), np.array([[1], [2]], dtype=np.float32)]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(model, input_dict, outputs,
-                                          use_cpu_only=use_cpu_only,
+                                          use_cpu_for_conversion=use_cpu_only,
                                           backend=backend)
 
     @pytest.mark.parametrize(
@@ -924,7 +918,7 @@ def build_model(x):
         input_values = [np.array([5], dtype=np.float32)]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend
         )
 
     @pytest.mark.parametrize(
@@ -941,7 +935,7 @@ def build_model(x):
         input_values = [np.array([10], dtype=np.float32)]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend
         )
 
     @pytest.mark.parametrize(
@@ -962,7 +956,7 @@ def build_model(x, y):
         input_dict = dict(zip(inputs, input_values))
 
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend
         )
 
     @pytest.mark.parametrize(
@@ -982,7 +976,7 @@ def build_model(x, y):
         ]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend
         )
 
     @pytest.mark.parametrize(
@@ -1005,7 +999,7 @@ def build_model(x, y, z):
         ]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend
         )
 
     @pytest.mark.parametrize(
@@ -1029,7 +1023,7 @@ def build_model(x, y, z, m):
         ]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend
         )
 
     @pytest.mark.parametrize(
@@ -1068,7 +1062,7 @@ def body1(i, j):
         ]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend
         )
 
     @pytest.mark.parametrize(
@@ -1111,7 +1105,7 @@ def body1(i, j):
         ]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend
         )
 
 
@@ -1246,7 +1240,7 @@ def build_model_static_weights(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -1321,7 +1315,7 @@ def build_model_static_weights(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             backend=backend,
             frontend_only=False,
             atol=1e-03,  # default 1e-04
@@ -1406,14 +1400,13 @@ def build_model_static_weights(x):
             input_values = [(np.random.rand(*input_shape).astype(np.float32))]
             input_dict = dict(zip(inputs, input_values))
 
-            proto,_,_,_,_,_ = TensorFlowBaseTest.run_compare_tf(
+            proto, _, _, _, _, _ = TensorFlowBaseTest.run_compare_tf(
                 model,
                 input_dict,
                 outputs,
-                use_cpu_only=use_cpu_only,
+                use_cpu_for_conversion=use_cpu_only,
                 backend=backend,
                 frontend_only=False,
-                use_cpu_for_conversion=use_cpu_for_conversion,
             )
 
             if backend[0] == 'nnv1_proto':
@@ -1443,14 +1436,13 @@ def build_model_dynamic_weights(x, W):
                 model,
                 input_dict,
                 outputs,
-                use_cpu_only=use_cpu_only,
+                use_cpu_for_conversion=use_cpu_only,
                 backend=backend,
                 frontend_only=False,
-                use_cpu_for_conversion=use_cpu_for_conversion,
             )
 
         if backend[0] == "neuralnetwork" and dynamic_weights:
-            pytest.xfail("dynamic conv with groups > 1 is not supported on the neuralnetwork backend")
+             pytest.xfail("dynamic conv with groups > 1 is not supported on the neuralnetwork backend")
 
         # We do not support dynamic weight when dilations != 1.
         test_dynamic_W() if dynamic_weights and dilations == (1, 1) else test_static_W()
@@ -1537,10 +1529,9 @@ def build_model_dynamic_weights(x, depthwise_filter, pointwise_filter):
                 model,
                 input_dict,
                 outputs,
-                use_cpu_only=use_cpu_only,
+                use_cpu_for_conversion=use_cpu_only,
                 backend=backend,
                 frontend_only=False,
-                use_cpu_for_conversion=use_cpu_for_conversion,
             )
 
         def test_static_W():
@@ -1572,10 +1563,9 @@ def build_model_static_weights(x):
                 model,
                 input_dict,
                 outputs,
-                use_cpu_only=use_cpu_only,
+                use_cpu_for_conversion=use_cpu_only,
                 backend=backend,
                 frontend_only=False,
-                use_cpu_for_conversion=use_cpu_for_conversion,
             )
 
         test_static_W()
@@ -1706,7 +1696,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             backend=backend,
             frontend_only=False,
         )
@@ -1818,7 +1808,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             backend=backend,
             frontend_only=False,
         )
@@ -1902,7 +1892,7 @@ def build_model(x, y):
         input_values = [x_val, y_val]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend
         )
 
     @pytest.mark.parametrize(
@@ -1963,8 +1953,7 @@ def build_model(x, y):
         ]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend,
-            use_cpu_for_conversion=use_cpu_for_conversion,
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend,
         )
 
     @pytest.mark.parametrize(
@@ -2019,8 +2008,7 @@ def build_model(x, y):
         ]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend,
-            use_cpu_for_conversion=use_cpu_for_conversion,
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend,
         )
 
 class TestEinsum(TensorFlowBaseTest):
@@ -2082,8 +2070,7 @@ def build_model(x, y):
 
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_for_conversion, backend=backend,
-            use_cpu_for_conversion=use_cpu_for_conversion,
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_for_conversion, backend=backend,
         )
 
 
@@ -2288,11 +2275,10 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=True,
+            use_cpu_for_conversion=True,
             backend=backend,
             atol=atol,
             rtol=rtol,
-            use_cpu_for_conversion=use_cpu_for_conversion,
         )
 
 
@@ -2336,7 +2322,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             backend=backend,
         )
 
@@ -2374,7 +2360,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             backend=backend,
         )[0]
         # also check if the scale factor are integers
@@ -2439,9 +2425,8 @@ def build_model(x):
                 model,
                 input_dict,
                 outputs,
-                use_cpu_only=use_cpu_only,
+                use_cpu_for_conversion=use_cpu_only,
                 backend=backend,
-                use_cpu_for_conversion=use_cpu_for_conversion,
             )
 
         def test_dynamic():
@@ -2461,9 +2446,8 @@ def build_model(x, boxes_pl, box_indices_pl):
                 model,
                 input_dict,
                 outputs,
-                use_cpu_only=use_cpu_only,
+                use_cpu_for_conversion=use_cpu_only,
                 backend=backend,
-                use_cpu_for_conversion=use_cpu_for_conversion,
             )
 
         test_dynamic() if dynamic else test_static()
@@ -2513,7 +2497,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             backend=backend,
         )
 
@@ -2566,8 +2550,8 @@ def build_model(x):
 
         input_dict = dict(zip(inputs, input_values))
 
-        proto,_,_,_,_,_ = TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend
+        proto, _, _, _, _, _ = TensorFlowBaseTest.run_compare_tf(
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend
         )
 
         for layer in proto.neuralNetwork.layers:
@@ -2622,7 +2606,7 @@ def build_model(x, m, v, o, s):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             backend=backend,
             atol=.2,
             rtol=1e-4,
@@ -2707,7 +2691,7 @@ def build_model(x, m, v, b):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             backend=backend,
             atol=0.2,
             rtol=1e-4,
@@ -2751,7 +2735,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             backend=backend,
             atol=1e-2,
             rtol=1e-3,
@@ -2785,7 +2769,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             backend=backend,
             atol=0.05,
             rtol=1e-4,
@@ -2820,7 +2804,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             backend=backend,
             atol=1e-2,
             rtol=1e-3,
@@ -2848,7 +2832,7 @@ def build_model(x):
         input_dict = dict(zip(inputs, input_values))
 
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend
         )
 
     @pytest.mark.parametrize(
@@ -2871,7 +2855,7 @@ def build_model(x):
         input_dict = dict(zip(inputs, input_values))
 
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend
         )
 
 
@@ -2900,7 +2884,7 @@ def build_model(x):
         input_dict = dict(zip(inputs, input_values))
 
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend
         )
 
     @pytest.mark.parametrize(
@@ -2927,7 +2911,7 @@ def build_model(x):
         input_dict = dict(zip(inputs, input_values))
 
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend
         )
 
 
@@ -2959,7 +2943,7 @@ def build_model(x):
         input_dict = dict(zip(inputs, input_values))
 
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend
         )
 
     @pytest.mark.parametrize(
@@ -2989,7 +2973,7 @@ def build_model(x):
         input_dict = dict(zip(inputs, input_values))
 
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend
         )
 
 
@@ -3012,7 +2996,7 @@ def build_model(x):
         input_dict = dict(zip(inputs, input_value))
 
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend
         )
 
 
@@ -3047,7 +3031,7 @@ def test_random_binomial(self, use_cpu_only, backend, size, rank, constant):
                 graph,
                 {x: np.random.rand(*shape)},
                 ref,
-                use_cpu_only=use_cpu_only,
+                use_cpu_for_conversion=use_cpu_only,
                 backend=backend,
             )
 
@@ -3065,7 +3049,7 @@ def test_random_categorical(self, use_cpu_only, backend, size):
                 graph,
                 {x: np.random.rand(*shape)},
                 ref,
-                use_cpu_only=use_cpu_only,
+                use_cpu_for_conversion=use_cpu_only,
                 validate_shapes_only=True,
                 backend=backend,
             )
@@ -3100,7 +3084,7 @@ def test_random_normal(self, use_cpu_only, backend, mean, rank, constant):
                 graph,
                 {x: np.random.rand(*shape)},
                 ref,
-                use_cpu_only=use_cpu_only,
+                use_cpu_for_conversion=use_cpu_only,
                 backend=backend,
             )
 
@@ -3137,7 +3121,7 @@ def test_keras_random_normal(self, use_cpu_only, backend, mean, rank, constant):
                 graph,
                 {x: np.random.rand(*shape)},
                 ref,
-                use_cpu_only=use_cpu_only,
+                use_cpu_for_conversion=use_cpu_only,
                 backend=backend,
             )
 
@@ -3172,7 +3156,7 @@ def test_random_uniform(self, use_cpu_only, backend, low, high, rank, constant):
                 graph,
                 {x: np.random.rand(*shape)},
                 ref,
-                use_cpu_only=use_cpu_only,
+                use_cpu_for_conversion=use_cpu_only,
                 backend=backend,
             )
 
@@ -3213,7 +3197,7 @@ def test_keras_random_uniform(
                 graph,
                 {x: np.random.rand(*shape)},
                 ref,
-                use_cpu_only=use_cpu_only,
+                use_cpu_for_conversion=use_cpu_only,
                 backend=backend,
             )
 
@@ -3287,9 +3271,8 @@ def build_model(x):
                 model,
                 input_dict,
                 outputs,
-                use_cpu_only=use_cpu_for_conversion,
-                backend=backend,
                 use_cpu_for_conversion=use_cpu_for_conversion,
+                backend=backend,
             )
 
         def test_tf_argmin():
@@ -3304,9 +3287,8 @@ def build_model(x):
                 model,
                 input_dict,
                 outputs,
-                use_cpu_only=use_cpu_for_conversion,
-                backend=backend,
                 use_cpu_for_conversion=use_cpu_for_conversion,
+                backend=backend,
             )
 
         def test_tf_reduction():
@@ -3340,9 +3322,8 @@ def build_model(x):
                 model,
                 input_dict,
                 outputs,
-                use_cpu_only=use_cpu_for_conversion,
-                backend=backend,
                 use_cpu_for_conversion=use_cpu_for_conversion,
+                backend=backend,
             )
 
         if tf_op in {tf.math.argmax}:
@@ -3401,7 +3382,7 @@ def build_model(x, indices):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -3455,7 +3436,7 @@ def build_model(x, indices):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -3497,7 +3478,7 @@ def build_model(indices, updates, shape):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -3638,7 +3619,7 @@ def build_model(x, begin, end):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -3704,7 +3685,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -3741,7 +3722,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -3768,7 +3749,7 @@ def build_model(x, y):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -3813,7 +3794,7 @@ def build_model(*args):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -3880,7 +3861,7 @@ def build_model_dynamic_size(x, begin, size):
                     model,
                     input_dict,
                     outputs,
-                    use_cpu_only=use_cpu_only,
+                    use_cpu_for_conversion=use_cpu_only,
                     frontend_only=False,
                     backend=backend,
                 )
@@ -3922,7 +3903,7 @@ def build_model_dynamic_size(x, begin, size):
                 model,
                 input_dict,
                 outputs,
-                use_cpu_only=use_cpu_only,
+                use_cpu_for_conversion=use_cpu_only,
                 frontend_only=False,
                 backend=backend,
             )
@@ -3970,7 +3951,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -4000,7 +3981,7 @@ def build_model(x):
             model,
             {inputs[0]: random_gen(shape, rand_min=-100, rand_max=100)},
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             backend=backend,
         )
 
@@ -4030,7 +4011,7 @@ def build_model(x):
             TensorFlowBaseTest.run_compare_tf(model,
                            input_dict,
                            outputs,
-                           use_cpu_only=use_cpu_only,
+                           use_cpu_for_conversion=use_cpu_only,
                            frontend_only=False,
                            backend=backend)
 
@@ -4135,7 +4116,7 @@ def model(x):
             graph,
             input_dict,
             ["quantize/quantized_model/conv2d/Conv2D"],
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
             tf_outputs=tf_outs,
@@ -4167,7 +4148,7 @@ def build_model(x):
             TensorFlowBaseTest.run_compare_tf(model,
                            input_dict,
                            outputs,
-                           use_cpu_only=use_cpu_only,
+                           use_cpu_for_conversion=use_cpu_only,
                            frontend_only=False,
                            backend=backend)
 
@@ -4184,7 +4165,7 @@ def build_model(x):
             TensorFlowBaseTest.run_compare_tf(model,
                            input_dict,
                            outputs,
-                           use_cpu_only=use_cpu_only,
+                           use_cpu_for_conversion=use_cpu_only,
                            frontend_only=False,
                            backend=backend)
 
@@ -4201,6 +4182,7 @@ class TestNonMaximumSuppression(TensorFlowBaseTest):
                 "max_boxes",
                 "iou_threshold",
                 "score_threshold",
+                "use_V5",
             ]
         ),
         itertools.product(
@@ -4210,6 +4192,7 @@ class TestNonMaximumSuppression(TensorFlowBaseTest):
             [5, 20, 100],
             [1.0, 0.99],
             [float("-inf"), -200.0],
+            [True, False],
         ),
     )
     def test_non_max_suppression(
@@ -4220,6 +4203,7 @@ def test_non_max_suppression(
         max_boxes,
         iou_threshold,
         score_threshold,
+        use_V5,
     ):
         if backend == ("mlprogram", "fp16") and not use_cpu_only:
             pytest.xfail("rdar://80661262 ([GPU failures ] NonMaximumSuppression FP16 coremltools unit tests)")
@@ -4231,13 +4215,23 @@ def test_non_max_suppression(
 
         @make_tf_graph([boxes_val.shape, scores_val.shape])
         def build_model(boxes, scores):
-            ret = tf.image.non_max_suppression(
-                boxes=boxes,
-                scores=scores,
-                max_output_size=max_boxes,
-                iou_threshold=iou_threshold,
-                score_threshold=score_threshold,
-            )
+            if use_V5:
+                ret = tf.raw_ops.NonMaxSuppressionV5(
+                    boxes=boxes,
+                    scores=scores,
+                    max_output_size=max_boxes,
+                    iou_threshold=iou_threshold,
+                    score_threshold=score_threshold,
+                    soft_nms_sigma=0.,
+                )
+            else:
+                ret = tf.image.non_max_suppression(
+                    boxes=boxes,
+                    scores=scores,
+                    max_output_size=max_boxes,
+                    iou_threshold=iou_threshold,
+                    score_threshold=score_threshold,
+                )
             return ret
 
         model, inputs, outputs = build_model
@@ -4246,7 +4240,7 @@ def build_model(boxes, scores):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             backend=backend,
         )
 
@@ -4296,14 +4290,15 @@ def build_model(x, depth_input):
 
             model, inputs, outputs = build_model
             input_values = [np.random.randint(0, depth, size=x_shape).astype(np.int32),
-                    np.array([depth]).astype(np.int32)]
+                            np.array([depth]).astype(np.int32)]
             input_dict = dict(zip(inputs, input_values))
 
         TensorFlowBaseTest.run_compare_tf(model, input_dict, outputs,
-               use_cpu_only=use_cpu_only,
-               frontend_only=False, backend=backend)
+                                          use_cpu_for_conversion=use_cpu_only,
+                                          frontend_only=False, backend=backend)
+
 
-class TestSparseSoftmaxCrossEntropyWithLogits(TensorFlowBaseTest):
+class TestSoftmaxCrossEntropyWithLogits(TensorFlowBaseTest):
     
     @pytest.mark.parametrize("use_cpu_only, backend, class_num",
                              itertools.product(
@@ -4312,7 +4307,7 @@ class TestSparseSoftmaxCrossEntropyWithLogits(TensorFlowBaseTest):
                                  [1, 3],
                              )
                              )   
-    def test(self, use_cpu_only, backend, class_num):
+    def test_sparse_softmax_cross_entropy_with_logits(self, use_cpu_only, backend, class_num):
         batch_size = 2
         feature_shape = [batch_size, class_num]
         label_shape = [batch_size, tf.int32]
@@ -4322,14 +4317,67 @@ def build_model(feat, label):
             return tf.raw_ops.SparseSoftmaxCrossEntropyWithLogits(features=feat, labels=label)[0]
             
         model, inputs, outputs = build_model
-        features = np.random.rand(batch_size, class_num)
+        features = random_gen(feature_shape, rand_min=0, rand_max=1)
         labels = np.random.randint(low=0, high=class_num, size=(batch_size,), dtype=np.int32)
         input_values = [features, labels]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(model, input_dict, outputs,
-                       use_cpu_only=use_cpu_only,
+                       use_cpu_for_conversion=use_cpu_only,
                        frontend_only=False, backend=backend)
 
+    @pytest.mark.parametrize("use_cpu_only, backend, class_num",
+                             itertools.product(
+                                 [True, False],
+                                 backends,
+                                 [1, 3],
+                             )
+                             )   
+    def test_softmax_cross_entropy_with_logits(self, use_cpu_only, backend, class_num):
+        batch_size = 2
+        feature_shape = [batch_size, class_num]
+        label_shape = [batch_size, class_num]
+
+        @make_tf_graph([feature_shape, label_shape])
+        def build_model(feat, label):
+            return tf.raw_ops.SoftmaxCrossEntropyWithLogits(features=feat, labels=label)[0]
+            
+        model, inputs, outputs = build_model
+        input_values = [
+            random_gen(feature_shape, rand_min=0, rand_max=1),
+            random_gen(label_shape, rand_min=0, rand_max=1),
+        ]
+        input_dict = dict(zip(inputs, input_values))
+        TensorFlowBaseTest.run_compare_tf(model, input_dict, outputs,
+                                          use_cpu_for_conversion=use_cpu_only,
+                                          frontend_only=False, backend=backend)
+
+
+class TestIdentityN(TensorFlowBaseTest):
+    
+    @pytest.mark.parametrize("use_cpu_only, backend",
+                             itertools.product([True, False],backends,)
+                             )   
+    def test(self, use_cpu_only, backend):
+        shape_1 = [1,]
+        shape_2 = [3, 4]
+        shape_3 = [5, 6, 7]
+
+        @make_tf_graph([shape_1, shape_2, shape_3])
+        def build_model(x, y ,z):
+            return tf.raw_ops.IdentityN(input=[x, y, z])
+            
+        model, inputs, outputs = build_model
+        input_values = [
+            random_gen(shape_1, rand_min=0, rand_max=1), 
+            random_gen(shape_2, rand_min=0, rand_max=1), 
+            random_gen(shape_3, rand_min=0, rand_max=1),
+        ]
+        input_dict = dict(zip(inputs, input_values))
+        TensorFlowBaseTest.run_compare_tf(model, input_dict, outputs,
+                                          use_cpu_for_conversion=use_cpu_only,
+                                          frontend_only=False, backend=backend)
+
+
 class TestPad(TensorFlowBaseTest):
     @pytest.mark.parametrize("use_cpu_only, backend, rank, mode, dynamic, trial",
                              itertools.product(
@@ -4377,7 +4425,7 @@ def build_model(x):
             input_dict = dict(zip(inputs, input_values))
 
         TensorFlowBaseTest.run_compare_tf(model, input_dict, outputs,
-                       use_cpu_only=use_cpu_only,
+                       use_cpu_for_conversion=use_cpu_only,
                        frontend_only=False, backend=backend)
 
 
@@ -4417,7 +4465,7 @@ def build_model(x):
             input_values = [random_gen(input_shape, rand_min=0.2, rand_max=1000)]
             input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(model, input_dict, outputs,
-                       use_cpu_only=use_cpu_only,
+                       use_cpu_for_conversion=use_cpu_only,
                        frontend_only=False, backend=backend)
 
 
@@ -4454,7 +4502,7 @@ def build_model(limit):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             backend=backend,
         )
 
@@ -4470,7 +4518,7 @@ def build_model(delta):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             backend=backend,
         )
 
@@ -4486,7 +4534,7 @@ def build_model(begin):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             backend=backend,
         )
 
@@ -4526,7 +4574,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             backend=backend,
         )
 
@@ -4546,7 +4594,7 @@ def test_tile(self, use_cpu_only, backend, rank):
                 graph,
                 {x: np.random.rand(*x_shape), reps: reps_val},
                 res,
-                use_cpu_only=use_cpu_only,
+                use_cpu_for_conversion=use_cpu_only,
                 frontend_only=False,
                 backend=backend,
             )
@@ -4578,10 +4626,40 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             backend=backend,
         )
 
+    @pytest.mark.parametrize(
+        "use_cpu_only, backend, shape, k",
+        itertools.product(
+            [True, False], 
+            backends, 
+            [(1, 3), (1, 10), (3, 50)], 
+            [1, 3, 20],
+        ),
+    )
+    def test_in_top_k(self, use_cpu_only, backend, shape, k):
+        # TensorFlow only supports last dimension (axis = -1).
+        batch_size, class_num = shape
+
+        @make_tf_graph([shape, (batch_size, tf.int32)])
+        def build_model(predictions, targets):
+            return tf.math.in_top_k(predictions=predictions, targets=targets, k=k)
+
+        model, inputs, outputs = build_model
+        pred_values = random_gen(shape, rand_min=-2, rand_max=2)
+        target_values = np.random.randint(class_num, size=batch_size).astype(np.int32)
+        input_values = [pred_values, target_values]
+
+        input_dict = dict(zip(inputs, input_values))
+        TensorFlowBaseTest.run_compare_tf(
+            model,
+            input_dict,
+            outputs,
+            use_cpu_for_conversion=use_cpu_only,
+            backend=backend,
+        )
 
 class TestConcat(TensorFlowBaseTest):
     @pytest.mark.parametrize("use_cpu_only, backend, op_version, rank, num_inputs",
@@ -4626,8 +4704,8 @@ def build_model(*inputs):
             input_values = [random_gen(shape) for shape in input_shapes]
             input_dict = dict(zip(inputs, input_values))
             TensorFlowBaseTest.run_compare_tf(model, input_dict, outputs,
-                           use_cpu_only=use_cpu_only,
-                           frontend_only=False, backend=backend)
+                                              use_cpu_for_conversion=use_cpu_only,
+                                              frontend_only=False, backend=backend)
 
 
 class TestSplit(TensorFlowBaseTest):
@@ -4676,9 +4754,8 @@ def build_model(x):
                     model,
                     input_dict,
                     outputs,
-                    use_cpu_only=use_cpu_for_conversion,
-                    backend=backend,
                     use_cpu_for_conversion=use_cpu_for_conversion,
+                    backend=backend,
                 )
 
     @pytest.mark.parametrize(
@@ -4702,7 +4779,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             backend=backend,
         )
 
@@ -4724,7 +4801,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             backend=backend,
         )
 
@@ -4748,7 +4825,7 @@ def build_model(x, y):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             backend=backend,
         )
 
@@ -4768,7 +4845,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             backend=backend,
         )
 
@@ -4788,7 +4865,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             backend=backend,
         )
 
@@ -4818,7 +4895,7 @@ def build_model(*inputs):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -4837,12 +4914,8 @@ class TestArgSort(TensorFlowBaseTest):
     )
     def test_argsort(self, use_cpu_only, backend, rank, axis, direction):
         shape = np.random.randint(low=1, high=4, size=rank)
-        if use_cpu_only:
-            dtype = np.float32
-            tf_dtype = tf.float32
-        else:
-            dtype = np.float16
-            tf_dtype = tf.float16
+        dtype = np.float32
+        tf_dtype = tf.float32
 
         @make_tf_graph([list(shape) + [tf_dtype]])
         def build_model(x):
@@ -4855,7 +4928,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             backend=backend
         )
 
@@ -4884,7 +4957,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             backend=backend,
         )
 
@@ -4918,7 +4991,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -4944,7 +5017,7 @@ def build_model(x):
                 model,
                 input_dict,
                 outputs,
-                use_cpu_only=use_cpu_only,
+                use_cpu_for_conversion=use_cpu_only,
                 frontend_only=False,
                 backend=backend,
             )
@@ -4974,7 +5047,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -5008,7 +5081,7 @@ def build_model(x, y):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -5034,7 +5107,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -5060,7 +5133,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -5100,8 +5173,8 @@ def build_model(x):
 
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(model, input_dict, outputs,
-                       use_cpu_only=use_cpu_only,
-                       frontend_only=False, backend=backend)
+                                          use_cpu_for_conversion=use_cpu_only,
+                                          frontend_only=False, backend=backend)
 
 
 class TestReverse(TensorFlowBaseTest):
@@ -5144,7 +5217,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             backend=backend,
         )
 
@@ -5174,7 +5247,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             backend=backend,
         )
 
@@ -5202,7 +5275,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             backend=backend,
         )
 
@@ -5244,7 +5317,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -5286,7 +5359,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             backend=backend,
         )
 
@@ -5311,7 +5384,7 @@ def build_model(x):
                 model,
                 input_dict,
                 outputs,
-                use_cpu_only=use_cpu_only,
+                use_cpu_for_conversion=use_cpu_only,
                 backend=backend,
             )
 
@@ -5327,7 +5400,7 @@ def build_model(x, tf_perm):
                 model,
                 input_dict,
                 outputs,
-                use_cpu_only=use_cpu_only,
+                use_cpu_for_conversion=use_cpu_only,
                 backend=backend,
             )
 
@@ -5369,7 +5442,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -5401,7 +5474,47 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
+            frontend_only=False,
+            backend=backend,
+        )
+
+    @pytest.mark.parametrize(
+        "use_cpu_only, backend, shape_block_paddings, dynamic",
+        itertools.product(
+            [True, False],
+            backends,
+            [
+                [(1, 4, 6, 2, 2), [2, 3], [[2, 0],[3, 6]]], 
+                [(2, 4, 6, 1), [1, 2], [[2, 1], [3, 3]]],
+                [(2, 4, 6, 1, 2), [2, 1], [[0, 0],[0, 0]]],
+                [(2, 4, 6, 1, 2), [2], [[0, 0]]],
+            ],
+            [True, False],
+        ),
+    )
+    def test_smoke_new_op(self, use_cpu_only, backend, shape_block_paddings, dynamic):
+        input_shape, block_shape, paddings = shape_block_paddings
+        
+        # The neuralnetwork backend doesn't support these tests
+        if backend[0] == "neuralnetwork":
+            return
+
+        tf_input_shape = input_shape if not dynamic else [None] * len(input_shape)
+        @make_tf_graph([tf_input_shape])
+        def build_model(x):
+            return tf.raw_ops.SpaceToBatchND(
+                input=x, block_shape=block_shape, paddings=paddings
+            )
+
+        model, inputs, outputs = build_model
+        input_values = [random_gen(input_shape)]
+        input_dict = dict(zip(inputs, input_values))
+        TensorFlowBaseTest.run_compare_tf(
+            model,
+            input_dict,
+            outputs,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -5424,6 +5537,13 @@ def test_programmatic(
         # generate data
         input_shape = np.random.randint(low=1, high=4, size=input_rank)
         block_shape = np.random.randint(low=1, high=3, size=block_rank)
+
+        if backend[0] == "neuralnetwork":
+            if block_rank == 2 and block_shape[0] != block_shape[1]:
+                pytest.xfail("neuralnetwork backend doesn't support unequal block shape.")
+            if block_shape[0] == 1:
+                pytest.xfail("neuralnetwork backend doesn't support unity block shape.")
+
         paddings = []
         for i in range(block_rank):
             while True:
@@ -5456,7 +5576,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -5488,7 +5608,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -5510,6 +5630,13 @@ def test_programmatic(
         # generate data
         input_shape = np.random.randint(low=1, high=4, size=input_rank)
         block_shape = np.random.randint(low=1, high=3, size=block_rank)
+
+        if backend[0] == "neuralnetwork":
+            if block_rank == 2 and block_shape[0] != block_shape[1]:
+                pytest.xfail("neuralnetwork backend doesn't support unequal block shape.")
+            if block_shape[0] == 1:
+                pytest.xfail("neuralnetwork backend doesn't support unity block shape.")
+        
         input_shape[0] = input_shape[0] * np.prod(block_shape)
         crops = []
         for i in range(block_rank):
@@ -5536,6 +5663,64 @@ def build_model(x):
                     input=x, block_shape=block_shape, crops=crops
                 )
 
+        model, inputs, outputs = build_model
+        input_values = [random_gen(input_shape)]
+        input_dict = dict(zip(inputs, input_values))
+
+        # Before rdar://93071454 (batch_to_space is error out in espresso for dynamic inputs cormel model) is fixed,
+        # we need to specify the default shape for the dynamic model by setting inputs_for_conversion
+        if dynamic:
+            shape = tuple([RangeDim(default=dim) for dim in input_shape])
+            inputs_for_conversion = [TensorType(shape=shape, dtype=np.float32)]
+        else:
+            inputs_for_conversion = None
+
+        TensorFlowBaseTest.run_compare_tf(
+            model,
+            input_dict,
+            outputs,
+            inputs_for_conversion=inputs_for_conversion,
+            use_cpu_for_conversion=use_cpu_only,
+            frontend_only=False,
+            backend=backend,
+        )
+
+    @pytest.mark.parametrize(
+        "use_cpu_only, backend, shape_block_crops, dynamic",
+        itertools.product(
+            [True, False],
+            backends,
+            [
+                [(6, 4, 6, 2, 2), [2, 3], [[2, 0],[3, 6]]], 
+                [(4, 4, 6, 1), [1, 2], [[2, 1], [3, 3]]],
+                [(4, 4, 6, 1, 2), [2, 1], [[0, 0],[0, 0]]],
+                [(4, 4, 6, 1, 2), [2], [[0, 0]]],
+            ],
+            [True, False],
+        ),
+    )
+    def test_smoke_new_op(self, use_cpu_only, backend, shape_block_crops, dynamic):
+        input_shape, block_shape, crops = shape_block_crops
+        
+        # The neuralnetwork backend doesn't support these tests
+        if backend[0] == "neuralnetwork":
+            return
+
+        tf_input_shape = input_shape if not dynamic else [None] * len(input_shape)
+        @make_tf_graph([tf_input_shape])
+        def build_model(x):
+            return tf.raw_ops.BatchToSpaceND(
+                input=x, block_shape=block_shape, crops=crops
+            )
+
+        # Before rdar://93071454 (batch_to_space is error out in espresso for dynamic inputs cormel model) is fixed,
+        # we need to specify the default shape for the dynamic model by setting inputs_for_conversion
+        if dynamic:
+            shape = tuple([RangeDim(default=dim) for dim in input_shape])
+            inputs_for_conversion = [TensorType(shape=shape, dtype=np.float32)]
+        else:
+                        inputs_for_conversion = None
+
         model, inputs, outputs = build_model
         input_values = [random_gen(input_shape)]
         input_dict = dict(zip(inputs, input_values))
@@ -5543,13 +5728,29 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
+            inputs_for_conversion=inputs_for_conversion,
             frontend_only=False,
             backend=backend,
         )
 
 
 class TestTensorArray(TensorFlowBaseTest):
+
+    @staticmethod
+    def get_dynamic_elem_shape_model():
+        elem_shape = (None, None)
+        @make_tf_graph([elem_shape])
+        def build_model(x):
+            ta = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
+            ta = ta.write(10, x)
+            ta = ta.write(9, x)
+            ta = ta.scatter([3], tf.expand_dims(x, 0))
+            ta = ta.scatter([8], tf.expand_dims(x, 0))
+
+            return ta.stack()
+        return build_model
+
     @pytest.mark.parametrize(
         "use_cpu_only, backend", itertools.product([True, False], backends,)
     )
@@ -5586,7 +5787,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -5601,25 +5802,13 @@ def test_tf_dynamic_elem_shape(self, use_cpu_only, backend):
         # TF2: TensorListReserve, TensorListLength, TensorListSetItem,
         #      TensorListScatterIntoExistingList, TensorListStack,
         #      TensorListResize
-        elem_shape = (None, None)
-
-        @make_tf_graph([elem_shape])
-        def build_model(x):
-            ta = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
-            ta = ta.write(10, x)
-            ta = ta.write(9, x)
-            ta = ta.scatter([3], tf.expand_dims(x, 0))
-            ta = ta.scatter([8], tf.expand_dims(x, 0))
-
-            return ta.stack()
-
-        model, inputs, outputs = build_model
+        model, inputs, outputs = TestTensorArray.get_dynamic_elem_shape_model()
         input_values = [random_gen((2,3))]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
             model,
             input_dict, outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False, backend=backend)
 
     @pytest.mark.skip(
@@ -5649,7 +5838,7 @@ def cond(i, num_iters, array, update):
         input_values = [random_gen(shape=(3, 2))]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend
         )
 
 
@@ -5700,7 +5889,7 @@ def build_model(x, shape):
 
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend
         )
 
 
@@ -5757,7 +5946,7 @@ def test_tf_no_variable(
                 graph,
                 {x: np.random.rand(*x_shape).astype(np.float32),},
                 res,
-                use_cpu_only=use_cpu_only,
+                use_cpu_for_conversion=use_cpu_only,
                 frontend_only=False,
                 backend=backend,
             )
@@ -5787,7 +5976,7 @@ def test_tf_lstm_block_cell(self, use_cpu_only, backend, batch):
                 graph,
                 {x: np.random.rand(*x_shape).astype(np.float32),},
                 res,
-                use_cpu_only=use_cpu_only,
+                use_cpu_for_conversion=use_cpu_only,
                 frontend_only=False,
                 backend=backend,
                 # variable needs to be frozen
@@ -5826,7 +6015,7 @@ def test_tf_lstm_block_fused_cell(self, use_cpu_only, backend, batch_size):
                 graph,
                 {x: np.random.rand(*x_shape).astype(np.float32),},
                 output,
-                use_cpu_only=use_cpu_only,
+                use_cpu_for_conversion=use_cpu_only,
                 frontend_only=False,
                 backend=backend,
                 # variable needs to be frozen
@@ -5873,7 +6062,7 @@ def test_tf_multiple_lstm_block_fused_cell(self, use_cpu_only, backend):
                 graph,
                 {x: np.random.rand(*x_shape).astype(np.float32),},
                 x5,
-                use_cpu_only=use_cpu_only,
+                use_cpu_for_conversion=use_cpu_only,
                 frontend_only=False,
                 backend=backend,
                 # variable needs to be frozen
@@ -5906,7 +6095,7 @@ def test_tf_no_variable(self, use_cpu_only, backend):
                 graph,
                 {x: np.random.rand(1).astype(np.float32),},
                 res,
-                use_cpu_only=use_cpu_only,
+                use_cpu_for_conversion=use_cpu_only,
                 frontend_only=False,
                 backend=backend,
             )
@@ -5949,7 +6138,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -6017,7 +6206,6 @@ def build_model(x):
             frontend_only=False,
             backend=backend,
             use_cpu_for_conversion=use_cpu_for_conversion,
-            use_cpu_only=use_cpu_for_conversion,
         )
 
 class TestLogSoftMax(TensorFlowBaseTest):
@@ -6040,7 +6228,7 @@ def build_model(x):
         input_values = [input_value]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(model, input_dict, outputs,
-                       use_cpu_only=use_cpu_only,
+                       use_cpu_for_conversion=use_cpu_only,
                        frontend_only=False, backend=backend)
 
 
@@ -6068,8 +6256,8 @@ def build_model(x):
         input_values = [input_value]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(model, input_dict, outputs,
-                       use_cpu_only=use_cpu_only,
-                       frontend_only=False, backend=backend)
+                                          use_cpu_for_conversion=use_cpu_only,
+                                          frontend_only=False, backend=backend)
 
 
 class TestSize(TensorFlowBaseTest):
@@ -6107,8 +6295,8 @@ def build_model(x):
             input_values = [input_value]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(model, input_dict, outputs,
-                       use_cpu_only=use_cpu_only,
-                       frontend_only=False, backend=backend)
+                                          use_cpu_for_conversion=use_cpu_only,
+                                          frontend_only=False, backend=backend)
 
 class TestAudioSpectrogram(TensorFlowBaseTest):
     @pytest.mark.parametrize(
@@ -6144,7 +6332,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
@@ -6195,7 +6383,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             frontend_only=False,
             backend=backend,
         )
diff --git a/coremltools/converters/mil/frontend/tensorflow/test/test_parse.py b/coremltools/converters/mil/frontend/tensorflow/test/test_parse.py
index 892ffa680..178cc451b 100644
--- a/coremltools/converters/mil/frontend/tensorflow/test/test_parse.py
+++ b/coremltools/converters/mil/frontend/tensorflow/test/test_parse.py
@@ -6,7 +6,7 @@
 import unittest
 import pytest
 
-pytest.importorskip("tensorflow", minversion="1.14.0")
+pytest.importorskip("tensorflow", minversion="1.15.0")
 from tensorflow.core.framework import attr_value_pb2 as attr_value
 from tensorflow.core.framework import tensor_shape_pb2 as tensor_shape
 from tensorflow.core.framework import types_pb2 as types
diff --git a/coremltools/converters/mil/frontend/tensorflow/test/test_parsed_tf_node.py b/coremltools/converters/mil/frontend/tensorflow/test/test_parsed_tf_node.py
index 355c48bb5..52a0425f3 100644
--- a/coremltools/converters/mil/frontend/tensorflow/test/test_parsed_tf_node.py
+++ b/coremltools/converters/mil/frontend/tensorflow/test/test_parsed_tf_node.py
@@ -6,7 +6,7 @@
 import unittest
 import pytest
 
-pytest.importorskip("tensorflow", minversion="1.14.0")
+pytest.importorskip("tensorflow", minversion="1.15.0")
 from tensorflow.core.framework import node_def_pb2 as node_def
 from tensorflow.core.framework import tensor_shape_pb2 as tensor_shape
 from tensorflow.core.framework import types_pb2 as types
diff --git a/coremltools/converters/mil/frontend/tensorflow/test/testing_utils.py b/coremltools/converters/mil/frontend/tensorflow/test/testing_utils.py
index f56a6f5bb..d4a3f3a7b 100644
--- a/coremltools/converters/mil/frontend/tensorflow/test/testing_utils.py
+++ b/coremltools/converters/mil/frontend/tensorflow/test/testing_utils.py
@@ -2,18 +2,20 @@
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
-import numpy as np
+
 import os
 import pytest
 import tempfile
 
+import numpy as np
+
 from coremltools import TensorType
 import coremltools.models.utils as coremltoolsutils
 from coremltools.converters.mil.testing_utils import compare_shapes, \
     compare_backend, run_core_ml_predict, ct_convert
 from coremltools.converters.mil.testing_reqs import ct
 
-tf = pytest.importorskip("tensorflow", minversion="1.14.0")
+tf = pytest.importorskip("tensorflow", minversion="1.15.0")
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.tools.freeze_graph import freeze_graph as freeze_g
@@ -112,8 +114,9 @@ def get_tf_node_names(tf_nodes, mode="inputs"):
 
 
 def tf_graph_to_mlmodel(
-    graph, feed_dict, output_nodes, frontend="tensorflow", backend=("neuralnetwork", "fp32"),
-    use_cpu_for_conversion=False,
+    graph, feed_dict, output_nodes, frontend="tensorflow",
+    backend=("neuralnetwork", "fp32"), use_cpu_for_conversion=False,
+    inputs_for_conversion=None,
 ):
     """
     Parameters
@@ -130,8 +133,9 @@ def tf_graph_to_mlmodel(
         Backend to convert to.
     use_cpu_for_conversion: bool
         Argument which is passed as is to the unified converter API.
-        That is, "ct.convert(...., useCPUOnly=use_cpu_for_conversion)"
         It forces the model to be loaded on the CPU context, post conversion.
+    inputs_for_conversion: list of coremltools.TensorType() or coremltools.ImageType() objects
+        Defaults to None. It is passed as is to the "inputs" argument of the converter.
     -----------
     Returns MLModel, Input Values, Output Names
     """
@@ -145,9 +149,16 @@ def tf_graph_to_mlmodel(
     output_names = get_tf_node_names(output_nodes, mode="outputs")
     input_values = {name: val for name, val in zip(input_names, feed_dict.values())}
 
+    if use_cpu_for_conversion:
+        compute_unit = ct.ComputeUnit.CPU_ONLY
+    else:
+        compute_unit = ct.ComputeUnit.ALL
+        
+    inputs = inputs_for_conversion if inputs_for_conversion is not None else None
+
     mlmodel = ct_convert(
-        graph, inputs=None, outputs=output_names, source=frontend, convert_to=backend,
-        useCPUOnly=use_cpu_for_conversion,
+        graph, inputs=inputs, outputs=output_names, source=frontend, convert_to=backend,
+        compute_units=compute_unit,
     )
 
     return mlmodel, input_values, output_names, output_nodes
@@ -175,7 +186,7 @@ def run_compare_tf(
     graph,
     feed_dict,
     output_nodes,
-    use_cpu_only=False,
+    inputs_for_conversion=None,
     use_cpu_for_conversion=False,
     frontend_only=False,
     frontend="tensorflow",
@@ -197,21 +208,10 @@ def run_compare_tf(
         Dict of placeholder and value pairs representing inputs.
     output_nodes: tf.node or list[tf.node]
         List of names representing outputs.
-    use_cpu_only: bool
-        If true, use CPU only for prediction, otherwise, use GPU also.
+    inputs_for_conversion: list of coremltools.TensorType() or coremltools.ImageType() objects
+        Defaults to None. It is passed as is to the "inputs" argument of the converter.
     use_cpu_for_conversion: bool
-        If true, the converter is invoked using "ct.convert(...., useCPUOnly=True)",
-        which in turn forces the model to be loaded with the CPU context, which happens
-        when the converter loads the ML model object from the proto spec
-        using "ct.models.MLModel(proto_spec, useCPUOnly=True)".
-        The other argument, i.e., "use_cpu_only" on the other hand refers to only the compute engine
-        for prediction purposes. For a model that is loaded on a non-CPU context, it can still be forced
-        to execute on the CPU at the time of prediction. Hence,
-        "use_cpu_for_conversion = False && use_cpu_only = True" is valid and results in a case when a model is
-        loaded for GPU but executed on the CPU.
-        The scenario, "use_cpu_for_conversion = True && use_cpu_only = False" is invalid though,
-        since once a model is loaded on a CPU context its context cannot be changed to a non CPU device
-        at the time of prediction.
+        If True, the model to be loaded with the CPU context.
     frontend_only: bool
         If true, skip the prediction call, only validate conversion.
     frontend: str
@@ -223,7 +223,7 @@ def run_compare_tf(
     rtol: float
         The relative tolerance parameter.
     validate_shapes_only: bool
-        If true, skip element-wise value comparision.
+        If True, skip element-wise value comparision.
     freeze_graph: bool
         If True, use the "tensorflow.python.tools.freeze_graph" function
         to freeze the TF graph prior to conversion. This will ensure that
@@ -234,10 +234,6 @@ def run_compare_tf(
     Return:
         Proto, mlmodel, input dictionay, prediction(if possible)
     """
-    if use_cpu_for_conversion and not use_cpu_only:
-        # use_cpu_for_conversion = True && use_cpu_only = False
-        raise ValueError("use_cpu_for_conversion = True && use_cpu_only = False is an invalid test case")
-
     if not isinstance(output_nodes, (tuple, list)):
         output_nodes = [output_nodes]
 
@@ -272,7 +268,8 @@ def run_compare_tf(
             graph = load_tf_pb(static_model_file)
 
     mlmodel, input_key_values, output_names, output_nodes = tf_graph_to_mlmodel(
-        graph, feed_dict, output_nodes, frontend, backend, use_cpu_for_conversion=use_cpu_for_conversion,
+        graph, feed_dict, output_nodes, frontend, backend,
+        use_cpu_for_conversion=use_cpu_for_conversion, inputs_for_conversion=inputs_for_conversion,
     )
 
     if frontend_only or coremltoolsutils._macos_version() < (10, 13) \
@@ -290,22 +287,19 @@ def run_compare_tf(
         if isinstance(v, np.ndarray) and issubclass(v.dtype.type, np.integer):
             input_key_values[k] = v.astype(np.float) # Core ML only accepts floats
 
+    pred = None
     if validate_shapes_only:
-        compare_shapes(mlmodel, input_key_values, expected_outputs, use_cpu_only)
-    else:
-        compare_backend(
-            mlmodel,
-            input_key_values,
-            expected_outputs,
-            use_cpu_only,
-            atol=atol,
-            rtol=rtol,
-            also_compare_shapes=True,
-            dtype=backend[1],
+        compare_shapes(mlmodel, input_key_values, expected_outputs)
+    elif not coremltoolsutils._has_custom_layer(mlmodel._spec):
+        pred = compare_backend(
+                mlmodel,
+                input_key_values,
+                expected_outputs,
+                atol=atol,
+                rtol=rtol,
+                also_compare_shapes=True,
+                dtype=backend[1],
         )
-    pred=None
-    if not coremltoolsutils._has_custom_layer(mlmodel.get_spec()):
-        pred = run_core_ml_predict(mlmodel, input_key_values, use_cpu_only)
     else:
         print('Skipping model prediction as it has a custom nn layer!')
     return mlmodel._spec, mlmodel, input_key_values, pred
@@ -328,35 +322,42 @@ def layer_counts(spec, layer_type):
     return n
 
 
-class TensorFlowBaseTest(object):
+class TensorFlowBaseTest:
     testclassname=''
     testmodelname=''
+
     @pytest.fixture(autouse=True)
     def store_testname_with_args(self, request):
         TensorFlowBaseTest.testclassname = type(self).__name__
         TensorFlowBaseTest.testmodelname = request.node.name
 
-    def teardown_method(self, method):
-        pass
-
     @staticmethod
-    def run_compare_tf(graph, feed_dict, output_nodes, use_cpu_only=False,
+    def run_compare_tf(graph, feed_dict, output_nodes,
+                       inputs_for_conversion=None,
                        use_cpu_for_conversion=False,
                        frontend_only=False, frontend="tensorflow",
                        backend=("neuralnetwork", "fp32"), atol=1e-04, rtol=1e-05,
                        validate_shapes_only=False, freeze_graph=False,
                        tf_outputs=None):
-        res = run_compare_tf(graph, feed_dict, output_nodes,
-                             use_cpu_only=use_cpu_only,
+
+        res = run_compare_tf(graph,
+                             feed_dict,
+                             output_nodes,
+                             inputs_for_conversion=inputs_for_conversion,
                              use_cpu_for_conversion=use_cpu_for_conversion,
-                             frontend_only=frontend_only, frontend=frontend,
+                             frontend_only=frontend_only,
+                             frontend=frontend,
                              backend=backend, atol=atol,
                              rtol=rtol,
                              validate_shapes_only=validate_shapes_only,
-                             freeze_graph=freeze_graph, tf_outputs=tf_outputs)
+                             freeze_graph=freeze_graph,
+                             tf_outputs=tf_outputs
+        )
+
         alist = []
         if res is not None:
             alist = list(res)
         alist.append(TensorFlowBaseTest.testclassname)
         alist.append(TensorFlowBaseTest.testmodelname)
+
         return tuple(alist)
diff --git a/coremltools/converters/mil/frontend/tensorflow/tf_graph_pass/__init__.py b/coremltools/converters/mil/frontend/tensorflow/tf_graph_pass/__init__.py
index 346172a3e..5a0b09694 100644
--- a/coremltools/converters/mil/frontend/tensorflow/tf_graph_pass/__init__.py
+++ b/coremltools/converters/mil/frontend/tensorflow/tf_graph_pass/__init__.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
 #  Copyright (c) 2020, Apple Inc. All rights reserved.
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
diff --git a/coremltools/converters/mil/frontend/tensorflow/tf_graph_pass/cond_to_where.py b/coremltools/converters/mil/frontend/tensorflow/tf_graph_pass/cond_to_where.py
index 465cd7bd2..5f23600f3 100644
--- a/coremltools/converters/mil/frontend/tensorflow/tf_graph_pass/cond_to_where.py
+++ b/coremltools/converters/mil/frontend/tensorflow/tf_graph_pass/cond_to_where.py
@@ -1,14 +1,12 @@
-# -*- coding: utf-8 -*-
-
 #  Copyright (c) 2020, Apple Inc. All rights reserved.
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+import logging
+
 from ..basic_graph_ops import delete_node, disconnect_edge
 from .visitors import FindAllUpstreamTerminals
-
-import logging
 from coremltools._deps import _HAS_TF_2
 
 
@@ -34,7 +32,7 @@ def compute_max_rank(graph):
     return ret
 
 
-class CondToWhere(object):
+class CondToWhere:
     @staticmethod
     def _search(g, node_name):
         """
diff --git a/coremltools/converters/mil/frontend/tensorflow/tf_graph_pass/functionalize_loops.py b/coremltools/converters/mil/frontend/tensorflow/tf_graph_pass/functionalize_loops.py
index 6428ffa8c..ecec028fa 100644
--- a/coremltools/converters/mil/frontend/tensorflow/tf_graph_pass/functionalize_loops.py
+++ b/coremltools/converters/mil/frontend/tensorflow/tf_graph_pass/functionalize_loops.py
@@ -3,6 +3,8 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+import logging
+
 from ..parsed_tf_node import ParsedTFNode
 from ..basic_graph_ops import (
     connect_dests,
@@ -20,10 +22,9 @@
     FindImmediateUpstreamNodes,
     FindSubgraph,
 )
-import logging
 
 
-class FunctionalizeLoops(object):
+class FunctionalizeLoops:
     """
     Turns while loops in TensorFlow dataflow graph into the functional form:
     while(cond_function, body_function)
diff --git a/coremltools/converters/mil/frontend/tensorflow/tf_graph_pass/insert_get_tuple.py b/coremltools/converters/mil/frontend/tensorflow/tf_graph_pass/insert_get_tuple.py
index 32d026a2b..676b6df3f 100644
--- a/coremltools/converters/mil/frontend/tensorflow/tf_graph_pass/insert_get_tuple.py
+++ b/coremltools/converters/mil/frontend/tensorflow/tf_graph_pass/insert_get_tuple.py
@@ -1,13 +1,12 @@
-# -*- coding: utf-8 -*-
-
 #  Copyright (c) 2020, Apple Inc. All rights reserved.
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
-from ..parsed_tf_node import ParsedTFNode
 import copy
 
+from ..parsed_tf_node import ParsedTFNode
+
 
 def insert_get_tuple(gddict):
     """
@@ -64,7 +63,7 @@ def make_op(input_node, index, new_node_name, gto_make_op_cache):
         "TensorArrayV3",
         "Const",
     ]
-    inclusions = ["Split", "SplitV", "LSTMBlockCell", "TopK", "TopKV2", "Unpack", "BlockLSTM"]
+    inclusions = ["Split", "SplitV", "LSTMBlockCell", "TopK", "TopKV2", "Unpack", "BlockLSTM", "BlockLSTMV2", "NonMaxSuppressionV5"]
     gto_make_op_cache = {}
     for name in list(gddict.keys()):
         new_node = ParsedTFNode()
diff --git a/coremltools/converters/mil/frontend/tensorflow/tf_graph_pass/visitors.py b/coremltools/converters/mil/frontend/tensorflow/tf_graph_pass/visitors.py
index a02f8517f..516963b61 100644
--- a/coremltools/converters/mil/frontend/tensorflow/tf_graph_pass/visitors.py
+++ b/coremltools/converters/mil/frontend/tensorflow/tf_graph_pass/visitors.py
@@ -6,7 +6,7 @@
 from ..parsed_tf_node import ParsedTFNode
 
 
-class FindAllDownstreamTerminals(object):
+class FindAllDownstreamTerminals:
     # Find all nodes matching a particular function
     # which is downstream reachable from a set of nodes.
     def __init__(self, fn):
@@ -40,7 +40,7 @@ def get_result(self):
         return self.result
 
 
-class FindAllReachableNodes(object):
+class FindAllReachableNodes:
     # Find all nodes reachable from a set of nodes which satisfy a criteria
     def __init__(self, fn):
         self.result = []
@@ -75,7 +75,7 @@ def get_result(self):
         return self.result
 
 
-class FindImmediateUpstreamNodes(object):
+class FindImmediateUpstreamNodes:
     # Find all nodes matching a particular function which is immediately above a set of nodes
     def __init__(self, fn):
         self.result = []
@@ -100,7 +100,7 @@ def get_result(self):
         return self.result
 
 
-class FindImmediateDownstreamNodes(object):
+class FindImmediateDownstreamNodes:
     # Find all nodes matching a particular function which is immediately above a set of nodes
     def __init__(self, fn):
         self.result = []
@@ -126,7 +126,7 @@ def get_result(self):
         return self.result
 
 
-class FindAllUpstreamTerminals(object):
+class FindAllUpstreamTerminals:
     # Find the "upstream frontier" of nodes passing some predicate.
     # In other words, perform a pre-order traversal of a node and its inputs, collecting all nodes
     # passing a given predicate as we go along. Terminate the search along a given branch as soon
@@ -166,7 +166,7 @@ def get_result(self):
         return self.result
 
 
-class FindSubgraph(object):
+class FindSubgraph:
     # Find all nodes between a set of sources and a set of terminals
     # Sources are not returned, but reached terminals are returned
     def __init__(self, terminal_nodes):
diff --git a/coremltools/converters/mil/frontend/tensorflow/tfssa.py b/coremltools/converters/mil/frontend/tensorflow/tfssa.py
index aabe7ca4e..18f1677e8 100644
--- a/coremltools/converters/mil/frontend/tensorflow/tfssa.py
+++ b/coremltools/converters/mil/frontend/tensorflow/tfssa.py
@@ -12,7 +12,7 @@
 from .naming_utils import escape_fn_name
 
 
-class ParsedNode(object):
+class ParsedNode:
     """
     Node class for the tfssa graph.
 
@@ -69,7 +69,7 @@ def copy(self):
         return self.__copy__()
 
 
-class SSAFunction(object):
+class SSAFunction:
     __slots__ = ["graph", "inputs", "input_types", "outputs", "output_types", "ret"]
 
     def __init__(self, gdict=None, inputs=None, outputs=None, ret=None):
@@ -179,7 +179,7 @@ def copy(self):
         return self.__copy__()
 
 
-class NetworkEnsemble(object):
+class NetworkEnsemble:
     __slots__ = ["functions", "variables", "global_resource"]
 
     def __init__(self, instance=None):
diff --git a/coremltools/converters/mil/frontend/tensorflow2/load.py b/coremltools/converters/mil/frontend/tensorflow2/load.py
index 84fe27722..546c6ee75 100644
--- a/coremltools/converters/mil/frontend/tensorflow2/load.py
+++ b/coremltools/converters/mil/frontend/tensorflow2/load.py
@@ -132,10 +132,10 @@ def _get_concrete_functions_and_graph_def(self):
         
         return cfs, graph_def
 
-    def _graph_def_from_model(self, outputs=None):
+    def _graph_def_from_model(self, output_names=None):
         """Overwrites TFLoader._graph_def_from_model()"""
         _, graph_def = self._get_concrete_functions_and_graph_def()
-        return self.extract_sub_graph(graph_def, outputs)
+        return self.extract_sub_graph(graph_def, output_names)
 
     def _tf_ssa_from_graph_def(self, fn_name="main"):
         """Overwrites TFLoader._tf_ssa_from_graph_def()"""
diff --git a/coremltools/converters/mil/frontend/tensorflow2/ssa_passes/remove_vacuous_cond.py b/coremltools/converters/mil/frontend/tensorflow2/ssa_passes/remove_vacuous_cond.py
index 6a96e89e5..6d16e3561 100644
--- a/coremltools/converters/mil/frontend/tensorflow2/ssa_passes/remove_vacuous_cond.py
+++ b/coremltools/converters/mil/frontend/tensorflow2/ssa_passes/remove_vacuous_cond.py
@@ -1,15 +1,14 @@
-# -*- coding: utf-8 -*-
-
 #  Copyright (c) 2020, Apple Inc. All rights reserved.
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+import logging
 
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil.passes.pass_registry import register_pass
 from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
-import logging
+
 
 def _remove_vacuous_cond_block(block):
     num_changes = 0
diff --git a/coremltools/converters/mil/frontend/tensorflow2/test/test_conversion_api.py b/coremltools/converters/mil/frontend/tensorflow2/test/test_conversion_api.py
new file mode 100644
index 000000000..f6eedceab
--- /dev/null
+++ b/coremltools/converters/mil/frontend/tensorflow2/test/test_conversion_api.py
@@ -0,0 +1,92 @@
+#  Copyright (c) 2020, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import pytest
+
+tf = pytest.importorskip("tensorflow", minversion="2.1.0")
+
+@pytest.fixture
+def int32_input_model():
+    x = tf.keras.Input(batch_input_shape=(10, 20), name="input", dtype=tf.int32)
+    out = tf.add(x, tf.constant(5, dtype=tf.int32), name="output")
+    return tf.keras.Model(inputs=x, outputs=out)
+
+@pytest.fixture
+def float32_input_model_add_op():
+    x = tf.keras.Input(batch_input_shape=(10, 20), name="input", dtype=tf.float32)
+    out = tf.add(x, tf.constant(5.5, dtype=tf.float32), name="output")
+    return tf.keras.Model(inputs=x, outputs=out)
+
+@pytest.fixture
+def float32_input_model_relu_ops():
+    x = tf.keras.Input(batch_input_shape=(10, 20), name="input", dtype=tf.float32)
+    x1 = tf.keras.layers.ReLU()(x)
+    out = tf.keras.layers.ReLU(name="output")(x1)
+    return tf.keras.Model(inputs=x, outputs=out)
+
+@pytest.fixture
+def int64_input_model():
+    x = tf.keras.Input(batch_input_shape=(10, 20), name="input", dtype=tf.int64)
+    out = tf.add(x, tf.constant(5, dtype=tf.int64), name="output")
+    return tf.keras.Model(inputs=x, outputs=out)
+
+@pytest.fixture
+def float32_two_input_model():
+    x = tf.keras.Input(batch_input_shape=(10, 20), name="input1", dtype=tf.float32)
+    y = tf.keras.Input(batch_input_shape=(10, 20), name="input2", dtype=tf.float32)
+    out = tf.add(x, y, name="output")
+    return tf.keras.Model(inputs=[x, y], outputs=out)
+
+@pytest.fixture
+def float32_two_output_model():
+    x = tf.keras.Input(batch_input_shape=(10, 20), name="input", dtype=tf.float32)
+    y = tf.nn.relu(x)
+    out2 = tf.nn.relu6(x, name="output2")
+    out1 = tf.nn.relu(y, name="output1")
+    return tf.keras.Model(inputs=x, outputs=[out1, out2])
+
+@pytest.fixture
+def rank3_input_model():
+    x = tf.keras.Input(batch_input_shape=(1, 10, 20), name="input", dtype=tf.float32)
+    out = tf.add(x, tf.constant(5, dtype=tf.float32), name="output")
+    return tf.keras.Model(inputs=x, outputs=out)
+
+@pytest.fixture
+def rank4_input_model():
+    x = tf.keras.Input(batch_input_shape=(1, 10, 20, 3), name="input", dtype=tf.float32)
+    out = tf.add(x, tf.constant(5, dtype=tf.float32), name="output")
+    return tf.keras.Model(inputs=x, outputs=out)
+
+@pytest.fixture
+def rank4_input_model_with_channel_first_output():
+    x = tf.keras.Input(batch_input_shape=(1, 10, 20, 3), name="input", dtype=tf.float32)
+    y = tf.add(x, tf.constant(5, dtype=tf.float32))
+    out = tf.transpose(y, perm=[0, 3, 1, 2], name="output")
+    return tf.keras.Model(inputs=x, outputs=out)
+
+@pytest.fixture
+def rank4_grayscale_input_model():
+    x = tf.keras.Input(batch_input_shape=(1, 10, 20, 1), name="input", dtype=tf.float32)
+    out = tf.add(x, tf.constant(5, dtype=tf.float32), name="output")
+    return tf.keras.Model(inputs=x, outputs=out)
+
+@pytest.fixture
+def rank4_grayscale_input_model_with_channel_first_output():
+    x = tf.keras.Input(batch_input_shape=(1, 10, 20, 1), name="input", dtype=tf.float32)
+    y = tf.add(x, tf.constant(5, dtype=tf.float32))
+    out = tf.transpose(y, perm=[0, 3, 1, 2], name="output")
+    return tf.keras.Model(inputs=x, outputs=out)
+
+@pytest.fixture
+def linear_model():
+    # this model will test the fuse_matmul_weight_bias pass
+    x = tf.keras.Input(batch_input_shape=(1, 10), name="input", dtype=tf.float32)
+    y = tf.keras.layers.Dense(4)(x)
+    y = tf.add(y, tf.constant([1, 2, 3, 4], shape=(4,), dtype=tf.float32))
+    out = tf.nn.relu(y)
+    return tf.keras.Model(inputs=x, outputs=out)
+
+
+from coremltools.converters.mil.frontend.tensorflow.test.test_conversion_api import TestInputOutputConversionAPI
\ No newline at end of file
diff --git a/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_load.py b/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_load.py
index ea440a276..28b1e2b80 100644
--- a/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_load.py
+++ b/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_load.py
@@ -72,7 +72,7 @@ def test_keras_saved_model_file(self):
         )
         keras_model.save(self.saved_model_dir, save_format="tf")
         mlmodel = converter.convert(
-            self.saved_model_dir, outputs="Identity", source=frontend
+            self.saved_model_dir, outputs=["Identity"], source=frontend
         )
         assert mlmodel is not None
 
@@ -105,7 +105,7 @@ def test_concrete_function_list_from_tf_low_level_api(self):
             tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY
         ]
         mlmodel = converter.convert(
-            [concrete_func], outputs="Identity", source=frontend
+            [concrete_func], outputs=["Identity"], source=frontend
         )
         assert mlmodel is not None
 
diff --git a/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_ops.py b/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_ops.py
index 638d9d0a1..e27da5da4 100644
--- a/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_ops.py
+++ b/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_ops.py
@@ -70,6 +70,7 @@
     TestFill,
     TestGather,
     TestIdentity,
+    TestIdentityN,
     TestImageResizing,
     TestIsFinite,
     TestL2Normalization,
@@ -100,6 +101,7 @@
     TestSize,
     TestSliceByIndex,
     TestSliceBySize,
+    TestSoftmaxCrossEntropyWithLogits,
     TestSpaceToBatchND,
     TestSpaceToDepth,
     TestSplit,
@@ -156,7 +158,7 @@ def build_model(x, warp):
         ]
         input_dict = dict(zip(inputs, input_values))
         self.run_compare_tf2(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend,
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend,
         )
 
 
@@ -206,7 +208,7 @@ def build_model(x):
         ]
         input_dict = dict(zip(inputs, input_values))
         self.run_compare_tf2(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend,
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend,
         )
 
 
@@ -239,7 +241,7 @@ def build_model(x):
         ]
         input_dict = dict(zip(inputs, input_values))
         self.run_compare_tf2(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend,
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend,
         )
 
 
@@ -293,7 +295,7 @@ def build_model(x):
         input_values = [random_gen(input_shape, -100, 100)]
         input_dict = dict(zip(inputs, input_values))
         self.run_compare_tf2(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend,
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend,
         )
 
     @pytest.mark.parametrize(
@@ -316,7 +318,7 @@ def build_model(x):
         input_values = [random_gen(x_shape, -100, 100)]
         input_dict = dict(zip(inputs, input_values))
         self.run_compare_tf2(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend,
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend,
         )
 
     @pytest.mark.parametrize(
@@ -350,7 +352,7 @@ def build_model(x):
         ]
         input_dict = dict(zip(inputs, input_values))
         self.run_compare_tf2(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend,
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend,
         )
 
 
@@ -393,7 +395,7 @@ def build_model(x):
             model,
             input_dict,
             outputs,
-            use_cpu_only=use_cpu_only,
+            use_cpu_for_conversion=use_cpu_only,
             backend=backend,
             atol=1e-2,
             rtol=1e-3,
@@ -433,7 +435,7 @@ def build_model(x, y):
         input_dict = dict(zip(inputs, input_values))
 
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend
         )
 
 
@@ -454,7 +456,7 @@ def build_model(x):
         input_values = [np.array([0.7], dtype=np.float32)]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend
         )
 
     @pytest.mark.parametrize(
@@ -473,7 +475,7 @@ def build_model(x):
         input_values = [np.array([2], dtype=np.float32)]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend
         )
 
     @pytest.mark.parametrize(
@@ -498,7 +500,7 @@ def build_model(x, y):
         ]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend
         )
 
     @pytest.mark.parametrize(
@@ -517,7 +519,7 @@ def build_model(x):
         input_values = [np.array([2.0], dtype=np.float32)]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend
         )
 
     @pytest.mark.parametrize(
@@ -536,7 +538,7 @@ def build_model(x):
         input_values = [np.array([2.0], dtype=np.float32)]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend
         )
 
     @pytest.mark.parametrize(
@@ -558,7 +560,7 @@ def build_model(x):
         input_values = [np.array([9.0], dtype=np.float32)]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend
         )
 
 @pytest.mark.xfail(reason="rdar://76293949 (TF2 unit test InvalidArgumentError)", run=False)
@@ -593,7 +595,7 @@ def build_model(x, y):
         ]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend
         )
 
     @pytest.mark.parametrize(
@@ -622,7 +624,7 @@ def build_model(x):
         input_values = [np.array([[3.14], [6.17], [12.14]], dtype=np.float32)]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend
         )
 
     @pytest.mark.parametrize(
@@ -655,7 +657,7 @@ def build_model(x, y):
         ]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend
         )
 
     @pytest.mark.parametrize(
@@ -684,7 +686,7 @@ def build_model(x):
         input_values = [np.array([[3.14], [6.17], [12.14]], dtype=np.float32)]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend
         )
 
     @pytest.mark.parametrize(
@@ -709,5 +711,5 @@ def build_model(x):
         input_values = [np.random.rand(3, 1, 8).astype(np.float32)]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
-            model, input_dict, outputs, use_cpu_only=use_cpu_only, backend=backend
+            model, input_dict, outputs, use_cpu_for_conversion=use_cpu_only, backend=backend
         )
diff --git a/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_ops_tf_keras.py b/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_ops_tf_keras.py
index 4cd7d8996..0cdffc01a 100644
--- a/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_ops_tf_keras.py
+++ b/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_ops_tf_keras.py
@@ -19,7 +19,10 @@
 from coremltools.converters.mil.frontend.tensorflow.test.testing_utils import (
     TensorFlowBaseTest
 )
-from coremltools.converters.mil.testing_utils import random_gen
+from coremltools.converters.mil.testing_utils import (
+    get_op_types_in_program,
+    random_gen,
+)
 from ..._utils import is_symbolic_dim_in_prog
 
 TensorFlowBaseTest.run_compare_tf_keras = \
@@ -101,6 +104,29 @@ def test_activation(self, use_cpu_only, backend, rank, op):
             **kwargs
         )
 
+    @pytest.mark.parametrize("backend", backends)
+    def test_conv2d_prelu_fusion(self, backend):
+        x_shape = (1, 10, 10, 32)
+        x = tf.keras.Input(batch_input_shape=x_shape)  # (B, H, W, C)
+        x1 = tf.keras.layers.Conv2D(16, kernel_size=1)(x)
+        x1 =  tf.keras.layers.PReLU(alpha_initializer='glorot_uniform', shared_axes=[1, 2])(x1)
+        x1 = tf.keras.layers.Conv2D(16, kernel_size=1)(x1)
+        x1 = tf.keras.layers.PReLU(alpha_initializer='glorot_uniform', shared_axes=[1, 2])(x1)
+        keras_model = tf.keras.Model(inputs=x, outputs=x1)
+
+        res = TensorFlowBaseTest.run_compare_tf_keras(
+            keras_model,
+            [random_gen(x_shape, -1, 1)],
+            use_cpu_only=True,
+            backend=backend,
+        )
+        coreml_model = res[1]
+        mil_prog = coreml_model._get_mil_internal()
+        # assert that "prelu" ops are present in the mil program,
+        # which should be if "fuse_prelu" pass worked correctly
+        assert len(mil_prog.find_ops(op_type="prelu")) == 2
+        assert "relu" not in get_op_types_in_program(mil_prog)
+
 
 class TestBinary(TensorFlowBaseTest):
     @pytest.mark.parametrize(
@@ -1323,13 +1349,13 @@ def _test_for_symbolic_shapes(keras_input_shape, input_shape_for_conversion, are
                                       are_symbols_expected=True)
 
     @pytest.mark.parametrize(
-        "use_cpu_only, backend",
-        itertools.product([True, False], backends,),
+        "use_cpu_only, tf_raw_lstm_op, backend",
+        itertools.product([True, False], [tf.raw_ops.BlockLSTM, tf.raw_ops.BlockLSTMV2], backends,),
     )
-    def test_lstm_block_fused_op(self, use_cpu_only, backend):
+    def test_lstm_block_fused_op(self, use_cpu_only, tf_raw_lstm_op, backend):
         '''
-        Define a model with custom LSTM ops that uses tf.raw_ops.BlockLSTM and
-        verify that it converts to a fused lstm op.
+        Define a model with custom LSTM ops that uses tf.raw_ops.BlockLSTM / tf.raw_ops.BlockLSTMV2
+        and verify that it converts to a fused lstm op.
 
         %x (shape: (Seq, Batch, idim) == (5, 2, 4))
         %x1 = LSTM(h=10) (%input) # shape = (5, 2, 10)
@@ -1357,7 +1383,7 @@ def build(self, input_shape):
                 self.init_c = tf.constant(np.zeros((self.batch_size, self.hidden_dim)).astype(np.float32))
 
             def call(self, inputs):
-                _, output_state, _, _, _, _, output = tf.raw_ops.BlockLSTM(
+                _, output_state, _, _, _, _, output = tf_raw_lstm_op(
                     seq_len_max=self.seq_length,
                     x=inputs,
                     cs_prev=self.init_c,
diff --git a/coremltools/converters/mil/frontend/tensorflow2/test/testing_utils.py b/coremltools/converters/mil/frontend/tensorflow2/test/testing_utils.py
index e916bdc66..6129d9e56 100644
--- a/coremltools/converters/mil/frontend/tensorflow2/test/testing_utils.py
+++ b/coremltools/converters/mil/frontend/tensorflow2/test/testing_utils.py
@@ -2,14 +2,16 @@
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
-import numpy as np
+
 import os
+
+import numpy as np
 import pytest
 
 tf = pytest.importorskip("tensorflow", minversion="2.1.0")
 from tensorflow.python.framework import dtypes
 
-from coremltools.converters.mil.testing_reqs import ct
+import coremltools as ct
 from coremltools.converters.mil.frontend.tensorflow.test.testing_utils import (
     get_tf_node_names,
     TensorFlowBaseTest
@@ -24,6 +26,7 @@
 import coremltools.models.utils as coremltoolsutils
 from coremltools.models.utils import _macos_version
 
+
 def make_tf2_graph(input_types):
     """
     Decorator to help construct TensorFlow 2.x model.
@@ -70,7 +73,7 @@ def run_compare_tf2(
     model,
     input_dict,
     output_names,
-    use_cpu_only=False,
+    inputs_for_conversion=None,
     use_cpu_for_conversion=False,
     frontend_only=False,
     frontend="tensorflow",
@@ -88,46 +91,35 @@ def run_compare_tf2(
         Dict of name and value pairs representing inputs.
     output_names: list of str
         List of output node names.
-    use_cpu_only: bool
-        If true, use CPU only for prediction, otherwise, use GPU also.
+    inputs_for_conversion: list of coremltools.TensorType() or coremltools.ImageType() objects
+        Defaults to None. It is passed as is to the "inputs" argument of the converter.
     use_cpu_for_conversion: bool
-        If true, the converter is invoked using "ct.convert(...., useCPUOnly=True)",
-        which in turn forces the model to be loaded with the CPU context, which happens
-        when the converter loads the ML model object from the proto spec
-        using "ct.models.MLModel(proto_spec, useCPUOnly=True)".
-        The other argument, i.e., "use_cpu_only" on the other hand refers to only the compute engine
-        for prediction purposes. For a model that is loaded on a non-CPU context, it can still be forced
-        to execute on the CPU at the time of prediction. Hence,
-        "use_cpu_for_conversion = False && use_cpu_only = True" is valid and results in a case when a model is
-        loaded for GPU but executed on the CPU.
-        The scenario, "use_cpu_for_conversion = True && use_cpu_only = False" is invalid though,
-        since once a model is loaded on a CPU context its context cannot be changed to a non CPU device
-        at the time of prediction.
+        If True, forces the model to be loaded with the CPU context.
     frontend_only: bool
-        If true, skip the prediction call, only validate conversion.
+        If True, skip the prediction call, only validate conversion.
     frontend: str
         Frontend to convert from.
     backend: str
         Backend to convert to.
     debug: bool
-        If true, print verbose information and plot intermediate graphs.
+        If True, print verbose information and plot intermediate graphs.
     atol: float
         The absolute tolerance parameter.
     rtol: float
         The relative tolerance parameter.
     """
-    if use_cpu_for_conversion and not use_cpu_only:
-        # use_cpu_for_conversion = True && use_cpu_only = False
-        raise ValueError("use_cpu_for_conversion = True && use_cpu_only = False is an invalid test case")
-
     inputs = []
-    cf_inputs = [t for t in model[0].inputs if t.dtype != dtypes.resource]
-    for t in cf_inputs:
-        name = get_tf_node_names(t.name)[0]
-        shape = [RangeDim() if s is None or s == -1 else s \
-                for s in list(t.get_shape())]
-        inputs.append(TensorType(name=name, shape=shape,
-                                 dtype=t.dtype.as_numpy_dtype))
+    if inputs_for_conversion is None:
+        cf_inputs = [t for t in model[0].inputs if t.dtype != dtypes.resource]
+        for t in cf_inputs:
+            name = get_tf_node_names(t.name)[0]
+            shape = [RangeDim() if s is None or s == -1 else s \
+                    for s in list(t.get_shape())]
+            inputs.append(TensorType(name=name, shape=shape,
+                                     dtype=t.dtype.as_numpy_dtype))
+    else:
+        inputs = inputs_for_conversion
+        
     outputs = []
     for t in output_names:
         name = get_tf_node_names(t)[0]
@@ -142,6 +134,11 @@ def run_compare_tf2(
         ref = [tf_outputs.numpy()]
     expected_outputs = {n: v for n, v in zip(outputs, ref)}
 
+    if use_cpu_for_conversion:
+        compute_unit = ct.ComputeUnit.CPU_ONLY
+    else:
+        compute_unit = ct.ComputeUnit.ALL
+    
     mlmodel = ct_convert(
         model,
         source=frontend,
@@ -149,7 +146,7 @@ def run_compare_tf2(
         outputs=outputs,
         convert_to=backend,
         debug=debug,
-        useCPUOnly=use_cpu_for_conversion,
+        compute_units=compute_unit,
     )
 
     for k,v in input_dict.items():
@@ -160,20 +157,17 @@ def run_compare_tf2(
        or (mlmodel.is_package and _macos_version() < (12, 0)):
         return mlmodel._spec, mlmodel, input_dict, None
 
-    compare_backend(
-        mlmodel,
-        input_dict,
-        expected_outputs,
-        use_cpu_only,
-        atol=atol,
-        rtol=rtol,
-        also_compare_shapes=True,
-        dtype=backend[1],
-    )
-
     pred = None
-    if not coremltoolsutils._has_custom_layer(mlmodel.get_spec()):
-        pred = run_core_ml_predict(mlmodel, input_dict, use_cpu_only)
+    if not coremltoolsutils._has_custom_layer(mlmodel._spec):
+        pred = compare_backend(
+                mlmodel,
+                input_dict,
+                expected_outputs,
+                atol=atol,
+                rtol=rtol,
+                also_compare_shapes=True,
+                dtype=backend[1],
+        )
     else:
         print('Skipping model prediction as it has a custom nn layer!')
     return mlmodel._spec, mlmodel, input_dict, pred
@@ -200,9 +194,9 @@ def run_compare_tf_keras(
     inputs_for_conversion: list of coremltools.TensorType() or coremltools.ImageType() objects
         Defaults to None. It is passed as is to the "inputs" argument of the converter.
     use_cpu_only: bool
-        If true, use CPU only for prediction, otherwise, use GPU also.
+        If True, use CPU only for prediction.
     frontend_only: bool
-        If true, skip the prediction call, only validate conversion.
+        If True, skip the prediction call, only validate conversion.
     frontend: str
         Frontend to convert from.
     backend: str
@@ -212,10 +206,16 @@ def run_compare_tf_keras(
     rtol: float
         The relative tolerance parameter.
     """
-    mlmodel = ct_convert(model, inputs=inputs_for_conversion, source=frontend, convert_to=backend)
+    if use_cpu_only:
+        compute_unit = ct.ComputeUnit.CPU_ONLY
+    else:
+        compute_unit = ct.ComputeUnit.ALL
+
+    mlmodel = ct_convert(model, inputs=inputs_for_conversion, source=frontend, convert_to=backend,
+                         compute_units=compute_unit)
 
     # assumes conversion preserve the i/o names
-    proto = mlmodel.get_spec()
+    proto = mlmodel._spec
     inputs = [i.name.split(":")[0].strip() for i in model.inputs]
     outputs = [str(o.name) for o in proto.description.output]
 
@@ -231,20 +231,17 @@ def run_compare_tf_keras(
        or (mlmodel.is_package and _macos_version() < (12, 0)):
         return proto, mlmodel, input_key_values, None
 
-    compare_backend(
-        mlmodel,
-        input_key_values,
-        expected_outputs,
-        use_cpu_only,
-        atol=atol,
-        rtol=rtol,
-        also_compare_shapes=True,
-        dtype=backend[1]
-    )
-
     pred = None
     if not coremltoolsutils._has_custom_layer(proto):
-        pred = run_core_ml_predict(mlmodel, input_key_values, use_cpu_only)
+        pred = compare_backend(
+                mlmodel,
+                input_key_values,
+                expected_outputs,
+                atol=atol,
+                rtol=rtol,
+                also_compare_shapes=True,
+                dtype=backend[1]
+        )
     else:
         print('Skipping model prediction as it has a custom nn layer!')
     return proto, mlmodel, input_key_values, pred
@@ -256,7 +253,7 @@ class TensorFlow2BaseTest(TensorFlowBaseTest):
     def run_compare_tf2(model,
                         input_dict,
                         output_names,
-                        use_cpu_only=False,
+                        inputs_for_conversion=None,
                         use_cpu_for_conversion=False,
                         frontend_only=False,
                         frontend="tensorflow",
@@ -264,10 +261,11 @@ def run_compare_tf2(model,
                         debug=False,
                         atol=1e-04,
                         rtol=1e-05):
+
         res = run_compare_tf2(model,
                               input_dict,
                               output_names,
-                              use_cpu_only=use_cpu_only,
+                              inputs_for_conversion=inputs_for_conversion,
                               use_cpu_for_conversion=use_cpu_for_conversion,
                               frontend_only=frontend_only,
                               frontend=frontend,
@@ -284,6 +282,7 @@ def run_compare_tf2(model,
     def run_compare_tf_keras(model, input_values, inputs_for_conversion=None, use_cpu_only=False,
                              frontend_only=False, frontend="tensorflow",
                              backend=("neuralnetwork", "fp32"), atol=1e-04, rtol=1e-05):
+
         res = run_compare_tf_keras(model, input_values,
                                    inputs_for_conversion=inputs_for_conversion,
                                    use_cpu_only=use_cpu_only,
diff --git a/coremltools/converters/mil/frontend/torch/converter.py b/coremltools/converters/mil/frontend/torch/converter.py
index e2e530564..36d716795 100644
--- a/coremltools/converters/mil/frontend/torch/converter.py
+++ b/coremltools/converters/mil/frontend/torch/converter.py
@@ -27,6 +27,7 @@
     remove_getattr_nodes
 )
 from .ssa_passes.torch_passes import torch_passes
+from .._utils import get_output_names
 
 torch_to_mil_types = {
     _torch.bool: types.bool,
@@ -139,7 +140,7 @@ def __init__(
         Arguments:
             torchscript: torch.jit.ScriptModule object representing the model to convert.
             inputs: Input values and optional names. See kwarg in load.py for full description.
-            outputs: Names of the graph's outputs. See kwarg in load.py for full description.
+            outputs: List of outputs as ct.InputType. See kwarg in load.py for full description.
             cut_at_symbols: A list of internal symbol name strings. Graph conversion will
                 terminate once these symbols have been generated. For debugging use
                 only. See kwarg in load.py.
@@ -151,7 +152,8 @@ def __init__(
             if isinstance(inp, ImageType) and self.inputs[idx].channel_first is None:
                 self.inputs[idx].channel_first = True
         self.torchscript = torchscript
-        self.output_names = outputs
+        self.outputs = outputs
+        self.output_names = get_output_names(self.outputs)
         self.context = TranscriptionContext()
         raw_graph, params_dict = self._expand_and_optimize_ir(self.torchscript)
         self.params_dict = params_dict
@@ -242,7 +244,18 @@ def convert(self):
             for internal_name, users_name in zip(
                 self.graph.inputs.keys(), ssa_func_inputs.keys()
             ):
-                self.context.add(ssa_func.inputs[users_name], torch_name=internal_name)
+                input_var = ssa_func.inputs[users_name]
+                if (types.is_tensor(input_var.sym_type) or types.is_scalar(input_var.sym_type)) \
+                    and (input_var.dtype == types.fp16 or input_var.dtype == types.fp64):
+                    # cast the input var to float32
+                    # We need to do this because the type inference is very buggy when started from
+                    # float16/float64 typed inputs. Until that is fixed in the following radar
+                    # we cast all inputs of type float16/float64 to float32 as the first step.
+                    # These casts will later get removed, if compute_precision=Float16 is
+                    # provided, which will cause the FP16ComputePrecision pass to run.
+                    # TODO: remove this when this radar is fixed: rdar://93731970
+                    input_var = mb.cast(x=input_var, dtype="fp32")
+                self.context.add(input_var, torch_name=internal_name)
 
             self.convert_const()
 
@@ -260,13 +273,23 @@ def convert(self):
             graph_outputs = [g for g in graph_outputs if g is not None]
 
             # Output renaming occurs
+            if self.outputs is not None:
+                if len(self.outputs) != len(graph_outputs):
+                    msg = "Number of outputs provided, {}, do not match the number of outputs detected in the model, {}."
+                    raise ValueError(msg.format(
+                        len(self.outputs),
+                        len(graph_outputs),
+                    ))
             if self.output_names:
                 for index, var in enumerate(graph_outputs):
-                    output_rename = self.output_names[index]
-                    var.name = output_rename
+                    if self.output_names[index] is not None:
+                        output_rename = self.output_names[index]
+                        var.name = output_rename
 
             ssa_func.set_outputs(graph_outputs)
             prog.add_function("main", ssa_func)
+            if self.outputs is not None:
+                prog.set_main_output_types(self.outputs)
         self.torch_passes(prog)
         return prog
 
@@ -401,7 +424,6 @@ def _lower_graph_block(graph):
 
         return graph, params_dict
 
-
     @staticmethod
     def _expand_and_optimize_ir(torchscript):
         """
diff --git a/coremltools/converters/mil/frontend/torch/dialect_ops.py b/coremltools/converters/mil/frontend/torch/dialect_ops.py
index bdafa7e84..9d4f7a9be 100644
--- a/coremltools/converters/mil/frontend/torch/dialect_ops.py
+++ b/coremltools/converters/mil/frontend/torch/dialect_ops.py
@@ -42,8 +42,8 @@ class torch_upsample_nearest_neighbor(Operation):
 
     Parameters
     ----------
-    x: tensor<[\*D, H1, W1],T>  (Required)
-        * Must be at least rank ``3``.
+    x: tensor<[b, C, H1, W1],T>  (Required)
+        * Must be rank ``4``.
     output_height: i32
         * Output height for the height dimension.
     output_width: i32
@@ -51,7 +51,7 @@ class torch_upsample_nearest_neighbor(Operation):
 
     Returns
     -------
-    tensor<[\*D, H2, W2],T>
+    tensor<[b, C, H2, W2],T>
         * Tensor with same type as the input.
         * ``H2`` = output_height
         * ``W2`` = output_width
@@ -70,13 +70,13 @@ def __init__(self, **kwargs):
         super(torch_upsample_nearest_neighbor, self).__init__(**kwargs)
 
     def type_inference(self):
-        if self.x.rank < 3:
+        if self.x.rank != 4:
             raise ValueError(
-                'input to the "torch_upsample_nearest_neighbor" op must have rank at least 3'
+                'input to the "torch_upsample_nearest_neighbor" op must have rank 4'
             )
         ret_shape = list(self.x.shape)
-        ret_shape[-1] = get_new_symbol()
-        ret_shape[-2] = get_new_symbol()
+        ret_shape[2] = get_new_symbol()
+        ret_shape[3] = get_new_symbol()
         return types.tensor(self.x.dtype, ret_shape)
 
 # torch_upsample_bilinear is dealing with upsample layer which has flexible input shape,
@@ -92,8 +92,8 @@ class torch_upsample_bilinear(Operation):
 
     Parameters
     ----------
-    x: tensor<[\*D, H1, W1],T>  (Required)
-        * Must be rank ``3``.
+    x: tensor<[b, C, H1, W1],T>  (Required)
+        * Must be rank ``4``.
     output_height: i32
         * Output height for the height dimension.
     output_width: i32
@@ -103,7 +103,7 @@ class torch_upsample_bilinear(Operation):
 
     Returns
     -------
-    tensor<[\*D, H2, W2],T>
+    tensor<[b, C, H2, W2],T>
         * Tensor with same type as the input.
         * ``H2`` = output_height
         * ``W2`` = output_width
@@ -128,13 +128,13 @@ def __init__(self, **kwargs):
         super(torch_upsample_bilinear, self).__init__(**kwargs)
 
     def type_inference(self):
-        if self.x.rank < 3:
+        if self.x.rank != 4:
             raise ValueError(
-                'input to the "torch_upsample_bilinear" op must have rank at least 3'
+                'input to the "torch_upsample_bilinear" op must have rank 4'
             )
         ret_shape = list(self.x.shape)
-        ret_shape[-1] = get_new_symbol()
-        ret_shape[-2] = get_new_symbol()
+        ret_shape[2] = get_new_symbol()
+        ret_shape[3] = get_new_symbol()
         return types.tensor(self.x.dtype, ret_shape)
 
 # torch_tensor_assign is dealing with the tensor assignment operation
diff --git a/coremltools/converters/mil/frontend/torch/internal_graph.py b/coremltools/converters/mil/frontend/torch/internal_graph.py
index 6d5e81843..9829cd233 100644
--- a/coremltools/converters/mil/frontend/torch/internal_graph.py
+++ b/coremltools/converters/mil/frontend/torch/internal_graph.py
@@ -239,7 +239,7 @@ def __init__(
                 their numpy value.
             inputs: If @raw_graph is None, the OrderedDict mapping input names
                 to their example values.
-            outputs: If @raw_graph is None, the list of outputs from the graph.
+            outputs: list[str], If @raw_graph is None, the list of outputs from the graph.
         """
 
         self.nodes = []
diff --git a/coremltools/converters/mil/frontend/torch/load.py b/coremltools/converters/mil/frontend/torch/load.py
index 3d6ad4a7a..a77d730d7 100644
--- a/coremltools/converters/mil/frontend/torch/load.py
+++ b/coremltools/converters/mil/frontend/torch/load.py
@@ -3,7 +3,9 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+import logging as _logging
 import os.path as _os_path
+
 import torch as _torch
 
 from .converter import TorchConverter, torch_to_mil_types
@@ -31,15 +33,18 @@ def load(model_spec, debug=False, **kwargs):
         If names are not specified: input keys for calling predict on the converted model
         will be internal symbols of the input to the graph.
         User can specify a subset of names.
-    outputs (optional): List of output name strings. If specified: keys of output dictionary
-        will be these names in order of flattened returned outputs. If not specified:
-        output dictionary keys will be the internal output symbols in the graph.
-        User can specify a subset of names.
+    outputs (optional): list[ct.InputType] or None
+        list of either ct.TensorTypes or ct.ImageTypes (both of which are child classes of InputType)
+        This is the value of the "outputs" argument, passed on by the user in "coremltools.convert" API.
     cut_at_symbols (optional): List of internal symbol name strings. Graph conversion will
         terminate once these symbols have been generated. For debugging use
         only.
     """
     torchscript = _torchscript_from_model(model_spec)
+
+    if type(torchscript) == _torch.jit._script.RecursiveScriptModule:
+        _logging.warning("Support for converting Torch Script Models is experimental. "
+                         "If possible you should use a traced model for conversion.")
     inputs = _convert_to_torch_inputtype(kwargs["inputs"])
     outputs = kwargs.get("outputs", None)
     cut_at_symbols = kwargs.get("cut_at_symbols", None)
@@ -66,6 +71,8 @@ def _convert_to_torch_inputtype(inputs):
         if isinstance(_input, (list, tuple)):
             input_type.append(_convert_to_torch_inputtype(_input))
         elif isinstance(_input, InputType):
+            if _input.shape is None:
+                raise ValueError("'shape' must be provided in the 'inputs' argument for pytorch conversion")
             input_type.append(_input)
         elif isinstance(_input, _torch.Tensor):
             input_type.append(
diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
index 5ec94aef8..75546452c 100644
--- a/coremltools/converters/mil/frontend/torch/ops.py
+++ b/coremltools/converters/mil/frontend/torch/ops.py
@@ -73,7 +73,7 @@ def convert_nodes(context, graph):
             assign node outputs.
         graph: An InternalTorchIRGraph or InternalTorchIRBlock object.
     """
-    for node in _tqdm(graph.nodes, desc="Converting Frontend ==> MIL Ops", unit=" ops"):
+    for node in _tqdm(graph.nodes, desc="Converting PyTorch Frontend ==> MIL Ops", unit=" ops"):
         op_lookup = node.kind
         if op_lookup.endswith("_"):
             # This is an "in place" op.
@@ -832,14 +832,18 @@ def prelu(context, node):
     alpha = inputs[1]
     # In the MIL backend, it assumes that the inputs of prelu should have
     # at least rank 3, i.e. [batch, channel, spatial_dims*].
-    if x.rank < 3:
-        x = mb.expand_dims(x=x, axes=[1])
+    if x.rank >= 2:
+        alpha = alpha.val
+        alpha = _np.ones((x.shape[1],))*alpha
+
+    if x.rank <= 2:
+        axes = [1, 2] if x.rank == 1 else [2]
+        x = mb.expand_dims(x=x, axes=axes)
         x = mb.prelu(x=x, alpha=alpha)
-        res = mb.squeeze(x=x, axes=[1], name=node.name)
+        res = mb.squeeze(x=x, axes=axes, name=node.name)
     else:
-        alpha = alpha.val
-        alpha_vec = _np.ones((x.shape[1],))*alpha
-        res = mb.prelu(x=x, alpha=alpha_vec, name=node.name)
+        res = mb.prelu(x=x, alpha=alpha, name=node.name)
+        
     context.add(res)
 
 @register_torch_op
@@ -1381,7 +1385,6 @@ def batch_norm(context, node):
     running_var = inputs[4]
     training = inputs[5].val
     eps = inputs[7]
-    name = node.name
 
     # If training = True, the mean and variance of the current batch of data are used to normalize the input data.
     # If training = False, data statistics running_mean and running_var are used instead.
@@ -1527,6 +1530,8 @@ def embedding(context, node):
             "will be ignored."
         )
 
+    indices = mb.cast(x=indices, dtype="int32")
+
     #  Changing the axis from 0 is not an option in torch, so we don't expose it
     gather = mb.gather(x=_input, indices=indices, name=node.name)
     context.add(gather)
@@ -2094,7 +2099,6 @@ def _add_mil_lstm(input, initial_h, initial_c, weights, has_bias, bidirectional,
             f_b = mb.add(x=biases[0], y=biases[1], )
             r_b = mb.add(x=biases[2], y=biases[3], )
 
-
         # (3.)
         f_ih_w = _ifzo_to_ifoz(
             weights[0], name=name + "_lstm_forward_ih_weights_ifoz_to_ifzo",
@@ -2315,37 +2319,108 @@ def lstm(context, node):
 def _get_scales_from_output_size(output_size, input_shape):
     scales = []
     if output_size is not None:
-        # @output_size will be a list if scales was provided or a
-        # single var if output size was provided
+        # output_size will be either 
+        # (1) A list of Var, and each Var indicates the output size for that dimension
+        # (2) A single Var which indicates the whole output size 
+        # (3) A numpy array
+
         if isinstance(output_size, list):
-            output_size = [output_size[0].val, output_size[1].val]
+            output_size = [x.val for x in output_size]
         if isinstance(output_size, Var):
-            output_size = [output_size.val[0], output_size.val[1]]
-
-        # output size is computed using the formula
-        # floor (scale * input_size) in Core ML (and PyTorch)
-        # Thus, when computing the scales from the output size,
-        # add a small positive constant to the output size,
-        # to make sure that the floor formula results in the correct output
-        # size and not 1 unit smaller, due to float precision issues
-        # e.g. if output size = 5 and input size = 2, then scale will be
-        # 2.5, which can get represented as 2.49999, resulting in an output size of 4
-        # instead of 5, without this correction.
-        Hout, Wout = output_size[0], output_size[1]
-        Hin, Win = input_shape[-2], input_shape[-1]
-        scales_h = Hout / Hin if Hout % Hin == 0 else (Hout + 1e-4) / Hin
-        scales_w = Wout / Win if Wout % Win == 0 else (Wout + 1e-4) / Win
-
-        scales = [scales_h, scales_w]
+            output_size = [x for x in output_size.val]
+        if isinstance(output_size, _np.ndarray):
+            output_size = output_size.tolist()
+
+        # output size is computed using the formula floor (scale * input_size) in Core ML (and PyTorch).
+        # Thus, when computing the scales from the output size, we add a small positive constant to the output size
+        # to make sure that the floor formula results in the correct output size and not 1 unit smaller.
+        # For instance, if output size = 5 and input size = 2, then scale will be 2.5, which can get 
+        # represented as 2.49999 due to float precision issues, and this might resultin an output size of 4
+        # instead of 5, without the epsilon correction.
+
+        if len(output_size) == 1:
+            # 1d upsampling
+            Hout = output_size[0]
+            Hin = input_shape[-1]
+            scales_h = Hout / Hin if Hout % Hin == 0 else (Hout + 1e-4) / Hin
+            scales = scales_h
+        elif len(output_size) == 2:
+            # 2d upsampling 
+            Hout, Wout = output_size[0], output_size[1]
+            Hin, Win = input_shape[-2], input_shape[-1]
+            scales_h = Hout / Hin if Hout % Hin == 0 else (Hout + 1e-4) / Hin
+            scales_w = Wout / Win if Wout % Win == 0 else (Wout + 1e-4) / Win
+            scales = [scales_h, scales_w]
+        else:
+            msg = "Only 1d and 2d unsampling are supported."
+            raise NotImplementedError(msg)
+
     return scales
 
+def _is_float_value(x, threshold=0.001):
+    return x - _math.floor(x) > threshold
 
 @register_torch_op
-def upsample_bilinear2d(context, node):
+def upsample_linear1d(context, node):
+    inputs = _get_inputs(context, node)
+    x = inputs[0]
+    output_size = inputs[1]
+    align_corners = bool(inputs[2].val)
+    scale = inputs[3]
 
-    def _is_float_value(x, threshold=0.001):
-        return x - _math.floor(x) > threshold
+    scale_factor = None
+
+    if scale is not None and scale.val is not None and scale.shape == (1,):
+        # Get the scale factor from provided inputs
+        # This happens when recompute_scale_factor = False
+        scale_factor = scale.val[0]
+
+        # Currently, we are not supporting recompute_scale_factor = False, align_corners = False with float output size
+        _, _, h = x.shape
+        if not is_symbolic(h):
+            # For the static input shape, we can compute the output size beforehand, and check if it is a float value
+            output_size = h * scale_factor
+            is_float = _is_float_value(output_size)
+        else:
+            # For the dynamic input shape, we check if the scale factor itself is float
+            is_float = _is_float_value(scale_factor)
 
+        if is_float and not align_corners:
+            msg = "recompute_scale_factor = False, align_corners = False with float output size is " + \
+                                            "not supported for the upsample op {}".format(node.name)
+            raise NotImplementedError(msg)
+
+    elif isinstance(output_size, list):
+        # When the input shape is dynamic and recompute_scale_factor = True,
+        # we need to trace the graph to find the scale factor.
+        x = mb.expand_dims(x=x, axes=[3])
+        x = mb.torch_upsample_bilinear(
+            x=x,
+            output_height=output_size[0],
+            output_width=1.,
+            align_corners=align_corners,
+        )
+        x = mb.squeeze(x=x, axes=[3], name=node.name)
+        context.add(x)
+        return
+
+    elif output_size.val is not None:
+        # Infer the scale factor from the provided output size
+        scale_factor = _get_scales_from_output_size(output_size, x.shape)
+
+    # Expand the input to a 4d tensor, and use MIL's upsample_bilinear op
+    x = mb.expand_dims(x=x, axes=[3])
+    x = mb.upsample_bilinear(
+        x=x,
+        scale_factor_height=scale_factor,
+        scale_factor_width=1.,
+        align_corners=align_corners,
+    )
+    x = mb.squeeze(x=x, axes=[3], name=node.name)
+    context.add(x)
+
+@register_torch_op
+def upsample_bilinear2d(context, node):
     inputs = _get_inputs(context, node)
     _input = inputs[0]
     output_size = inputs[1]
@@ -2418,6 +2493,44 @@ def _is_float_value(x, threshold=0.001):
     )
     context.add(upsample_bilinear)
 
+@register_torch_op
+def upsample_nearest1d(context, node):
+    inputs = _get_inputs(context, node)
+    x = inputs[0]
+    output_size = inputs[1]
+    scale = inputs[2]
+
+    scale_factor = None
+
+    if scale is not None and scale.val is not None and scale.shape == (1,):
+        # Get the scale factor from provided inputs
+        # This happens when recompute_scale_factor = False
+        scale_factor = scale.val[0]  
+
+    elif isinstance(output_size, list):
+        # When the input shape is dynamic and recompute_scale_factor = True,
+        # we need to trace the graph to find the scale factor.
+        x = mb.expand_dims(x=x, axes=[3])
+        x = mb.torch_upsample_nearest_neighbor(
+            x=x,
+            output_height=output_size[0],
+            output_width=1.,
+        )
+        x = mb.squeeze(x=x, axes=[3], name=node.name)
+        context.add(x)
+        return
+    else:
+        # Infer scale factors from output sizes
+        scale_factor = _get_scales_from_output_size(output_size, x.shape)
+    
+    x = mb.expand_dims(x=x, axes=[3])
+    x = mb.upsample_nearest_neighbor(
+        x=x,
+        scale_factor_height=scale_factor,
+        scale_factor_width=1.,
+    )
+    x = mb.squeeze(x=x, axes=[3], name=node.name)
+    context.add(x)
 
 @register_torch_op
 def upsample_nearest2d(context, node):
@@ -3626,7 +3739,7 @@ def argmax(context, node):
     res = mb.reduce_argmax(x=x, axis=axis, keep_dims=keep_dims, name=node.name)
     context.add(res)
 
-@register_torch_op
+@register_torch_op(torch_alias=["empty_like"])
 def zeros_like(context, node):
     inputs = _get_inputs(context, node, expected=6)
     x = inputs[0]
@@ -4399,16 +4512,39 @@ def broadcast_tensors(context, node):
     inputs = _get_inputs(context, node)
     context.add(_broadcast_tensors(inputs[0]), node.name)
 
-@register_torch_op
-def scatter_add(context, node):
-    inputs = _get_inputs(context, node)
+
+def _scatter(context, inputs, mode, name):
     data = inputs[0]
     axis = inputs[1].val
     indices = inputs[2]
     updates = inputs[3]
-    result = mb.scatter_along_axis(data=data, indices=indices, updates=updates, axis=axis, mode="add", name=node.name)
+    result = mb.scatter_along_axis(data=data, indices=indices, updates=updates,
+                                   axis=axis, mode=mode, name=name)
     context.add(result)
 
+@register_torch_op
+def scatter(context, node):
+    inputs = _get_inputs(context, node)
+    assert len(inputs) in (4, 5)
+
+    # Determine reduce/mode parameter
+    if len(inputs) == 5:
+        mode = inputs[4].val
+        if mode == 'multiply':
+            mode = 'mul'
+        else:
+            assert mode == 'add'
+    else:
+        mode = 'update'
+
+    _scatter(context, inputs, mode, node.name)
+
+
+@register_torch_op
+def scatter_add(context, node):
+    inputs = _get_inputs(context, node)
+    _scatter(context, inputs, 'add', node.name)
+
 @register_torch_op()
 def roi_align(context, node):
     """
diff --git a/coremltools/converters/mil/frontend/torch/ssa_passes/torch_upsample_to_core_upsample.py b/coremltools/converters/mil/frontend/torch/ssa_passes/torch_upsample_to_core_upsample.py
index a5ad224ad..1622ce1a9 100644
--- a/coremltools/converters/mil/frontend/torch/ssa_passes/torch_upsample_to_core_upsample.py
+++ b/coremltools/converters/mil/frontend/torch/ssa_passes/torch_upsample_to_core_upsample.py
@@ -9,12 +9,18 @@
 from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
 from coremltools.converters.mil.mil.passes.pass_registry import register_pass
 
+target_ops = [
+    "torch_upsample_nearest_neighbor", 
+    "torch_upsample_bilinear",
+]
 
 @register_pass(namespace="torch")
 class torch_upsample_to_core_upsample(AbstractGraphPass):
     """
-    Try to map Torch dialect ops `torch_upsample_nearest_neighbor` or `torch_upsample_bilinear` to
-    `upsample_nearest_neighbor` or `upsample_bilinear` in the core op set if compatible.
+    Try to map Torch dialect ops 
+    1. `torch_upsample_nearest_neighbor`
+    2. `torch_upsample_bilinear` 
+    to `upsample_nearest_neighbor` or `upsample_bilinear` in the core op set if compatible.
 
     Inputs:
 
@@ -30,7 +36,7 @@ def _torch_upsample_to_core_upsample_block(block):
         for b in op.blocks:
             _torch_upsample_to_core_upsample_block(b)
 
-        if op.op_type in ["torch_upsample_nearest_neighbor", "torch_upsample_bilinear"]:
+        if op.op_type in target_ops:
             if _try_replace_with_core_upsample(op):
                 logging.info("Successfully map {} to core upsample".format(op.op_type))
             else:
@@ -38,6 +44,13 @@ def _torch_upsample_to_core_upsample_block(block):
 
 
 def _try_get_upsample_factor(output_size):
+    op = output_size
+    # If the output has value, than the upsample op itself is derived from the upsample_1d op,
+    # so we can just return scale factor 1 for that case
+    if op.outputs[0].val is not None:
+        assert op.outputs[0].val == 1.
+        return 1.
+
     # output_size = [
     #       (torch.floor((input.size(i + 2).float() * torch.tensor(scale_factors[i], dtype=torch.float32)).float()))
     #        for i in range(dim)
@@ -47,7 +60,6 @@ def _try_get_upsample_factor(output_size):
     # The whole sequence is mul(input_size, scale_factor) -> cast(fp32) -> floor() -> cast(int32)
 
     # 1. check if the output_size is type 'cast' with dtype 'int32'
-    op = output_size
     if op.op_type != "cast" or op.dtype.val != "int32":
         return None
 
@@ -74,41 +86,47 @@ def _try_replace_with_core_upsample(op):
     """
     Inputs:
 
-    op (Operation): op.op_type must be 'torch_upsample_nearest_neighbor' or 'torch_upsample_bilinear'
+    op (Operation): op.op_type must be either
+    1. `torch_upsample_nearest_neighbor`
+    2. `torch_upsample_bilinear`
 
     Returns:
 
     True if op can be represented by mb.upsample_nearest_neighbor or mb.upsample_bilinear op in SSA.
     False otherwise
     """
-    scales_h = _try_get_upsample_factor(op.output_height.op)
-    scales_w = _try_get_upsample_factor(op.output_width.op)
-
-    if scales_h is None or scales_w is None:
-        return False
-
-    old_upsample = op.outputs[0]
-    block = op.enclosing_block
-
-    with block:
-        if op.op_type == "torch_upsample_nearest_neighbor":
-            new_upsample = mb.upsample_nearest_neighbor(
-                x=op.x,
-                scale_factor_height=scales_h,
-                scale_factor_width=scales_w,
-                name=op.name,
-                before_op=op,
-            )
-        elif op.op_type == "torch_upsample_bilinear":
-            new_upsample = mb.upsample_bilinear(
-                x=op.x,
-                scale_factor_height=scales_h,
-                scale_factor_width=scales_w,
-                align_corners=op.align_corners,
-                name=op.name,
-                before_op=op,
-            )
-        block.replace_uses_of_var_after_op(anchor_op=op, old_var=old_upsample, new_var=new_upsample)
-    block.remove_ops([op])
+    assert op.op_type in target_ops
+    
+    # 2d upsampling
+    if op.op_type in ["torch_upsample_nearest_neighbor", "torch_upsample_bilinear"]:
+        scales_h = _try_get_upsample_factor(op.output_height.op)
+        scales_w = _try_get_upsample_factor(op.output_width.op)
+
+        if scales_h is None or scales_w is None:
+            return False
+
+        old_upsample = op.outputs[0]
+        block = op.enclosing_block
+
+        with block:
+            if op.op_type == "torch_upsample_nearest_neighbor":
+                new_upsample = mb.upsample_nearest_neighbor(
+                    x=op.x,
+                    scale_factor_height=scales_h,
+                    scale_factor_width=scales_w,
+                    name=op.name,
+                    before_op=op,
+                )
+            elif op.op_type == "torch_upsample_bilinear":
+                new_upsample = mb.upsample_bilinear(
+                    x=op.x,
+                    scale_factor_height=scales_h,
+                    scale_factor_width=scales_w,
+                    align_corners=op.align_corners,
+                    name=op.name,
+                    before_op=op,
+                )
+            block.replace_uses_of_var_after_op(anchor_op=op, old_var=old_upsample, new_var=new_upsample)
+        block.remove_ops([op])
 
     return True
diff --git a/coremltools/converters/mil/frontend/torch/test/test_conversion_api.py b/coremltools/converters/mil/frontend/torch/test/test_conversion_api.py
new file mode 100644
index 000000000..8762d8796
--- /dev/null
+++ b/coremltools/converters/mil/frontend/torch/test/test_conversion_api.py
@@ -0,0 +1,573 @@
+#  Copyright (c) 2022, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import numpy as np
+from PIL import Image
+import pytest
+
+import coremltools as ct
+from coremltools.converters.mil.testing_utils import (
+    assert_cast_ops_count,
+    assert_input_dtype,
+    assert_ops_in_mil_program,
+    assert_output_dtype,
+    assert_prog_input_type,
+    assert_prog_output_type,
+    assert_spec_input_image_type,
+    assert_spec_output_image_type,
+    verify_prediction,
+)
+from coremltools.proto import FeatureTypes_pb2 as ft
+
+torch = pytest.importorskip("torch")
+
+
+@pytest.fixture
+def int32_input_model():
+    class Model(torch.nn.Module):
+        def forward(self, x):
+            return x + 5
+    example_input = torch.randint(0, 100, (10, 20), dtype=torch.int32)
+    return torch.jit.trace(Model().eval(), example_input)
+
+@pytest.fixture
+def int64_input_model():
+    class Model(torch.nn.Module):
+        def forward(self, x):
+            return x + 5
+    example_input = torch.randint(0, 100, (10, 20), dtype=torch.int64)
+    return torch.jit.trace(Model().eval(), example_input)
+
+@pytest.fixture
+def float32_input_model_add_op():
+    class Model(torch.nn.Module):
+        def forward(self, x):
+            return x + 5.5
+    example_input = torch.randint(0, 100, (10, 20), dtype=torch.float32)
+    return torch.jit.trace(Model().eval(), example_input)
+
+@pytest.fixture
+def float32_input_model_relu_ops():
+    class Model(torch.nn.Module):
+        def forward(self, x):
+            x = torch.nn.ReLU()(x)
+            return torch.nn.ReLU()(x)
+    example_input = torch.randint(0, 100, (10, 20), dtype=torch.float32)
+    return torch.jit.trace(Model().eval(), example_input)
+
+@pytest.fixture
+def float32_two_input_model():
+    class Model(torch.nn.Module):
+        def forward(self, x, y):
+            return x + y
+    example_input = torch.randint(0, 100, (10, 20), dtype=torch.float32)
+    return torch.jit.trace(Model().eval(), [example_input, example_input])
+
+@pytest.fixture
+def float32_two_output_model():
+    class Model(torch.nn.Module):
+        def forward(self, x):
+            y = torch.nn.ReLU()(x)
+            out1 = torch.nn.ReLU()(y)
+            out2 = torch.nn.ReLU6()(x)
+            return out1, out2
+    example_input = torch.randint(0, 100, (10, 20), dtype=torch.float32)
+    return torch.jit.trace(Model().eval(), example_input)
+
+@pytest.fixture
+def rank3_input_model():
+    class Model(torch.nn.Module):
+        def forward(self, x):
+            return x + 5.5
+    example_input = torch.randint(0, 100, (1, 10, 20), dtype=torch.float32)
+    return torch.jit.trace(Model().eval(), example_input)
+
+@pytest.fixture
+def rank4_input_model():
+    class Model(torch.nn.Module):
+        def forward(self, x):
+            return x + 5.5
+    example_input = torch.randint(0, 100, (1, 3, 10, 20), dtype=torch.float32)
+    return torch.jit.trace(Model().eval(), example_input)
+
+@pytest.fixture
+def rank4_grayscale_input_model():
+    class Model(torch.nn.Module):
+        def forward(self, x):
+            return x + 5.5
+    example_input = torch.randint(0, 100, (1, 1, 10, 20), dtype=torch.float32)
+    return torch.jit.trace(Model().eval(), example_input)
+
+@pytest.fixture
+def linear_model():
+    # this model will test the fuse_linear_bias pass
+    class Model(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.linear = torch.nn.Linear(10, 15, bias=False)
+            self.constant_tensor = torch.ones((15,), dtype=torch.float32)
+
+        def forward(self, x):
+            x = self.linear(x)
+            x = x - self.constant_tensor
+            x = torch.nn.ReLU()(x)
+            return x
+    example_input = torch.randint(0, 10, (1, 10), dtype=torch.float32)
+    return torch.jit.trace(Model().eval(), example_input)
+
+
+@pytest.mark.skipif(ct.utils._macos_version() < (13, 0), reason='Tests are for deployment target ios16/macos13')
+class TestInputOutputConversionAPI:
+
+    def test_input_dtype_default(self, int32_input_model):
+        #if dtype is not provided it defaults to float32
+        mlmodel = ct.convert(int32_input_model,
+                             inputs=[ct.TensorType(shape=(10, 20))],
+                             minimum_deployment_target=ct.target.macOS12)
+        assert_input_dtype(mlmodel, expected_type_str="fp32")
+        verify_prediction(mlmodel)
+
+    def test_input_shape_missing_error(self, float32_input_model_add_op):
+        with pytest.raises(ValueError,
+                           match="'shape' must be provided in the 'inputs' argument for pytorch conversion"):
+            mlmodel = ct.convert(float32_input_model_add_op,
+                                 inputs=[ct.TensorType(dtype=np.int32)],
+                                 minimum_deployment_target=ct.target.macOS12)
+
+    def test_unsupported_input_dtype_in_torch_model(self, int64_input_model):
+        # test that no error is raised when no dtype is provided by the user,
+        # and the Torch model's input dtype is not supported.
+        # In this case, it will be mapped to the default dtype which is float32
+        mlmodel = ct.convert(int64_input_model,
+                             inputs=[ct.TensorType(shape=(10, 20))],
+                             minimum_deployment_target=ct.target.macOS12)
+        assert_input_dtype(mlmodel, expected_type_str="fp32")
+        verify_prediction(mlmodel)
+
+    def test_input_dtype_user_provided(self, float32_input_model_add_op):
+        # test that provided dtype in the api is applied
+        mlmodel = ct.convert(float32_input_model_add_op,
+                             inputs=[ct.TensorType(shape=(10, 20), dtype=np.int32)],
+                             minimum_deployment_target=ct.target.macOS12)
+        assert_input_dtype(mlmodel, expected_type_str="int32")
+        assert_output_dtype(mlmodel, expected_type_str="fp32")
+        verify_prediction(mlmodel)
+
+    def test_invalid_input_dtype(self, int32_input_model):
+        with pytest.raises(TypeError,
+                           match="is unsupported for inputs/outputs of the model"
+                           ):
+            mlmodel = ct.convert(int32_input_model,
+                                 inputs=[ct.TensorType(dtype=np.int16)],
+                                 minimum_deployment_target=ct.target.macOS12)
+
+        with pytest.raises(TypeError,
+                           match="float16 dtype for inputs is only supported for deployment target >= iOS16/macOS13"
+                           ):
+            mlmodel = ct.convert(int32_input_model,
+                                 inputs=[ct.TensorType(dtype=np.float16)],
+                                 minimum_deployment_target=ct.target.macOS12)
+
+    def test_fp16_input_dtype(self, float32_input_model_add_op, float32_input_model_relu_ops, int32_input_model):
+        """
+        Test that providing fp16 input dtype works with macOS13.
+        """
+        mlmodel = ct.convert(float32_input_model_add_op,
+                             inputs=[ct.TensorType(shape=(10, 20), dtype=np.float16)],
+                             minimum_deployment_target=ct.target.macOS13
+                             )
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["add", "cast"])
+        assert_input_dtype(mlmodel, expected_type_str="fp16")
+        assert_output_dtype(mlmodel, expected_type_str="fp32")
+        verify_prediction(mlmodel)
+
+        mlmodel = ct.convert(float32_input_model_relu_ops,
+                             inputs=[ct.TensorType(shape=(10, 20), dtype=np.float16)],
+                             minimum_deployment_target=ct.target.macOS13
+                             )
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["relu", "relu", "cast"])
+        assert_input_dtype(mlmodel, expected_type_str="fp16")
+        assert_output_dtype(mlmodel, expected_type_str="fp32")
+        verify_prediction(mlmodel)
+
+        mlmodel = ct.convert(int32_input_model,
+                             inputs=[ct.TensorType(shape=(10, 20), dtype=np.float16)],
+                             minimum_deployment_target=ct.target.macOS13,
+                             )
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["add", "cast"])
+        assert_input_dtype(mlmodel, expected_type_str="fp16")
+        assert_output_dtype(mlmodel, expected_type_str="fp32")
+        verify_prediction(mlmodel)
+
+    def test_fp16_input_dtype_fp32_precision(self, float32_input_model_add_op, float32_input_model_relu_ops,
+                                             int32_input_model):
+        """
+        Same test as test_fp16_input_dtype, but with Float32 precision
+        """
+        mlmodel = ct.convert(float32_input_model_add_op,
+                             inputs=[ct.TensorType(shape=(10, 20), dtype=np.float16)],
+                             minimum_deployment_target=ct.target.macOS13,
+                             compute_precision=ct.precision.FLOAT32,
+                             )
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["cast", "add"])
+        assert_input_dtype(mlmodel, expected_type_str="fp16")
+        assert_output_dtype(mlmodel, expected_type_str="fp32")
+        verify_prediction(mlmodel)
+
+        """
+        Although no FP16ComputePrecision is applied, the float16 input propagates through the network
+        """
+        mlmodel = ct.convert(float32_input_model_relu_ops,
+                             inputs=[ct.TensorType(shape=(10, 20), dtype=np.float16)],
+                             minimum_deployment_target=ct.target.macOS13,
+                             compute_precision=ct.precision.FLOAT32,
+                             )
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["cast", "relu", "relu"])
+        assert_input_dtype(mlmodel, expected_type_str="fp16")
+        assert_output_dtype(mlmodel, expected_type_str="fp32")
+
+    def test_input_name_specified_by_user(self, float32_input_model_relu_ops,
+                                          float32_two_input_model):
+        mlmodel = ct.convert(float32_input_model_relu_ops,
+                             inputs=[ct.TensorType(shape=(10, 20), name="my_custom_input_name")],
+                             minimum_deployment_target=ct.target.macOS12)
+        assert_input_dtype(mlmodel, expected_type_str="fp32", expected_name="my_custom_input_name")
+
+        mlmodel = ct.convert(float32_two_input_model,
+                             inputs=[ct.TensorType(shape=(10, 20), name="user_provided_name_1"),
+                                     ct.TensorType(shape=(10, 20), name="user_provided_name_2")],
+                             minimum_deployment_target=ct.target.macOS12)
+        assert_input_dtype(mlmodel, expected_type_str="fp32", expected_name="user_provided_name_1", index=0)
+        assert_input_dtype(mlmodel, expected_type_str="fp32", expected_name="user_provided_name_2", index=1)
+
+    def test_two_input_model(self, float32_two_input_model):
+        # test that error is raised if only 1 input is provided
+        with pytest.raises(ValueError):
+            ct.convert(float32_two_input_model,
+                       inputs=[ct.TensorType(shape=(10, 20), dtype=np.int32)],
+                       minimum_deployment_target=ct.target.macOS12)
+
+
+        # test forcing 1st input to type int32
+        mlmodel = ct.convert(float32_two_input_model,
+                             inputs=[ct.TensorType(shape=(10, 20), dtype=np.int32),
+                                     ct.TensorType(shape=(10, 20))],
+                             minimum_deployment_target=ct.target.macOS12)
+        assert_input_dtype(mlmodel, expected_type_str="int32", index=0)
+        assert_input_dtype(mlmodel, expected_type_str="fp32", index=1)
+        assert_output_dtype(mlmodel, expected_type_str="fp32")
+
+        # test forcing both inputs to be int32
+        mlmodel = ct.convert(float32_two_input_model,
+                             inputs=[ct.TensorType(shape=(10, 20), dtype=np.int32),
+                                     ct.TensorType(shape=(10, 20), dtype=np.int32),
+                                     ],
+                             minimum_deployment_target=ct.target.macOS12)
+        assert_input_dtype(mlmodel, expected_type_str="int32", index=0)
+        assert_input_dtype(mlmodel, expected_type_str="int32", index=1)
+        assert_output_dtype(mlmodel, expected_type_str="int32")
+
+        # test forcing both inputs to be float16
+        mlmodel = ct.convert(float32_two_input_model,
+                             inputs=[ct.TensorType(shape=(10, 20), dtype=np.float16),
+                                     ct.TensorType(shape=(10, 20), dtype=np.float16),
+                                     ],
+                             minimum_deployment_target=ct.target.macOS13)
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["add", "cast"])
+        assert_input_dtype(mlmodel, expected_type_str="fp16", index=0)
+        assert_input_dtype(mlmodel, expected_type_str="fp16", index=1)
+        assert_output_dtype(mlmodel, expected_type_str="fp32")
+        verify_prediction(mlmodel)
+
+    def test_output_name_specified_by_user(self, float32_input_model_relu_ops, float32_two_output_model):
+        mlmodel = ct.convert(float32_input_model_relu_ops,
+                             inputs=[ct.TensorType(shape=(10, 20), name="custom_input_name")],
+                             outputs=[ct.TensorType(name="custom_output_name")],
+                             minimum_deployment_target=ct.target.macOS12)
+        assert_input_dtype(mlmodel, expected_type_str="fp32", expected_name="custom_input_name")
+        assert_output_dtype(mlmodel, expected_type_str="fp32", expected_name="custom_output_name")
+
+        mlmodel = ct.convert(float32_two_output_model,
+                             inputs=[ct.TensorType(shape=(10, 20), name="custom_input_name")],
+                             outputs=[ct.TensorType(name="custom_output1_name"),
+                                      ct.TensorType(name="custom_output2_name")],
+                             minimum_deployment_target=ct.target.macOS12)
+        assert_input_dtype(mlmodel, expected_type_str="fp32", expected_name="custom_input_name")
+        assert_output_dtype(mlmodel, expected_type_str="fp32", expected_name="custom_output1_name", index=0)
+        assert_output_dtype(mlmodel, expected_type_str="fp32", expected_name="custom_output2_name", index=1)
+
+    def test_single_output_model(self, int32_input_model, float32_input_model_relu_ops):
+        # test output type: if not provided, it should be the default which is float32
+        mlmodel = ct.convert(int32_input_model,
+                             inputs=[ct.TensorType(shape=(10, 20))],
+                             minimum_deployment_target=ct.target.macOS12)
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["cast", "add", "cast"])
+        assert_input_dtype(mlmodel, expected_type_str="fp32")
+        assert_output_dtype(mlmodel, expected_type_str="fp32")
+
+        # test that the output dtype provided by the user is applied during conversion
+        mlmodel = ct.convert(float32_input_model_relu_ops,
+                             inputs=[ct.TensorType(shape=(10, 20))],
+                             outputs=[ct.TensorType(dtype=np.int32)],
+                             minimum_deployment_target=ct.target.macOS12)
+        assert_input_dtype(mlmodel, expected_type_str="fp32")
+        assert_output_dtype(mlmodel, expected_type_str="int32")
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["cast", "relu", "relu", "cast", "cast"])
+
+        # test that an error is raised when shape is provided for the output
+        with pytest.raises(ValueError):
+            mlmodel = ct.convert(int32_input_model,
+                                 inputs=[ct.TensorType(shape=(10, 20))],
+                                 outputs=[ct.TensorType(dtype=np.float32, shape=(10, 20))],
+                                 minimum_deployment_target=ct.target.macOS12)
+
+        # test that output dtype of float16 is rejected when deployment target is low
+        with pytest.raises(TypeError,
+                           match="float16 dtype for outputs is only supported for deployment target >= iOS16/macOS13"
+                           ):
+            ct.convert(float32_input_model_relu_ops,
+                       inputs=[ct.TensorType(shape=(10, 20))],
+                       outputs=[ct.TensorType(dtype=np.float16)],
+                       minimum_deployment_target=ct.target.macOS12,
+                       )
+
+        # test that output type float16 is applied correctly
+        mlmodel = ct.convert(float32_input_model_relu_ops,
+                             inputs=[ct.TensorType(shape=(10, 20))],
+                             outputs=[ct.TensorType(dtype=np.float16)],
+                             minimum_deployment_target=ct.target.macOS13,
+                             )
+        assert_output_dtype(mlmodel, expected_type_str="fp16")
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["cast", "relu", "relu"])
+
+        # test that input and output types float16 are applied correctly
+        mlmodel = ct.convert(float32_input_model_relu_ops,
+                             inputs=[ct.TensorType(shape=(10, 20), dtype=np.float16)],
+                             outputs=[ct.TensorType(dtype=np.float16)],
+                             minimum_deployment_target=ct.target.macOS13,
+                             )
+        assert_input_dtype(mlmodel, expected_type_str="fp16")
+        assert_output_dtype(mlmodel, expected_type_str="fp16")
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["relu", "relu"])
+        verify_prediction(mlmodel)
+
+    def test_multi_output_model(self, float32_two_output_model):
+        # check that error is raised when only 1 output provided
+        with pytest.raises(ValueError, match="Number of outputs provided, 1, "
+                                        "do not match the number of outputs detected in the model, 2"):
+            ct.convert(float32_two_output_model,
+                       inputs=[ct.TensorType(shape=(10, 20))],
+                       outputs=[ct.TensorType()],
+                       minimum_deployment_target=ct.target.macOS12)
+
+        # set 1 output to float16 and the other to float32
+        mlmodel = ct.convert(float32_two_output_model,
+                             inputs=[ct.TensorType(shape=(10, 20), dtype=np.float16)],
+                             outputs=[ct.TensorType(name="out1", dtype=np.float16),
+                                      ct.TensorType(name="out2", dtype=np.float32)],
+                             minimum_deployment_target=ct.target.macOS13,
+                             )
+        assert_cast_ops_count(mlmodel, expected_count=1)
+        assert_input_dtype(mlmodel, expected_type_str="fp16")
+        assert_output_dtype(mlmodel, expected_type_str="fp16", expected_name="out1" ,index=0)
+        assert_output_dtype(mlmodel, expected_type_str="fp32", expected_name="out2", index=1)
+        verify_prediction(mlmodel)
+
+    def test_color_input(self, rank4_input_model, rank3_input_model):
+        mlmodel = ct.convert(rank4_input_model,
+                             inputs=[ct.ImageType(shape=(1, 3, 10, 20), color_layout=ct.colorlayout.RGB)],
+                             minimum_deployment_target=ct.target.macOS13,
+                             )
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["cast", "add", "cast"])
+        assert_spec_input_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.RGB)
+        assert_prog_input_type(mlmodel._mil_program, expected_dtype_str="fp32")
+        assert_prog_output_type(mlmodel._mil_program, expected_dtype_str="fp32")
+        verify_prediction(mlmodel)
+
+        with pytest.raises(ValueError, match="must have rank 4"):
+            mlmodel = ct.convert(rank3_input_model,
+                                 inputs=[ct.ImageType(shape=(1, 10, 20), color_layout=ct.colorlayout.RGB)],
+                                 minimum_deployment_target=ct.target.macOS12,
+                                 )
+
+    def test_grayscale_input(self, rank4_input_model, rank3_input_model, rank4_grayscale_input_model):
+        with pytest.raises(ValueError, match="must have rank 4"):
+            mlmodel = ct.convert(rank3_input_model,
+                                 inputs=[ct.ImageType(shape=(1, 10, 20), color_layout=ct.colorlayout.GRAYSCALE)],
+                                 minimum_deployment_target=ct.target.macOS13,
+                                 )
+
+        # invalid shape
+        with pytest.raises(ValueError):
+            mlmodel = ct.convert(rank4_input_model,
+                                 inputs=[ct.ImageType(shape=(1, 3, 10, 20), color_layout=ct.colorlayout.GRAYSCALE)],
+                                 minimum_deployment_target=ct.target.macOS13,
+                                 )
+
+        mlmodel = ct.convert(rank4_grayscale_input_model,
+                             inputs=[ct.ImageType(shape=(1, 1, 10, 20), color_layout=ct.colorlayout.GRAYSCALE)],
+                             minimum_deployment_target=ct.target.macOS13,
+                             )
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["cast", "add", "cast"])
+        assert_spec_input_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.GRAYSCALE)
+        assert_prog_input_type(mlmodel._mil_program, expected_dtype_str="fp32")
+        assert_prog_output_type(mlmodel._mil_program, expected_dtype_str="fp32")
+        verify_prediction(mlmodel)
+
+        with pytest.raises(TypeError, match="float16 dtype for inputs is only supported for deployment target >= iOS16/macOS13"):
+            mlmodel = ct.convert(rank4_grayscale_input_model,
+                                 inputs=[ct.ImageType(shape=(1, 1, 10, 20),
+                                                      color_layout=ct.colorlayout.GRAYSCALE_FLOAT16)],
+                                 minimum_deployment_target=ct.target.macOS12,
+                                 )
+
+        # test that grayscale_16 raises error when used with neural network
+        with pytest.raises(TypeError, match="float16 dtype for inputs is only supported for deployment target >= iOS16/macOS13"):
+            mlmodel = ct.convert(rank4_grayscale_input_model,
+                                 inputs=[ct.ImageType(shape=(1, 1, 10, 20),
+                                                      color_layout=ct.colorlayout.GRAYSCALE_FLOAT16)],
+                                 )
+
+        mlmodel = ct.convert(rank4_grayscale_input_model,
+                             inputs=[ct.ImageType(shape=(1, 1, 10, 20),
+                                                  color_layout=ct.colorlayout.GRAYSCALE_FLOAT16)],
+                             outputs=[ct.TensorType(dtype=np.float16)],
+                             minimum_deployment_target=ct.target.macOS13,
+                             )
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["add"])
+        assert_spec_input_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.GRAYSCALE_FLOAT16)
+        assert_prog_input_type(mlmodel._mil_program, expected_dtype_str="fp16")
+        assert_output_dtype(mlmodel, expected_type_str="fp16")
+        # TODO: uncomment the following when rdar://92239179 is fixed
+        # verify_prediction(mlmodel)
+
+    def test_color_output(self, rank4_input_model, float32_input_model_add_op):
+        # check that an error is raised if the output shape is not of form (1, 3, H, W)
+        with pytest.raises(ValueError, match="must have rank 4. Instead it has rank 2"):
+            ct.convert(float32_input_model_add_op,
+                       inputs=[ct.TensorType(shape=(10, 20))],
+                       outputs=[ct.ImageType(color_layout=ct.colorlayout.RGB)],
+                       minimum_deployment_target=ct.target.macOS13)
+
+        mlmodel = ct.convert(rank4_input_model,
+                             inputs=[ct.ImageType(shape=(1, 3, 10, 20),
+                                                  color_layout=ct.colorlayout.BGR)],
+                             outputs=[ct.ImageType(color_layout=ct.colorlayout.RGB)],
+                             minimum_deployment_target=ct.target.macOS13,
+                             )
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["cast", "add", "cast"])
+        assert_spec_input_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.BGR)
+        assert_spec_output_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.RGB)
+        assert_prog_input_type(mlmodel._mil_program, expected_dtype_str="fp32")
+        assert_prog_output_type(mlmodel._mil_program, expected_dtype_str="fp32")
+        verify_prediction(mlmodel)
+
+        # check neural network conversion
+        mlmodel = ct.convert(rank4_input_model,
+                             inputs=[ct.ImageType(shape=(1, 3, 10, 20),
+                                                  color_layout=ct.colorlayout.RGB)],
+                             outputs=[ct.ImageType(color_layout=ct.colorlayout.BGR)],
+                             )
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["add"])
+        assert_spec_input_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.RGB)
+        assert_spec_output_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.BGR)
+        verify_prediction(mlmodel)
+
+    def test_grayscale_output(self, rank4_grayscale_input_model):
+        with pytest.raises(TypeError, match="float16 dtype for outputs is only supported for deployment target >= iOS16/macOS13"):
+            mlmodel = ct.convert(rank4_grayscale_input_model,
+                                 inputs=[ct.TensorType(shape=(1, 1, 10, 20))],
+                                 outputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE_FLOAT16)],
+                                 minimum_deployment_target=ct.target.macOS12,
+                                 )
+
+        mlmodel = ct.convert(rank4_grayscale_input_model,
+                             inputs=[ct.ImageType(shape=(1, 1, 10, 20),
+                                                  color_layout=ct.colorlayout.GRAYSCALE)],
+                             outputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE)],
+                             )
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["add"])
+        assert_spec_input_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.GRAYSCALE)
+        assert_spec_output_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.GRAYSCALE)
+        verify_prediction(mlmodel)
+
+        mlmodel = ct.convert(rank4_grayscale_input_model,
+                             inputs=[ct.ImageType(shape=(1, 1, 10, 20),
+                                                  color_layout=ct.colorlayout.GRAYSCALE_FLOAT16)],
+                             outputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE_FLOAT16)],
+                             minimum_deployment_target=ct.target.macOS13,
+                             )
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["add"])
+        assert_spec_input_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.GRAYSCALE_FLOAT16)
+        assert_spec_output_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.GRAYSCALE_FLOAT16)
+        assert_prog_input_type(mlmodel._mil_program, expected_dtype_str="fp16")
+        assert_prog_output_type(mlmodel._mil_program, expected_dtype_str="fp16")
+        # TODO: uncomment the following when rdar://92239179 is fixed
+        # verify_prediction(mlmodel)
+
+        mlmodel = ct.convert(rank4_grayscale_input_model,
+                             inputs=[ct.ImageType(shape=(1, 1, 10, 20),
+                                                  color_layout=ct.colorlayout.GRAYSCALE)],
+                             outputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE_FLOAT16)],
+                             minimum_deployment_target=ct.target.macOS13,
+                             )
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["cast", "add"])
+        assert_spec_input_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.GRAYSCALE)
+        assert_spec_output_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.GRAYSCALE_FLOAT16)
+        assert_prog_input_type(mlmodel._mil_program, expected_dtype_str="fp32")
+        assert_prog_output_type(mlmodel._mil_program, expected_dtype_str="fp16")
+        # TODO: uncomment the following when rdar://92239179 is fixed
+        # verify_prediction(mlmodel)
+
+    def test_linear_model(self, linear_model):
+        # this will test the fuse_linear_bias pass, when the inputs are of type float16
+        mlmodel = ct.convert(linear_model,
+                             inputs=[ct.TensorType(shape=(1, 10), dtype=np.float16)],
+                             outputs=[ct.TensorType(dtype=np.float16)],
+                             minimum_deployment_target=ct.target.macOS13,
+                             )
+        assert_input_dtype(mlmodel, expected_type_str="fp16")
+        assert_output_dtype(mlmodel, expected_type_str="fp16")
+        assert_ops_in_mil_program(mlmodel, ["linear", "relu"])
+        verify_prediction(mlmodel)
+
+
+    def test_classifier(self):
+        torch_model = torch.nn.ReLU().eval()
+        traced_model = torch.jit.trace(torch_model, torch.rand(3,))
+        model = ct.convert(
+            traced_model,
+            inputs=[ct.TensorType(shape=(3,), dtype=np.float16)],
+            outputs=[ct.TensorType(dtype=np.float16)],
+            classifier_config = ct.ClassifierConfig(['a', 'b', 'c']),
+            convert_to='mlprogram',
+            minimum_deployment_target=ct.target.macOS13,
+        )
+        assert_input_dtype(model, expected_type_str="fp16")
+        assert_ops_in_mil_program(model, ["relu", "cast", "classify"])
+        spec = model.get_spec()
+        input_name = spec.description.input[0].name
+        out_dict = model.predict({input_name : np.array([1.0, 2.0, 3.0])})
+        assert 'classLabel' in out_dict
+        assert out_dict['classLabel'] == 'c'
+        assert len(spec.description.output) == 2
+        assert "classLabel_probs" in out_dict
+        assert isinstance(out_dict["classLabel_probs"], dict)
+
+    def test_prediction_with_fp16_io(self):
+        torch_model = torch.nn.Linear(30, 5).eval()
+        traced_model = torch.jit.trace(torch_model, torch.rand(1, 30))
+        mlmodel = ct.convert(traced_model,
+                             inputs=[ct.TensorType(name="input", shape=(1, 30), dtype=np.float32)],
+                             outputs=[ct.TensorType(dtype=np.float32)],
+                             minimum_deployment_target=ct.target.macOS13,
+                             compute_units=ct.ComputeUnit.CPU_ONLY,
+                             )
+        # test prediction
+        sample_input = np.random.rand(1, 30).astype(np.float32) * 10
+        model_output = mlmodel.predict({"input": sample_input})[mlmodel._spec.description.output[0].name]
+        reference_output = traced_model(torch.from_numpy(sample_input)).detach().numpy()
+        np.testing.assert_allclose(reference_output, model_output, rtol=1e-2, atol=1e-2)
+
diff --git a/coremltools/converters/mil/frontend/torch/test/test_internal_graph.py b/coremltools/converters/mil/frontend/torch/test/test_internal_graph.py
index 497d581d7..1f7332ccf 100644
--- a/coremltools/converters/mil/frontend/torch/test/test_internal_graph.py
+++ b/coremltools/converters/mil/frontend/torch/test/test_internal_graph.py
@@ -1724,10 +1724,8 @@ def test_max(self, context, input_size, dim, keepdim):
         node = InternalTorchIRNode(
             kind="max", inputs=input_list, outputs=["out1", "out2"],
         )
-        ssa = self._construct_test_graph(context, ops.max, node, constants=constants)
-        index_result = context["out1"].val
-        max_result = context["out2"].val
-        expected_index, expected_max = torch.max(test_input, dim=dim, keepdim=keepdim)
+        self._construct_test_graph(context, ops.max, node, constants=constants)
+        torch.max(test_input, dim=dim, keepdim=keepdim)
 
     @pytest.mark.parametrize(
         "input_size, dim, descending",
diff --git a/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py b/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
index 769a6cbc7..11535f371 100644
--- a/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
+++ b/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
@@ -3,8 +3,9 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
-import sys
 import itertools
+import sys
+
 import numpy as np
 import pytest
 import torch.nn as nn
@@ -15,13 +16,18 @@
     ModuleWrapper,
     TorchBaseTest
 )
-from coremltools import RangeDim
-from coremltools.models.utils import _python_version
-from coremltools.models.utils import _macos_version
-from coremltools.converters.mil import testing_reqs
-
-from coremltools import TensorType
+import coremltools as ct
+from coremltools import (
+    RangeDim,
+    Shape,
+    TensorType
+)
 from coremltools._deps import version_lt
+from coremltools.converters.mil import testing_reqs
+from coremltools.models.utils import (
+    _macos_version,
+    _python_version
+)
 
 
 backends = testing_reqs.backends
@@ -36,15 +42,33 @@
 
 class TestScriptedModels(TorchBaseTest):
 
-    @pytest.mark.parametrize(
-        "use_cpu_for_conversion, backend", itertools.product([True, False], backends)
-    )
-    def test_cond(self, use_cpu_for_conversion, backend):
-        if backend[0] == "mlprogram":
-            pytest.skip("rdar://81169758 (Cond tests hang on mlprogram backend)")
-        if backend[0] == "mlprogram" and not use_cpu_for_conversion:
-            pytest.xfail("rdar://78343191 ((MIL GPU) Core ML Tools Unit Test failures [failure to load or Seg fault])")
+    @staticmethod
+    def get_while_loop_model():
+        class TestLayer(nn.Module):
+            def __init__(self):
+                super(TestLayer, self).__init__()
+
+            def forward(self, x):
+                x = 0.5 * x
+                return x
+
+        class TestNet(nn.Module):
+            input_size = (1,)
+
+            def __init__(self):
+                super(TestNet, self).__init__()
+                layer = TestLayer()
+                self.layer = torch.jit.trace(layer, torch.rand(self.input_size))
+
+            def forward(self, x):
+                while x > 0.01:
+                    x = self.layer(x)
+                return x
 
+        return TestNet().eval()
+
+    @staticmethod
+    def get_cond_model():
         class TestNet(nn.Module):
             def forward(self, x):
                 if torch.squeeze(x) < 10.:
@@ -52,7 +76,18 @@ def forward(self, x):
                 else:
                     return x*2.
 
-        torch_model = TestNet().eval()
+        return TestNet().eval()
+
+    @pytest.mark.parametrize("backend", backends)
+    def test_while_loop(self, backend):
+        model = TestScriptedModels.get_while_loop_model()
+        self.run_compare_torch(model.input_size, model, backend=backend, use_scripting=True)
+
+    @pytest.mark.parametrize(
+        "use_cpu_for_conversion, backend", itertools.product([True, False], backends)
+    )
+    def test_cond(self, use_cpu_for_conversion, backend):
+        torch_model = TestScriptedModels.get_cond_model()
 
         self.run_compare_torch(torch.tensor([1.]), torch_model,
             input_as_shape=False, backend=backend,
@@ -88,38 +123,8 @@ def forward(self, x):
         
         self.run_compare_torch(model.input_size, model, backend=backend, use_scripting=True)
 
-    @pytest.mark.parametrize("backend", backends)
-    def test_while_loop(self, backend):
-        class TestLayer(nn.Module):
-            def __init__(self):
-                super(TestLayer, self).__init__()
-
-            def forward(self, x):
-                x = 0.5 * x
-                return x
-
-        class TestNet(nn.Module):
-            input_size = (1,)
-
-            def __init__(self):
-                super(TestNet, self).__init__()
-                layer = TestLayer()
-                self.layer = torch.jit.trace(layer, torch.rand(self.input_size))
-
-            def forward(self, x):
-                while x > 0.01:
-                    x = self.layer(x)
-                return x
-
-        model = TestNet().eval()
-
-        self.run_compare_torch(model.input_size, model, backend=backend, use_scripting=True)
-
     @pytest.mark.parametrize("backend", backends)
     def test_if(self, backend):
-        if backend[0] == 'mlprogram':
-            pytest.xfail("Not supported on ML Program backend")
-
         class TestLayer(nn.Module):
             def __init__(self):
                 super(TestLayer, self).__init__()
@@ -182,6 +187,27 @@ def test_conv(self, backend):
         )
 
 
+class TestMean(TorchBaseTest):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_with_flexible_shape(self, backend):
+        class Model(nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return torch.mean(x, dim=(2, 3), keepdim=True)
+
+        model = Model()
+        shape = (1, 3, 256, 256)
+        converter_input_type = [TensorType(shape=Shape(shape=[1, 3, RangeDim(), RangeDim()],
+                                                       default=shape))]
+        self.run_compare_torch(shape, model, backend=backend,
+                               converter_input_type=converter_input_type)
+
+
 class TestAffineGrid(TorchBaseTest):
     @pytest.mark.parametrize(
         "backend, x_shape_and_target_size, "
@@ -1013,7 +1039,143 @@ def test_convolution_transpose3d(
         self.run_compare_torch((1, in_channels, depth, height, width), model,
                            backend=backend)
 
+def _is_float_value(x, threshold=0.001):
+    return x - np.floor(x) > threshold
+
 class TestUpsample(TorchBaseTest):
+    @pytest.mark.parametrize(
+        "output_size, align_corners, backend",
+        itertools.product(
+            [1, 3, 10, 190],
+            [True, False],
+            backends,
+        )
+    )
+    def test_upsample_linear1d_with_output_size(
+        self, output_size, align_corners, backend
+    ):
+        input_shape = (1, 3, 10)
+        output_size = 3
+        model = ModuleWrapper(
+            nn.functional.interpolate,
+            {"size": output_size, "mode": "linear", "align_corners": align_corners,},
+        )
+        self.run_compare_torch(input_shape, model, backend=backend)
+
+    @pytest.mark.parametrize(
+        "scale, align_corners, recompute_scale_factor, backend",
+        itertools.product(
+            [2, 0.5, 5.3], [True, False], [True, False], backends
+        )
+    )
+    def test_upsample_linear1d_with_scales(
+        self, scale, align_corners, recompute_scale_factor, backend
+    ):
+        Height = 8
+        input_shape = (1, 3, Height)
+        output_h = Height * scale
+        is_h_float = _is_float_value(output_h)
+        
+        if is_h_float and not align_corners and not recompute_scale_factor:
+            pytest.xfail("rdar://81124053 (Support recompute_scale_factor)")
+
+        model = ModuleWrapper(
+            nn.functional.interpolate,
+            {
+                "scale_factor": scale,
+                "mode": "linear",
+                "align_corners": align_corners,
+                "recompute_scale_factor": recompute_scale_factor,
+            },
+        )
+        self.run_compare_torch(input_shape, model, backend=backend)
+
+    @pytest.mark.parametrize(
+        "scales, align_corners, recompute_scale_factor, backend",
+        itertools.product(
+           [2, 0.7, 3.6], [True, False], [True, False], backends
+        )
+    )
+    def test_upsample_linear1d_with_scales_dynamic(
+        self, scales, align_corners, recompute_scale_factor, backend
+    ):
+
+        is_float = _is_float_value(scales)
+        input_shape = (1, 3, 22)
+
+        if is_float and not align_corners and not recompute_scale_factor:
+            pytest.xfail("rdar://81124053 (Support recompute_scale_factor)")
+
+        model = ModuleWrapper(
+            nn.functional.interpolate,
+            {
+                "scale_factor": scales,
+                "mode": "linear",
+                "align_corners": align_corners,
+                "recompute_scale_factor": recompute_scale_factor,
+            },
+        )
+        converter_input_type = [TensorType(shape=(1, 3, RangeDim(default=22)), dtype=np.float32)]
+        mlmodel = self.run_compare_torch(input_shape, model,
+                               backend=backend,
+                               converter_input_type=converter_input_type)[1]
+
+        # also check if the scale factor are integers
+        if backend[0] == 'neuralnetwork' and not is_float:
+            for layer in mlmodel._spec.neuralNetwork.layers:
+                if layer.WhichOneof('layer') == "upsample":
+                    assert len(layer.upsample.fractionalScalingFactor) == 0
+
+    @pytest.mark.parametrize(
+        "output_size, backend",
+        itertools.product(
+            [10, 170], backends
+        )
+    )
+    def test_upsample_nearest1d_with_output_size(self, output_size, backend):
+        input_shape = (1, 3, 10)
+        model = ModuleWrapper(
+            nn.functional.interpolate, {"size": output_size, "mode": "nearest"},
+        )
+        self.run_compare_torch(input_shape, model, backend=backend)
+
+    @pytest.mark.parametrize(
+        "scales, backend",
+        itertools.product([2, 3, 4.5], backends),
+    )
+    def test_upsample_nearest1d_with_scales(self, scales, backend):
+        if backend[0] == "neuralnetwork":
+            if isinstance(scales, float):
+                return  # Skip fractional scale factors tests for neuralnetwork
+
+        input_shape = (1, 3, 10)
+        model = ModuleWrapper(
+            nn.functional.interpolate,
+            {"scale_factor": scales, "mode": "nearest"},
+        )
+        self.run_compare_torch(input_shape, model, backend=backend)
+
+    @pytest.mark.parametrize(
+        "scales, backend",
+        itertools.product([2, 3], backends),
+    )
+    def test_upsample_nearest1d_with_scales_dynamic(self, scales, backend):
+        input_shape = (1, 3, 10)
+        model = ModuleWrapper(
+            nn.functional.interpolate,
+            {"scale_factor": scales, "mode": "nearest", "recompute_scale_factor": True,},
+        )
+        converter_input_type = [TensorType(shape=(1, 3, RangeDim()), dtype=np.float32)]
+        mlmodel = self.run_compare_torch(input_shape, model,
+                               backend=backend,
+                               converter_input_type=converter_input_type)[1]
+
+        # also check if the scale factor are integers
+        if backend[0] == 'neuralnetwork':
+            for layer in mlmodel._spec.neuralNetwork.layers:
+                if layer.WhichOneof('layer') == "upsample":
+                    assert len(layer.upsample.fractionalScalingFactor) == 0
+
     @pytest.mark.parametrize(
         "output_size, align_corners, backend",
         itertools.product(
@@ -1045,8 +1207,6 @@ def test_upsample_bilinear2d_with_output_size(
     def test_upsample_bilinear2d_with_scales(
         self, scales_h, scales_w, align_corners, recompute_scale_factor, backend
     ):
-        def _is_float_value(x, threshold=0.001):
-            return x - np.floor(x) > threshold
 
         Height = 8
         Width = 22
@@ -1130,8 +1290,6 @@ def test_upsample_nearest2d_with_scales_dynamic(self, scales_h, scales_w, backen
     def test_upsample_bilinear2d_with_scales_dynamic(
         self, scales_h, scales_w, align_corners, recompute_scale_factor, backend
     ):
-        def _is_float_value(x, threshold=0.001):
-            return x - np.floor(x) > threshold
 
         is_h_float = _is_float_value(scales_h)
         is_w_float = _is_float_value(scales_w)
@@ -1161,6 +1319,26 @@ def _is_float_value(x, threshold=0.001):
                     assert len(layer.upsample.fractionalScalingFactor) == 0
 
 
+class TestEmptyLike(TorchBaseTest):
+    @pytest.mark.parametrize(
+        "shape, backend",
+        itertools.product(
+            COMMON_SHAPES,
+            backends
+        )
+    )
+    def test_empty_like(self, backend, shape):
+        class TestModel(nn.Module):
+            def __init__(self):
+                super(TestModel, self).__init__()
+
+            def forward(self, x):
+                y = torch.empty_like(x)
+                return torch.Tensor([len(y)])
+
+        self.run_compare_torch(shape, TestModel(), backend=backend)
+
+
 class TestAvgPool(TorchBaseTest):
 
     @pytest.mark.parametrize(
@@ -2043,7 +2221,7 @@ class TestTypeAs(TorchBaseTest):
     @pytest.mark.parametrize("backend, type",
         itertools.product(
             backends,
-            ["int32", "float16", "float32", "bool"]
+            ["int32", "float32", "bool"]
             )
         )
     def test_type_as(self, backend, type):
@@ -2241,7 +2419,6 @@ def forward(self, x):
         ),
     )
     def test_linspace_dynamic(self, backend, start_end, steps):
-        input_shape = tuple([steps])
         start, end = start_end
         class Model(nn.Module):
             def __init__(self):
@@ -2296,7 +2473,6 @@ def forward(self, x):
         ),
     )
     def test_arange_dynamic(self, backend, start_end_step):
-        input_shape = tuple([1,])
         start, end, step = start_end_step
         class Model(nn.Module):
             def __init__(self):
@@ -2309,6 +2485,7 @@ def forward(self, x):
         inputs = [torch.tensor([start, end, step])]
         self.run_compare_torch(inputs, model, backend=backend, input_as_shape=False)
 
+
 class TestEinsum(TorchBaseTest):
     @pytest.mark.parametrize(
         "backend, equation, reverse_input_order",
@@ -2470,17 +2647,23 @@ def test_relu6(self, backend, shape):
         )
 
     @pytest.mark.parametrize(
-        "backend, alpha, shape", 
+        "backend, alpha, shape, single_alpha", 
         itertools.product(
             backends, 
             [0.25, 2.0],
-            [(1, 5, 6, 7), (1, 128)],
+            [(3,), (2, 6), (2, 3, 4), (2, 5, 6, 7), (2, 3, 4, 5, 6)],
+            [True, False],
         ),
     )
-    def test_prelu(self, backend, alpha, shape):
+    def test_prelu(self, backend, alpha, shape, single_alpha):
+        if (backend[0] == "mlprogram" and backend[1] == "fp16"):
+            pytest.xfail("rdar://92175249 ([MIL] TestActivation::test_prelu[backend=(mlprogram, fp16)] CI failure)")
+
         input_shape = shape
-        C = input_shape[1] if len(input_shape) >= 3 else 1
-        model = nn.PReLU(C, alpha).eval()
+        num_parameters = input_shape[1] if len(input_shape) >= 2 else 1
+        if single_alpha:
+            num_parameters = 1
+        model = nn.PReLU(num_parameters, alpha).eval()
         self.run_compare_torch(
             input_shape, model, backend=backend,
         )
@@ -2730,9 +2913,16 @@ def test_clamp(self, backend, shape, clamp_range):
     )
     def test_threshold(self, backend, shape, threshold):
         model = torch.nn.Threshold(threshold[0], threshold[1]).eval()
+        input_value = torch.rand(np.prod(shape))
+        # make sure the values are not too close to the threshold
+        for i in range(len(input_value)):
+            if abs(input_value[i]-threshold[0]) < 0.005:
+                input_value[i] += 0.05
+        input_value = torch.reshape(input_value, shape)
         self.run_compare_torch(
-            shape, model, backend=backend,
+            input_value, model, backend=backend,
             use_cpu_for_conversion=True, # TODO: change this to False (rdar://78343191)
+            input_as_shape=False,
         )
 
     @pytest.mark.parametrize(
@@ -3965,13 +4155,13 @@ def __init__(self):
 
             def forward(self, x):
                 if len(shape) == 3:
-                    index_1 = torch.tensor([0,0,0,0,0,0,0,0]).view(2,4)
-                    index_2 = torch.tensor([1,0,0,2,2,1,1,1]).view(2,4)
-                    return x[index_1, :,index_2]
+                    index_1 = torch.tensor([0, 0, 0, 0, 0, 0, 0, 0]).view(2, 4)
+                    index_2 = torch.tensor([1, 0, 0, 2, 2, 1, 1, 1]).view(2, 4)
+                    return x[index_1, :, index_2]
 
                 elif len(shape) == 4:
-                    index_1 = torch.tensor([0,1,1,1,1,1,0,0]).view(4,2)
-                    index_2 = torch.tensor([0,1,2,3,4,0,1,2]).view(4,2)
+                    index_1 = torch.tensor([0, 1, 1, 1, 1, 1, 0, 0]).view(4,2)
+                    index_2 = torch.tensor([0, 1, 2, 3, 4, 0, 1, 2]).view(4,2)
                     return x[index_1, :, :, index_2]
 
         model = IndexModel()
@@ -4035,6 +4225,7 @@ def forward(self, x):
             shape, model, backend=backend,
         )
 
+
 class TestPad(TorchBaseTest):
     @pytest.mark.parametrize(
         "backend, rank, mode",
@@ -4094,6 +4285,7 @@ def test_constant_pad_3d(self, backend):
         model = torch.nn.ConstantPad3d((5, 6, 3, 8, 2, 4), 3.5).eval()
         self.run_compare_torch(input_shape, model, backend=backend)
 
+
 class TestMeshgrid(TorchBaseTest):
     @pytest.mark.parametrize(
         "rows, cols, dtype, inp_mode, backend",
@@ -4131,7 +4323,8 @@ def forward(self, rows, cols):
             inputs, model, expected_results, input_as_shape=False, backend=backend,
         )
 
-class TestSacatterAdd(TorchBaseTest):
+
+class TestScatter(TorchBaseTest):
     @pytest.mark.parametrize(
         "shapes_dims, backend",
         itertools.product(
@@ -4143,20 +4336,80 @@ class TestSacatterAdd(TorchBaseTest):
             backends
         ),
     )
-    def test_scatter_add(self, shapes_dims, backend):
+    def test_scatter(self, shapes_dims, backend):
+        class TestModel(nn.Module):
+            def __init__(self, dim, shapes):
+                super(TestModel, self).__init__()
+                self.dim = dim
+                self.source = torch.rand(*(shapes))
+                self.index = torch.randint(0, shapes[dim], size=shapes)
+                
+            def forward(self, x):
+                return x.scatter_(self.dim, self.index, self.source)
+
         shapes, dims = shapes_dims
         for dim in dims:
+            m = TestModel(0, shapes)
+            self.run_compare_torch(shapes, m, backend=backend)
 
-            class TestModel(nn.Module):
-                def __init__(self):
-                    super(TestModel, self).__init__()
-                    self.source = torch.rand(*(shapes))
-                    self.index = torch.randint(0, shapes[dim], size=shapes)
 
-                def forward(self, x):
-                    return x.scatter_add_(dim, self.index, self.source)
+    @pytest.mark.parametrize(
+        "shapes_dims, mode, backend",
+        itertools.product(
+            [
+                [(10,), (0, -1)],
+                [(2, 3), (1, -1)],
+                [(2, 3, 4, 5), (0, -2)],
+            ],
+            ['add', 'multiply'],
+            backends
+        ),
+    )
+    def test_scatter_with_reduce(self, shapes_dims, mode, backend):
+        class TestModel(nn.Module):
+            def __init__(self, dim, shapes, mode):
+                super(TestModel, self).__init__()
+                self.dim = dim
+                self.mode = mode
+                self.source = torch.rand(*(shapes))
+                self.index = torch.randint(0, shapes[dim], size=shapes)
+
+            def forward(self, x):
+                return x.scatter_(self.dim, self.index, self.source, reduce=self.mode)
+
+        shapes, dims = shapes_dims
+        for dim in dims:
+            m = TestModel(0, shapes, mode)
+            self.run_compare_torch(shapes, m, backend=backend)
+            
+
+    @pytest.mark.parametrize(
+        "shapes_dims, backend",
+        itertools.product(
+            [
+                [(10,), (0, -1)],
+                [(2, 3), (1, -1)],
+                [(2, 3, 4, 5), (0, -2)],
+            ],
+            backends
+        ),
+    )
+    def test_scatter_add(self, shapes_dims, backend):
+        class TestModel(nn.Module):
+            def __init__(self, dim, shapes):
+                super(TestModel, self).__init__()
+                self.dim = dim
+                self.source = torch.rand(*(shapes))
+                self.index = torch.randint(0, shapes[dim], size=shapes)
+                
+            def forward(self, x):
+                return x.scatter_add_(self.dim, self.index, self.source)
+
+        shapes, dims = shapes_dims
+        for dim in dims:
+            m = TestModel(dim, shapes)
+            self.run_compare_torch(shapes, m, backend=backend)
 
-            self.run_compare_torch(shapes, TestModel().eval(), backend=backend)
 
 class TestBroadcastTensors(TorchBaseTest):
     @pytest.mark.parametrize(
@@ -4215,3 +4468,43 @@ def __init__(self):
             def forward(self, a, b, c, d):
                 return torch.broadcast_tensors(a, b, c, d)
         self.run_compare_torch(shapes, TestModel().eval(), backend=backend)
+
+
+class TestEmbedding(TorchBaseTest):
+    @pytest.mark.parametrize(
+        "backend, input_dtype",
+        itertools.product(
+            backends,
+            [np.int32, np.float32],
+        )
+    )
+    def test_embedding(self, backend, input_dtype):
+        num_embeddings = 4
+        embedding_size = 10
+        B = 2
+        dim = 5
+        converter_input_type = [TensorType(shape=(B, dim), dtype=input_dtype)]
+
+        # input shape: (B, dim)
+        # output shape : (B, dim, embedding_size)
+        # shape of weights : (num_embeddings, embedding_size)
+        class EmbeddingModel(nn.Module):
+            def __init__(self):
+                super(EmbeddingModel, self).__init__()
+                self.embedding = torch.nn.Embedding(num_embeddings, embedding_size)
+
+            def forward(self, x):
+                return self.embedding(x)
+
+        input_data = np.random.randint(low=0, high=num_embeddings, size=(B, dim))
+        input_data = torch.from_numpy(input_data)
+        model = EmbeddingModel()
+        expected_results = model(input_data)
+        self.run_compare_torch(
+            input_data,
+            model,
+            expected_results=expected_results,
+            input_as_shape=False,
+            backend=backend,
+            converter_input_type=converter_input_type,
+        )
diff --git a/coremltools/converters/mil/frontend/torch/test/testing_utils.py b/coremltools/converters/mil/frontend/torch/test/testing_utils.py
index 0636c9569..88b8c5b3f 100644
--- a/coremltools/converters/mil/frontend/torch/test/testing_utils.py
+++ b/coremltools/converters/mil/frontend/torch/test/testing_utils.py
@@ -9,7 +9,7 @@
 import torch.nn as nn
 
 from ..converter import torch_to_mil_types
-from coremltools import TensorType, RangeDim
+from coremltools import ComputeUnit, TensorType, RangeDim
 from coremltools._deps import _IS_MACOS
 import coremltools.models.utils as coremltoolsutils
 from coremltools.converters.mil.mil.types.type_mapping import nptype_from_builtin
@@ -33,9 +33,9 @@ def forward(self, *args):
 np.random.seed(1984)
 
 
-def _flatten(object):
+def _flatten(objects):
     flattened_list = []
-    for item in object:
+    for item in objects:
         if isinstance(item, (list, tuple)):
             flattened_list.extend(_flatten(item))
         else:
@@ -71,7 +71,7 @@ def convert_to_coreml_inputs(input_description, inputs):
 
 
 def convert_to_mlmodel(model_spec, tensor_inputs, backend=("neuralnetwork", "fp32"),
-                       converter_input_type=None,use_cpu_for_conversion=False):
+                       converter_input_type=None, use_cpu_for_conversion=False):
     def _convert_to_inputtype(inputs):
         if isinstance(inputs, list):
             return [_convert_to_inputtype(x) for x in inputs]
@@ -90,8 +90,14 @@ def _convert_to_inputtype(inputs):
         inputs = list(_convert_to_inputtype(tensor_inputs))
     else:
         inputs = converter_input_type
+
+    if use_cpu_for_conversion:
+        compute_unit = ComputeUnit.CPU_ONLY
+    else:
+        compute_unit = ComputeUnit.ALL
+        
     return ct_convert(model_spec, inputs=inputs, convert_to=backend,
-                      source="pytorch", useCPUOnly=use_cpu_for_conversion)
+                      source="pytorch", compute_units=compute_unit)
 
 
 def generate_input_data(input_size, rand_range=(0, 1)):
@@ -137,7 +143,7 @@ def convert_and_compare(input_data, model_spec,
                         expected_results=None, atol=1e-4,
                         backend=("neuralnetwork", "fp32"),
                         converter_input_type=None,
-                        use_cpu_for_conversion=False):
+                        use_cpu_for_conversion=True):
     """
     If expected results is not set, it will by default
     be set to the flattened output of the torch model.
@@ -147,7 +153,6 @@ def convert_and_compare(input_data, model_spec,
     - input_data: torch.tensor or list[torch.tensor]
     - use_cpu_for_conversion: bool
         Argument which is passed as is to the unified converter API.
-        That is, "ct.convert(...., useCPUOnly=use_cpu_for_conversion)"
         It forces the model to be loaded on the CPU context, post conversion.
     """
     if isinstance(model_spec, str):
@@ -175,8 +180,8 @@ def convert_and_compare(input_data, model_spec,
     if dtype == "fp16":
         atol = max(atol * 100.0, 5e-1)
 
-    if not coremltoolsutils._has_custom_layer(mlmodel.get_spec()):
-        coreml_results = mlmodel.predict(coreml_inputs, useCPUOnly=True)
+    if not coremltoolsutils._has_custom_layer(mlmodel._spec):
+        coreml_results = mlmodel.predict(coreml_inputs)
         sorted_coreml_results = [
             coreml_results[key] for key in sorted(coreml_results.keys())
         ]
@@ -190,7 +195,7 @@ def convert_and_compare(input_data, model_spec,
     return model_spec, mlmodel, coreml_inputs, coreml_results
 
 
-class TorchBaseTest(object):
+class TorchBaseTest:
     testclassname = ''
     testmodelname = ''
 
@@ -199,16 +204,13 @@ def store_testname_with_args(self, request):
         TorchBaseTest.testclassname = type(self).__name__
         TorchBaseTest.testmodelname = request.node.name
 
-    def teardown_method(self, method):
-        pass
-
     @staticmethod
     def run_compare_torch(
             input_data, model, expected_results=None, places=5,
             input_as_shape=True, backend=("neuralnetwork", "fp32"),
             rand_range=(0.0, 1.0), use_scripting=False,
             converter_input_type=None,
-            use_cpu_for_conversion=False,
+            use_cpu_for_conversion=True,
     ):
         """
         Traces a model and runs a numerical test.
diff --git a/coremltools/converters/mil/frontend/torch/torchir_passes.py b/coremltools/converters/mil/frontend/torch/torchir_passes.py
index de2af795a..00a565976 100644
--- a/coremltools/converters/mil/frontend/torch/torchir_passes.py
+++ b/coremltools/converters/mil/frontend/torch/torchir_passes.py
@@ -8,6 +8,7 @@
 
 from .internal_graph import InternalTorchIRNode, InternalTorchIRGraph
 
+
 def generate_tensor_assignment_ops(graph):
     """
     This graph pass handles inplace tensor assignements, specifically it handles:
@@ -88,7 +89,7 @@ def _construct_nodes_to_fuse_inputs(nodes_to_fuse):
         return inputs
 
     tensor_to_node_sequence_mapping = {}
-    updated_tensor_count = defaultdict(lambda : 0)
+    updated_tensor_count = defaultdict(lambda: 0)
 
     for i in range(len(graph.nodes)):
         node = graph.nodes[i]
diff --git a/coremltools/converters/mil/input_types.py b/coremltools/converters/mil/input_types.py
index b0d9138bd..c26a767af 100644
--- a/coremltools/converters/mil/input_types.py
+++ b/coremltools/converters/mil/input_types.py
@@ -3,6 +3,7 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+from enum import Enum
 import numpy as np
 
 from coremltools.converters.mil.mil.types.symbolic import is_symbolic
@@ -12,8 +13,13 @@
     is_builtin,
 )
 
+class ColorLayout(Enum):
+    RGB = "RGB"
+    BGR = "BGR"
+    GRAYSCALE = "G"
+    GRAYSCALE_FLOAT16 = "G_FLOAT16"
 
-class ClassifierConfig(object):
+class ClassifierConfig:
     def __init__(
         self,
         class_labels,
@@ -48,8 +54,8 @@ def __init__(
         self.predicted_probabilities_output = predicted_probabilities_output
 
 
-class InputType(object):
-    def __init__(self, name=None, shape=None, dtype=types.fp32):
+class InputType:
+    def __init__(self, name=None, shape=None, dtype=None):
         """
         The Input Type for inputs fed into the model.
 
@@ -79,7 +85,7 @@ def __init__(
         shape=None,
         scale=1.0,
         bias=None,
-        color_layout="RGB",
+        color_layout=ColorLayout.RGB,
         channel_first=None,
     ):
         """
@@ -95,13 +101,20 @@ def __init__(
 
             If `color_layout` is ``'RGB'`` or ``'BGR'``, bias would be a list of ``float``.
 
-        color_layout: string
+        color_layout: string or of type ct.colorlayout enumeration
             Color layout of the image.
 
             Valid values:
-                * ``'G'``: Grayscale
-                * ``'RGB'``: [Red, Green, Blue]
-                * ``'BGR'``: [Blue, Green, Red]
+            enumeration (recommended):
+                * ct.colorlayout.RGB
+                * ct.colorlayout.BGR
+                * ct.colorlayout.GRAYSCALE
+                * ct.colorlayout.GRAYSCALE_FLOAT16
+
+            string values (older way to specify):
+                * ``'G'``: Grayscale (maps to ct.colorlayout.GRAYSCALE)
+                * ``'RGB'``: [Red, Green, Blue] (maps to ct.colorlayout.BGR)
+                * ``'BGR'``: [Blue, Green, Red] (maps to ct.colorlayout.RGB)
 
         channel_first: (bool) or None
             Set to ``True`` if input format is channel first.
@@ -113,16 +126,24 @@ def __init__(
         """
         super(ImageType, self).__init__(name, shape)
         self.scale = scale
-        if color_layout not in ["G", "RGB", "BGR"]:
-            raise ValueError(
-                "color_layout should be one of ['G', 'RGB', 'BGR'], got '{}' instead".format(
-                    color_layout
-                )
-            )
-        self.color_layout = color_layout
+        msg = "color_layout should be an enum of type ct.colorlayout, i.e. one of: " \
+              "{ct.colorlayout.RGB, ct.colorlayout.BGR, " \
+              "ct.colorlayout.GRAYSCALE, ct.colorlayout.GRAYSCALE_FLOAT16}"
+        if not (isinstance(color_layout, str) or isinstance(color_layout, ColorLayout)):
+            raise ValueError(msg)
+        if isinstance(color_layout, str):
+            if color_layout not in ("G", "RGB", "BGR"):
+                raise ValueError(msg)
+            color_layout = ColorLayout(color_layout)
 
+        self.color_layout = color_layout
+        if color_layout == ColorLayout.GRAYSCALE_FLOAT16:
+            self.dtype = types.fp16
         if bias is None:
-            self.bias = 0.0 if color_layout == "G" else [0.0, 0.0, 0.0]
+            if color_layout in (ColorLayout.GRAYSCALE, ColorLayout.GRAYSCALE_FLOAT16):
+                self.bias = 0.0
+            else:
+                self.bias = [0.0, 0.0, 0.0]
         else:
             self.bias = bias
         self.channel_first = channel_first
@@ -134,12 +155,11 @@ def __str__(self):
         str_repr = 'ImageType[name={}, shape={}, scale={}, bias={}, ' +\
                 'color_layout={}, channel_first={}]'
         return str_repr.format(self.name, self.shape, self.scale, self.bias,
-                self.color_layout, self.channel_first)
+                               self.color_layout, self.channel_first)
 
 
 class TensorType(InputType):
-    def __init__(self, name=None, shape=None, dtype=None,
-        default_value=None):
+    def __init__(self, name=None, shape=None, dtype=None, default_value=None):
         """
         Specify a (dense) tensor input.
 
@@ -163,7 +183,7 @@ def __init__(self, name=None, shape=None, dtype=None,
               * The ``shape`` is required.
 
         dtype: np.generic or mil.type type
-            Numpy ``dtype`` (for example, ``np.int32``). Default is ``np.float32``.
+            For example, ``np.int32`` or ``coremltools.converters.mil.mil.types.fp32``
 
         default_value: np.ndarray
             If provided, the input is considered optional. At runtime, if the
@@ -187,16 +207,21 @@ def __init__(self, name=None, shape=None, dtype=None,
           dtype=ct.converters.mil.types.fp32)``
         """
         super(TensorType, self).__init__(name, shape)
-        if dtype is None:
-            self.dtype = types.fp32
-        elif is_builtin(dtype):
-            self.dtype = dtype
-        else:
-            # Assume dtype is numpy type
-            try:
-                self.dtype = numpy_type_to_builtin_type(dtype)
-            except TypeError:
-                raise TypeError("dtype={} is unsupported".format(dtype))
+        if dtype is not None:
+            if is_builtin(dtype):
+                self.dtype = dtype
+                if dtype not in (types.fp16, types.fp32, types.fp64, types.int32, types.int64, types.bool):
+                    raise TypeError("dtype={} is unsupported for inputs/outputs of the model".format(dtype))
+            else:
+                # Assume dtype is numpy type
+                try:
+                    self.dtype = numpy_type_to_builtin_type(dtype)
+                except TypeError:
+                    raise TypeError("dtype={} is unsupported".format(dtype))
+                if dtype not in (np.float16, np.float32, np.float64, np.float,
+                                 np.int32, np.int64, np.int,
+                                 np.bool, np.bool_):
+                    raise TypeError("dtype={} is unsupported for inputs/outputs of the model".format(dtype))
 
         if default_value is not None:
             if isinstance(shape, EnumeratedShapes):
@@ -217,11 +242,14 @@ def __init__(self, name=None, shape=None, dtype=None,
                     'TensorType.shape {}'
                 raise ValueError(msg.format(name, default_value.shape,
                     self.shape.to_list()))
-            if numpy_type_to_builtin_type(default_value.dtype) != self.dtype:
+            if self.dtype is not None and \
+                    numpy_type_to_builtin_type(default_value.dtype) != self.dtype:
                 msg = 'TensorType {} default_value dtype {} != ' +\
                     'TensorType.dtype {}'
                 raise ValueError(msg.format(name, default_value.dtype,
                     self.dtype.__type_info__()))
+            else:
+                self.dtype = numpy_type_to_builtin_type(default_value.dtype)
 
         self.default_value = default_value
 
@@ -230,10 +258,11 @@ def __repr__(self):
 
     def __str__(self):
         return 'TensorType[name={}, shape={}, dtype={}]'.format(self.name,
-                self.shape, self.dtype)
+                                                                self.shape,
+                                                                self.dtype)
 
 
-class RangeDim(object):
+class RangeDim:
     def __init__(self, lower_bound=1, upper_bound=-1, default=None,
             symbol=None):
         """
@@ -293,7 +322,7 @@ def __str__(self):
             self.lower_bound, self.upper_bound, self.default, self.symbol)
 
 
-class Shape(object):
+class Shape:
     def __init__(self, shape, default=None):
         """
         The basic shape class to be set in InputType.
@@ -371,7 +400,7 @@ def to_list(self, allow_symbolic=False):
         return self.symbolic_shape
 
 
-class EnumeratedShapes(object):
+class EnumeratedShapes:
     def __init__(self, shapes, default=None):
         """
         A shape class that is used for setting multiple valid shape in InputType.
diff --git a/coremltools/converters/mil/mil/__init__.py b/coremltools/converters/mil/mil/__init__.py
index b51b488ba..8f6824ae2 100644
--- a/coremltools/converters/mil/mil/__init__.py
+++ b/coremltools/converters/mil/mil/__init__.py
@@ -43,4 +43,4 @@
 from .var import ListVar, Var
 
 from .builder import Builder
-from .ops.defs._op_reqs import register_op
+from .ops.defs._op_reqs import register_op
\ No newline at end of file
diff --git a/coremltools/converters/mil/mil/block.py b/coremltools/converters/mil/mil/block.py
index a71798c32..4a7c5d397 100644
--- a/coremltools/converters/mil/mil/block.py
+++ b/coremltools/converters/mil/mil/block.py
@@ -31,7 +31,7 @@ class InvalidBlockStateError(Exception):
     pass
 
 
-class Block(object):
+class Block:
     __slots__ = [
         "name",
         "_block_inputs",
@@ -507,6 +507,19 @@ def _replace_var(
             self._block_inputs = tuple(self._block_inputs)
 
         # If old_var is block's output, replace as well.
+        self.replace_block_output_var(old_var, new_var)
+
+        return num_ops_affected
+
+    def replace_block_output_var(
+            self,
+            old_var,
+            new_var,
+    ):
+        """
+        If old_var is in the list of block's outputs,
+        replace old_var with the new_var.
+        """
         if old_var in self._outputs:
             idx = self._outputs.index(old_var)
             self._outputs[idx] = new_var
@@ -515,11 +528,10 @@ def _replace_var(
             # This block no longer uses `old_var` as its outputs
             old_var.consuming_blocks.remove(self)
 
-            # if rename_new_var_if_fn_output:
             # Ensure output name is consistent
             if isinstance(self, Function):
                 new_var.name = old_var.name
-        return num_ops_affected
+
 
     def replace_uses_of_var_after_op(
         self,
@@ -850,7 +862,7 @@ def __init__(self, inputs):
                 for s in shapes:
                     if is_symbolic(s):
                         k_used_symbols.add(s)
-        super(Function, self).__init__()
+        super().__init__()
 
     # Override Block's input
     @property
diff --git a/coremltools/converters/mil/mil/builder.py b/coremltools/converters/mil/mil/builder.py
index 47647338d..e904a1152 100644
--- a/coremltools/converters/mil/mil/builder.py
+++ b/coremltools/converters/mil/mil/builder.py
@@ -4,9 +4,9 @@
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 from collections import defaultdict
-import copy
 import logging
 import numbers
+
 import numpy as np
 
 from coremltools.converters.mil.mil.types.symbolic import any_symbolic
@@ -54,7 +54,7 @@ class Builder:
     >>> func_inputs = {"x": mb.placeholder(shape=[2,3]),
     >>>                "y": mb.placeholder(shape=[2,3])}
     >>> with Function(func_inputs) as ssa_fun:
-    >>>   x, y = ssa_fun.inputs['x'], ssa_fun.inputs['x']
+    >>>   x, y = ssa_fun.inputs['x'], ssa_fun.inputs['y']
     >>>   res_var = mb.add(x=x, y=y) # created within ssa_fun block
     >>>   ssa_fun.set_outputs([res_var])
     >>> prog.add_function("main", ssa_fun)
diff --git a/coremltools/converters/mil/mil/input_type.py b/coremltools/converters/mil/mil/input_type.py
index 5565a0c17..386966b36 100644
--- a/coremltools/converters/mil/mil/input_type.py
+++ b/coremltools/converters/mil/mil/input_type.py
@@ -10,21 +10,21 @@
 
 
 SUPPORT_INT_TYPES = [
-                        types.uint8,
-                        types.int8,
-                        types.uint16,
-                        types.int16,
-                        types.uint32,
-                        types.int32,
-                        types.uint64,
-                        types.int64,
-                    ]
+    types.uint8,
+    types.int8,
+    types.uint16,
+    types.int16,
+    types.uint32,
+    types.int32,
+    types.uint64,
+    types.int64,
+]
 
 SUPPORT_FLOAT_TYPES = [
-                        types.fp16,
-                        types.fp32,
-                        types.fp64,
-                    ]
+    types.fp16,
+    types.fp32,
+    types.fp64,
+]
 
 
 class DefaultInputs:
@@ -113,9 +113,13 @@ def validate_inputs(self, op_name, op_type, candidate_kvs):
             if input_type.const and \
                 not isinstance(input_type, InternalInputType) \
                 and var.val is None:
-                msg = msg_prefix + \
-                    'Input {} must be const at compile time'
-                raise ValueError(msg.format(name), name, var.name)
+
+                if var.op and var.op.op_type.startswith("constexpr_"):
+                    pass  # Output of constexprs qualifies as const but gets materialized after load time
+                else:
+                    msg = msg_prefix + \
+                        'Input {} must be const at compile time'
+                    raise ValueError(msg.format(name), name, var.name)
 
             if not isinstance(var, InternalVar) and \
                 not input_type.is_compatible(var):
@@ -125,7 +129,6 @@ def validate_inputs(self, op_name, op_type, candidate_kvs):
                             var.sym_type.__type_info__()))
 
 
-
 class _InputType:
     """
     (Untyped) input containing fundamental properties of all inputs to an
@@ -184,7 +187,7 @@ def type_str(self):
 
 class ListInputType(_InputType):
     def __init__(self, **kwargs):
-        super(ListInputType, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def _is_compatible(self, v):
         return types.is_list(v.sym_type)
@@ -195,20 +198,29 @@ def type_str(self):
 
 
 class ScalarOrTensorInputType(_InputType):
-    def __init__(self, **kwargs):
-        super(ScalarOrTensorInputType, self).__init__(**kwargs)
+    def __init__(self, type_domain=None, **kwargs):
+        self.type_domain = []
+        if type_domain is not None:
+            for dtype in type_domain:
+                if not isinstance(type, type(dtype)):
+                    raise ValueError("Type domain should be an iterable of numpy dtypes.")
+                self.type_domain.append(types.type_to_builtin_type(dtype))
+        super().__init__(**kwargs)
 
     def _is_compatible(self, v):
-        return types.is_scalar(v.dtype) or types.is_tensor(v.dtype)
+        result = types.is_scalar(v.dtype) or types.is_tensor(v.dtype)
+        if self.type_domain:
+            result = result and (v.dtype in self.type_domain)
+        return result
 
     @property
     def type_str(self):
-        return 'tensor or scalar'
+        return 'tensor or scalar of dtype from type domain ' + str([types.builtin_to_string(v) for v in self.type_domain])
 
 
 class ListOrScalarOrTensorInputType(_InputType):
     def __init__(self, **kwargs):
-        super(ListOrScalarOrTensorInputType, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def _is_compatible(self, v):
         return (
@@ -233,7 +245,7 @@ class IntInputType(ScalarOrTensorInputType):
     """
 
     def __init__(self, **kwargs):
-        super(IntInputType, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def _is_compatible(self, v):
         return v.dtype in SUPPORT_INT_TYPES
@@ -254,7 +266,7 @@ class BoolInputType(ScalarOrTensorInputType):
     """
 
     def __init__(self, **kwargs):
-        super(BoolInputType, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def _is_compatible(self, v):
         return v.dtype == types.bool
@@ -275,7 +287,7 @@ class FloatInputType(ScalarOrTensorInputType):
     """
 
     def __init__(self, **kwargs):
-        super(FloatInputType, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def _is_compatible(self, v):
         return v.dtype in SUPPORT_FLOAT_TYPES
@@ -296,7 +308,7 @@ class IntOrFloatInputType(ScalarOrTensorInputType):
     """
 
     def __init__(self, **kwargs):
-        super(IntOrFloatInputType, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def _is_compatible(self, v):
         return v.dtype in SUPPORT_INT_TYPES + SUPPORT_FLOAT_TYPES
@@ -318,7 +330,7 @@ class IntOrFloatOrBoolInputType(ScalarOrTensorInputType):
     """
 
     def __init__(self, **kwargs):
-        super(IntOrFloatOrBoolInputType, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def _is_compatible(self, v):
         return v.dtype in SUPPORT_INT_TYPES + SUPPORT_FLOAT_TYPES + [types.bool]
@@ -337,7 +349,7 @@ class TensorInputType(ScalarOrTensorInputType):
     """
 
     def __init__(self, **kwargs):
-        super(TensorInputType, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def _is_compatible(self, v):
         # We only support scalar string type.
@@ -357,7 +369,7 @@ class FloatTensorInputType(ScalarOrTensorInputType):
     """
 
     def __init__(self, **kwargs):
-        super(FloatTensorInputType, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def _is_compatible(self, v):
         return types.is_tensor(v.sym_type) and v.dtype in SUPPORT_FLOAT_TYPES
@@ -375,7 +387,7 @@ class IntTensorInputType(ScalarOrTensorInputType):
     """
 
     def __init__(self, **kwargs):
-        super(IntTensorInputType, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def _is_compatible(self, v):
         return types.is_tensor(v.sym_type) and v.dtype in SUPPORT_INT_TYPES
@@ -385,7 +397,7 @@ def type_str(self):
 
 class BoolTensorInputType(ScalarOrTensorInputType):
     def __init__(self, **kwargs):
-        super(BoolTensorInputType, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def _is_compatible(self, v):
         return types.is_tensor(v.sym_type) and v.dtype == types.bool
@@ -396,7 +408,7 @@ def type_str(self):
 
 class StringInputType(ScalarOrTensorInputType):
     def __init__(self, **kwargs):
-        super(StringInputType, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def _is_compatible(self, v):
         return types.is_str(v.sym_type)
@@ -407,7 +419,7 @@ def type_str(self):
 
 class TupleInputType(_InputType):
     def __init__(self, **kwargs):
-        super(TupleInputType, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def _is_compatible(self, v):
         # We don't check the detail types within the tuple.
@@ -426,7 +438,7 @@ class InternalInputType(_InputType):
     """
 
     def __init__(self, **kwargs):
-        super(InternalInputType, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def _is_compatible(self, v):
         return True  # skip type check by default for InternalInputType.
@@ -438,7 +450,7 @@ class PyFunctionInputType(InternalInputType):
     """
 
     def __init__(self, **kwargs):
-        super(PyFunctionInputType, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     # def _is_compatible(self, v):
     #    return callable(v.val)
@@ -446,7 +458,7 @@ def __init__(self, **kwargs):
 
 class InternalStringInputType(InternalInputType):
     def __init__(self, **kwargs):
-        super(InternalStringInputType, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     # def _is_compatible(self, v):
     #    return types.is_str(v.sym_type)
@@ -454,7 +466,7 @@ def __init__(self, **kwargs):
 
 class InternalScalarOrTensorInputType(InternalInputType):
     def __init__(self, **kwargs):
-        super(InternalScalarOrTensorInputType, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     # def _is_compatible(self, v):
     #    return types.is_scalar(v.dtype) or types.is_tensor(v.dtype)
diff --git a/coremltools/converters/mil/mil/operation.py b/coremltools/converters/mil/mil/operation.py
index 75d3cfaea..aa44e4643 100644
--- a/coremltools/converters/mil/mil/operation.py
+++ b/coremltools/converters/mil/mil/operation.py
@@ -117,7 +117,7 @@ def is_internal_input(arg_name):
     return arg_name[0] == "_"
 
 
-class mil_list(object):
+class mil_list:
     '''
     A wrapper around python list
     '''
@@ -128,7 +128,7 @@ def __init__(self, ls=None):
             raise TypeError("Type of 'ls' must be list in the 'mil_list' class")
 
 
-class Operation(object):
+class Operation:
     """
     Represents Operation in MIL.
 
@@ -165,8 +165,8 @@ def __init__(self, **kwargs):
         self._check_expected_inputs(kwargs)
 
         # Set inputs from kwargs
-        input_kv = {k: v for k, v in kwargs.items() \
-            if k in self._input_types and v is not None}
+        input_kv = {k: v for k, v in kwargs.items()
+                    if k in self._input_types and v is not None}
         self._validate_and_set_inputs(input_kv)
         self._ensure_required_inputs()
 
@@ -193,12 +193,13 @@ def _check_expected_inputs(self, kwargs):
                 raise ValueError(
                     "Unknown input '{}' for op '{}'".format(
                       k, self.op_type)
-                    )
+                )
 
     def set_inputs(self,
-        no_check_var_types=False,
-        type_inference=False,
-        **input_kvs):
+                   no_check_var_types=False,
+                   type_inference=False,
+                   **input_kvs
+    ):
         """
         Parameters
         ----------
@@ -245,7 +246,7 @@ def type_value_inference(self, overwrite_output=False):
             output_names = self.output_names()
             if not isinstance(output_names, tuple):
                 output_names = (output_names,)
-        except NotImplementedError as e:
+        except NotImplementedError:
             if len(output_types) > 1:
                 output_names = tuple(str(i) for i, _ in enumerate(output_types))
             else:
@@ -328,7 +329,7 @@ def _auto_val(self, output_types):
             # Is self.value_inference implemented for corresponding input?
             try:
                 vals = self.value_inference()
-            except NotImplementedError as e:
+            except NotImplementedError:
                 do_auto_val = False
 
         if not do_auto_val:
diff --git a/coremltools/converters/mil/mil/ops/defs/__init__.py b/coremltools/converters/mil/mil/ops/defs/__init__.py
index 676738078..d3830fb5b 100644
--- a/coremltools/converters/mil/mil/ops/defs/__init__.py
+++ b/coremltools/converters/mil/mil/ops/defs/__init__.py
@@ -25,6 +25,13 @@
 
 from .classify import classify
 
+from .constexpr_ops import (
+    constexpr_affine_dequantize,
+    constexpr_cast,
+    constexpr_lut_to_dense,
+    constexpr_sparse_to_dense,
+)
+
 from .control_flow import (
     cond,
     const,
@@ -193,6 +200,7 @@
     slice_by_index,
     slice_by_size,
     space_to_depth,
+    space_to_batch,
     squeeze,
     transpose,
     pixel_shuffle,
diff --git a/coremltools/converters/mil/mil/ops/defs/activation.py b/coremltools/converters/mil/mil/ops/defs/activation.py
index d863e5b77..2a8fc843c 100644
--- a/coremltools/converters/mil/mil/ops/defs/activation.py
+++ b/coremltools/converters/mil/mil/ops/defs/activation.py
@@ -50,7 +50,7 @@ class clamped_relu(Operation):
     )
 
     def __init__(self, **kwargs):
-        super(clamped_relu, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -94,7 +94,7 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(elu, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -159,7 +159,7 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(gelu, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -213,7 +213,7 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(leaky_relu, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -259,7 +259,7 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(linear_activation, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -276,17 +276,19 @@ class prelu(Operation):
 
     Parameters
     ----------
-    x: tensor<[b, C, n, m], T> (Required)
+    x: tensor<[B, C, 1..3], T> (Required)
+        * x must have rank 4 or rank 3 or rank 5, i.e. a shape of (B,C,H) or (B,C,H,W) or (B,C,D,H,W)
     alpha: const tensor<[C], T>, (Required)
+        * The length of alpha must match the second dimension of x (channel dimension)
 
     Returns
     -------
-    tensor<[b, C, n, m], fp32>
+    tensor<[B, C, 1..3], T>
         * A tensor of the same shape as ``x``.
 
     Attributes
     ----------
-    T: fp16, fp32
+    T: fp32, fp16
     """
 
     input_spec = InputSpec(
@@ -294,7 +296,7 @@ class prelu(Operation):
         alpha=TensorInputType(const=True),)
 
     def __init__(self, **kwargs):
-        super(prelu, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -306,8 +308,8 @@ def value_inference(self):
         return x_pos + b * alpha_br
 
     def type_inference(self):
-        if len(self.x.shape) < 3:
-            raise ValueError("x should be at least rank 3")
+        if self.x.rank not in (3, 4, 5):
+            raise ValueError("prelu op: x must be rank 3 or 4 or 5, instead it is of rank {}".format(len(self.x.shape)))
         if len(self.alpha.val.shape) != 1:
             raise ValueError("alpha should be rank 1")
         if self.x.shape[1] != self.alpha.val.shape[0]:
@@ -315,6 +317,12 @@ def type_inference(self):
                 "Size of dimension 1 of alpha should be the same as "
                 + "the size of dimension 1 of x."
             )
+        if self.x.rank in (3, 5):
+            # check whether all alpha values are the same or not
+            are_values_same = np.where(np.abs(self.alpha.val - self.alpha.val[0]) > 1e-5)[0].size == 0
+            if not are_values_same:
+                raise ValueError("prelu op: rank 3 or rank 5 input is only supported when all the values of alpha are same,"
+                                 "which is not the case here")
         return self.x.sym_type
 
 
@@ -338,7 +346,7 @@ class relu(elementwise_unary):
     """
 
     def __init__(self, **kwargs):
-        super(relu, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -365,7 +373,7 @@ class relu6(elementwise_unary):
     """
 
     def __init__(self, **kwargs):
-        super(relu6, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -409,7 +417,7 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(scaled_tanh, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -439,7 +447,7 @@ class sigmoid(elementwise_unary):
     """
 
     def __init__(self, **kwargs):
-        super(sigmoid, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -482,7 +490,7 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(sigmoid_hard, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -493,6 +501,7 @@ def value_inference(self):
     def type_inference(self):
         return self.x.sym_type
 
+
 @register_op(doc_str="")
 class silu(Operation):
     """
@@ -514,11 +523,12 @@ class silu(Operation):
     input_spec = InputSpec(x=TensorInputType(),)
 
     def __init__(self, **kwargs):
-        super(silu, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         return types.tensor(self.x.dtype, tuple(self.x.shape))
 
+
 @register_op(doc_str="")
 class softplus(elementwise_unary):
     """
@@ -539,7 +549,7 @@ class softplus(elementwise_unary):
     """
 
     def __init__(self, **kwargs):
-        super(softplus, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -574,7 +584,7 @@ class softplus_parametric(Operation):
     )
 
     def __init__(self, **kwargs):
-        super(softplus_parametric, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -637,7 +647,7 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(softmax, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         return self.x.sym_type
@@ -650,6 +660,7 @@ def value_inference(self):
         temp = np.exp(x - max_vals)
         return temp / np.sum(temp, axis=axis, keepdims=True)
 
+
 @register_op(doc_str="")
 class softsign(elementwise_unary):
     """
@@ -670,7 +681,7 @@ class softsign(elementwise_unary):
     """
 
     def __init__(self, **kwargs):
-        super(softsign, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -709,7 +720,7 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(thresholded_relu, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         return self.x.sym_type
diff --git a/coremltools/converters/mil/mil/ops/defs/classify.py b/coremltools/converters/mil/mil/ops/defs/classify.py
index 4a2b73fae..a6d63bafe 100644
--- a/coremltools/converters/mil/mil/ops/defs/classify.py
+++ b/coremltools/converters/mil/mil/ops/defs/classify.py
@@ -52,7 +52,7 @@ class classify(Operation):
     )
 
     def __init__(self, **kwargs):
-        super(classify, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         # check the type of "classes"
@@ -60,6 +60,11 @@ def type_inference(self):
             msg = "'classes' in the op 'classify' must be of type list. Instead it is {}."
             raise ValueError(msg.format(self.classes.sym_type.__type_info__()))
 
+        # check the type of "probabilities"
+        if self.probabilities.dtype != types.fp32:
+            msg = "classify op: input probabilities must be of type fp32. Instead it is of type {}"
+            raise TypeError(msg.format(self.probabilities.sym_type.get_primitive().__type_info__()))
+
         classes_elem_type = self.classes.elem_type
         if classes_elem_type not in {types.str, types.int64}:
             msg = "Type of elements in 'classes' in the op 'classify' must be either str or int64. Instead it is {}."
diff --git a/coremltools/converters/mil/mil/ops/defs/constexpr_ops.py b/coremltools/converters/mil/mil/ops/defs/constexpr_ops.py
new file mode 100644
index 000000000..56c589436
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/defs/constexpr_ops.py
@@ -0,0 +1,377 @@
+# Copyright (c) 2022, Apple Inc. All rights reserved.
+import numpy as np
+
+from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil.input_type import (
+    InputSpec,
+    ScalarOrTensorInputType,
+)
+from coremltools.converters.mil.mil.operation import Operation
+from coremltools.converters.mil.mil.ops.defs._op_reqs import register_op
+
+
+@register_op(doc_str="")
+class constexpr_affine_dequantize(Operation):
+    """
+    A compile-time operation that returns a constant output value upon dequantizing its constant inputs.
+
+    This operation is used to represent constant 8 bit quantized data with affine/linear quantization.
+    The quantized data is stored in the parameter "quantized_data".
+    The other parameters, scale, zero_point and axis describe how unquantized values can be extracted from it,
+    using the equation for affine/linear quantization:
+
+                unquantized_data = scale * (quantized_data - zero_point)
+
+    Although all the parameters of this op are constants, this op is not constant folded to a single const op, at the time of model serialization.
+    The unquantized output will be decompressed later, based on the implementation detail (either at model load time or runtime)
+
+    Parameters
+    ----------
+    quantized_data: const tensor<SrcT, [1..]> (Required)
+
+    zero_point: const tensor<SrcT, [0..1]> (Required)
+
+    scale: const tensor<DstT, [0..1]> (Required)
+
+    axis: const tensor<int32, []> (Required)
+
+    scale can be either a scalar or a vector
+    zero_point can be either a scalar or a vector
+    If scale is a vector, for implementation, it gets broadcasted to following shape
+        - Rank of scale becomes same as the rank of quantized_data
+        - Constraint: size(scale-vector) == quantized_data.shape[axis]
+        - For i == axis, scale.shape[i] == quantized_data.shape[i]
+        - For i != axis, scale.shape == 1
+
+    For example:
+        Let's say quantized_data.shape = (2, 3, 4, 5) and axis = 1
+        if scale is a vector, then scale.size needs to be equals to quantized_data.shape[axis] i.e = 3,
+        which we would get broadcasted to (1, 3, 1, 1)
+
+    zero_point follows similar broadcasting rules and size constraints as scale
+
+    Returns
+    -------
+    const tensor<DstT, [1..]>
+
+    Attributes
+    ----------
+    SrcT: uint8, int8
+    DstT: fp16, fp32
+    """
+
+    input_spec = InputSpec(
+        quantized_data=ScalarOrTensorInputType(
+            const=True, type_domain=(np.uint8, np.int8)
+        ),
+        zero_point=ScalarOrTensorInputType(const=True, type_domain=(np.uint8, np.int8)),
+        scale=ScalarOrTensorInputType(const=True, type_domain=(np.float16, np.float32)),
+        axis=ScalarOrTensorInputType(const=True, type_domain=(np.int32,)),
+    )
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def type_inference(self):
+        def assert_is_scalar_or_vector(param, name):
+            if param.rank not in (0, 1):
+                raise ValueError(
+                    "Parameter {} needs to be either a scalar or vector".format(name)
+                )
+
+        def assert_vector_size_same_as_axial_dimension(param, axis_dim_size, name):
+            if param.rank == 1 and param.shape[0] != axis_dim_size:
+                raise ValueError(
+                    "Parameter {}, if vector, needs to have same size as the dimension size along the parameter quantized_data".format(
+                        name
+                    )
+                )
+
+        if self.zero_point.dtype != self.quantized_data.dtype:
+            raise ValueError(
+                "Parameters quantized_data and zero_point needs to be of the same dtype"
+            )
+
+        rank = self.quantized_data.rank
+        if self.axis.val < -rank or self.axis.val >= rank:
+            raise ValueError(
+                "Parameter axis needs to be in the range -quantized_data.rank <= axis < quantized_data.rank"
+            )
+
+        assert_is_scalar_or_vector(self.scale, "scale")
+        assert_is_scalar_or_vector(self.zero_point, "zero_point")
+
+        assert_vector_size_same_as_axial_dimension(
+            self.scale, self.quantized_data.shape[self.axis.val], "scale"
+        )
+        assert_vector_size_same_as_axial_dimension(
+            self.zero_point, self.quantized_data.shape[self.axis.val], "zero_point"
+        )
+
+        dtype = self.scale.dtype
+        shape = self.quantized_data.shape
+        return types.tensor(dtype, shape)
+
+    def value_inference(self):
+        return None  # Needs to be None to avoid decompression
+
+    def get_decompressed_value(self):
+        return self.decompress(
+            self.quantized_data.val, 
+            self.zero_point.val, 
+            self.scale.val, 
+            self.axis.val
+        )
+
+    @staticmethod
+    def decompress(quantized_data, zero_point, scale, axis):
+        axes = tuple(
+            [i for i in range(len(quantized_data.shape)) if i != axis]
+        )
+        sc = np.expand_dims(scale, axis=axes)
+        zp = np.expand_dims(zero_point, axis=axes)
+        val = sc * (quantized_data.astype(np.float32) - zp.astype(np.float32))
+        return val.astype(scale.dtype)
+
+
+@register_op(doc_str="")
+class constexpr_cast(Operation):
+    """
+    A compile-time operation that returns a constant output value upon casting its constant input.
+
+    Expression: output = constexpr_cast(source_val, output_dtype="fp32")
+
+    Parameters
+    ----------
+    source_val: const tensor<SrcT, [...]> (Required)
+
+    output_dtype: const tensor<string, []> (Required)
+
+    Returns
+    -------
+    const tensor<DstT, [...]>
+
+    Attributes
+    ----------
+    SrcT: fp16
+    DstT: fp32
+    """
+
+    input_spec = InputSpec(
+        source_val=ScalarOrTensorInputType(const=True, type_domain=(np.float16,)),
+        output_dtype=ScalarOrTensorInputType(const=True, type_domain=(str,)),
+    )
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def type_inference(self):
+
+        dtype = types.string_to_builtin(self.output_dtype.val)
+        if dtype != types.fp32:
+            raise NotImplementedError("Only output_dtype = fp32 is supported")
+
+        shape = self.source_val.shape
+        return types.tensor(dtype, shape)
+
+    def value_inference(self):
+        return None  # Needs to be None to avoid decompression
+
+
+@register_op(doc_str="")
+class constexpr_lut_to_dense(Operation):
+    """
+    A compile-time operation that returns a constant output value upon decompressing look-up-table to a dense tensor.
+
+    This operation is used to store constant weights in a look up table format (aka palettized weights).
+    Look up table (LUT) is a mapping from index to values.
+    Weights are quantized and stored as indices (or keys) into LUT.
+    Before computation, these keys are mapped to corresponding values in LUT.
+
+    Parameters
+    ----------
+    indices: const tensor<uint8, [M]> (Required)
+
+    lut: const tensor<T, [NUM_PALETTES]> (Required)
+
+    shape: const tensor<uint32, [K]> (Required)
+
+    Any data is packed and read in a row-major order
+    NUM_PALETTES can be one of {2, 4, 16, 64 or 256}
+    n_bits = log2(NUM_PALETTES) can thus be one of {1, 2, 4, 6, 8}
+    indices are packed in bytes of size M, where M = ceil(n_bits * product(shape) / 8)
+    The bit fields are packed one byte at a time, starting with the least significant bit and
+    moving upwards towards the most significant bit. It follows naturally, if an index is split
+    across two bytes, LSB bits of that index gets filled over MSB bits of current byte and the remaining
+    bits of the same index gets filled in the LSB bits of the next byte.
+
+    For example:
+        if n_bits = 2, shape = (5,) => M = 2 bytes
+
+                    MSB             LSB
+                     |               |
+        indices =  | 01   10   11   00 | xx   xx   xx   11 |      <== packed elements
+                   | i3 | i2 | i1 | i0 | -- | -- | -- | i4 |      <== tagged element ids
+                   |      byte 0       |       byte 1      |      <== tagged bytes
+
+    Returns
+    -------
+    const tensor<T, [...]>
+
+    Attributes
+    ----------
+    T: uint8, int8, fp16, fp32
+    """
+
+    input_spec = InputSpec(
+        indices=ScalarOrTensorInputType(const=True, type_domain=(np.uint8,)),
+        lut=ScalarOrTensorInputType(
+            const=True, type_domain=(np.int8, np.uint8, np.float16, np.float32)
+        ),
+        shape=ScalarOrTensorInputType(const=True, type_domain=(np.uint32,)),
+    )
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def type_inference(self):
+        def assert_is_vector(param, name):
+            if param.rank != 1:
+                raise ValueError("Parameter {} needs to have rank == 1".format(name))
+
+        assert_is_vector(self.indices, "indices")
+        assert_is_vector(self.lut, "lut")
+
+        if self.lut.shape[0] not in (2, 4, 16, 64, 256):
+            raise ValueError(
+                "Parameter lut should be a vector of size from one of {2, 4, 16, 64, 256}"
+            )
+
+        nbits = int(np.log2(self.lut.shape[0]))
+        output_size = np.prod(self.shape.val)
+        if self.indices.shape[0] != np.ceil(nbits * (output_size / 8.0)):
+            raise AssertionError(
+                "Constraint violated, M = ceil(n_bits * product(shape) / 8) where M = indices.size"
+            )
+
+        dtype = self.lut.dtype
+        shape = self.shape.val
+        return types.tensor(dtype, shape)
+
+    def value_inference(self):
+        return None  # Needs to be None to avoid decompression
+
+    def get_decompressed_value(self):
+        return self.decompress(
+                self.lut.val,
+                self.indices.val,
+                self.shape.val,
+            )
+
+    @staticmethod
+    def decompress(lut, indices, shape):
+        bitarray = np.unpackbits(indices, bitorder="little")
+        nbits = np.log2(lut.size).astype(np.int32)
+
+        pad_required = bitarray.size % nbits != 0
+        if pad_required:
+            bitarray = np.concatenate([bitarray, np.zeros(bitarray.size % nbits)])
+
+        assert bitarray.size % nbits == 0
+
+        size = np.prod(shape)
+        bitarray = bitarray.reshape(-1, nbits)[:size, :]
+
+        indices = np.packbits(bitarray, bitorder="little", axis=-1).reshape(-1)
+        flatten_val = lut[indices]
+        return flatten_val.reshape(shape)
+
+
+@register_op(doc_str="")
+class constexpr_sparse_to_dense(Operation):
+    """
+    A compile-time operation that returns a constant output value upon de-sparsification of its constant inputs.
+
+    This operation represents unstructured sparsity and uses bit mask binary representation.
+    If a bit is set, then the corresponding element in output tensor is non-zero and the value is read from `nonzero_data` attribute.
+    Likewise, if bit is not set, then the corresponding element in output tensor is zero.
+
+    Parameters
+    ----------
+    nonzero_data: const tensor<T, [D]> (Required)
+
+    mask: const tensor<uint8, [M]> (Required)
+
+    shape: const tensor<uint32, [K]> (Required)
+
+    Any data is packed and read in a row-major order
+    Mask contains M bytes where M = ceil( product(shape) / 8) i.e each bit field corresponds to one element in output tensor
+    D == total number of set bits in Mask
+    The bit fields are packed one byte at a time, starting with the least significant bit and
+    moving upwards towards the most significant bit.
+
+    For example:
+        shape = (5,) => M = 1 bytes
+
+                   MSB                  LSB
+                    |                    |
+        mask    =  |x  x  x  0  1  1  0  0 |      <== packed elements
+                   |--|--|--|i4|i3|i2|i1|i0|      <== tagged element ids
+                   |      byte 0           |      <== tagged bytes
+
+    Returns
+    -------
+    const tensor<T, [...]>
+
+    Attributes
+    ----------
+    T: uint8, int8, fp16, fp32
+    """
+
+    input_spec = InputSpec(
+        nonzero_data=ScalarOrTensorInputType(
+            const=True, type_domain=(np.int8, np.uint8, np.float16, np.float32)
+        ),
+        mask=ScalarOrTensorInputType(const=True, type_domain=(np.uint8,)),
+        shape=ScalarOrTensorInputType(const=True, type_domain=(np.uint32,)),
+    )
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def type_inference(self):
+        def assert_is_vector(param, name):
+            if param.rank != 1:
+                raise ValueError("Parameter {} needs to have rank == 1".format(name))
+
+        assert_is_vector(self.nonzero_data, "nonzero_data")
+        assert_is_vector(self.mask, "mask")
+
+        if sum(bin(x).count("1") for x in self.mask.val) != self.nonzero_data.shape[0]:
+            raise AssertionError(
+                "Number of set bits in mask needs to be equal to number of elements in parameter nonzero_data"
+            )
+
+        output_size = np.prod(self.shape.val)
+        if self.mask.shape[0] != np.ceil(output_size / 8.0):
+            raise AssertionError(
+                "Constraint Violated: M = ceil( product(shape) / 8) where M = mask.size"
+            )
+
+        dtype = self.nonzero_data.dtype
+        shape = self.shape.val
+        return types.tensor(dtype, shape)
+
+    def value_inference(self):
+        return None  # Needs to be None to avoid decompression
+
+    def get_decompressed_value(self):
+        return self.decompress(self.nonzero_data.val, self.mask.val, self.shape.val)
+
+    @staticmethod
+    def decompress(nonzero_data, mask, shape):
+        flattend_val = np.zeros(shape, dtype=nonzero_data.dtype).flatten()
+        flattend_val[
+            np.where(np.unpackbits(mask, bitorder="little") != 0)
+        ] = nonzero_data
+        return flattend_val.reshape(shape)
diff --git a/coremltools/converters/mil/mil/ops/defs/control_flow.py b/coremltools/converters/mil/mil/ops/defs/control_flow.py
index 96847f649..13a386085 100644
--- a/coremltools/converters/mil/mil/ops/defs/control_flow.py
+++ b/coremltools/converters/mil/mil/ops/defs/control_flow.py
@@ -2,10 +2,12 @@
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
 import copy
-import numpy as np
 import logging
 
+import numpy as np
+
 from coremltools.converters.mil.mil import (
     Block,
     get_new_symbol,
@@ -26,6 +28,7 @@
     TensorInputType,
     TupleInputType,
     StringInputType,
+    InternalInputType,
 )
 from coremltools.converters.mil.mil.operation import (
     mil_list,
@@ -37,6 +40,8 @@
 )
 from coremltools.converters.mil.mil.types import is_compatible_type
 from coremltools.converters.mil.mil.types.type_mapping import (
+    builtin_to_string,
+    numpy_type_to_builtin_type,
     numpy_val_to_builtin_val,
     is_subtype,
 )
@@ -64,6 +69,12 @@ class cond(Operation):
         * It must take zero input (i.e. no input), and have return types that match those of the
           ``if`` branch.
 
+    _existing_blocks: list[Block] (Optional)
+        * Python list of ``Block``.
+        * For internal use only. When converting a milproto, we already got existing blocks,
+          and the ``build_nested_blocks`` function can use them directly.
+        * When ``_existing_blocks`` is set, ``_true_fn`` and ``_false_fn`` must be dummy functions which returns ``None``. 
+
     Returns
     -------
     tuple
@@ -74,12 +85,22 @@ class cond(Operation):
         pred=BoolInputType(),
         _true_fn=PyFunctionInputType(),
         _false_fn=PyFunctionInputType(),
+        _existing_blocks=InternalInputType(optional=True),
     )
 
     def __init__(self, **kwargs):
-        super(cond, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def build_nested_blocks(self):
+        # If the front end is milproto, we already have the well constructed cond/body block.
+        # For this case, we set self.blocks directly.
+        # We also check that _cond and _body are both dummy functions (return None).
+        if self._existing_blocks is not None and self._existing_blocks.val is not None:
+            assert self._true_fn.val([]) is None
+            assert self._false_fn.val([]) is None
+            self.blocks = self._existing_blocks.val
+            return
+
         # Cond block
         true_block_name = self.name + "_true"
         with Block(name=true_block_name, outer_op=self) as true_block:
@@ -153,7 +174,7 @@ class Const(Operation):
     )
 
     def __init__(self, **kwargs):
-        super(Const, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         builtin_type, _ = self._get_type_val(self.val.val)
@@ -175,9 +196,9 @@ def _get_type_val(self, value):
             value = np.array(value)
 
             # For the int type, we use int32 by default
-            if value.dtype in [np.uint8, np.int8, np.uint16, np.int16, np.uint32, np.uint64, np.int64]:
+            if value.dtype in [np.uint16, np.int16, np.uint64, np.int64]:
                 if value.dtype in [np.uint64, np.int64]:
-                    msg = "Downcast const op {} data int64 as int32".format(self.name)
+                    msg = "Downcast const op {} data".format(self.name) + builtin_to_string(numpy_type_to_builtin_type(value.dtype)) + " as int32"
                     logging.debug(msg)
                 value = value.astype(np.int32)
 
@@ -209,17 +230,18 @@ def _get_type_val(self, value):
         _, builtin_type = numpy_val_to_builtin_val(value)
         return builtin_type, value
 
+
 @register_op(doc_str="")
 class const(Const):
     def __init__(self, **kwargs):
-        super(const, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
 
 # Internal const can have symbolic value (for testing purpose)
 @register_op(doc_str="")
 class _const_symbolic(const):
     def __init__(self, **kwargs):
-        super(_const_symbolic, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         builtin_type, _ = self._get_type_val(self.val.sym_val)
@@ -274,7 +296,7 @@ class select(Operation):
     )
 
     def __init__(self, **kwargs):
-        super(select, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         a_type = self.a.sym_type
@@ -315,6 +337,12 @@ class while_loop(Operation):
     loop_vars: tuple (Required)
         * Python tuple of ``Variables``.
 
+    _existing_blocks: list[Block] (Optional)
+        * Python list of ``Block``.
+        * For internal use only. When converting a milproto, we already got existing blocks,
+          and the ``build_nested_blocks`` function can use them directly.
+        * When ``_existing_blocks`` is set, ``_cond`` and ``_body`` must be dummy functions which returns ``None``. 
+
     Returns
     -------
     tuple
@@ -326,10 +354,11 @@ class while_loop(Operation):
         _cond=PyFunctionInputType(),
         _body=PyFunctionInputType(),
         loop_vars=TupleInputType(),
+        _existing_blocks=InternalInputType(optional=True),
     )
 
     def __init__(self, **kwargs):
-        super(while_loop, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @staticmethod
     def _check_equal_value(val1, val2):
@@ -409,6 +438,15 @@ def build_nested_blocks(self):
         # if `loop_cond` is True do we execute the body block. This is the
         # semantics of tf.while_loop.
 
+        # If the front end is milproto, we already have the well constructed cond/body block.
+        # For this case, we set self.blocks directly.
+        # We also check that _cond and _body are both dummy functions (return None).
+        if self._existing_blocks is not None and self._existing_blocks.val is not None:
+            assert self._cond.val([]) is None
+            assert self._body.val([]) is None
+            self.blocks = self._existing_blocks.val
+            return
+
         block_inputs = tuple(copy.copy(v) for v in self.loop_vars)
         _, visible_vars = self.enclosing_block._visible_vars_in_block()
         name_count = {v.name: 1 for v in visible_vars}
@@ -553,7 +591,7 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(make_list, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         builtin_dtype = types.string_to_builtin(self.dtype.val)
@@ -608,7 +646,7 @@ class list_length(Operation):
     input_spec = InputSpec(ls=ListInputType(),)
 
     def __init__(self, **kwargs):
-        super(list_length, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         return types.int32
@@ -652,7 +690,7 @@ class list_write(Operation):
     )
 
     def __init__(self, **kwargs):
-        super(list_write, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         list_elem_type = self.ls.elem_type
@@ -706,7 +744,7 @@ class list_read(Operation):
         )
 
     def __init__(self, **kwargs):
-        super(list_read, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         list_elem_type = self.ls.elem_type
@@ -748,7 +786,7 @@ class list_gather(Operation):
         )
 
     def __init__(self, **kwargs):
-        super(list_gather, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         list_elem_type = self.ls.elem_type
@@ -800,7 +838,7 @@ class list_scatter(Operation):
     )
 
     def __init__(self, **kwargs):
-        super(list_scatter, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         num_indices = self.indices.shape[0]
diff --git a/coremltools/converters/mil/mil/ops/defs/conv.py b/coremltools/converters/mil/mil/ops/defs/conv.py
index f9a28608c..1629882b5 100644
--- a/coremltools/converters/mil/mil/ops/defs/conv.py
+++ b/coremltools/converters/mil/mil/ops/defs/conv.py
@@ -149,7 +149,7 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(conv, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         inshape = self.x.shape
@@ -247,7 +247,7 @@ def default_inputs(self):
                 )
 
     def __init__(self, **kwargs):
-        super(conv_quantized, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
 
 @register_op(doc_str="")
@@ -357,7 +357,7 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(conv_transpose, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         # Input shape is [n, C_in, spatial_dims]
diff --git a/coremltools/converters/mil/mil/ops/defs/elementwise_binary.py b/coremltools/converters/mil/mil/ops/defs/elementwise_binary.py
index 6e815c369..1d2473f32 100644
--- a/coremltools/converters/mil/mil/ops/defs/elementwise_binary.py
+++ b/coremltools/converters/mil/mil/ops/defs/elementwise_binary.py
@@ -24,7 +24,7 @@ class elementwise_binary(Operation):
     input_spec = InputSpec(x=ScalarOrTensorInputType(), y=ScalarOrTensorInputType(),)
 
     def __init__(self, **kwargs):
-        super(elementwise_binary, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         typea = self.x.sym_type
@@ -106,7 +106,7 @@ class add(elementwise_binary):
     """
     
     def __init__(self, **kwargs):
-        super(add, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def get_operator(self):
         return operator.add
@@ -138,7 +138,7 @@ class equal(elementwise_binary):
     """
 
     def __init__(self, **kwargs):
-        super(equal, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def get_operator(self):
         return np.equal
diff --git a/coremltools/converters/mil/mil/ops/defs/elementwise_unary.py b/coremltools/converters/mil/mil/ops/defs/elementwise_unary.py
index 394c1f9ff..a6968d300 100644
--- a/coremltools/converters/mil/mil/ops/defs/elementwise_unary.py
+++ b/coremltools/converters/mil/mil/ops/defs/elementwise_unary.py
@@ -34,7 +34,7 @@ class elementwise_unary(Operation):
     input_spec = InputSpec(x=ScalarOrTensorInputType(),)
 
     def __init__(self, **kwargs):
-        super(elementwise_unary, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         return self.x.sym_type
@@ -63,7 +63,7 @@ class abs(elementwise_unary):
     """
 
     def __init__(self, **kwargs):
-        super(abs, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -91,7 +91,7 @@ class acos(elementwise_unary):
     """
 
     def __init__(self, **kwargs):
-        super(acos, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -119,7 +119,7 @@ class asin(elementwise_unary):
     """
 
     def __init__(self, **kwargs):
-        super(asin, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -147,7 +147,7 @@ class atan(elementwise_unary):
     """
 
     def __init__(self, **kwargs):
-        super(atan, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -176,7 +176,7 @@ class atanh(elementwise_unary):
     """
 
     def __init__(self, **kwargs):
-        super(atanh, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -204,7 +204,7 @@ class ceil(elementwise_unary):
     """
 
     def __init__(self, **kwargs):
-        super(ceil, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -242,7 +242,7 @@ class clip(Operation):
     )
 
     def __init__(self, **kwargs):
-        super(clip, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         return self.x.sym_type
@@ -272,7 +272,7 @@ class cos(elementwise_unary):
     """
 
     def __init__(self, **kwargs):
-        super(cos, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -300,7 +300,7 @@ class cosh(elementwise_unary):
     """
 
     def __init__(self, **kwargs):
-        super(cosh, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -328,7 +328,7 @@ class erf(elementwise_unary):
     """
 
     def __init__(self, **kwargs):
-        super(erf, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -356,7 +356,7 @@ class exp(elementwise_unary):
     """
 
     def __init__(self, **kwargs):
-        super(exp, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -384,7 +384,7 @@ class exp2(elementwise_unary):
     """
 
     def __init__(self, **kwargs):
-        super(exp2, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -413,7 +413,7 @@ class floor(elementwise_unary):
     """
 
     def __init__(self, **kwargs):
-        super(floor, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -455,7 +455,7 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(inverse, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         return self.x.sym_type
@@ -497,7 +497,7 @@ def default_inputs(self):
             epsilon=1e-45)
 
     def __init__(self, **kwargs):
-        super(log, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         return self.x.sym_type
@@ -529,7 +529,7 @@ class logical_not(elementwise_unary):
     """
 
     def __init__(self, **kwargs):
-        super(logical_not, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -557,7 +557,7 @@ class round(elementwise_unary):
     """
 
     def __init__(self, **kwargs):
-        super(round, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -599,7 +599,7 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(rsqrt, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         return self.x.sym_type
@@ -632,7 +632,7 @@ class sign(elementwise_unary):
     """
 
     def __init__(self, **kwargs):
-        super(sign, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -660,7 +660,7 @@ class sin(elementwise_unary):
     """
 
     def __init__(self, **kwargs):
-        super(sin, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -688,7 +688,7 @@ class sinh(elementwise_unary):
     """
 
     def __init__(self, **kwargs):
-        super(sinh, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -716,7 +716,7 @@ class sqrt(elementwise_unary):
     """
 
     def __init__(self, **kwargs):
-        super(sqrt, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -744,7 +744,7 @@ class square(elementwise_unary):
     """
 
     def __init__(self, **kwargs):
-        super(square, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -772,7 +772,7 @@ class tan(elementwise_unary):
     """
 
     def __init__(self, **kwargs):
-        super(tan, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -801,7 +801,7 @@ class tanh(elementwise_unary):
     """
 
     def __init__(self, **kwargs):
-        super(tanh, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -835,7 +835,7 @@ class threshold(Operation):
     )
 
     def __init__(self, **kwargs):
-        super(threshold, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         return self.x.sym_type
@@ -871,7 +871,7 @@ class cast(Operation):
     )
 
     def __init__(self, **kwargs):
-        super(cast, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         type_map = {
@@ -897,6 +897,10 @@ def type_inference(self):
 
     @precondition(allow=VALUE | SYMBOL)
     def value_inference(self):
+        return self.get_cast_value(self.x, self.dtype.val)
+
+    @staticmethod
+    def get_cast_value(input_var, dtype_val):
         type_map = {
             "int32": np.int32,
             "int64": np.int64,
@@ -906,19 +910,19 @@ def value_inference(self):
             "bool": np.bool,
         }
 
-        if self.dtype.val not in type_map.keys():
+        if dtype_val not in type_map.keys():
             raise NotImplementedError(
                 "Parameter dtype of the cast operation can be one of the {}. "
-                "Provided {}".format(type_map.keys(), self.dtype.val)
+                "Provided {}".format(type_map.keys(), dtype_val)
             )
 
-        if self.x.val is None:
-            if self.x.sym_val is not None and not is_symbolic(self.x.sym_val) and len(self.x.sym_val.shape) == 1:
-                result = [np.array(val).astype(dtype=type_map[self.dtype.val]).item() if not is_symbolic(val) else val for val in self.x.sym_val]
+        if input_var.val is None:
+            if input_var.sym_val is not None and not is_symbolic(input_var.sym_val) and len(input_var.sym_val.shape) == 1:
+                result = [np.array(val).astype(dtype=type_map[dtype_val]).item() if not is_symbolic(val) else val for val in input_var.sym_val]
                 return np.array(result)
             return None
 
-        if not types.is_tensor(self.x.sym_type):
-            return self.x.val.astype(dtype=type_map[self.dtype.val])
+        if not types.is_tensor(input_var.sym_type):
+            return input_var.val.astype(dtype=type_map[dtype_val])
         else:
-            return np.array(self.x.val).astype(dtype=type_map[self.dtype.val])
+            return np.array(input_var.val).astype(dtype=type_map[dtype_val])
diff --git a/coremltools/converters/mil/mil/ops/defs/image_resizing.py b/coremltools/converters/mil/mil/ops/defs/image_resizing.py
index 967620601..25e02ff9b 100644
--- a/coremltools/converters/mil/mil/ops/defs/image_resizing.py
+++ b/coremltools/converters/mil/mil/ops/defs/image_resizing.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 #  Copyright (c) 2020, Apple Inc. All rights reserved.
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
@@ -20,9 +19,9 @@
     TensorInputType,
     types,
 )
-
 from coremltools.converters.mil.mil.types.symbolic import is_symbolic
 
+
 @register_op(doc_str="")
 class affine(Operation):
     """
@@ -104,7 +103,7 @@ class affine(Operation):
     )
 
     def __init__(self, **kwargs):
-        super(affine, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         if self.x.rank != 4:
@@ -200,7 +199,7 @@ def default_inputs(self):
         )
 
     def __init__(self, **kwargs):
-        super(upsample_nearest_neighbor, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         if self.x.rank < 3:
@@ -301,7 +300,7 @@ class resample(Operation):
     )
 
     def __init__(self, **kwargs):
-        super(resample, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         if self.x.rank != 4:
@@ -390,7 +389,7 @@ class resize_nearest_neighbor(Operation):
     )
 
     def __init__(self, **kwargs):
-        super(resize_nearest_neighbor, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         if self.x.rank < 3:
@@ -501,7 +500,7 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(upsample_bilinear, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         if self.x.rank < 3:
@@ -626,7 +625,7 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(resize_bilinear, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         if self.x.rank < 3:
@@ -763,7 +762,7 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(crop_resize, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         if self.x.rank != 4:
@@ -834,7 +833,7 @@ class crop(Operation):
     )
 
     def __init__(self, **kwargs):
-        super(crop, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         if self.x.rank < 3:
diff --git a/coremltools/converters/mil/mil/ops/defs/linear.py b/coremltools/converters/mil/mil/ops/defs/linear.py
index f66b03ca2..af65b4889 100644
--- a/coremltools/converters/mil/mil/ops/defs/linear.py
+++ b/coremltools/converters/mil/mil/ops/defs/linear.py
@@ -20,6 +20,7 @@
 from ._op_reqs import register_op
 from ._utils import broadcast_shapes, parse_einsum_equation
 
+
 @register_op(doc_str="")
 class linear(Operation):
     """
@@ -54,10 +55,10 @@ def default_inputs(self):
         Dout = self.weight.shape[0]
         return DefaultInputs(
             bias=[0.]*Dout,
-            )
+        )
 
     def __init__(self, **kwargs):
-        super(linear, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         x_type = self.x.dtype
@@ -174,10 +175,10 @@ def default_inputs(self):
         return DefaultInputs(
             transpose_x=False,
             transpose_y=False,
-            )
+        )
 
     def __init__(self, **kwargs):
-        super(matmul, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         x_type = self.x.dtype
diff --git a/coremltools/converters/mil/mil/ops/defs/normalization.py b/coremltools/converters/mil/mil/ops/defs/normalization.py
index 3acd794c4..de6174580 100644
--- a/coremltools/converters/mil/mil/ops/defs/normalization.py
+++ b/coremltools/converters/mil/mil/ops/defs/normalization.py
@@ -78,7 +78,7 @@ def default_inputs(self):
         )
 
     def __init__(self, **kwargs):
-        super(batch_norm, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         x_shape = self.x.shape
@@ -130,7 +130,7 @@ def default_inputs(self):
         )
 
     def __init__(self, **kwargs):
-        super(instance_norm, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         x_shape = self.x.shape
@@ -149,19 +149,22 @@ class l2_norm(Operation):
 
     Parameters
     ----------
-    x: tensor<[*D,C,H,W], T> (Required)
+    x: tensor<[\*B, \*D], T> (Required)
         * Input tensor, ``rank(x) >= 3``.
-        * ``*D`` refers to the spatial dimensions, ``rank(*D) >= 0``.
-        * ``n`` is the batch dimension.
-        * For ranks greater than 3, the leading dimensions, starting from ``0`` to ``-4`` (inclusive),
-          are all treated as batch.
+        * ``*B`` refers to the leading dimensions.
+        * ``*D`` refers to the spatial dimensions to be normalized. Must be rank 3: ``rank(*D) == 3``.
+        * When ``rank(x) == 3``, in which ``rank(*B) == 0 and rank(*D) == 3``, the input is divided by
+          the square root of the sum of squares of all elements.
+        * For ranks greater than 3, in which ``rank(*B) >= 1 and rank(*D) == 3``,
+          the leading dimensions \*B, starting from ``0`` to ``-4`` (inclusive),
+          are all treated as batch. The L2 normalization are done batch-wise.
     epsilon: const fp32 (Optional)
         * Small constant to avoid division by ``0``.
         * Optional, defaults to ``1e-6``.
 
     Returns
     -------
-    tensor<[\*D,C,H,W], T>
+    tensor<[\*B, \*D], T>
         * Same type and shape as the input tensor ``x``.
 
     Attributes
@@ -180,12 +183,33 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(l2_norm, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
+        if self.x.rank < 3:
+            msg = "Input rank of l2_norm must be at least 3. Got {}".format(self.x.rank)
+            raise ValueError(msg)
         x_shape = self.x.shape
         return types.tensor(self.x.dtype, tuple(x_shape))
 
+    @precondition(allow=VALUE)
+    def value_inference(self):
+        val = self.x.val
+        eps = self.epsilon.val
+        shape = self.x.shape
+        rank = self.x.rank
+        batch_dims = rank - 3
+        if batch_dims == 0:
+            square_sum = np.sum(val**2)
+            output = val/np.power(square_sum + eps, 0.5)
+        else:
+            batch_dim_prod = np.prod(shape[:batch_dims])
+            reshape_val = np.reshape(val, (batch_dim_prod, -1))
+            square_sum = np.sum(reshape_val * reshape_val, axis=1, keepdims=True) + eps
+            output = reshape_val/np.power(square_sum, 0.5)
+            output = np.reshape(output, shape)
+        return output
+
 
 @register_op(doc_str="")
 class layer_norm(Operation):
@@ -247,7 +271,7 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(layer_norm, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @staticmethod
     def _is_compatible_shape(shapea, shapeb):
@@ -355,7 +379,7 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(local_response_norm, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         x_shape = self.x.shape
diff --git a/coremltools/converters/mil/mil/ops/defs/pool.py b/coremltools/converters/mil/mil/ops/defs/pool.py
index 48aea5b70..94d9eb8bc 100644
--- a/coremltools/converters/mil/mil/ops/defs/pool.py
+++ b/coremltools/converters/mil/mil/ops/defs/pool.py
@@ -38,7 +38,7 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(Pooling, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         ksize = self.kernel_sizes.val
@@ -170,7 +170,7 @@ def default_inputs(self):
                 )
 
     def __init__(self, **kwargs):
-        super(avg_pool, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
 
 @register_op(doc_str="")
@@ -210,7 +210,7 @@ class l2_pool(Pooling):
     """
     
     def __init__(self, **kwargs):
-        super(l2_pool, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
 
 @register_op(doc_str="")
@@ -253,4 +253,4 @@ class max_pool(Pooling):
     """
     
     def __init__(self, **kwargs):
-        super(max_pool, self).__init__(**kwargs)
+        super().__init__(**kwargs)
diff --git a/coremltools/converters/mil/mil/ops/defs/random.py b/coremltools/converters/mil/mil/ops/defs/random.py
index aa6cd9ba1..563cdaf92 100644
--- a/coremltools/converters/mil/mil/ops/defs/random.py
+++ b/coremltools/converters/mil/mil/ops/defs/random.py
@@ -26,7 +26,7 @@ class RandomDistribution(Operation):
     out_dtype = types.fp32
 
     def __init__(self, **kwargs):
-        super(RandomDistribution, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         if any_symbolic(self.shape.shape):
@@ -100,7 +100,7 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(random_bernoulli, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         self.out_dtype = self.prob.dtype
@@ -157,7 +157,7 @@ def default_inputs(self):
         )
 
     def __init__(self, **kwargs):
-        super(random_categorical, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         self.out_dtype = self.x.dtype
@@ -217,7 +217,7 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(random_normal, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         if self.mean.dtype != self.stddev.dtype:
@@ -287,7 +287,7 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(random_uniform, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         if self.low.dtype != self.high.dtype:
diff --git a/coremltools/converters/mil/mil/ops/defs/recurrent.py b/coremltools/converters/mil/mil/ops/defs/recurrent.py
index f1d1acefa..6962d3caa 100644
--- a/coremltools/converters/mil/mil/ops/defs/recurrent.py
+++ b/coremltools/converters/mil/mil/ops/defs/recurrent.py
@@ -117,7 +117,7 @@ def default_inputs(self):
         )
 
     def __init__(self, **kwargs):
-        super(gru, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         if self.x.rank != 3:
@@ -345,7 +345,7 @@ def default_inputs(self):
             clip=None)
 
     def __init__(self, **kwargs):
-        super(lstm, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         if self.x.rank != 3:
@@ -474,7 +474,7 @@ def default_inputs(self):
             activation="tanh")
 
     def __init__(self, **kwargs):
-        super(rnn, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         if self.x.rank != 3:
diff --git a/coremltools/converters/mil/mil/ops/defs/reduction.py b/coremltools/converters/mil/mil/ops/defs/reduction.py
index 059a1d504..4328fde32 100644
--- a/coremltools/converters/mil/mil/ops/defs/reduction.py
+++ b/coremltools/converters/mil/mil/ops/defs/reduction.py
@@ -38,7 +38,7 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(ReductionAxes, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         x_type = self.x.dtype
@@ -85,7 +85,7 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(ReductionAxis, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def _find_reduced_shape(self):
         x_shape = self.x.shape
@@ -119,7 +119,7 @@ def get_operator(self):
 @register_op(doc_str="")
 class reduce_arg(ReductionAxis):
     def __init__(self, **kwargs):
-        super(reduce_arg, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         x_shape = self.x.shape
@@ -144,35 +144,34 @@ class reduce_argmax(reduce_arg):
     """
     Computes the indices of the maximum value across dimensions of a tensor.
     In case of ties, the identity of the return value is not guaranteed.
-    
+
     Parameters
     ----------
     x: <\*,T> (Required)
         * Must be 1-dimensional or higher.
-    
+
     axis: const<i32> (Optional)
         * The dimension to reduce. Default is ``-1``.
-    
+
     keep_dims: const<bool> (Optional, default=False)
         * If ``False``, the rank is reduced by ``1`` by removing the dimension
           specified in ``axis``. If ``True``, retain reduced axis with length ``1``.
-    
+
     Returns
     -------
     <\*, int32>
-    
+
     Attributes
     ----------
-    T: i32, fp16, fp32
-    
+    T: f32, int32
+
     References
     ----------
     See `tf.math.argmax <https://www.tensorflow.org/api_docs/python/tf/math/argmax>`_.
-    
     """
-    
+
     def __init__(self, **kwargs):
-        super(reduce_argmax, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def get_operator(self):
         return np.argmax
@@ -183,35 +182,35 @@ class reduce_argmin(reduce_arg):
     """
     Computes the indices of the minimum value across dimensions of a tensor.
     In case of ties, the identity of the return value is not guaranteed.
-    
+
     Parameters
     ----------
     x: <\*,T> (Required)
         * Must be 1-dimensional or higher.
-    
+
     axis: const<i32> (Optional)
         * The dimension to reduce. Default is ``-1``.
-    
+
     keep_dims: const<bool> (Optional, default=False)
         * If ``False``, the rank is reduced by ``1`` by removing the dimension specified
           in ``axis``, otherwise retain reduced axis with length ``1``.
-    
+
     Returns
     -------
     <\*, int32>
-    
+
     Attributes
     ----------
-    T: i32, fp16, fp32
-    
+    T: f32, int32
+
     References
     ----------
     See `tf.math.argmin <https://www.tensorflow.org/api_docs/python/tf/math/argmin>`_.
-    
+
     """
-    
+
     def __init__(self, **kwargs):
-        super(reduce_argmin, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def get_operator(self):
         return np.argmin
@@ -250,7 +249,7 @@ class reduce_l1_norm(ReductionAxes):
     """
     
     def __init__(self, **kwargs):
-        super(reduce_l1_norm, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def get_operator(self):
         def l1_norm(x, axis=None, keepdims=False):
@@ -287,7 +286,7 @@ class reduce_l2_norm(ReductionAxes):
     """
     
     def __init__(self, **kwargs):
-        super(reduce_l2_norm, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def get_operator(self):
         def l2_norm(x, axis=None, keepdims=False):
@@ -325,7 +324,7 @@ class reduce_log_sum(ReductionAxes):
     """
     
     def __init__(self, **kwargs):
-        super(reduce_log_sum, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def get_operator(self):
         def log_sum(x, axis=None, keepdims=False):
@@ -371,7 +370,7 @@ class reduce_log_sum_exp(ReductionAxes):
     """
     
     def __init__(self, **kwargs):
-        super(reduce_log_sum_exp, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def get_operator(self):
         def operator(a, axis=None, keepdims=False):
@@ -416,7 +415,7 @@ class reduce_max(ReductionAxes):
     """
     
     def __init__(self, **kwargs):
-        super(reduce_max, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def get_operator(self):
         return np.max
@@ -454,7 +453,7 @@ class reduce_mean(ReductionAxes):
     """
     
     def __init__(self, **kwargs):
-        super(reduce_mean, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def get_operator(self):
         return np.mean
@@ -486,9 +485,9 @@ class reduce_min(ReductionAxes):
     ----------
     T: i32, fp16, fp32
     """
-    
+
     def __init__(self, **kwargs):
-        super(reduce_min, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def get_operator(self):
         return np.min
@@ -498,31 +497,32 @@ def get_operator(self):
 class reduce_prod(ReductionAxes):
     """
     Computes the product of elements across given dimensions of the input tensor.
-    
+
     Parameters
     ----------
     x: <\*,T> (Required)
         * Must be 1-dimensional or higher.
-    
+
     axes: const<K,i32> (Optional, default="None", reduce on all axes.)
         * The dimensions to reduce.
-    
+
     keep_dims: const<bool> (Optional, default=False)
         * If ``False``, the rank is reduced by ``1`` for each entry in ``axes``,
           otherwise retain reduced axes with length ``1``.
-    
+
     Returns
     -------
     <\*,T>
         * Scalar or tensor: The reduced tensor.
-    
+
     Attributes
     ----------
     T:  i32, fp16, fp32
+
     """
-    
+
     def __init__(self, **kwargs):
-        super(reduce_prod, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def get_operator(self):
         return np.prod
@@ -532,31 +532,31 @@ def get_operator(self):
 class reduce_sum(ReductionAxes):
     """
     Computes the sum of elements across given dimensions of the input tensor.
-    
+
     Parameters
     ----------
     x: <\*,T> (Required)
         * Must be 1-dimensional or higher.
-    
+
     axes: const<K,i32> (Optional, default="None", reduce on all axes.)
         * The dimensions to reduce.
 
     keep_dims: const<bool> (Optional, default=False)
         * If ``False``, the rank is reduced by ``1`` for each entry in ``axes``,
           otherwise retain reduced axes with length ``1``.
-    
+
     Returns
     -------
     <\*,T>
         * Scalar or tensor: The reduced tensor.
-    
+
     Attributes
     ----------
     T: i32, fp16, fp32
     """
 
     def __init__(self, **kwargs):
-        super(reduce_sum, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def get_operator(self):
         return np.sum
@@ -566,31 +566,31 @@ def get_operator(self):
 class reduce_sum_square(ReductionAxes):
     """
     Computes the sum of squares of elements across given dimensions of the input tensor.
-    
+
     Parameters
     ----------
     x: <\*,T> (Required)
         * Must be 1-dimensional or higher.
-    
+
     axes: const<K,i32> (Optional, default="None", reduce on all axes.)
         * The dimensions to reduce.
-    
+
     keep_dims: const<bool> (Optional, default=False)
         * If ``False``, the rank is reduced by ``1`` for each entry in ``axes``,
           otherwise retain reduced axes with length ``1``.
-    
+
     Returns
     -------
     <\*,T>
         * Scalar or tensor: The reduced tensor.
-    
+
     Attributes
     ----------
     T: i32, fp16, fp32
     """
-    
+
     def __init__(self, **kwargs):
-        super(reduce_sum_square, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def get_operator(self):
         def sum_squre(x, axis=None, keepdims=False):
diff --git a/coremltools/converters/mil/mil/ops/defs/scatter_gather.py b/coremltools/converters/mil/mil/ops/defs/scatter_gather.py
index 739ccfe61..813944993 100644
--- a/coremltools/converters/mil/mil/ops/defs/scatter_gather.py
+++ b/coremltools/converters/mil/mil/ops/defs/scatter_gather.py
@@ -92,7 +92,7 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(gather, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE | SYMBOL)
     def value_inference(self):
@@ -195,7 +195,17 @@ class scatter(Operation):
 
     Attributes
     ----------
-    T: fp16, fp32, i32
+    T: fp32
+
+    For example:
+        data = [[1, 2, 3], [4, 5, 6]]
+        indices = [1, 0]
+        updates = [[5, 6, 7], [8, 9, 10]]
+        axis = 0
+        mode = "update"
+
+    produces:
+       [[9, 11, 13], [9, 11, 13]]
     """
 
     input_spec = InputSpec(
@@ -213,7 +223,7 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(scatter, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         if self.axis.val < -self.data.rank or self.axis.val >= self.data.rank:
@@ -262,7 +272,7 @@ class gather_along_axis(Operation):
 
     Attributes
     ----------
-    U: fp16, fp32, i32
+    T: fp16, fp32, i32
     """
 
     input_spec = InputSpec(
@@ -277,7 +287,7 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(gather_along_axis, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -316,8 +326,8 @@ def type_inference(self):
 @register_op(doc_str="")
 class scatter_along_axis(Operation):
     """
-    Scatter ``updates`` to ``data`` at locations ``indices`` at dimension ``axis``
-    by operation ``mode``.
+    Scatter ``updates`` to ``data`` at locations ``indices`` along ``axis`` dimension
+    using ``mode`` operation.
 
     Example: ``mode == update``.
 
@@ -378,7 +388,7 @@ class scatter_along_axis(Operation):
 
     Attributes
     ----------
-    T: fp16, fp32, i32
+    U: fp16, fp32, i32
     """
 
     input_spec = InputSpec(
@@ -396,7 +406,7 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(scatter_along_axis, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -469,7 +479,7 @@ class gather_nd(Operation):
         )
 
     def __init__(self, **kwargs):
-        super(gather_nd, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         assert self.indices.shape[-1] <= self.x.rank
@@ -532,7 +542,7 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(scatter_nd, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         assert self.indices.shape[-1] <= self.data.rank
diff --git a/coremltools/converters/mil/mil/ops/defs/tensor_operation.py b/coremltools/converters/mil/mil/ops/defs/tensor_operation.py
index 3e837f799..fcd7f703d 100644
--- a/coremltools/converters/mil/mil/ops/defs/tensor_operation.py
+++ b/coremltools/converters/mil/mil/ops/defs/tensor_operation.py
@@ -88,7 +88,7 @@ def default_inputs(self):
             upper=-1)
 
     def __init__(self, **kwargs):
-        super(band_part, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         return self.x.sym_type
@@ -140,7 +140,7 @@ def default_inputs(self):
             reverse=False)
 
     def __init__(self, **kwargs):
-        super(cumsum, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -203,7 +203,7 @@ def default_inputs(self):
             value=0.)
 
     def __init__(self, **kwargs):
-        super(fill, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         if any_symbolic(self.shape.shape):
@@ -283,7 +283,7 @@ def default_inputs(self):
             per_class_suppression=False)
 
     def __init__(self, **kwargs):
-        super(non_maximum_suppression, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         boxes_dtype = self.boxes.dtype
@@ -324,7 +324,7 @@ class non_zero(Operation):
     input_spec = InputSpec(x=TensorInputType())
 
     def __init__(self, **kwargs):
-        super(non_zero, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         shape = tuple([get_new_symbol(), self.x.rank])
@@ -380,10 +380,10 @@ def default_inputs(self):
             axis=-1,
             on_value=1,
             off_value=0,
-            )
+        )
 
     def __init__(self, **kwargs):
-        super(one_hot, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         on_type = self.on_value.dtype
@@ -474,7 +474,7 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(pad, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         in_shape = self.x.shape
@@ -486,8 +486,8 @@ def type_inference(self):
             raise ValueError("Pad mode should be one of {'constant', 'reflect', 'replicate'}")
 
         if pad.val is None:
-            for i in range(self.pad.shape[0]//2):
-                ret_shape[-self.pad.shape[0]//2+i] = get_new_symbol()
+            for i in range(self.pad.shape[0] // 2):
+                ret_shape[-self.pad.shape[0] // 2 + i] = get_new_symbol()
         else:
             pad = pad.val
             pad = pad.copy()
@@ -558,7 +558,7 @@ class range_1d(Operation):
     )
 
     def __init__(self, **kwargs):
-        super(range_1d, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     @precondition(allow=VALUE)
     def value_inference(self):
@@ -622,7 +622,7 @@ class tile(Operation):
     input_spec = InputSpec(x=TensorInputType(), reps=TensorInputType(),)
 
     def __init__(self, **kwargs):
-        super(tile, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         x_type = self.x.dtype
@@ -704,7 +704,7 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(argsort, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         return types.tensor(types.int32, self.x.shape)
@@ -764,7 +764,7 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(topk, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         x_type = self.x.dtype
@@ -880,7 +880,7 @@ class shape(Operation):
     input_spec = InputSpec(x = ScalarOrTensorInputType())
 
     def __init__(self, **kwargs):
-        super(shape, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         input_rank = self.x.rank
@@ -953,10 +953,10 @@ class concat(Operation):
     def default_inputs(self):
         return DefaultInputs(
             interleave=False,
-            )
+        )
 
     def __init__(self, **kwargs):
-        super(concat, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         concat_dim_len = 0
@@ -1111,7 +1111,7 @@ class split(Operation):
     )
 
     def __init__(self, **kwargs):
-        super(split, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         num_splits, sizes = self._get_num_splits_and_sizes()
@@ -1214,7 +1214,7 @@ class stack(Operation):
                            axis=IntInputType(const=True),)
 
     def __init__(self, **kwargs):
-        super(stack, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
 
@@ -1279,7 +1279,7 @@ class identity(Operation):
     input_spec = InputSpec(x=ListOrScalarOrTensorInputType())
 
     def __init__(self, **kwargs):
-        super(identity, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         return self.x.sym_type
diff --git a/coremltools/converters/mil/mil/ops/defs/tensor_transformation.py b/coremltools/converters/mil/mil/ops/defs/tensor_transformation.py
index 3e18d7a33..3859e7341 100644
--- a/coremltools/converters/mil/mil/ops/defs/tensor_transformation.py
+++ b/coremltools/converters/mil/mil/ops/defs/tensor_transformation.py
@@ -26,6 +26,7 @@
     InputSpec,
     IntInputType,
     IntTensorInputType,
+    FloatTensorInputType,
     ScalarOrTensorInputType,
     TensorInputType
 )
@@ -137,7 +138,7 @@ class depth_to_space(Operation):
         )
 
     def __init__(self, **kwargs):
-        super(depth_to_space, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         x_type = self.x.dtype
@@ -178,7 +179,7 @@ class expand_dims(Operation):
     )
 
     def __init__(self, **kwargs):
-        super(expand_dims, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         x_rank = self.x.rank
@@ -276,7 +277,7 @@ class reshape(Operation):
         )
 
     def __init__(self, **kwargs):
-        super(reshape, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         if any_symbolic(self.shape.shape):
@@ -404,7 +405,7 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(reverse, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         return self.x.sym_type
@@ -469,7 +470,7 @@ def default_inputs(self):
             batch_axis=0)
 
     def __init__(self, **kwargs):
-        super(reverse_sequence, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         return self.x.sym_type
@@ -541,7 +542,7 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(slice_by_index, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
 
@@ -660,7 +661,7 @@ class slice_by_size(Operation):
     )
 
     def __init__(self, **kwargs):
-        super(slice_by_size, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         if self.begin.rank != 1:
@@ -756,7 +757,7 @@ class space_to_depth(Operation):
         block_size=IntInputType(const=True),)
 
     def __init__(self, **kwargs):
-        super(space_to_depth, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         x_type = self.x.dtype
@@ -765,6 +766,161 @@ def type_inference(self):
         ret_shape = (n, c * (bs * bs), h // bs, w // bs)
         return types.tensor(x_type, ret_shape)
 
+@register_op(doc_str="")
+class space_to_batch(Operation):
+    """
+    Rearrange elements in a tensor from spatial into batch dimension.
+
+    Parameters
+    ----------
+    x: tensor<[n, C, H, W], T> (Required)
+        * Input tensor must have rank 4.
+        * The first and the second dimension are batch, channel, respectively
+        * The remaining dimensions (H, W) are treated as "spatial dimensions"
+    block_shape: const tensor<[2], i32> (Required)
+        * The length of the block_shape must be `2`
+        * It defines the shapes of the block in which the spatial dimensions are divided
+    paddings: const tensor<[2, 2], i32> (Required)
+        * It must have shape `(2, 2)`
+        * It defines the padding for each spatial dimensions
+
+    Returns
+    -------
+    tensor<[new_n, C, new_H, new_W], T>
+        * new_n = n * block_shape[0] * block_shape[1]
+        * new_H = (H + paddings[0][0] + padding[0][1])/block_shape[0]
+        * new_W = (W + paddings[1][0] + padding[1][1])/block_shape[1]
+        * The output has the same rank as the input
+
+    Attributes
+    ----------
+    T: fp16, fp32
+    """
+
+    input_spec = InputSpec(
+        x=FloatTensorInputType(),
+        block_shape=IntInputType(const=True),
+        paddings=IntInputType(const=True),
+        )
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def type_inference(self):
+        x_shape = self.x.shape
+        block_shape = self.block_shape.val
+        paddings = self.paddings.val
+
+        if self.x.rank != 4:
+            msg = "Input to space_to_batch op must be rank 4. Instead got an input with rank {}".format(self.x.rank)
+            raise ValueError(msg)
+
+        if paddings.shape != (block_shape.shape[0], 2):
+            msg = "block_shape and paddings must have shape [2], [2, 2] accordingly in the space_to_batch op. "\
+            "Got {}, {}.".format(block_shape.shape, paddings.shape)
+            raise ValueError(msg)
+
+        m = block_shape.shape[0]
+        if m != 2:
+            msg = "space_to_batch op only supports spatial dimensions = 2. Got {}".format(m)
+            raise ValueError(msg)
+
+        b = x_shape[0]
+        c = x_shape[1]
+        spatial_shape = x_shape[2:2+m]
+
+        if self.x.rank != m + 2:
+            raise ValueError("The input rank of space_to_batch op must exactly be " \
+                             "len(block_shape){} + 2! Got {}".format(self.block_shape.val, self.x.rank))
+
+        padded_spatial_shape = [x + paddings[i][0] + paddings[i][1] for i, x in enumerate(spatial_shape)]
+        new_b = b * np.prod(block_shape)
+        new_spatial_shape = [padded_spatial_shape[i]/block_shape[i] for i in range(m)]
+        ret_shape = [new_b, c] + new_spatial_shape
+        x_type = self.x.dtype
+
+        return types.tensor(x_type, ret_shape)
+
+
+@register_op(doc_str="")
+class batch_to_space(Operation):
+    """
+    Rearrange elements in a tensor from batch into spatial dimension.
+
+    Parameters
+    ----------
+    x: tensor<[n, C, H, W], T> (Required)
+        * Input tensor must have rank 4.
+        * The first and the second dimension are batch, channel, respectively
+        * The remaining dimensions (H, W) are treated as "spatial dimensions"
+    block_shape: const tensor<[2], i32> (Required)
+        * The length of the block_shape must be `2`
+        * It defines the shapes of the block in which the spatial dimensions are multiplied
+    crops: const tensor<[2, 2], i32> (Required)
+        * It must have shape `(2, 2)`
+        * It defines the amount to crop from each spatial dimensions
+
+    Returns
+    -------
+    tensor<[new_n, C, new_H, new_W], T>
+        * new_n = n / (block_shape[0] * block_shape[1])
+        * new_H = (H * block_shape[0]) - paddings[0][0] - padding[0][1]
+        * new_W = (W * block_shape[1]) - paddings[1][0] - padding[1][1]
+        * The output has the same rank as the input
+
+    Attributes
+    ----------
+    T: fp16, fp32
+    """
+
+    input_spec = InputSpec(
+        x=FloatTensorInputType(),
+        block_shape=IntInputType(const=True),
+        crops=IntInputType(const=True),
+        )
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def type_inference(self):
+        x_shape = self.x.shape
+        block_shape = self.block_shape.val
+        crops = self.crops.val
+
+        if self.x.rank != 4:
+            msg = "Input to batch_to_space op must be rank 4. Instead got an input with rank {}".format(self.x.rank)
+            raise ValueError(msg)
+
+        if crops.shape != (block_shape.shape[0], 2):
+            msg = "block_shape and crops must have shape [2], [2, 2] accordingly in the batch_to_space op. "\
+            "Got {}, {}.".format(block_shape.shape, crops.shape)
+            raise ValueError(msg)
+
+        m = block_shape.shape[0]
+        if m != 2:
+            msg = "batch_to_space op only supports spatial dimensions = 2. Got {}".format(m)
+            raise ValueError(msg)
+
+        b = x_shape[0]
+        c = x_shape[1]
+        spatial_shape = x_shape[2:2+m]
+
+        if self.x.rank != m + 2:
+            raise ValueError("The input rank of batch_to_space op must exactly be " \
+                             "len(block_shape){} + 2! Got {}".format(self.block_shape.val, self.x.rank))
+
+        if not is_symbolic(b) and  b % np.prod(block_shape) != 0:
+            msg = ("Batch size must be perfectly divided by the product of block_shape. Got batch size {}, and block_shape"
+            ).format(b, block_shape)
+            raise ValueError(msg)
+            
+        new_b = b / np.prod(block_shape)
+        new_spatial_shape = [spatial_shape[i] * block_shape[i] for i in range(m)]
+        cropped_spatial_shape = [x - crops[i][0] - crops[i][1] for i, x in enumerate(new_spatial_shape)]
+        ret_shape = [new_b, c] + cropped_spatial_shape
+        x_type = self.x.dtype
+
+        return types.tensor(x_type, ret_shape)
 
 @register_op(doc_str="")
 class squeeze(Operation):
@@ -800,7 +956,7 @@ def default_inputs(self):
             )
 
     def __init__(self, **kwargs):
-        super(squeeze, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         x_type = self.x.dtype
@@ -862,7 +1018,7 @@ class transpose(Operation):
         perm=IntTensorInputType(const=True),)
 
     def __init__(self, **kwargs):
-        super(transpose, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         x_type = self.x.dtype
@@ -916,7 +1072,7 @@ class pixel_shuffle(Operation):
     )
 
     def __init__(self, **kwargs):
-        super(pixel_shuffle, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         x_type = self.x.dtype
@@ -969,7 +1125,7 @@ def default_inputs(self):
         return DefaultInputs(stride=1)
 
     def __init__(self, **kwargs):
-        super(sliding_windows, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def type_inference(self):
         x_shape = self.x.shape
diff --git a/coremltools/converters/mil/mil/ops/tests/test_activation.py b/coremltools/converters/mil/mil/ops/tests/test_activation.py
index d6a57a8ea..8b1b6a0c2 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_activation.py
+++ b/coremltools/converters/mil/mil/ops/tests/test_activation.py
@@ -325,21 +325,49 @@ def build(x):
 
 class TestPReLU:
     @pytest.mark.parametrize(
-        "use_cpu_only, backend", itertools.product([True, False], backends,)
+        "rank, alpha_values, use_cpu_only, backend", itertools.product(
+            [3, 4, 5],
+            [[1.0, 2.0, 3.0], [4.0, 4.0, 4.0]],
+            [True, False],
+            backends,
+        )
     )
-    def test_builder_to_backend_smoke(self, use_cpu_only, backend):
-        t = np.array([[[[-1, 3, 6]], [[-1, 2, -3]], [[4, -5, 6]]]], dtype=np.float32)
-        input_placeholders = {"x": mb.placeholder(shape=t.shape)}
-        input_values = {"x": t}
+    def test_builder_to_backend_smoke(self, rank, alpha_values, use_cpu_only, backend):
+        if (backend[0] == "mlprogram" and backend[1] == "fp16"):
+            pytest.xfail("rdar://92175249 ([MIL] TestActivation::test_prelu[backend=(mlprogram, fp16)] CI failure)")
 
-        def build(x):
-            return mb.prelu(x=x, alpha=np.array([1, 2, 3], dtype=np.float32))
+        alpha = np.array(alpha_values, dtype=np.float32)
 
-        expected_output_types = (1, 3, 1, 3, types.fp32)
+        if rank == 3 or rank == 5:
+            are_alpha_values_same = np.where(np.abs(alpha - alpha[0]) > 1e-5)[0].size == 0
+            if not are_alpha_values_same:
+                pytest.xfail("rdar://91442339")
+
+        t = np.array([[[[-1, 3]], [[-1, 2]], [[4, -5]]]], dtype=np.float32)
         expected_outputs = np.array(
-            [[[[-1, 3, 6]], [[-2, 2, -6]], [[4, -15, 6]]]], dtype=np.float32
+            [[[[-1 * alpha[0], 3]], [[-1 * alpha[1], 2]], [[4, -5 * alpha[2]]]]], dtype=np.float32
         )
 
+        shape = None
+        if rank == 3:
+            shape = (1, 3, 2)
+        elif rank == 4:
+            shape = (1, 3, 1, 2)
+        elif rank == 5:
+            shape = (1, 3, 1, 1, 2)
+        else:
+            raise ValueError("rank not supported")
+
+        t = np.reshape(t, shape)
+        expected_outputs = np.reshape(expected_outputs, shape)
+        expected_output_types = tuple([s for s in shape]) + (types.fp32,)
+
+        input_placeholders = {"x": mb.placeholder(shape=t.shape)}
+        input_values = {"x": t}
+
+        def build(x):
+            return mb.prelu(x=x, alpha=alpha)
+
         run_compare_builder(
             build,
             input_placeholders,
@@ -371,19 +399,18 @@ def test_builder_eval(self):
     def test_builder_eval1(self):
         x_val = np.array([[[-1, 3, 6]], [[-1, 2, -3]], [[4, -5, 6]]], dtype=np.float32)
         with pytest.raises(ValueError, match=r".* dimension 1 .*"):
-            v = mb.prelu(x=x_val, alpha=np.array([1, 2], dtype=np.float32))
+            mb.prelu(x=x_val, alpha=np.array([1, 2], dtype=np.float32))
 
     @ssa_fn
     def test_builder_eval2(self):
         x_val = np.array([[[-1, 3, 6]], [[-1, 2, -3]], [[4, -5, 6]]], dtype=np.float32)
         with pytest.raises(ValueError, match=r"alpha .* rank 1"):
-            v = mb.prelu(x=x_val, alpha=np.array([[1, 2, 3]], dtype=np.float32))
+            mb.prelu(x=x_val, alpha=np.array([[1, 2, 3]], dtype=np.float32))
 
     @ssa_fn
     def test_builder_eval3(self):
-        x_val = np.array([[[-1, 3, 6]], [[-1, 2, -3]], [[4, -5, 6]]], dtype=np.float32)
         with pytest.raises(ValueError, match=r"x .* rank 3"):
-            v = mb.prelu(x=[1], alpha=np.array([[1, 2, 3]], dtype=np.float32))
+            mb.prelu(x=[1], alpha=np.array([[1, 2, 3]], dtype=np.float32))
 
     @pytest.mark.parametrize(
         "use_cpu_only, backend, dim, chan",
@@ -782,7 +809,7 @@ def test_builder_eval(self):
     def test_builder_eval2(self):
         x_val = np.array([[[-1, 3, 6]], [[-1, 2, -3]], [[4, -5, 6]]], dtype=np.float32)
         with pytest.raises(ValueError, match=r".* dimension 1 .*"):
-            v = mb.softplus_parametric(
+            mb.softplus_parametric(
                 x=x_val,
                 alpha=np.array([1, 2], dtype=np.float32),
                 beta=np.array([4, 5, 6], dtype=np.float32),
@@ -792,7 +819,7 @@ def test_builder_eval2(self):
     def test_builder_eval3(self):
         x_val = np.array([[[-1, 3, 6]], [[-1, 2, -3]], [[4, -5, 6]]], dtype=np.float32)
         with pytest.raises(ValueError, match=r"alpha .* rank 1"):
-            v = mb.softplus_parametric(
+            mb.softplus_parametric(
                 x=x_val,
                 alpha=np.array([[1, 2, 3]], dtype=np.float32),
                 beta=np.array([4, 5, 6], dtype=np.float32),
@@ -801,7 +828,7 @@ def test_builder_eval3(self):
     @ssa_fn
     def test_builder_eval4(self):
         with pytest.raises(ValueError, match=r"x .* rank 3"):
-            v = mb.softplus_parametric(
+            mb.softplus_parametric(
                 x=[1],
                 alpha=np.array([[1, 2, 3]], dtype=np.float32),
                 beta=np.array([4, 5, 6], dtype=np.float32),
@@ -811,7 +838,7 @@ def test_builder_eval4(self):
     def test_builder_eval5(self):
         x_val = np.array([[[-1, 3, 6]], [[-1, 2, -3]], [[4, -5, 6]]], dtype=np.float32)
         with pytest.raises(ValueError, match=r".* dimension 1 .*"):
-            v = mb.softplus_parametric(
+            mb.softplus_parametric(
                 x=x_val,
                 alpha=np.array([1, 2, 3], dtype=np.float32),
                 beta=np.array([5, 6], dtype=np.float32),
@@ -821,7 +848,7 @@ def test_builder_eval5(self):
     def test_builder_eval6(self):
         x_val = np.array([[[[-1, 3, 6]], [[-1, 2, -3]], [[4, -5, 6]]]], dtype=np.float32)
         with pytest.raises(ValueError, match=r"beta .* rank 1"):
-            v = mb.softplus_parametric(
+            mb.softplus_parametric(
                 x=x_val,
                 alpha=np.array([1, 2, 3], dtype=np.float32),
                 beta=np.array([[4, 5, 6]], dtype=np.float32),
diff --git a/coremltools/converters/mil/mil/ops/tests/test_const.py b/coremltools/converters/mil/mil/ops/tests/test_const.py
index 735cadce0..3d6ade7d6 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_const.py
+++ b/coremltools/converters/mil/mil/ops/tests/test_const.py
@@ -2,14 +2,16 @@
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
 import itertools
+
 import numpy as np
 import pytest
 
+from .testing_utils import run_compare_builder
 from coremltools.converters.mil import testing_reqs
 from coremltools.converters.mil.mil import Builder as mb, types
 
-from .testing_utils import run_compare_builder
 
 backends = testing_reqs.backends
 
@@ -36,6 +38,8 @@ class TestConst:
     def test_builder_to_backend_smoke(self, use_cpu_for_conversion, backend, dtype):
         if backend[0] == "mlprogram" and not use_cpu_for_conversion:
             pytest.xfail("rdar://78343191 ((MIL GPU) Core ML Tools Unit Test failures [failure to load or Seg fault])")
+        if backend[0] == "mlprogram" and dtype in [np.uint8, np.int8, np.uint32]:
+            pytest.xfail("Data type not supported")
 
         t = np.random.randint(0, 5, (4, 2)).astype(np.float32)
         constant = np.random.randint(0, 5, (4, 2)).astype(dtype)
@@ -62,5 +66,4 @@ def build(x):
             use_cpu_only=use_cpu_for_conversion,
             frontend_only=False,
             backend=backend,
-            use_cpu_for_conversion=use_cpu_for_conversion,
         )
diff --git a/coremltools/converters/mil/mil/ops/tests/test_constexpr_ops.py b/coremltools/converters/mil/mil/ops/tests/test_constexpr_ops.py
new file mode 100644
index 000000000..48474c53e
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/tests/test_constexpr_ops.py
@@ -0,0 +1,203 @@
+#  Copyright (c) 2020, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import itertools
+
+import numpy as np
+import pytest
+
+import coremltools as ct
+from coremltools.converters.mil.mil import Builder as mb, types
+from coremltools.converters.mil.mil.ops.tests.testing_utils import run_compare_builder
+
+backends = [("mlprogram", "fp32"), ("mlprogram", "fp16")]
+
+@pytest.mark.skipif(
+    ct.utils._macos_version() < (13, 0),
+    reason="ConstExpr ops available from macOS13 onwards.",
+)
+class TestConstexprAffineDequantize:
+    @pytest.mark.parametrize(
+        "use_cpu_for_conversion, backend", itertools.product([True, False], backends)
+    )
+    def test_builder_to_backend_smoke(self, use_cpu_for_conversion, backend):
+
+        t = np.array(range(4)).reshape(1, 1, 2, 2).astype(np.float32)
+        decompressed_constant = (
+            np.array([1, 2, 3, 4]).reshape(1, 1, 2, 2).astype(np.float32)
+        )
+        input_placeholders = {
+            "x": mb.placeholder(shape=t.shape),
+        }
+        input_values = {"x": t}
+
+        def build(x):
+            quantized_data = np.array([3, 5, 5, 6]).reshape(1, 1, 2, 2).astype(np.uint8)
+            scale = np.array([1, 2]).astype(np.float32)
+            zero_point = np.array([2, 4]).astype(np.uint8)
+            axis = 3
+            y = mb.constexpr_affine_dequantize(
+                quantized_data=quantized_data,
+                zero_point=zero_point,
+                scale=scale,
+                axis=axis,
+            )
+            return mb.add(x=x, y=y)
+
+        expected_output_types = (1, 1, 2, 2, types.fp32)
+        expected_outputs = t + decompressed_constant.astype(np.float32)
+
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            use_cpu_only=use_cpu_for_conversion,
+            frontend_only=False,
+            backend=backend,
+            converter=ct.convert,
+            minimum_deployment_target=ct.target.iOS16,
+        )
+
+@pytest.mark.skipif(
+    ct.utils._macos_version() < (13, 0),
+    reason="ConstExpr ops available from macOS13 onwards.",
+)
+class TestConstexprCast:
+    @pytest.mark.parametrize(
+        "use_cpu_for_conversion, backend", itertools.product([True, False], backends)
+    )
+    def test_builder_to_backend_smoke(self, use_cpu_for_conversion, backend):
+
+        t = np.array(range(4)).reshape(4, 1).astype(np.float32)
+        decompressed_constant = np.array([1, 2, 3, 4]).reshape(4, 1).astype(np.float32)
+        input_placeholders = {
+            "x": mb.placeholder(shape=t.shape),
+        }
+        input_values = {"x": t}
+
+        def build(x):
+            source_val = np.array([1, 2, 3, 4]).reshape(4, 1).astype(np.float16)
+            y = mb.constexpr_cast(source_val=source_val, output_dtype="fp32")
+            return mb.add(x=x, y=y)
+
+        expected_output_types = (4, 1, types.fp32)
+        expected_outputs = t + decompressed_constant.astype(np.float32)
+
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            use_cpu_only=use_cpu_for_conversion,
+            frontend_only=False,
+            backend=backend,
+            converter=ct.convert,
+            minimum_deployment_target=ct.target.iOS16,
+        )
+
+
+@pytest.mark.skipif(
+    ct.utils._macos_version() < (13, 0),
+    reason="ConstExpr ops available from macOS13 onwards.",
+)
+class TestConstexprLutToDense:
+    @pytest.mark.parametrize(
+        "use_cpu_for_conversion, backend", itertools.product([True, False], backends)
+    )
+    def test_builder_to_backend_smoke(self, use_cpu_for_conversion, backend):
+
+        t = np.array(range(4)).reshape(4, 1).astype(np.float32)
+        decompressed_constant = np.array([1, 2, 3, 4]).reshape(4, 1).astype(np.float32)
+        input_placeholders = {
+            "x": mb.placeholder(shape=t.shape),
+        }
+        input_values = {"x": t}
+
+        def build(x):
+            lut_data = np.array(
+                [
+                    -19.0,
+                    4.0,
+                    0.0,
+                    -1.0,
+                    1.0,
+                    3.0,
+                    5.0,
+                    -8.0,
+                    19,
+                    13,
+                    42,
+                    4.5,
+                    5.4,
+                    2.0,
+                    -6,
+                    -7,
+                ]
+            ).astype(np.float32)
+            indices = np.array([212, 21]).astype(np.uint8)
+            shape = np.array([4, 1]).astype(np.uint32)
+            y = mb.constexpr_lut_to_dense(lut=lut_data, indices=indices, shape=shape)
+            return mb.add(x=x, y=y)
+
+        expected_output_types = (4, 1, types.fp32)
+        expected_outputs = t + decompressed_constant.astype(np.float32)
+
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            use_cpu_only=use_cpu_for_conversion,
+            frontend_only=False,
+            backend=backend,
+            converter=ct.convert,
+            minimum_deployment_target=ct.target.iOS16,
+        )
+
+@pytest.mark.skipif(
+    ct.utils._macos_version() < (13, 0),
+    reason="ConstExpr ops available from macOS13 onwards.",
+)
+class TestConstexprSparseToDense:
+    @pytest.mark.parametrize(
+        "use_cpu_for_conversion, backend", itertools.product([True, False], backends)
+    )
+    def test_builder_to_backend_smoke(self, use_cpu_for_conversion, backend):
+
+        t = np.array(range(4)).reshape(4, 1).astype(np.float32)
+        decompressed_constant = np.array([1, 2, 0, 4]).reshape(4, 1).astype(np.float32)
+        input_placeholders = {
+            "x": mb.placeholder(shape=t.shape),
+        }
+        input_values = {"x": t}
+
+        def build(x):
+            nonzero_data = np.array([1, 2, 4]).astype(np.float32)
+            mask = np.array([11]).astype(np.uint8)
+            shape = np.array([4, 1]).astype(np.uint32)
+            y = mb.constexpr_sparse_to_dense(
+                nonzero_data=nonzero_data, mask=mask, shape=shape
+            )
+            return mb.add(x=x, y=y)
+
+        expected_output_types = (4, 1, types.fp32)
+        expected_outputs = t + decompressed_constant.astype(np.float32)
+
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            use_cpu_only=use_cpu_for_conversion,
+            frontend_only=False,
+            backend=backend,
+            converter=ct.convert,
+            minimum_deployment_target=ct.target.iOS16,
+        )
diff --git a/coremltools/converters/mil/mil/ops/tests/test_control_flow.py b/coremltools/converters/mil/mil/ops/tests/test_control_flow.py
index 4beb14a45..9d1febbd9 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_control_flow.py
+++ b/coremltools/converters/mil/mil/ops/tests/test_control_flow.py
@@ -2,7 +2,9 @@
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
 import itertools
+
 import pytest
 import numpy as np
 
@@ -150,7 +152,6 @@ def false_fn():
             use_cpu_only=use_cpu_for_conversion,
             frontend_only=False,
             backend=backend,
-            use_cpu_for_conversion=use_cpu_for_conversion,
         )
 
 
diff --git a/coremltools/converters/mil/mil/ops/tests/test_conv.py b/coremltools/converters/mil/mil/ops/tests/test_conv.py
index a0ef9c496..f4111c379 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_conv.py
+++ b/coremltools/converters/mil/mil/ops/tests/test_conv.py
@@ -556,7 +556,6 @@ def build(x, input_weight):
             use_cpu_only=use_cpu_only,
             frontend_only=False,
             backend=backend,
-            use_cpu_for_conversion=use_cpu_only,
         )
 
     @pytest.mark.parametrize(
diff --git a/coremltools/converters/mil/mil/ops/tests/test_elementwise_binary.py b/coremltools/converters/mil/mil/ops/tests/test_elementwise_binary.py
index bfe4560e0..f66d9c3d1 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_elementwise_binary.py
+++ b/coremltools/converters/mil/mil/ops/tests/test_elementwise_binary.py
@@ -499,7 +499,6 @@ def build(x, y):
             use_cpu_only=use_cpu_for_conversion,
             frontend_only=False,
             backend=backend,
-            use_cpu_for_conversion=use_cpu_for_conversion,
         )
 
     @ssa_fn
diff --git a/coremltools/converters/mil/mil/ops/tests/test_elementwise_unary.py b/coremltools/converters/mil/mil/ops/tests/test_elementwise_unary.py
index 2215383d6..6252b9cbc 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_elementwise_unary.py
+++ b/coremltools/converters/mil/mil/ops/tests/test_elementwise_unary.py
@@ -4,11 +4,12 @@
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 import itertools
+
 import numpy as np
 import pytest
 import scipy
 from scipy import special
-
+from .testing_utils import run_compare_builder
 from coremltools.converters.mil import testing_reqs
 from coremltools.converters.mil.mil import (
     Builder as mb,
@@ -20,7 +21,6 @@
 from coremltools.converters.mil.mil.types.symbolic import is_compatible_symbolic_vector
 from coremltools.converters.mil.testing_utils import ssa_fn
 
-from .testing_utils import run_compare_builder
 
 backends = testing_reqs.backends
 
@@ -285,7 +285,6 @@ def test_builder_to_backend_smoke(self, use_cpu_for_conversion, backend, mode):
             use_cpu_only=use_cpu_for_conversion,
             frontend_only=False,
             backend=backend,
-            use_cpu_for_conversion=use_cpu_for_conversion,
         )
 
     @ssa_fn
@@ -686,11 +685,10 @@ def build(x):
             use_cpu_only=use_cpu_for_conversion,
             frontend_only=False,
             backend=backend,
-            use_cpu_for_conversion=use_cpu_for_conversion,
         )
 
     def test_erf_value_inference(self):
-        INPUT_SIZE=(2,3,4)
+        INPUT_SIZE=(2, 3, 4)
         rs = np.random.RandomState(1234)
         x = rs.random(INPUT_SIZE)
 
diff --git a/coremltools/converters/mil/mil/ops/tests/test_image_resizing.py b/coremltools/converters/mil/mil/ops/tests/test_image_resizing.py
index 92c14fed7..c838ccca3 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_image_resizing.py
+++ b/coremltools/converters/mil/mil/ops/tests/test_image_resizing.py
@@ -5,6 +5,7 @@
 
 import functools
 import itertools
+
 import numpy as np
 import pytest
 
@@ -291,7 +292,6 @@ def build(x):
             expected_outputs,
             use_cpu_only=use_cpu_for_conversion,
             backend=backend,
-            use_cpu_for_conversion=use_cpu_for_conversion,
         )
 
 
diff --git a/coremltools/converters/mil/mil/ops/tests/test_linear.py b/coremltools/converters/mil/mil/ops/tests/test_linear.py
index bb37ed611..515bb06d4 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_linear.py
+++ b/coremltools/converters/mil/mil/ops/tests/test_linear.py
@@ -57,6 +57,9 @@ def test_builder_eval(self):
         itertools.product([True, False], backends, [2, 3, 5]),
     )
     def test_builder_to_backend_stress(self, use_cpu_only, backend, rank):
+        if backend[0] == "mlprogram" and rank == 5 and not use_cpu_only:
+            pytest.xfail("rdar://94199353 (TestLinear.test_builder_to_backend_stress failing on the CI)")
+
         x_shape = np.random.randint(low=1, high=3, size=(rank,))
         x_val = np.random.rand(*x_shape)
         out_channels = 3
diff --git a/coremltools/converters/mil/mil/ops/tests/test_normalization.py b/coremltools/converters/mil/mil/ops/tests/test_normalization.py
index 4981ea56a..b8229f54d 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_normalization.py
+++ b/coremltools/converters/mil/mil/ops/tests/test_normalization.py
@@ -5,7 +5,6 @@
 
 import itertools
 import numpy as np
-from numpy import linalg as la
 import pytest
 
 from .testing_utils import UNK_SYM, run_compare_builder
@@ -250,6 +249,23 @@ def build(x):
 
 
 class TestNormalizationL2Norm:
+
+    @staticmethod
+    def _compute_l2_norm(val, eps):
+        shape = val.shape
+        rank = len(shape)
+        batch_dims = rank - 3
+        if batch_dims == 0:
+            square_sum = np.sum(val**2)
+            output = val/np.power(square_sum + eps, 0.5)
+        else:
+            batch_dim_prod = np.prod(shape[:batch_dims])
+            reshape_val = np.reshape(val, (batch_dim_prod, -1))
+            square_sum = np.sum(reshape_val * reshape_val, axis=1, keepdims=True) + eps
+            output = reshape_val/np.power(square_sum, 0.5)
+            output = np.reshape(output, shape)
+        return output
+
     @pytest.mark.parametrize(
         "use_cpu_only, backend", itertools.product([True, False], backends,)
     )
@@ -286,29 +302,18 @@ def build(x):
         )
 
     @pytest.mark.parametrize(
-        "use_cpu_only, backend, rank", itertools.product([True, False], backends, [3, 4, 5])
+        "use_cpu_only, backend, rank, epsilon", itertools.product([True, False], backends, [3, 4, 5], [1e-4, 5.7])
     )
-    def test_builder_to_backend_stress(self, use_cpu_only, backend, rank):
+    def test_builder_to_backend_stress(self, use_cpu_only, backend, rank, epsilon):
         shape = np.random.randint(low=2, high=6, size=rank)
-        x_val = random_gen(shape=shape, rand_min=-10.0, rand_max=10.0)
+        x_val = random_gen(shape=shape, rand_min=-1.0, rand_max=1.0)
         input_placeholders = {"x": mb.placeholder(shape=shape)}
         input_values = {"x": x_val}
 
         def build(x):
-            return [mb.l2_norm(x=x, epsilon=1e-12)]
-
-        # compute for the answer
-        batch_dims = rank - 3
-        if batch_dims == 0:
-            norm = la.norm(x_val)
-            output = x_val/norm
-        else:
-            batch_dim_prod = np.prod(shape[:batch_dims])
-            reshape_x_val = np.reshape(x_val, (batch_dim_prod, -1))
-            norm = la.norm(reshape_x_val, axis=1, keepdims=True)
-            output = reshape_x_val/norm
-            output = np.reshape(output, shape)
+            return [mb.l2_norm(x=x, epsilon=epsilon)]
 
+        output = TestNormalizationL2Norm._compute_l2_norm(x_val, epsilon)
         expected_output_types = [list(output.shape) + [types.fp32]]
         expected_outputs = [
             output
@@ -324,11 +329,25 @@ def build(x):
             backend=backend,
         )
 
+    @pytest.mark.parametrize("rank, epsilon",
+        itertools.product(
+            [3, 4, 5],
+            [1e-4, 11.2],
+        ),
+    )
+    def test_builder_eval_stress(self, rank, epsilon):
+        shape = np.random.randint(low=2, high=6, size=rank)
+        x_val = random_gen(shape=shape, rand_min=-1, rand_max=1)
+        with Function({}):
+            res = mb.l2_norm(x=x_val, epsilon=epsilon)
+            ref = TestNormalizationL2Norm._compute_l2_norm(x_val, epsilon)
+            np.testing.assert_allclose(ref, res.val, atol=1e-6, rtol=1e-5)
+
 
 class TestNormalizationLayerNorm:
 
     @staticmethod
-    def _keras_layer_norm( x, axes, epsilon):
+    def _keras_layer_norm(x, axes, epsilon):
         layer = tf.keras.layers.LayerNormalization(axis=axes, epsilon=epsilon)
         data = tf.constant(x, dtype=tf.float32)
         output = layer(data)
@@ -377,9 +396,9 @@ def build(x):
             np.array(
                 [
                     [
-                        [ 0.9999969,  -0.9999969 ],
-                        [ 0.99999833, -0.99999833],
-                        [ 0.99995005, -0.99995005],
+                        [0.9999969,  -0.9999969 ],
+                        [0.99999833, -0.99999833],
+                        [0.99995005, -0.99995005],
                     ]
                 ],
                 dtype=np.float32,
@@ -387,8 +406,8 @@ def build(x):
             np.array(
                 [
                     [
-                        [ 0.82687193, -1.06312108],
-                        [ 1.77186835, -0.82687193],
+                        [0.82687193, -1.06312108],
+                        [1.77186835, -0.82687193],
                         [-0.11812456, -0.59062278],
                     ]
                 ],
@@ -397,9 +416,9 @@ def build(x):
             np.array(
                 [
                     [
-                        [ 1.9999969,  -0.9999969 ],
-                        [ 1.99999833, -0.99999833],
-                        [ 1.99995005, -0.99995005],
+                        [1.9999969,  -0.9999969 ],
+                        [1.99999833, -0.99999833],
+                        [1.99995005, -0.99995005],
                     ]
                 ],
                 dtype=np.float32,
diff --git a/coremltools/converters/mil/mil/ops/tests/test_random.py b/coremltools/converters/mil/mil/ops/tests/test_random.py
index 7bc172e1b..84ddecf0a 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_random.py
+++ b/coremltools/converters/mil/mil/ops/tests/test_random.py
@@ -4,9 +4,10 @@
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 import itertools
+import unittest
+
 import numpy as np
 import pytest
-import unittest
 
 from .testing_utils import UNK_SYM, run_compare_builder
 from coremltools.converters.mil.mil import Builder as mb, types
diff --git a/coremltools/converters/mil/mil/ops/tests/test_recurrent.py b/coremltools/converters/mil/mil/ops/tests/test_recurrent.py
index 7014db34c..0772c3e92 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_recurrent.py
+++ b/coremltools/converters/mil/mil/ops/tests/test_recurrent.py
@@ -4,6 +4,7 @@
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 import itertools
+
 import numpy as np
 import pytest
 
diff --git a/coremltools/converters/mil/mil/ops/tests/test_scatter_gather.py b/coremltools/converters/mil/mil/ops/tests/test_scatter_gather.py
index 447e815f3..65f6542b1 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_scatter_gather.py
+++ b/coremltools/converters/mil/mil/ops/tests/test_scatter_gather.py
@@ -4,8 +4,9 @@
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 import itertools
-import pytest
+
 import numpy as np
+import pytest
 
 from .testing_utils import run_compare_builder
 from coremltools._deps import _HAS_TF_1, MSG_TF1_NOT_FOUND
@@ -438,45 +439,6 @@ def build(indices):
             backend=backend,
         )
 
-    @pytest.mark.parametrize(
-        "use_cpu_only, backend", itertools.product([True, False], backends,)
-    )
-    def test_embedding_builder_to_backend_smoke(self, use_cpu_only, backend):
-        x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
-        indices = np.array([1, 0], dtype=np.int32)
-        input_placeholders = {
-            "indices": mb.placeholder(shape=indices.shape, dtype=types.int32),
-        }
-
-        input_values = {"indices": indices}
-
-        def build(indices):
-            return [
-                mb.gather(x=x, indices=indices, axis=0),
-                mb.gather(x=x, indices=indices, axis=-2),
-            ]
-
-        expected_output_types = [
-            (2, 3, types.fp32),
-            (2, 3, types.fp32),
-        ]
-
-        expected_outputs = [
-            np.array([[4, 5, 6], [1, 2, 3]], dtype=np.float32),
-            np.array([[4, 5, 6], [1, 2, 3]], dtype=np.float32),
-        ]
-
-        run_compare_builder(
-            build,
-            input_placeholders,
-            input_values,
-            expected_output_types,
-            expected_outputs,
-            use_cpu_only=use_cpu_only,
-            frontend_only=False,
-            backend=backend,
-        )
-
     @ssa_fn
     def test_builder_eval(self):
         x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
@@ -590,7 +552,6 @@ def build(x, indices):
             use_cpu_only=use_cpu_for_conversion,
             frontend_only=False,
             backend=backend,
-            use_cpu_for_conversion=use_cpu_for_conversion,
         )
 
 
diff --git a/coremltools/converters/mil/mil/ops/tests/test_tensor_operation.py b/coremltools/converters/mil/mil/ops/tests/test_tensor_operation.py
index 7e3dd02c8..97caac4ee 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_tensor_operation.py
+++ b/coremltools/converters/mil/mil/ops/tests/test_tensor_operation.py
@@ -170,43 +170,43 @@ def test_invalid_reverse1(self):
     def test_invalid_reverse2(self):
         x_val = random_gen(shape=(1, 2, 3, 4, 5), rand_min=-100, rand_max=100)
         with pytest.raises(ValueError):
-            pred = mb.cumsum(x=x_val, reverse=0)
+            mb.cumsum(x=x_val, reverse=0)
 
     @ssa_fn
     def test_invalid_reverse3(self):
         x_val = random_gen(shape=(1, 2, 3, 4, 5), rand_min=-100, rand_max=100)
         with pytest.raises(ValueError):
-            pred = mb.cumsum(x=x_val, reverse=1)
+            mb.cumsum(x=x_val, reverse=1)
 
     @ssa_fn
     def test_invalid_exclusive1(self):
         x_val = random_gen(shape=(1, 2, 3, 4, 5), rand_min=-100, rand_max=100)
         with pytest.raises(ValueError):
-            pred = mb.cumsum(x=x_val, exclusive="")
+            mb.cumsum(x=x_val, exclusive="")
 
     @ssa_fn
     def test_invalid_exclusive2(self):
         x_val = random_gen(shape=(1, 2, 3, 4, 5), rand_min=-100, rand_max=100)
         with pytest.raises(ValueError):
-            pred = mb.cumsum(x=x_val, exclusive=0)
+            mb.cumsum(x=x_val, exclusive=0)
 
     @ssa_fn
     def test_invalid_exclusive3(self):
         x_val = random_gen(shape=(1, 2, 3, 4, 5), rand_min=-100, rand_max=100)
         with pytest.raises(ValueError):
-            pred = mb.cumsum(x=x_val, exclusive=1)
+            mb.cumsum(x=x_val, exclusive=1)
 
     @ssa_fn
     def test_invalid_input1(self):
         x_val = 1
         with pytest.raises(ValueError):
-            pred = mb.cumsum(x=x_val)
+            mb.cumsum(x=x_val)
 
     @ssa_fn
     def test_invalid_input2(self):
         x_val = ["1"]
         with pytest.raises(ValueError):
-            pred = mb.cumsum(x=x_val)
+            mb.cumsum(x=x_val)
 
 
 class TestFill:
@@ -562,6 +562,7 @@ def test_builder_to_backend_stress(
             iou_threshold = np.maximum(np.min(iou_matrix) - 0.01, 0.0)
         else:
             iou_threshold = np.percentile(iou_matrix, iou_threshold_percentile) + 0.01
+        iou_threshold = np.maximum(iou_threshold, 1e-8)
 
         (
             tf_boxes,
@@ -984,7 +985,6 @@ def build(x, y, z):
         "use_cpu_only, backend", itertools.product([True, False], backends,)
     )
     def test_large_array(self, use_cpu_only, backend):
-        
         input_placeholders = {
             "x": mb.placeholder(shape=(1,)), # dummpy input
         }
diff --git a/coremltools/converters/mil/mil/ops/tests/test_tensor_transformation.py b/coremltools/converters/mil/mil/ops/tests/test_tensor_transformation.py
index 02f3e7bff..a0b9ad910 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_tensor_transformation.py
+++ b/coremltools/converters/mil/mil/ops/tests/test_tensor_transformation.py
@@ -49,6 +49,82 @@ def build(x):
         )
 
 
+class TestSpaceToBatch:
+    @pytest.mark.parametrize(
+        "use_cpu_only, backend", itertools.product([True, False], backends,)
+    )
+    def test_builder_to_backend_smoke(self, use_cpu_only, backend):
+        # original input type is (2, 1, 2, 4, fp32)
+        val = np.array([[[[ 1,  2,  3,  4],
+                          [ 5,  6,  7,  8]]],
+                        [[[ 9, 10, 11, 12],
+                          [13, 14, 15, 16]]]], dtype=np.float32)
+        input_placeholders = {"x": mb.placeholder(shape=val.shape)}
+        input_values = {"x": val}
+
+        def build(x):
+            return [mb.space_to_batch(x=x, block_shape=[2, 2], paddings=[[0, 0], [2, 0]])]
+
+        expected_output_types = (8, 1, 1, 3, types.fp32)
+        expected_outputs = np.array([[[[ 0,  1,  3]]],
+                                     [[[ 0,  9, 11]]],
+                                     [[[ 0,  2,  4]]],
+                                     [[[ 0, 10, 12]]],
+                                     [[[ 0,  5,  7]]],
+                                     [[[ 0, 13, 15]]],
+                                     [[[ 0,  6,  8]]],
+                                     [[[ 0, 14, 16]]]], dtype=np.float32)
+
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            use_cpu_only=use_cpu_only,
+            frontend_only=False,
+            backend=backend,
+        )
+
+
+class TestBatchToSpace:
+    @pytest.mark.parametrize(
+        "use_cpu_only, backend", itertools.product([True, False], backends,)
+    )    
+    def test_builder_to_backend_smoke(self, use_cpu_only, backend):
+        # original input type is (8, 1, 1, 3, fp32)
+        val = np.array([[[[ 0,  1,  3]]],
+                       [[[ 0,  9, 11]]],
+                       [[[ 0,  2,  4]]],
+                       [[[ 0, 10, 12]]],
+                       [[[ 0,  5,  7]]],
+                       [[[ 0, 13, 15]]],
+                       [[[ 0,  6,  8]]],
+                       [[[ 0, 14, 16]]]], dtype=np.float32)
+        input_placeholders = {"x": mb.placeholder(shape=val.shape)}
+        input_values = {"x": val}
+
+        def build(x):
+            return [mb.batch_to_space(x=x, block_shape=[2, 2], crops=[[0, 0], [2, 0]])]
+
+        expected_output_types = (2, 1, 2, 4, types.fp32)
+        expected_outputs = np.array([[[[ 1,  2,  3,  4],
+                                       [ 5,  6,  7,  8]]],
+                                     [[[ 9, 10, 11, 12],
+                                       [13, 14, 15, 16]]]], dtype=np.float32)
+
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            use_cpu_only=use_cpu_only,
+            frontend_only=False,
+            backend=backend,
+        )
+
+
 class TestExpandDims:
     @pytest.mark.parametrize(
         "use_cpu_only, backend", itertools.product([True, False], backends,)
@@ -645,7 +721,6 @@ def build(x):
             expected_outputs,
             use_cpu_only=use_cpu_for_conversion,
             backend=backend,
-            use_cpu_for_conversion=use_cpu_for_conversion,
         )
 
     @ssa_fn
diff --git a/coremltools/converters/mil/mil/ops/tests/testing_utils.py b/coremltools/converters/mil/mil/ops/tests/testing_utils.py
index 96f0240ae..6831399ef 100644
--- a/coremltools/converters/mil/mil/ops/tests/testing_utils.py
+++ b/coremltools/converters/mil/mil/ops/tests/testing_utils.py
@@ -4,13 +4,13 @@
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 import logging
-import os
-import pytest
 
-from coremltools.converters.mil.mil.types.symbolic import is_symbolic
+import coremltools as ct
 from coremltools.converters.mil.mil import Program, Function
+from coremltools.converters.mil.mil.types.symbolic import is_symbolic
 from coremltools.converters.mil.testing_utils import compare_backend, ct_convert
 
+
 UNK_VARIADIC = "*s_unk"
 UNK_SYM = "s_unk"
 
@@ -28,7 +28,8 @@ def run_compare_builder(
     rtol=1e-05,
     inputs=None,
     also_compare_shapes=False,
-    use_cpu_for_conversion=False,
+    converter=ct.convert,
+    minimum_deployment_target=None,
 ):
     """
     Inputs:
@@ -53,10 +54,12 @@ def run_compare_builder(
 
         - inputs: type of inputs (either None (defaults to tensor) or [ct.ImageType])
 
-        - use_cpu_for_conversion: bool
-            Argument which is passed as is to the unified converter API.
-            That is, "ct.convert(...., useCPUOnly=use_cpu_for_conversion)"
-            It forces the model to be loaded on the CPU context, post conversion.
+        - converter: function
+            Reference to convert function to be used.
+            Default: ct.convert
+
+        - minimum_deployment_target : coremltools.target enumeration (optional)
+            A member of the ``coremltools.target`` enum.
 
     Returns:
         The converted mlmodel
@@ -118,8 +121,19 @@ def run_compare_builder(
         if output_shape != expected_shape:
             raise ValueError(msg)
 
-    mlmodel = ct_convert(prog, source="milinternal", convert_to=backend, inputs=inputs,
-                         useCPUOnly=use_cpu_for_conversion)
+    if use_cpu_only:
+        compute_unit = ct.ComputeUnit.CPU_ONLY
+    else:
+        compute_unit = ct.ComputeUnit.ALL
+
+    mlmodel = ct_convert(prog,
+                         converter=converter,
+                         source="milinternal",
+                         convert_to=backend,
+                         inputs=inputs,
+                         compute_units=compute_unit,
+                         minimum_deployment_target=minimum_deployment_target
+    )
 
     if frontend_only:
         return mlmodel
@@ -139,7 +153,6 @@ def run_compare_builder(
         mlmodel=mlmodel,
         input_key_values=input_values,
         expected_outputs=expected_outputs,
-        use_cpu_only=use_cpu_only,
         atol=atol,
         rtol=rtol,
         also_compare_shapes=also_compare_shapes,
diff --git a/coremltools/converters/mil/mil/passes/__init__.py b/coremltools/converters/mil/mil/passes/__init__.py
index 7cba6372f..a1772f38e 100644
--- a/coremltools/converters/mil/mil/passes/__init__.py
+++ b/coremltools/converters/mil/mil/passes/__init__.py
@@ -31,6 +31,7 @@
     noop_elimination,
     onehot_matmul_to_gather,
     pad_conv_connect,
+    prelu_fusion,
     quantization_passes,
     rank0_expand_dims_swap,
     reduce_mean_fusion,
@@ -40,7 +41,8 @@
     replace_stack_reshape,
     sanitize_input_output_names,
     topological_reorder,
-    use_reflection_padding
+    use_reflection_padding,
+    update_output_dtypes,
 )
 
 from coremltools.converters.mil.experimental.passes import (
diff --git a/coremltools/converters/mil/mil/passes/apply_common_pass_pipeline.py b/coremltools/converters/mil/mil/passes/apply_common_pass_pipeline.py
index e8ca03a2d..6866121e9 100644
--- a/coremltools/converters/mil/mil/passes/apply_common_pass_pipeline.py
+++ b/coremltools/converters/mil/mil/passes/apply_common_pass_pipeline.py
@@ -36,6 +36,7 @@ def _apply(passes, name="common"):
         return
 
     common_passes = [
+        "common::update_output_dtypes",
         "common::cast_optimization",
         "common::const_elimination",
         "common::sanitize_input_output_names",
@@ -66,8 +67,10 @@ def _apply(passes, name="common"):
         "common::fuse_conv_batchnorm", # should come after fuse_elementwise_to_batchnorm
         "common::fuse_conv_scale", # Re-run the fuse conv scale pass after the conv and batch_norm are fused
         "common::fuse_conv_bias", # Re-run the fuse conv bias pass after the conv and batch_norm are fused
+        "common::fuse_conv_batchnorm", # In some cases, we need to run conv / batch_norm fusion again after the fuse_conv_scale and fuse_conv_bias passes
         "common::detect_concat_interleave",
         "common::concat_to_pixel_shuffle", # should come after detect_concat_interleave and after replace_stack_reshape
+        "common::fuse_prelu", # reduce_transpose pass should run before and after this pass (the one after will be run during the cleanup passes stage)
         # "remove_redundant_ops" pass should be applied towards the end, once other graph passes have done their optimizations.
         # For instance, it should come after passes such as "reduce_transpose" that can introduce redundant transposes
         # in the network (while reducing the total number of transposes), and after passes such as "fuse_layernorm_or_instancenorm"
diff --git a/coremltools/converters/mil/mil/passes/cast_optimization.py b/coremltools/converters/mil/mil/passes/cast_optimization.py
index 691363100..c8835537f 100644
--- a/coremltools/converters/mil/mil/passes/cast_optimization.py
+++ b/coremltools/converters/mil/mil/passes/cast_optimization.py
@@ -58,7 +58,14 @@ def apply(self, prog):
             while block_changed:
                 block_changed = _fuse_or_cancel_consecutive_casts_block(f, cached_vars)
 
-class Node(object):
+        # main function's output_vars are treated differently, which are not handled by the method
+        # above, "_fuse_or_cancel_consecutive_casts_block".
+        # For that, we invoke another method
+        block_changed = True
+        while block_changed:
+            block_changed = _cancel_consecutive_casts_connected_to_outputs(prog.functions["main"])
+
+class Node:
     def __init__(self, op_type, match_criterion=None):
         """
 
@@ -189,3 +196,47 @@ def _fuse_or_cancel_consecutive_casts_block(block, cached_vars):
             if block_changed:
                 return block_changed
     return block_changed
+
+
+def _cancel_consecutive_casts_connected_to_outputs(block):
+    """
+    Lets say the ops in the block have the following pattern
+    "some_op"---->{var1}---->"cast_op1"---->"cast_op2"--->{var2}
+    , where var2 is one of the outputs in block.outputs
+
+    If cast_op1 and cast_op2 can be cancelled, this means, var1 and var2 are duplicates
+    of each other. The program can then be updated to
+    "some_op"---->{var1}
+    where var1 replaces var2 in block.outputs
+    This also requires replacing var1's name with var2's so that the model output names remain unchanged
+    """
+    new_output_vars = []
+    block_changed = False
+    for output_var in block.outputs:
+        cast_op2 = output_var.op
+        if cast_op2 is None:
+            continue
+        if cast_op2.op_type != "cast":
+            new_output_vars.append(output_var)
+            continue
+        cast_op1 = cast_op2.x.op
+        if cast_op1 is None:
+            new_output_vars.append(output_var)
+            continue
+        if cast_op1.op_type != "cast":
+            new_output_vars.append(output_var)
+            continue
+        var1 = cast_op1.x
+        if var1.op is None or var1.dtype != output_var.dtype :
+            new_output_vars.append(output_var)
+            continue
+        var1.set_name(output_var.name)
+        new_output_vars.append(var1)
+        block_changed = True
+
+    if block_changed:
+        block.set_outputs(new_output_vars)
+
+    return block_changed
+
+
diff --git a/coremltools/converters/mil/mil/passes/compression_passes.py b/coremltools/converters/mil/mil/passes/compression_passes.py
new file mode 100644
index 000000000..f6008b8c6
--- /dev/null
+++ b/coremltools/converters/mil/mil/passes/compression_passes.py
@@ -0,0 +1,474 @@
+#  Copyright (c) 2022, Apple Inc. All rights reserved.
+
+import numpy as np
+from enum import Enum
+import logging as _logging
+
+from coremltools.converters.mil.backend.mil.load import should_use_weight_file
+from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil.passes.quantization_passes import AbstractQuantizationPass
+from coremltools.models.neural_network.quantization_utils import _get_kmeans_lookup_table_and_weight
+from coremltools.converters.mil.mil.ops.defs.constexpr_ops import (
+    constexpr_affine_dequantize,
+    constexpr_lut_to_dense,
+    constexpr_sparse_to_dense,
+)
+
+class CompressionWeightMode(Enum):
+    @classmethod
+    def has_mode(cls, value):
+        if not isinstance(value, str):
+            return False
+        return value.upper() in cls._member_names_
+
+    @classmethod
+    def get_mode(cls):
+        return list(cls._member_names_)
+
+class SparseMode(CompressionWeightMode):
+    THRESHOLD_BASED = 1
+    PERCENTILE_BASED = 2
+
+class AffineQuantizeMode(CompressionWeightMode):
+    LINEAR = 1
+    LINEAR_SYMMETRIC = 2
+
+class PalettizeMode(CompressionWeightMode):
+    KMEANS = 1
+    UNIFORM = 2
+    UNIQUE = 3
+    CUSTOM = 4
+
+class SparseParams:
+    def __init__(self, nonzero_data=None, mask=None, shape=None):
+        self.nonzero_data = nonzero_data
+        self.mask = mask
+        self.shape = shape
+
+
+class WeightSparsifier(AbstractQuantizationPass):
+    """
+    This transform does the following, for each const op and if the "op_selector" return True:
+    - (self.sparsity) fraction of values with the least absolute value are zeroed out.
+    - If fake_compression=False,  Zeroed-Out Value is encoded via constexpr_sparse_to_dense op
+    - If fake_compression=True,   Zeroed-Out Value is encoded via const op
+    - Old const is replaced by a new operation with zeroed-out value.
+    """
+
+    def __init__(self, mode="threshold_based", threshold=1e-3, target_percentile=1.0, fake_compression=False, op_selector=None):
+        super().__init__(op_selector=op_selector)
+        self.fake_compression = fake_compression
+        self.mode = mode
+        self.threshold = threshold
+        self.target_percentile = target_percentile
+
+        if not SparseMode.has_mode(self.mode):
+            msg = "Only mode {} supported for weight sparsification. Got mode {}.".format(SparseMode.get_mode(), self.mode)
+            raise ValueError(msg)
+
+        self.mode = SparseMode[self.mode.upper()]
+
+        if self.target_percentile < 0 or self.target_percentile > 1:
+            raise ValueError("Invalid value of target_percentile: {}. Needs to be in [0, 1]".format(self.target_percentile))
+
+        if self.threshold < 0:
+            raise ValueError("Invalid value of threshold: {}. Needs to be in [0, inf)".format(self.threshold))
+
+    def is_valid_op(self, op):
+        if op.op_type == "const" and should_use_weight_file(op.val.val):
+            return True
+        return False
+
+    @staticmethod
+    def compress(val, mode, target_percentile, threshold):
+
+        def sparsify_with_percentile(val, target_percentile):
+            q = target_percentile * 100
+            return np.where(np.abs(val) <= np.percentile(np.abs(val), q), 0, val)
+
+        def sparsify_with_thresohld(val, threshold):
+            return np.where(np.abs(val) <= threshold, 0, val)
+
+        if not isinstance(val, (np.ndarray, np.generic)):
+            raise ValueError("Only numpy arrays are supported")
+
+        flattened_val = val.flatten()
+
+        if mode == SparseMode.PERCENTILE_BASED:
+            flattened_val = sparsify_with_percentile(flattened_val, target_percentile)
+        elif mode == SparseMode.THRESHOLD_BASED:
+            flattened_val = sparsify_with_thresohld(flattened_val, threshold)
+
+        params = SparseParams()
+        params.nonzero_data = flattened_val[np.where(flattened_val != 0)]
+        params.mask = np.packbits(np.where(flattened_val != 0, 1, 0), bitorder="little")
+        params.shape = val.shape
+        return params
+
+    @staticmethod
+    def decompress(params):
+        if not isinstance(params, SparseParams):
+            raise ValueError("Invalid type of params")
+        return constexpr_sparse_to_dense.decompress(params.nonzero_data, params.mask, params.shape)
+
+    def transform_op(self, op):
+        block = op.enclosing_block
+        sparse_params = self.compress(op.val.val, self.mode, self.target_percentile, self.threshold)
+
+        with block:
+            if not self.fake_compression:
+                new_var = mb.constexpr_sparse_to_dense(
+                    nonzero_data=sparse_params.nonzero_data,
+                    mask=sparse_params.mask,
+                    shape=np.uint32(sparse_params.shape),
+                    before_op=op,
+                    name=op.name + "_sparsified",
+                )
+            else:
+                decompressed_val = self.decompress(sparse_params)
+                new_var = mb.const(
+                    val=decompressed_val,
+                    before_op=op,
+                    name=op.name + "_fake_sparsified",
+                )
+
+            op.enclosing_block.replace_uses_of_var_after_op(
+                anchor_op=op,
+                old_var=op.outputs[0],
+                new_var=new_var,
+                no_check_var_types=True,
+            )
+
+            block.remove_ops([op])
+
+
+class LutParams:
+    def __init__(self, lut=None, indices=None, shape=None):
+        self.lut = lut
+        self.indices = indices
+        self.shape = shape
+
+
+class WeightPalettizer(AbstractQuantizationPass):
+    """
+    This transform does the following, for each const op and if the "op_selector" return True:
+    - A linear look up table with 2**(nbits) entries is created and value is represented via indexing into this look up table.
+    - If fake_compression=False,  compressed value is encoded via constexpr_lut_to_dense op
+    - If fake_compression=True,   compressed value is decompressed and then encoded via const op
+    - Old const op is replaced by a newly created operation.
+    """
+
+    def __init__(self, nbits, fake_compression=False, op_selector=None, mode="kmeans", lut_function=None):
+        super().__init__(op_selector=op_selector)
+        self.fake_compression = fake_compression
+        self.nbits = nbits
+        self.mode = mode
+        self.lut_function = lut_function
+
+        if not PalettizeMode.has_mode(self.mode):
+            msg = "Only mode {} supported for weight palettization. Got mode {}.".format(PalettizeMode.get_mode(), self.mode)
+            raise ValueError(msg)
+
+        self.mode = PalettizeMode[self.mode.upper()]
+
+        if nbits is None and self.mode in (PalettizeMode.KMEANS, PalettizeMode.UNIFORM):
+            msg = "nbits must be provided for mode {}".format(mode)
+            raise ValueError(msg)
+
+        if nbits is not None and self.mode in (PalettizeMode.UNIQUE, PalettizeMode.CUSTOM):
+            msg = "nbits must NOT be provided for mode {}".format(mode)
+            raise ValueError(msg)
+
+        if self.nbits is not None and self.nbits not in (1, 2, 4, 6, 8):
+            raise ValueError("Invalid value of nbits ({}) for palettization. Supported bits are {1, 2, 4, 6, 8}".format(nbits))
+
+        if (self.mode == PalettizeMode.CUSTOM) ^ (lut_function is not None):
+            msg = "lut_function must be None if mode is not custom, and that it cannot be None when the mode is custom."
+            raise ValueError(msg)
+
+        if self.mode == PalettizeMode.CUSTOM and not callable(self.lut_function):
+            msg = "A function object must be provided as lut_function. Got a lut_functions as type {}".format(type(self.lut_function))
+            raise ValueError(msg)
+
+    def is_valid_op(self, op):
+        if op.op_type == "const" and should_use_weight_file(op.val.val):
+            return True
+        return False
+
+    @staticmethod
+    def compress(val, nbits, mode, lut_function):
+
+        def compress_kmeans(val, nbits):
+            lut, indices = _get_kmeans_lookup_table_and_weight(nbits, val)
+            lut = lut.astype(val.dtype)
+            indices = indices.astype(np.uint8)
+            return lut, indices
+
+        def compress_uniform(val, nbits):
+            val = val.flatten()
+            val_min = np.amin(val)
+            val_max = np.amax(val)
+            scale = (val_max - val_min) / ((1 << nbits) - 1)
+            indices = np.round(
+                ((val - val_min) / (val_max - val_min)) * ((1 << nbits) - 1)
+            ).astype(np.uint8)
+            lut = np.array(range(0, 1 << nbits)) * scale + val_min
+            lut = lut.astype(val.dtype)
+            return lut, indices 
+
+        def get_nbits_for_unique_mode(val):
+            val = val.flatten()
+            unique_vals = np.unique(val).tolist()
+            for nbits in (1, 2, 4, 6, 8):
+                if len(unique_vals) <= 1 << nbits:
+                    return nbits
+            msg = "weight value cannot be represented in an 8 bits palettization. Skipped."
+            _logging.warning(msg)
+            return None
+
+        def compress_unique(val, nbits):
+            val = val.flatten()
+            unique_vals = np.unique(val).tolist()
+            if len(unique_vals) > 1 << nbits:
+                msg = "Too many unique values {} in the weight. Couldn't represented in {} bits.".format(len(unique_vals), nbits)
+                raise ValueError(msg)
+            lut = [0] * (1 << nbits)
+            lut[:len(unique_vals)] = unique_vals
+            indices = np.zeros((len(val),))
+            for i, k in enumerate(lut):
+                indices += (i+1) * (val == k).astype(np.int32)
+            indices = indices - 1
+            assert len(np.where(indices == -1)[0]) == 0, "weight must be corresponding to one existing indice"
+
+            lut = np.array(lut).astype(val.dtype)
+            indices = indices.astype(np.uint8)
+            return lut, indices
+
+        def pack_indices_into_bytes_array(indices, nbits):
+            bitarray = np.unpackbits(indices.reshape(-1, 1), bitorder="little", axis=-1)[
+                :, :nbits
+            ]
+            return np.packbits(bitarray.flatten(), bitorder="little")
+
+        def check_lut_parameters_are_valid(val, lut, indices):
+            if not isinstance(lut, np.ndarray) or not isinstance(indices, np.ndarray):
+                raise ValueError("LUT and indices must be type of numpy array.")
+
+            if indices.size != val.size:
+                msg = "Indices size ({}) mismatched with the original weight({}).".format(indices.size, val.size)
+                raise ValueError(msg)
+
+            if len(indices.shape) != 1 or indices.dtype != np.uint8:
+                msg = "Indices must be a numpy vector of type uint8. Found shape {} with type {}".format(indices.shape, indices.dtype)
+                raise ValueError(msg)
+
+            if lut.dtype != val.dtype:
+                msg = "Dtype mismatched between LUT ({}) and weight ({})".format(lut.dtype, val.dtype)
+                raise ValueError(msg)
+
+        if not isinstance(val, (np.ndarray, np.generic)):
+            raise ValueError("Only numpy arrays are supported")
+
+        if mode == PalettizeMode.KMEANS:
+            lut, indices = compress_kmeans(val, nbits)
+        elif mode == PalettizeMode.UNIFORM:
+            lut, indices = compress_uniform(val, nbits)
+        elif mode == PalettizeMode.UNIQUE:
+            nbits = get_nbits_for_unique_mode(val)
+            if nbits is None:
+                return None
+            lut, indices = compress_unique(val, nbits)
+        elif mode == PalettizeMode.CUSTOM:
+            lut, indices = lut_function(val)
+
+        check_lut_parameters_are_valid(val, lut, indices)
+
+        params = LutParams()
+        params.lut = lut
+        params.shape = val.shape
+        params.indices = pack_indices_into_bytes_array(indices, int(np.log2(lut.shape[0])))
+
+        return params
+
+    @staticmethod
+    def decompress(params):
+        if not isinstance(params, LutParams):
+            raise ValueError("Invalid type of params")
+        return constexpr_lut_to_dense.decompress(params.lut, params.indices, params.shape)
+
+    def transform_op(self, op):
+        block = op.enclosing_block
+        lut_params = self.compress(op.val.val, self.nbits, self.mode, self.lut_function)
+        
+        if lut_params is None:
+            return
+
+        with block:
+            if not self.fake_compression:
+                new_var = mb.constexpr_lut_to_dense(
+                    indices=lut_params.indices,
+                    lut=lut_params.lut,
+                    shape=np.uint32(lut_params.shape),
+                    before_op=op,
+                    name=op.name + "_palettized",
+                )
+            else:
+                decompressed_val = self.decompress(lut_params)
+                new_var = mb.const(
+                    val=decompressed_val,
+                    before_op=op,
+                    name=op.name + "_fake_palettized",
+                )
+
+            op.enclosing_block.replace_uses_of_var_after_op(
+                anchor_op=op,
+                old_var=op.outputs[0],
+                new_var=new_var,
+                no_check_var_types=True,
+            )
+
+            block.remove_ops([op])
+
+
+class AffineQuantParams:
+    def __init__(self, quantized_data=None, zero_point=None, scale=None, axis=None):
+        self.quantized_data = quantized_data
+        self.zero_point = zero_point
+        self.scale = scale
+        self.axis = axis
+
+
+class WeightAffineQuantizer(AbstractQuantizationPass):
+    """
+    This transform does the following, for each const op and if the "op_selector" return True:
+    - Values are linearly quantized into unsigned 8-bits.
+    - If fake_compression=False,  compressed value is encoded via constexpr_affine_dequantize op
+    - If fake_compression=True,   compressed value is decompressed and then encoded via const op
+    - Old const is replaced by a newly created operation.
+    """
+
+    def __init__(self, fake_compression=False, op_selector=None, mode="linear"):
+        super().__init__(op_selector=op_selector)
+        self.fake_compression = fake_compression
+        self.mode = mode
+
+        if not AffineQuantizeMode.has_mode(self.mode):
+            msg = "Only mode {} supported for weight affine quantization. Got mode {}."\
+            .format(AffineQuantizeMode.get_mode(), self.mode)
+            raise ValueError(msg)
+        self.mode = AffineQuantizeMode[self.mode.upper()]
+
+    def is_valid_op(self, op):
+        if op.op_type == "const" and should_use_weight_file(op.val.val):
+            return True
+        return False
+
+    @staticmethod
+    def _get_axis(op):
+        axis = 0
+        var = op.outputs[0]
+        if len(var.child_ops) == 1 and var.child_ops[0].op_type == "conv_transpose":
+            axis = 1
+        return axis
+
+    @staticmethod
+    def compress(val, axis, mode):
+        if not isinstance(val, (np.ndarray, np.generic)):
+            raise ValueError("Only numpy arrays are supported")
+
+        params = AffineQuantParams()
+        axes = tuple([i for i in range(len(val.shape)) if i != axis])
+        val_min = np.amin(val, axis=axes, keepdims=True)
+        val_max = np.amax(val, axis=axes, keepdims=True)
+        val_range = 255
+
+        if mode == AffineQuantizeMode.LINEAR_SYMMETRIC:
+            # For the linear_symmetric mode, the range is symmetrical to 0
+            max_abs = np.maximum(np.abs(val_min), np.abs(val_max))
+            val_min = -max_abs
+            val_max = max_abs
+            val_range = 254
+
+        params.scale = (val_max - val_min) / val_range
+        params.scale = params.scale.astype(val.dtype).squeeze()
+        params.quantized_data = np.round(
+            ((val - val_min) / (val_max - val_min)) * val_range
+        ).astype(np.uint8)
+        params.zero_point = (
+            np.round((-val_min / (val_max - val_min)) * val_range).astype(np.uint8).squeeze()
+        )
+        params.axis = axis
+        return params
+
+    @staticmethod
+    def decompress(params):
+        if not isinstance(params, AffineQuantParams):
+            raise ValueError("Invalid type of params")
+        return constexpr_affine_dequantize.decompress(params.quantized_data, params.zero_point, params.scale, params.axis)
+
+    def transform_op(self, op):
+        block = op.enclosing_block
+        quant_params = self.compress(op.val.val, self._get_axis(op), self.mode)
+
+        with block:
+            if not self.fake_compression:
+                new_var = mb.constexpr_affine_dequantize(
+                    quantized_data=quant_params.quantized_data,
+                    zero_point=quant_params.zero_point,
+                    scale=quant_params.scale,
+                    axis=quant_params.axis,
+                    before_op=op,
+                    name=op.name + "_affine_quantized",
+                )
+            else:
+                decompressed_val = self.decompress(quant_params)
+                new_var = mb.const(
+                    val=decompressed_val,
+                    before_op=op,
+                    name=op.name + "_fake_affine_quantized",
+                )
+
+            op.enclosing_block.replace_uses_of_var_after_op(
+                anchor_op=op,
+                old_var=op.outputs[0],
+                new_var=new_var,
+                no_check_var_types=True,
+            )
+
+            block.remove_ops([op])
+
+
+class WeightDecompressor(AbstractQuantizationPass):
+    """
+    This graph pass transforms the constexpr ops back into mb.const op.
+    constexpr ops includes:
+    (1) constexpr_affine_dequantize
+    (2) constexpr_lut_to_dense
+    (3) constexpr_sparse_to_dense
+    """
+
+    def __init__(self, op_selector):
+        super().__init__(op_selector=op_selector)
+
+    def is_valid_op(self, op):
+        return op.op_type in ["constexpr_affine_dequantize", "constexpr_lut_to_dense", "constexpr_sparse_to_dense"]
+
+    def transform_op(self, op):
+        block = op.enclosing_block
+        
+        with block:
+            decompressed_val = op.get_decompressed_value()
+            new_var = mb.const(
+                val=decompressed_val,
+                before_op=op,
+                name=op.name,
+            )
+
+            op.enclosing_block.replace_uses_of_var_after_op(
+                anchor_op=op,
+                old_var=op.outputs[0],
+                new_var=new_var,
+                no_check_var_types=True,
+            )
+
+            block.remove_ops([op])
diff --git a/coremltools/converters/mil/mil/passes/const_elimination.py b/coremltools/converters/mil/mil/passes/const_elimination.py
index f81b566d7..ebb3d993b 100644
--- a/coremltools/converters/mil/mil/passes/const_elimination.py
+++ b/coremltools/converters/mil/mil/passes/const_elimination.py
@@ -55,7 +55,7 @@ def _const_elimination_block(self, block):
                         anchor_op=op, old_var=o, new_var=res
                     )
                     # rename the const output
-                    o.set_name(o.name+'_ignored')
+                    o.set_name(o.name + '_ignored')
                 else:
                     all_outputs_are_const = False
 
diff --git a/coremltools/converters/mil/mil/passes/conv_batchnorm_fusion.py b/coremltools/converters/mil/mil/passes/conv_batchnorm_fusion.py
index 29bbfcbe1..3481c9ba1 100644
--- a/coremltools/converters/mil/mil/passes/conv_batchnorm_fusion.py
+++ b/coremltools/converters/mil/mil/passes/conv_batchnorm_fusion.py
@@ -1,15 +1,14 @@
-# -*- coding: utf-8 -*-
-
 #  Copyright (c) 2021, Apple Inc. All rights reserved.
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+import numpy as np
 
-from coremltools.converters.mil.mil.passes.pass_registry import register_pass
-from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
 from coremltools.converters.mil.mil import Builder as mb
-import numpy as np
+from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
+from coremltools.converters.mil.mil.passes.pass_registry import register_pass
+
 
 def _try_to_transform(conv_op, bn_op, block):
 
diff --git a/coremltools/converters/mil/mil/passes/conv_bias_fusion.py b/coremltools/converters/mil/mil/passes/conv_bias_fusion.py
index fe6443097..3ed8b1927 100644
--- a/coremltools/converters/mil/mil/passes/conv_bias_fusion.py
+++ b/coremltools/converters/mil/mil/passes/conv_bias_fusion.py
@@ -1,18 +1,16 @@
-# -*- coding: utf-8 -*-
-
 #  Copyright (c) 2020, Apple Inc. All rights reserved.
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+import logging
+import numpy as np
 
-from coremltools.converters.mil.mil.passes.pass_registry import register_pass
-from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
-from coremltools.converters.mil.mil import Builder as mb
-from coremltools.converters.mil.mil import types
 from .helper import _check_child_op_type
-import numpy as np
-import logging
+from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
+from coremltools.converters.mil.mil.passes.pass_registry import register_pass
+from coremltools.converters.mil.mil import Builder as mb, types
+
 
 child_op_types = ["add", "sub"]
 
@@ -325,4 +323,4 @@ def _fuse_conv_bias_block(self, block):
                 if fusion_status:
                     return fusion_status
 
-        return fusion_status
\ No newline at end of file
+        return fusion_status
diff --git a/coremltools/converters/mil/mil/passes/conv_scale_fusion.py b/coremltools/converters/mil/mil/passes/conv_scale_fusion.py
index f71a61f85..154534374 100644
--- a/coremltools/converters/mil/mil/passes/conv_scale_fusion.py
+++ b/coremltools/converters/mil/mil/passes/conv_scale_fusion.py
@@ -5,8 +5,8 @@
 
 import numpy as np
 
-from coremltools.converters.mil.mil.passes.pass_registry import register_pass
 from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
+from coremltools.converters.mil.mil.passes.pass_registry import register_pass
 from coremltools.converters.mil.mil import Builder as mb
 
 
@@ -68,7 +68,7 @@ def _try_to_transform(conv_op, scale_op, block):
 
     # transform the scale to 1./scale for the real_div case
     if scale_op.op_type == "real_div":
-        scale = 1./scale
+        scale = 1. / scale
 
     # get the type of the conv weight
     conv_weight_type = conv_weight.dtype
diff --git a/coremltools/converters/mil/mil/passes/elementwise_batchnorm_fusion.py b/coremltools/converters/mil/mil/passes/elementwise_batchnorm_fusion.py
index bccacdc43..c794718d4 100644
--- a/coremltools/converters/mil/mil/passes/elementwise_batchnorm_fusion.py
+++ b/coremltools/converters/mil/mil/passes/elementwise_batchnorm_fusion.py
@@ -1,15 +1,14 @@
-# -*- coding: utf-8 -*-
-
 #  Copyright (c) 2020, Apple Inc. All rights reserved.
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+import numpy as np
 
-from coremltools.converters.mil.mil.passes.pass_registry import register_pass
-from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
 from coremltools.converters.mil.mil import Builder as mb
-import numpy as np
+from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
+from coremltools.converters.mil.mil.passes.pass_registry import register_pass
+
 
 def _match_pattern(op):
     if op.outputs[0] in op.enclosing_block.outputs:
diff --git a/coremltools/converters/mil/mil/passes/gelu_tanh_approximation_fusion.py b/coremltools/converters/mil/mil/passes/gelu_tanh_approximation_fusion.py
index 21f94e217..c70838549 100644
--- a/coremltools/converters/mil/mil/passes/gelu_tanh_approximation_fusion.py
+++ b/coremltools/converters/mil/mil/passes/gelu_tanh_approximation_fusion.py
@@ -18,7 +18,7 @@ def is_var_constraint_satisifed(pattern):
 
     passed = passed and (
                         _check_var_scalar_value(pattern.mul_1.y, 0.044715) or
-                        _check_var_scalar_value(pattern.mul_1.x,  0.044715)
+                        _check_var_scalar_value(pattern.mul_1.x, 0.044715)
                         )
 
     passed = passed and (
diff --git a/coremltools/converters/mil/mil/passes/graph_pass.py b/coremltools/converters/mil/mil/passes/graph_pass.py
index b5f8727dd..7c64f59e8 100644
--- a/coremltools/converters/mil/mil/passes/graph_pass.py
+++ b/coremltools/converters/mil/mil/passes/graph_pass.py
@@ -3,15 +3,32 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
-class AbstractGraphPass():
+from abc import ABC, abstractmethod
+from coremltools.converters.mil._deployment_compatibility import AvailableTarget as target
 
-	def __call__(self, prog):
-		self.apply(prog)
+class AbstractGraphPass(ABC):
 
-	def __str__(self):
-		return type(self).__name__
+    def __init__(self, minimun_deployment_target=target.iOS13):
+        self._minimum_deployment_target = minimun_deployment_target
 
-	def apply(self, prog):
-		raise NotImplementedError(
-			'Graph pass transformation not implemented for "{}".'.format(self)
-		)
\ No newline at end of file
+    def __call__(self, prog):
+        if not prog.skip_all_passes:
+            self.apply(prog)
+
+    def __str__(self):
+        return type(self).__name__
+
+    @property
+    def minimun_deployment_target(self):
+        return self._minimum_deployment_target
+
+    @minimun_deployment_target.setter
+    def minimun_deployment_target(self, t):
+        if not isinstance(t, target):
+            raise TypeError("minimun_deployment_target must be an enumeration from Enum class AvailableTarget")
+        self._minimum_deployment_target = t
+
+
+    @abstractmethod
+    def apply(self, prog):
+        pass
diff --git a/coremltools/converters/mil/mil/passes/helper.py b/coremltools/converters/mil/mil/passes/helper.py
index ba11cbeb2..389f0e6fc 100644
--- a/coremltools/converters/mil/mil/passes/helper.py
+++ b/coremltools/converters/mil/mil/passes/helper.py
@@ -121,4 +121,4 @@ def _are_vars_identical(var1, var2):
 
     assert len(op1.blocks) == 0, "this method does not handle ops that have blocks in it"
     assert len(op2.blocks) == 0, "this method does not handle ops that have blocks in it"
-    return True
\ No newline at end of file
+    return True
diff --git a/coremltools/converters/mil/mil/passes/linear_bias_fusion.py b/coremltools/converters/mil/mil/passes/linear_bias_fusion.py
index 5d049b3b7..7a8fe53d5 100644
--- a/coremltools/converters/mil/mil/passes/linear_bias_fusion.py
+++ b/coremltools/converters/mil/mil/passes/linear_bias_fusion.py
@@ -1,15 +1,15 @@
-# -*- coding: utf-8 -*-
-
 #  Copyright (c) 2021, Apple Inc. All rights reserved.
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
-from coremltools.converters.mil.mil.passes.pass_registry import register_pass
-from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
-from coremltools.converters.mil.mil import Builder as mb
 import numpy as np
 
+from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
+from coremltools.converters.mil.mil.passes.pass_registry import register_pass
+
+
 def _try_to_transform(linear_op, add_or_sub_op, block):
 
     if add_or_sub_op.x.val is None and add_or_sub_op.y.val is None:
diff --git a/coremltools/converters/mil/mil/passes/matmul_weight_bias_fusion.py b/coremltools/converters/mil/mil/passes/matmul_weight_bias_fusion.py
index 6e4fbed72..1470bd6c2 100644
--- a/coremltools/converters/mil/mil/passes/matmul_weight_bias_fusion.py
+++ b/coremltools/converters/mil/mil/passes/matmul_weight_bias_fusion.py
@@ -1,15 +1,14 @@
-# -*- coding: utf-8 -*-
-
 #  Copyright (c) 2020, Apple Inc. All rights reserved.
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
-
 import numpy as np
-from coremltools.converters.mil.mil.passes.pass_registry import register_pass
-from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
+
 from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
+from coremltools.converters.mil.mil.passes.pass_registry import register_pass
+
 
 child_op_types = ["add", "sub"]
 
diff --git a/coremltools/converters/mil/mil/passes/merge_consecutive_paddings.py b/coremltools/converters/mil/mil/passes/merge_consecutive_paddings.py
index 52546487a..3c3320e7f 100644
--- a/coremltools/converters/mil/mil/passes/merge_consecutive_paddings.py
+++ b/coremltools/converters/mil/mil/passes/merge_consecutive_paddings.py
@@ -39,7 +39,7 @@ def _match_pattern(block, padding_op):
             return False
     else:
         # if the padding is not constant, then we can't merge if both pads affected the same side of the image
-        if any(i != 0 and j != 0 for (i,j) in zip(first_pad, child_pad)):
+        if any(i != 0 and j != 0 for (i, j) in zip(first_pad, child_pad)):
             return False
 
     return _replace_ops(block, padding_op, child_padding_op, final_pad)
@@ -68,6 +68,7 @@ def _merge_padding_block(block):
             return True
     return False
 
+
 @register_pass(namespace="common")
 class merge_consecutive_paddings(AbstractGraphPass):
     """
diff --git a/coremltools/converters/mil/mil/passes/name_sanitization_utils.py b/coremltools/converters/mil/mil/passes/name_sanitization_utils.py
index 12498feb6..108a51c48 100644
--- a/coremltools/converters/mil/mil/passes/name_sanitization_utils.py
+++ b/coremltools/converters/mil/mil/passes/name_sanitization_utils.py
@@ -10,7 +10,7 @@
 from coremltools.converters.mil.mil import Function
 
 
-class NameSanitizer(object):
+class NameSanitizer:
 
     def __init__(self, prefix=None):
         # to hold all names encountered,
diff --git a/coremltools/converters/mil/mil/passes/pad_conv_connect.py b/coremltools/converters/mil/mil/passes/pad_conv_connect.py
index 83d5b731f..867e3e044 100644
--- a/coremltools/converters/mil/mil/passes/pad_conv_connect.py
+++ b/coremltools/converters/mil/mil/passes/pad_conv_connect.py
@@ -1,16 +1,15 @@
-# -*- coding: utf-8 -*-
-
 #  Copyright (c) 2020, Apple Inc. All rights reserved.
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+import copy
+import numpy as np
 
-from coremltools.converters.mil.mil.passes.pass_registry import register_pass
-from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
 from coremltools.converters.mil.mil import Builder as mb
-import numpy as np
-import copy
+from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
+from coremltools.converters.mil.mil.passes.pass_registry import register_pass
+
 
 def _match_pattern(op):
     ret = set([])
diff --git a/coremltools/converters/mil/mil/passes/prelu_fusion.py b/coremltools/converters/mil/mil/passes/prelu_fusion.py
new file mode 100644
index 000000000..9986352d3
--- /dev/null
+++ b/coremltools/converters/mil/mil/passes/prelu_fusion.py
@@ -0,0 +1,198 @@
+#  Copyright (c) 2020, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import numpy as np
+
+from coremltools.converters.mil import Builder as mb
+from coremltools.converters.mil.experimental.passes.generic_pass_infrastructure import fuse_all_blocks
+from coremltools.converters.mil.mil import get_new_symbol
+from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
+from coremltools.converters.mil.mil.passes.helper import _check_var_scalar_value
+from coremltools.converters.mil.mil.passes.pass_registry import register_pass
+
+
+def _prelu_pattern(x):
+    # MIL operation takes named inputs (instead of positional inputs).
+    # Here `name` argument is MANDATORY.
+    neg = mb.mul(x=x, y=-1, name="neg")
+    relu1 = mb.relu(x=neg, name="relu1")
+    # use any constant here to match, rank and shape will be verified in "is_var_constraint_satisifed" method
+    mul = mb.mul(x=relu1, y=np.random.rand(2, 2, 2, 2), name="alpha_mul")
+    relu2 = mb.relu(x=x, name="relu2")
+    out = mb.add(x=relu2, y=mul, name="out_op")
+    return out
+
+
+
+class Pattern1:
+    @staticmethod
+    def is_var_constraint_satisifed(pattern):
+        # input must be rank 4
+        if pattern.root_var.rank != 4:
+            return False
+        # output must be rank 4
+        if pattern.out_op.outputs[0].rank != 4:
+            return False
+        if not (_check_var_scalar_value(pattern.neg.y, -1) or _check_var_scalar_value(pattern.neg.x, -1)):
+            return False
+        if pattern.alpha_mul.x.val is not None:
+            alpha = pattern.alpha_mul.x.val
+        elif pattern.alpha_mul.y.val is not None:
+            alpha = pattern.alpha_mul.y.val
+        else:
+            return False
+        # alpha must be of shape (1, C, 1, 1) or (C, 1, 1)
+        if len(alpha.shape) not in (3, 4):
+            return False
+        if alpha.size != alpha.shape[-3]:
+            return False
+
+        return True
+
+    @staticmethod
+    def transform_pattern(pattern):
+        # remove all the ops, and replace with a prelu op
+        out_var = pattern.out_op.outputs[0]
+        if pattern.alpha_mul.x.val is not None:
+            alpha = pattern.alpha_mul.x.val
+        else:
+            alpha = pattern.alpha_mul.y.val
+
+        alpha_vector = -1 * alpha.flatten()
+        x = mb.prelu(x=pattern.root_var, alpha=alpha_vector, name=out_var.name, before_op=pattern.out_op)
+        pattern.out_op.enclosing_block.replace_uses_of_var_after_op(
+            anchor_op=pattern.out_op, old_var=out_var, new_var=x
+        )
+        # Remove all the ops at once
+        pattern.block.remove_ops(pattern.op_list())
+
+    @staticmethod
+    def get_prelu_pattern():
+        """
+        y = a * relu(-1 * x) + relu(x)
+
+        when x is rank 4, and "a" is of shape (1, C, 1, 1) or (C, 1, 1),
+        this is equivalent to prelu with alpha = -a.flatten(),
+        """
+        @mb.program(input_specs=[mb.TensorSpec(shape=([get_new_symbol(), get_new_symbol(),
+                                                       get_new_symbol(), get_new_symbol()])), ])
+        def prelu_pattern(x):
+            return _prelu_pattern(x)
+
+        return prelu_pattern
+
+
+
+class Pattern2:
+    @staticmethod
+    def is_var_constraint_satisifed(pattern):
+        perm = pattern.transpose.perm.val
+        if not np.array_equal(perm, np.array([0,2,3,1])):
+            return False
+        # output must be rank 4
+        if pattern.out_op.outputs[0].rank != 4:
+            return False
+        if not (_check_var_scalar_value(pattern.neg.y, -1) or _check_var_scalar_value(pattern.neg.x, -1)):
+            return False
+        if pattern.alpha_mul.x.val is not None:
+            alpha = pattern.alpha_mul.x.val
+        elif pattern.alpha_mul.y.val is not None:
+            alpha = pattern.alpha_mul.y.val
+        else:
+            return False
+        # alpha must be of shape (C,) or (1,C) or (1,1,C) or (1,1,1,C)
+        if alpha.size != alpha.shape[-1]:
+            return False
+
+        return True
+
+    @staticmethod
+    def transform_pattern(pattern):
+        # remove all the ops, and replace with a prelu op + transpose op
+        perm = pattern.transpose.perm.val
+        out_var = pattern.out_op.outputs[0]
+        if pattern.alpha_mul.x.val is not None:
+            alpha = pattern.alpha_mul.x.val
+        else:
+            alpha = pattern.alpha_mul.y.val
+
+        alpha_vector = -1 * alpha.flatten()
+        x = mb.prelu(x=pattern.root_var, alpha=alpha_vector, before_op=pattern.out_op)
+        x = mb.transpose(x=x, perm=perm, name=out_var.name, before_op=pattern.out_op)
+        pattern.out_op.enclosing_block.replace_uses_of_var_after_op(
+            anchor_op=pattern.out_op, old_var=out_var, new_var=x
+        )
+        # Remove all the ops at once
+        pattern.block.remove_ops(pattern.op_list())
+
+    @staticmethod
+    def get_prelu_pattern():
+        """
+        x1 = transpose(perm=(0,2,3,1))(x)
+        y = a * relu(-1 * x1) + relu(x1)
+
+        when x is rank 4, and "a" is of shape (C,) or (1, C) or (1,1,C) or (1,1,1,C),
+        this is equivalent to prelu with alpha = -a.flatten(), followed by a transpose
+        with perm (0,2,3,1)
+        """
+        @mb.program(input_specs=[mb.TensorSpec(shape=([get_new_symbol(), get_new_symbol(),
+                                                       get_new_symbol(), get_new_symbol()])), ])
+        def prelu_pattern(x):
+            # perm value can be anything, it will be checked in "is_var_constraint_satisifed" method
+            x = mb.transpose(x=x, perm=[0,1,2,3], name="transpose")
+            return _prelu_pattern(x)
+
+        return prelu_pattern
+
+
+@register_pass(namespace="common")
+class fuse_prelu(AbstractGraphPass):
+    """
+    Detect the following patterns that can be mapped to a prelu op.
+    Essentially prelu op can be broken down into the following ops: y = a * relu(-1 * x) + relu(x)
+
+    Pattern 1:
+
+
+                      | ------------> relu --------------------|
+                      |                                        V
+       x (BCHW) ------|                                       add -----> y (BCHW)
+                      |                                        ^
+                      --------> mul -------> relu -----> mul---|
+                                ^                         ^
+                                |                         |
+                            Const(val=-1)               Const(name=a, shape=(C,1,1) or (1,C,1,1))
+
+    This will be mapped to:
+        x (BCHW) ------> prelu(alpha=a, shape=(C,)) ---------> y (BCHW)
+
+
+    Pattern 2:
+
+                                      | ------------> relu --------------------|
+                                      |                                        V
+      x (BCHW) -->transpose(BHWC)---->|                                       add -----> y (BHWC)
+                                      |                                        ^
+                                      --------> mul -------> relu -----> mul---|
+                                                 ^                        ^
+                                                 |                        |
+                                        Const(val=-1)    Const(shape=(C,) or (1,C) or (1,1,C) or (1,1,1,C))
+
+    This will be mapped to:
+        x (BCHW) ------> prelu ---------> transpose ------> y (BHWC)
+    """
+
+
+    def apply(self, prog):
+        for pattern in (Pattern1, Pattern2):
+            fuse_all_blocks(ops_arrangement=pattern.get_prelu_pattern(),
+                            var_constraints=pattern.is_var_constraint_satisifed,
+                            transform_pattern=pattern.transform_pattern,
+                            prog=prog)
+
+
+
+
+
diff --git a/coremltools/converters/mil/mil/passes/quantization_passes.py b/coremltools/converters/mil/mil/passes/quantization_passes.py
index 69071c0f6..057a16322 100644
--- a/coremltools/converters/mil/mil/passes/quantization_passes.py
+++ b/coremltools/converters/mil/mil/passes/quantization_passes.py
@@ -16,6 +16,7 @@ class ComputePrecision(_Enum):
     FLOAT16 = "float16"
     FLOAT32 = "float32"
 
+
 type_eps = {}
 type_min = {}
 type_negmin = {}
diff --git a/coremltools/converters/mil/mil/passes/reduce_transposes.py b/coremltools/converters/mil/mil/passes/reduce_transposes.py
index 747b3aa09..6e60c8dcd 100644
--- a/coremltools/converters/mil/mil/passes/reduce_transposes.py
+++ b/coremltools/converters/mil/mil/passes/reduce_transposes.py
@@ -1,20 +1,20 @@
-# -*- coding: utf-8 -*-
-
 #  Copyright (c) 2020, Apple Inc. All rights reserved.
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+from collections import defaultdict
+import copy
+import logging
+
+import numpy as np
 
-from coremltools.converters.mil.mil.passes.pass_registry import register_pass
-from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
 from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
+from coremltools.converters.mil.mil.passes.pass_registry import register_pass
 from coremltools.converters.mil.mil.types.symbolic import any_symbolic
 from coremltools.converters.mil.mil.var import Var
-import logging
-import numpy as np
-import copy
-from collections import defaultdict
+
 
 DEBUG = False  # set to true to plot the block before and after the transformation
 
@@ -243,7 +243,7 @@ def class_wrapper(op_update_cls):
     return class_wrapper
 
 
-class transform_axis_update_ops(object):
+class transform_axis_update_ops:
     """
     Parent class for every axis update op's class
 
@@ -672,7 +672,7 @@ def update(self):
             )
 
 
-class HypotheticalValue(object):
+class HypotheticalValue:
     # A hypothetical value.
     # Simply wraps a Var.
     # Actual Var it wraps doesn't really matter, its mainly for debugging.
@@ -682,7 +682,7 @@ def __init__(self, var=None):
         self.value = var  # type : Var
 
 
-class LazyTransposeHypotheticalValue(object):
+class LazyTransposeHypotheticalValue:
     # a hypothetical value that represents a transpose op on top of a hypothetical value,
     # or a collection of transpose_ops, which have the same "perm" parameter
 
@@ -712,7 +712,7 @@ def __init__(self, hypothetical_value, transpose_ops, perm):
         self.transpose_ops = transpose_ops  # type : Set(op)
 
 
-class TransposeOptimization(object):
+class TransposeOptimization:
     def __init__(self, block):
         self.block = block
 
diff --git a/coremltools/converters/mil/mil/passes/test_cast_optimization.py b/coremltools/converters/mil/mil/passes/test_cast_optimization.py
index 5f08c2bd9..8b2e330df 100644
--- a/coremltools/converters/mil/mil/passes/test_cast_optimization.py
+++ b/coremltools/converters/mil/mil/passes/test_cast_optimization.py
@@ -6,7 +6,7 @@
 import numpy as np
 import unittest
 
-from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil import Builder as mb, types as types
 from coremltools.converters.mil.testing_utils import (
     assert_model_is_valid,
     get_op_types_in_program,
@@ -180,7 +180,7 @@ def prog(x):
 
         assert_model_is_valid(
             prog,
-            {"x": (10,20)},
+            {"x": (10, 20)},
             expected_output_shapes={
                 block.outputs[0].name: (10, 20),
                 block.outputs[1].name: (10, 20),
@@ -364,3 +364,43 @@ def prog(x):
                 block.outputs[2].name: (10, 20),
             },
         )
+
+    """
+    Input graph:
+    input(dtype="fp16")---->relu----->relu
+                                      |
+                              --------|
+                              |
+                              V 
+                             cast(dtype="fp32")---->cast(dtype="fp16")
+                                                      |
+                                ----------------------|
+                                |
+                                V 
+                             cast(dtype="fp32")---->cast(dtype="fp16")---->output(dtype="fp16")
+    
+    Output graph:
+    input(dtype="fp16")---->relu----->relu---->output(dtype="fp16")
+    """
+    def test_two_casts_at_the_end(self):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20), dtype=types.fp16)])
+        def prog(x):
+            x = mb.relu(x=x)
+            x = mb.relu(x=x)
+            x = mb.cast(x=x, dtype="fp32")
+            x = mb.cast(x=x, dtype="fp16")
+            x = mb.cast(x=x, dtype="fp32")
+            x = mb.cast(x=x, dtype="fp16", name="original_output_name")
+            return x
+
+        self.assertEqual(get_op_types_in_program(prog),
+                         ['relu', 'relu', 'cast', 'cast', 'cast', 'cast'])
+        apply_pass_and_basic_check(prog, "common::cast_optimization")
+        _, prev_block, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+        self.assertEqual(get_op_types_in_program(prog),
+                         ['relu', 'relu'])
+        self.assertEqual(prev_block.outputs[0].name, "original_output_name")
+        self.assertEqual(block.outputs[0].name, "original_output_name")
+        self.assertEqual(block.outputs[0].dtype, types.fp16)
+
+
diff --git a/coremltools/converters/mil/mil/passes/test_concat_to_pixel_shuffle.py b/coremltools/converters/mil/mil/passes/test_concat_to_pixel_shuffle.py
index cbed7c208..8c3640499 100644
--- a/coremltools/converters/mil/mil/passes/test_concat_to_pixel_shuffle.py
+++ b/coremltools/converters/mil/mil/passes/test_concat_to_pixel_shuffle.py
@@ -3,6 +3,11 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+import unittest
+
+import numpy as np
+
+import coremltools as ct
 from coremltools._deps import _IS_MACOS
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.testing_utils import (
@@ -10,11 +15,7 @@
     get_op_types_in_program,
     apply_pass_and_basic_check,
 )
-from coremltools.converters.mil.testing_reqs import ct
 
-import unittest
-
-import numpy as np
 
 np.random.seed(1984)
 
@@ -67,7 +68,11 @@ def prog(x1, x2, x3, x4):
             expected_output_shapes={block.outputs[0].name: (1, 2, 6, 8)},
         )
 
-        mlmodel = ct.convert(prog, source="milinternal", convert_to="neuralnetwork")
+        mlmodel = ct.convert(prog,
+                             source="milinternal",
+                             convert_to="neuralnetwork",
+                             compute_units=ct.ComputeUnit.CPU_ONLY
+        )
 
         if not _IS_MACOS:
             # Can not get predictions unless on macOS.
@@ -85,7 +90,7 @@ def prog(x1, x2, x3, x4):
         cd = np.reshape(np.stack((input_dict["x3"], input_dict["x4"]), axis=3), newshape=[1, 2, 6, 4])
         old_prediction = np.reshape(np.stack((ab, cd), axis=4), newshape=[1, 2, 6, 8])        
 
-        prediction = mlmodel.predict(input_dict, useCPUOnly=True)
+        prediction = mlmodel.predict(input_dict)
         np.testing.assert_allclose(old_prediction, prediction[output_name], atol=1e-04, rtol=1e-05)
 
     def test_nested(self):
@@ -141,10 +146,14 @@ def prog(x1, x2, x3, x4, x5, x6, x7, x8):
 
         old_prediction = np.concatenate((x, y), axis=1)
 
-        mlmodel = ct.convert(prog, source="milinternal", convert_to="neuralnetwork")
+        mlmodel = ct.convert(prog,
+                             source="milinternal",
+                             convert_to="neuralnetwork",
+                             compute_units=ct.ComputeUnit.CPU_ONLY
+        )
 
         if _IS_MACOS:
-            prediction = mlmodel.predict(input_dict, useCPUOnly=True)
+            prediction = mlmodel.predict(input_dict)
             np.testing.assert_allclose(old_prediction, prediction[output_name], atol=1e-04, rtol=1e-05)
 
     def test_failure_0(self):
diff --git a/coremltools/converters/mil/mil/passes/test_conv_scale_fusion.py b/coremltools/converters/mil/mil/passes/test_conv_scale_fusion.py
index 7ca1df6ec..c910105e6 100644
--- a/coremltools/converters/mil/mil/passes/test_conv_scale_fusion.py
+++ b/coremltools/converters/mil/mil/passes/test_conv_scale_fusion.py
@@ -205,9 +205,6 @@ def prog(x):
         assert get_op_types_in_program(prog) == ["conv"]
 
         # validate graph pass
-        input_dict = {
-            "x": np.random.rand(*input_shape),
-        }
         output_shape = (2, Cout, 19) if rank == 3 else (2, Cout, 19, 22)
         assert_model_is_valid(
             prog,
diff --git a/coremltools/converters/mil/mil/passes/test_elementwise_fusions.py b/coremltools/converters/mil/mil/passes/test_elementwise_fusions.py
index e717cb672..32f8d2cef 100644
--- a/coremltools/converters/mil/mil/passes/test_elementwise_fusions.py
+++ b/coremltools/converters/mil/mil/passes/test_elementwise_fusions.py
@@ -3,17 +3,18 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+import itertools
+
+import numpy as np
+import pytest
+
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.testing_utils import (
-    assert_op_count_match,
     assert_model_is_valid,
     get_op_types_in_program,
     apply_pass_and_basic_check,
 )
 
-import pytest
-import numpy as np
-import itertools
 
 np.random.seed(1984)
 
diff --git a/coremltools/converters/mil/mil/passes/test_fp16_compute_precision.py b/coremltools/converters/mil/mil/passes/test_fp16_compute_precision.py
index 9f92b4e34..28bf90c27 100644
--- a/coremltools/converters/mil/mil/passes/test_fp16_compute_precision.py
+++ b/coremltools/converters/mil/mil/passes/test_fp16_compute_precision.py
@@ -3,6 +3,11 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+import unittest
+
+import numpy as np
+
+import coremltools as ct
 from coremltools._deps import _IS_MACOS
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil.passes import quantization_passes as transform
@@ -11,9 +16,7 @@
     get_op_types_in_program,
     apply_pass_and_basic_check,
 )
-import unittest
-import numpy as np
-import coremltools as ct
+
 
 np.random.seed(1984)
 
@@ -84,11 +87,11 @@ def prog(x):
             prog, transform.FP16ComputePrecision(op_selector=lambda op: True)
         )
 
-        mlmodel = ct.convert(prog, source="milinternal")
+        mlmodel = ct.convert(prog, source="milinternal", compute_units=ct.ComputeUnit.CPU_ONLY)
         input_dict = {"x": np.random.rand(10, 20)}
 
         if _IS_MACOS:
-            prediction = mlmodel.predict(input_dict, useCPUOnly=True)
+            prediction = mlmodel.predict(input_dict)
             assert(not np.isnan(prediction['real_div_0']).any())
             assert(np.isfinite(prediction['real_div_0']).all())
 
diff --git a/coremltools/converters/mil/mil/passes/test_noop_elimination.py b/coremltools/converters/mil/mil/passes/test_noop_elimination.py
index 5b694eb62..151958649 100644
--- a/coremltools/converters/mil/mil/passes/test_noop_elimination.py
+++ b/coremltools/converters/mil/mil/passes/test_noop_elimination.py
@@ -4,18 +4,25 @@
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 import itertools
+
 import numpy as np
 import pytest
 
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.testing_utils import (
+    apply_pass_and_basic_check,
     assert_model_is_valid,
     get_op_types_in_program,
-    apply_pass_and_basic_check,
 )
 
 
-@pytest.mark.parametrize("op_type, pos, val", itertools.product(['add', 'mul', 'floor_div', 'pow', 'real_div', 'sub'], ['x', 'y'], [0, 1, [0, 0, 0, 0], [1, 1, 1, 1]]))
+@pytest.mark.parametrize(
+    "op_type, pos, val", itertools.product(
+        ['add', 'mul', 'floor_div', 'pow', 'real_div', 'sub'],
+        ['x', 'y'],
+        [0, 1, [0, 0, 0, 0], [1, 1, 1, 1]]
+    )
+)
 def test_elementwise_elimination(op_type, pos, val):
     if 'div' in op_type and np.prod(val) == 0:
         return
@@ -89,7 +96,7 @@ def test_reshape_elimination():
     @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
     def prog(x):
         r1 = mb.reshape(x=x, shape=[1, 8])
-        r2 = mb.reshape(x=r1, shape=[1, 8])
+        mb.reshape(x=r1, shape=[1, 8])
         return mb.relu(x=r1)
 
     prev_prog, prev_block, block = apply_pass_and_basic_check(
@@ -400,5 +407,3 @@ def prog(x):
         {"x": (2, 3, 4)},
         expected_output_shapes={block.outputs[0].name: (2, 3, 4)},
     )
-
-
diff --git a/coremltools/converters/mil/mil/passes/test_passes.py b/coremltools/converters/mil/mil/passes/test_passes.py
index 937ff4eb9..3ba008ae6 100644
--- a/coremltools/converters/mil/mil/passes/test_passes.py
+++ b/coremltools/converters/mil/mil/passes/test_passes.py
@@ -3,8 +3,21 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+import copy
+import itertools
+
+import numpy as np
+import pytest
+
 import coremltools as ct
-from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil import (
+    Builder as mb,
+    Function,
+    get_new_symbol,
+    Program,
+    Symbol,
+    types
+)
 from coremltools.converters.mil.testing_utils import (
     assert_op_count_match,
     assert_model_is_valid,
@@ -12,16 +25,10 @@
     get_op_types_in_program,
     apply_pass_and_basic_check,
 )
-from coremltools.converters.mil.mil import Function, get_new_symbol, Program, Symbol, types
 from coremltools.converters.mil.experimental.passes.generic_pass_infrastructure import register_generic_pass
 from coremltools.converters.mil.mil.passes.helper import _check_var_scalar_value
 from coremltools.converters.mil.mil.passes.pass_registry import PASS_REGISTRY
-import copy
-import pytest
-import itertools
-import os
 
-import numpy as np
 
 np.random.seed(1984)
 validate_model = True
@@ -119,7 +126,7 @@ def program1(x):
         bias = mb.const(val=bias_val)
 
         # unused op and its inputs should be eliminated
-        weights_for_matmul = mb.transpose(x=weights, perm=[1,0])
+        weights_for_matmul = mb.transpose(x=weights, perm=[1, 0])
         mb.matmul(x=x, y=weights_for_matmul)
 
         return mb.linear(x=x, weight=weights, bias=bias)
@@ -1566,3 +1573,201 @@ def prog(shape):
         assert get_op_types_in_program(prog) == ["cast", "random_uniform", "random_uniform", "add"]
 
 
+class TestPreluFusion:
+
+    @pytest.mark.parametrize(
+        "swap_input_order, alpha_rank",
+        itertools.product(
+            [True, False],
+            [3, 4],
+        )
+    )
+    def test_channel_first_pattern(self, swap_input_order, alpha_rank):
+        """
+        Input:
+                          | ------------> relu --------------------|
+                          |                                        V
+           x (BCHW) ------|                                       add -----> y (BCHW)
+                          |                                        ^
+                          --------> mul -------> relu -----> mul---|
+                                    ^                         ^
+                                    |                         |
+                                Const(val=-1)               Const(name=a, shape=(1,C,1,1))
+
+        Output:
+            x (BCHW) ------> prelu(alpha=a, shape=(C,)) ---------> y (BCHW)
+        """
+        B, C, H, W = 2, 3, 5, 6
+
+        if alpha_rank == 3:
+            alpha = np.random.rand(C, 1, 1)
+        elif alpha_rank == 4:
+            alpha = np.random.rand(1, C, 1, 1)
+        else:
+            raise NotImplementedError("alpha rank must be 3 or 4")
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(B, C, H, W))])
+        def prog(x):
+            if swap_input_order:
+                neg = mb.mul(x=x, y=-1)
+            else:
+                neg = mb.mul(x=-1, y=x)
+            relu1 = mb.relu(x=neg)
+            if swap_input_order:
+                mul = mb.mul(x=relu1, y=alpha)
+            else:
+                mul = mb.mul(x=alpha, y=relu1)
+            relu2 = mb.relu(x=x)
+            if swap_input_order:
+                out = mb.add(x=relu2, y=mul)
+            else:
+                out = mb.add(x=mul, y=relu2)
+            return out
+
+        prev_prog, _, _ = apply_pass_and_basic_check(
+            prog, "common::fuse_prelu",
+        )
+        assert get_op_types_in_program(prev_prog) == ["mul", "relu", "mul", "relu", "add"]
+        assert get_op_types_in_program(prog) == ["prelu"]
+
+
+    @pytest.mark.parametrize(
+        "swap_input_order, alpha_rank",
+        itertools.product(
+            [True, False],
+            [1, 2, 3],
+        )
+    )
+    def test_channel_last_transpose_pattern(self, swap_input_order, alpha_rank):
+        """
+        Input:
+
+                                                        | ------------> relu --------------------|
+                                                        |                                        V
+        x (shappe=BCHW)-->transpose(out_shape=BHWC)---->|                                       add -----> y (BHWC)
+                                                        |                                        ^
+                                                        --------> mul -------> relu -----> mul---|
+                                                                   ^                        ^
+                                                                   |                        |
+                                                           Const(val=-1)             Const(shape=(1,1,C))
+
+        Output:
+            x (BCHW) ------> prelu ---------> transpose ------> y (BHWC)
+        """
+        B, C, H, W = 2, 3, 5, 6
+        if alpha_rank == 1:
+            alpha = np.random.rand(C)
+        elif alpha_rank == 2:
+            alpha = np.random.rand(1, C)
+        elif alpha_rank == 3:
+            alpha = np.random.rand(1, 1, C)
+        else:
+            raise NotImplementedError("alpha rank must be 1 or 2 or 3")
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(B, C, H, W))])
+        def prog(x):
+            x = mb.transpose(x=x, perm=[0,2,3,1])
+            if swap_input_order:
+                neg = mb.mul(x=x, y=-1)
+            else:
+                neg = mb.mul(x=-1, y=x)
+            relu1 = mb.relu(x=neg)
+            if swap_input_order:
+                mul = mb.mul(x=relu1, y=alpha)
+            else:
+                mul = mb.mul(x=alpha, y=relu1)
+            relu2 = mb.relu(x=x)
+            if swap_input_order:
+                out = mb.add(x=relu2, y=mul)
+            else:
+                out = mb.add(x=mul, y=relu2)
+            return out
+
+        prev_prog, _, block = apply_pass_and_basic_check(
+            prog, "common::fuse_prelu",
+        )
+        assert get_op_types_in_program(prev_prog) == ["transpose", "mul", "relu", "mul", "relu", "add"]
+        assert get_op_types_in_program(prog) == ["prelu", "transpose"]
+        assert_model_is_valid(
+            prog,
+            {"x": (B, C, H, W)},
+            expected_output_shapes={block.outputs[0].name: (B, H, W, C)},
+        )
+
+
+class TestUpdateOutputDtypePass:
+
+    def test_single_output(self):
+        """
+        Given:
+        ------
+        main(%input: (1, 20, int32)(Tensor)) {
+          block0() {
+            %relu: (1, 20, int32)(Tensor) = relu(x=%input, name="relu")
+            %output_relu6: (1, 20, int32)(Tensor) = relu6(x=%input, name="output_relu6")
+          } -> (%output_relu6)
+        }
+        prog.main_output_types = [ct.TensorType(dtype=np.float16)]
+
+        Result:
+        ------
+        main(%input: (1, 20, int32)(Tensor)) {
+          block0() {
+            %relu: (1, 20, int32)(Tensor) = relu(x=%input, name="relu")
+            %output_relu6_type_int32: (1, 20, int32)(Tensor) = relu6(x=%input, name="output_relu6")
+            %output_relu6: (1, 20, fp16)(Tensor) = cast(x=%output_relu6_type_int32, dtype="fp16", name="cast_0")
+          } -> (%output_relu6)
+        }
+        """
+        @mb.program(input_specs=[mb.TensorSpec(shape=(1, 20), dtype=types.int32)])
+        def prog(input):
+            x = mb.relu(x=input, name="relu")
+            x = mb.relu6(x=input, name="output_relu6")
+            return x
+
+        prog.set_main_output_types([ct.TensorType(dtype=np.float16)])
+        prev_prog, prev_block, block = apply_pass_and_basic_check(
+            prog, "common::update_output_dtypes"
+        )
+        assert get_op_types_in_program(prev_prog) == ["relu", "relu6"]
+        assert prev_block.outputs[0].dtype == types.int32
+        assert get_op_types_in_program(prog) == ["relu", "relu6", "cast"]
+        assert block.outputs[0].dtype == types.fp16
+        assert block.outputs[0].name == "output_relu6"
+
+    def test_multiple_outputs(self):
+        """
+        Given:
+        -----
+        main(%input: (1, 20, int32)(Tensor)) {
+          block0() {
+            %split_0: (1, 10, int32)(Tensor), %split_1: (1, 10, int32)(Tensor) = split(x=%input, num_splits=2, axis=1, name="split")
+          } -> (%split_0, %split_1)
+        }
+        prog.main_output_types = [ct.TensorType(), ct.TensorType(dtype=np.float16)]
+
+        Result:
+        ------
+        main(%input: (1, 20, int32)(Tensor)) {
+          block0() {
+            %split_0: (1, 10, int32)(Tensor), %split_1_type_int32: (1, 10, int32)(Tensor) = split(x=%input, num_splits=2, axis=1, name="split")
+            %split_1: (1, 10, fp16)(Tensor) = cast(x=%split_1_type_int32, dtype="fp16", name="cast_0")
+          } -> (%split_0, %split_1)
+        }
+
+        """
+        @mb.program(input_specs=[mb.TensorSpec(shape=(1, 20), dtype=types.int32)])
+        def prog(input):
+            x1, x2 = mb.split(x=input, num_splits=2, axis=1, name="split")
+            return x1, x2
+
+        prog.set_main_output_types([ct.TensorType(), ct.TensorType(dtype=np.float16)])
+        _, _, block = apply_pass_and_basic_check(
+            prog, "common::update_output_dtypes"
+        )
+        assert get_op_types_in_program(prog) == ["split", "cast"]
+        assert block.outputs[1].dtype == types.fp16
+        assert block.outputs[1].name == "split_1"
+
+
+
diff --git a/coremltools/converters/mil/mil/passes/test_reduce_transposes_pass.py b/coremltools/converters/mil/mil/passes/test_reduce_transposes_pass.py
index 62b6abef6..57e4f4774 100644
--- a/coremltools/converters/mil/mil/passes/test_reduce_transposes_pass.py
+++ b/coremltools/converters/mil/mil/passes/test_reduce_transposes_pass.py
@@ -3,19 +3,21 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
-from coremltools.converters.mil.mil import Builder as mb
-from coremltools.converters.mil.mil import get_new_symbol
+import numpy as np
+import pytest
+import unittest
+
+from .reduce_transposes import _find_transpose_compliment
+from coremltools.converters.mil.mil import (
+    Builder as mb,
+    get_new_symbol
+)
 from coremltools.converters.mil.mil.passes.pass_registry import PASS_REGISTRY
 from coremltools.converters.mil.testing_utils import (
-    assert_op_count_match,
+    apply_pass_and_basic_check,
     assert_model_is_valid,
     get_op_types_in_program,
-    apply_pass_and_basic_check,
 )
-import numpy as np
-import pytest
-from .reduce_transposes import _find_transpose_compliment
-import unittest
 
 
 np.random.seed(1984)
@@ -1723,9 +1725,9 @@ def prog(x):
     def test_materialized_output_reuse(self):
         @mb.program(input_specs=[mb.TensorSpec(shape=(2, 5))])
         def prog(x):
-            x1 = mb.transpose(x=x, perm=[1,0])
+            x1 = mb.transpose(x=x, perm=[1, 0])
             y1 = mb.relu(x=x1)
-            y2 = mb.transpose(x=y1, perm=[1,0])
+            y2 = mb.transpose(x=y1, perm=[1, 0])
             return y1, y2
 
         prev_prog, prev_block, block = apply_pass_and_basic_check(
@@ -1746,8 +1748,8 @@ def prog(x):
         assert_model_is_valid(
             prog,
             {'x': (2, 5)},
-            expected_output_shapes={block.outputs[0].name: (5,2),
-                                    block.outputs[1].name: (2,5)}
+            expected_output_shapes={block.outputs[0].name: (5, 2),
+                                    block.outputs[1].name: (2, 5)}
         )
 
     """
diff --git a/coremltools/converters/mil/mil/passes/test_replace_stack_reshape.py b/coremltools/converters/mil/mil/passes/test_replace_stack_reshape.py
index e5b11f7db..641c60419 100644
--- a/coremltools/converters/mil/mil/passes/test_replace_stack_reshape.py
+++ b/coremltools/converters/mil/mil/passes/test_replace_stack_reshape.py
@@ -3,9 +3,11 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
-import numpy as np
 import unittest
 
+import numpy as np
+
+import coremltools as ct
 from coremltools._deps import _IS_MACOS
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.testing_utils import (
@@ -13,7 +15,6 @@
     get_op_types_in_program,
     apply_pass_and_basic_check,
 )
-from coremltools.converters.mil.testing_reqs import ct
 
 
 np.random.seed(1984)
@@ -59,7 +60,11 @@ def prog(x1, x2):
 
         output_name = block.outputs[0].name
 
-        mlmodel = ct.convert(prog, source="milinternal", convert_to="neuralnetwork")
+        mlmodel = ct.convert(prog,
+                             source="milinternal",
+                             convert_to="neuralnetwork",
+                             compute_units=ct.ComputeUnit.CPU_ONLY
+        )
 
         if not _IS_MACOS:
             # Can not get predictions unless on macOS.
@@ -71,7 +76,7 @@ def prog(x1, x2):
 
         old_prediction = np.reshape(np.stack([input_dict["x1"], input_dict["x2"]], axis=2), newshape=[1, 10, 3, 4])
 
-        prediction = mlmodel.predict(input_dict, useCPUOnly=True)
+        prediction = mlmodel.predict(input_dict)
 
         np.testing.assert_allclose(old_prediction, prediction[output_name], atol=1e-04, rtol=1e-05)
 
@@ -113,7 +118,11 @@ def prog(x1, x2):
 
         output_name = block.outputs[0].name
 
-        mlmodel = ct.convert(prog, source="milinternal", convert_to="neuralnetwork")
+        mlmodel = ct.convert(prog,
+                             source="milinternal",
+                             convert_to="neuralnetwork",
+                             compute_units=ct.ComputeUnit.CPU_ONLY
+        )
 
         if not _IS_MACOS:
             # Can not get predictions unless on macOS.
@@ -125,7 +134,7 @@ def prog(x1, x2):
 
         old_prediction = np.reshape(np.stack([input_dict["x1"], input_dict["x2"]], axis=1), newshape=[1, 10, 3, 4])
 
-        prediction = mlmodel.predict(input_dict, useCPUOnly=True)
+        prediction = mlmodel.predict(input_dict)
         np.testing.assert_allclose(old_prediction, prediction[output_name], atol=1e-04, rtol=1e-05)
 
     def test_multiple(self):
@@ -160,7 +169,11 @@ def prog(x1, x2, x3, x4):
 
         output_name = block.outputs[0].name
 
-        mlmodel = ct.convert(prog, source="milinternal", convert_to="neuralnetwork")
+        mlmodel = ct.convert(prog,
+                             source="milinternal",
+                             convert_to="neuralnetwork",
+                             compute_units=ct.ComputeUnit.CPU_ONLY
+        )
 
         if not _IS_MACOS:
             # Can not get predictions unless on macOS.
@@ -174,7 +187,7 @@ def prog(x1, x2, x3, x4):
         branch_2 = np.reshape(np.stack([input_dict['x3'], input_dict['x4']], axis=1), newshape=[1, 4, 3, 4])
         old_prediction = np.reshape(np.stack([branch_1, branch_2], axis=2), newshape=[1, 4, 6, 4])
 
-        prediction = mlmodel.predict(input_dict, useCPUOnly=True)
+        prediction = mlmodel.predict(input_dict)
 
         np.testing.assert_allclose(old_prediction, prediction[output_name], atol=1e-04, rtol=1e-05)
 
diff --git a/coremltools/converters/mil/mil/passes/test_use_reflection_padding.py b/coremltools/converters/mil/mil/passes/test_use_reflection_padding.py
index 854839f9f..576dc2199 100644
--- a/coremltools/converters/mil/mil/passes/test_use_reflection_padding.py
+++ b/coremltools/converters/mil/mil/passes/test_use_reflection_padding.py
@@ -17,7 +17,7 @@
 
 
 class TestUseReflectionPadding:
-    
+
     def test_success_w_axis(self):
         @mb.program(input_specs=[mb.TensorSpec(shape=(1, 2, 6, 8))])
         def prog(x1):
@@ -86,7 +86,7 @@ def prog(x1):
             prog,
             inputs,
             expected_output_shapes={block.outputs[0].name: (1, 2, 8, 8)},
-        )        
+        )
 
 
     def test_failure_wrong_concat_order(self):
@@ -94,7 +94,7 @@ def test_failure_wrong_concat_order(self):
         def prog(x1):
             left = mb.slice_by_index(x=x1, begin=[0, 0, 1, 0], end=[0, 0, 2, 0], end_mask=[True, True, False, True])
             right = mb.slice_by_index(x=x1, begin=[0, 0, -2, 0], end=[0, 0, -1, 0], end_mask=[True, True, False, True])
-            # Concat is not in correct order 
+            # Concat is not in correct order
             x = mb.concat(values=[left, right, x1], axis=2)
 
             return x
@@ -110,7 +110,7 @@ def prog(x1):
             prog,
             inputs,
             expected_output_shapes={block.outputs[0].name: (1, 2, 8, 8)},
-        )        
+        )
 
 
     def test_failure_wrong_concat_order_2(self):
@@ -160,7 +160,7 @@ def prog(x1):
             prog,
             inputs,
             expected_output_shapes={block.outputs[0].name: (1, 2, 9, 8)},
-        )        
+        )
 
 
     def test_failure_not_all_same_input(self):
diff --git a/coremltools/converters/mil/mil/passes/update_output_dtypes.py b/coremltools/converters/mil/mil/passes/update_output_dtypes.py
new file mode 100644
index 000000000..a7b942dba
--- /dev/null
+++ b/coremltools/converters/mil/mil/passes/update_output_dtypes.py
@@ -0,0 +1,53 @@
+#  Copyright (c) 2021, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from coremltools.converters.mil.mil import Builder as mb, types as types
+from coremltools.converters.mil.mil.passes.pass_registry import register_pass
+from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
+
+@register_pass(namespace="common")
+class update_output_dtypes(AbstractGraphPass):
+    """
+    Update the dtypes of output vars of the main block to match the dtypes
+    provided in prog.main_output_types, which in turn is populated by the
+    "outputs" argument provided by the user in the coremltools.convert() API.
+    This graph pass assumes that the list of outputs in prog.main_output_types (if not None),
+    are in the same order as the output vars.
+    """
+
+    def apply(self, prog):
+        user_provided_output_types = prog.main_output_types
+        main_func = prog.functions["main"]
+        output_vars = main_func.outputs
+        if user_provided_output_types is None or len(user_provided_output_types) == 0:
+            return
+        if len(output_vars) != len(user_provided_output_types):
+            msg = "Number of outputs provided by the user, which is {}, " \
+                  "does not match the number of outputs generated by the model, which is {}"
+            raise ValueError(msg.format(len(user_provided_output_types), len(output_vars)))
+
+        new_outputs = []
+        for i, output_type in enumerate(user_provided_output_types):
+            required_output_dtype = output_type.dtype
+            output_var = output_vars[i]
+            if required_output_dtype is None or \
+                not (types.is_tensor(output_var.sym_type) or types.is_scalar(output_var.sym_type)) or \
+                required_output_dtype == output_var.dtype:
+                # no need to update the output var's dtype in this case
+                new_outputs.append(output_var)
+            else:
+                output_var_name = output_var.name
+                output_var.set_name(output_var_name + "_type_" + types.builtin_to_string(output_var.dtype))
+                with main_func:
+                    output_var = mb.cast(x=output_var, dtype=types.builtin_to_string(required_output_dtype))
+                    output_var.set_name(output_var_name)
+                new_outputs.append(output_var)
+
+        main_func.set_outputs(new_outputs)
+
+
+
+
+
diff --git a/coremltools/converters/mil/mil/passes/use_reflection_padding.py b/coremltools/converters/mil/mil/passes/use_reflection_padding.py
index dec0099ed..29ca65665 100644
--- a/coremltools/converters/mil/mil/passes/use_reflection_padding.py
+++ b/coremltools/converters/mil/mil/passes/use_reflection_padding.py
@@ -18,12 +18,12 @@ def _match_pattern(concat_op, block):
         return False
 
     # The original input will need to be in the middle of the concatenated inputs
-    original_input = concat_inputs[len(concat_inputs)//2]
+    original_input = concat_inputs[len(concat_inputs) // 2]
 
     axis = None
     slice_ops_out = []
     end_mask = None
-    begin_index = len(concat_inputs)//2
+    begin_index = len(concat_inputs) // 2
 
     for slice_op in concat_inputs:
 
diff --git a/coremltools/converters/mil/mil/program.py b/coremltools/converters/mil/mil/program.py
index 7a9c3a083..68708f312 100644
--- a/coremltools/converters/mil/mil/program.py
+++ b/coremltools/converters/mil/mil/program.py
@@ -17,8 +17,10 @@
 class Program:
     def __init__(self):
         self.main_input_types = []
+        self.main_output_types = None
         self.functions = {}
         self.parameters = {}
+        self.skip_all_passes = False
 
     def add_function(self, name, ssa_func):
         if not isinstance(ssa_func, Function):
@@ -35,6 +37,13 @@ def set_main_input_types(self, inputs):
             raise ValueError("main inputs should be tuple of InputSpec")
         self.main_input_types = inputs
 
+    def set_main_output_types(self, outputs=None):
+        if outputs is not None:
+            if not (isinstance(outputs, list) and all([isinstance(out, InputType) for out in outputs])):
+                raise TypeError("main outputs should be a list of type ct.TensorType or ct.ImageType")
+        self.main_output_types = outputs
+
+
     def find_ops(self, prefix=None, op_type=None, exactly_one=False):
         """
         Return list of ops with name matching `prefix` if specified, and
diff --git a/coremltools/converters/mil/mil/tests/test_block.py b/coremltools/converters/mil/mil/tests/test_block.py
index 6e15058f8..b246c6b3b 100644
--- a/coremltools/converters/mil/mil/tests/test_block.py
+++ b/coremltools/converters/mil/mil/tests/test_block.py
@@ -1,18 +1,17 @@
-# -*- coding: utf-8 -*-
-
 #  Copyright (c) 2020, Apple Inc. All rights reserved.
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+import copy
 
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.testing_utils import (
-    get_op_types_in_program,
-    assert_same_output_shapes,
     assert_same_output_names,
+    assert_same_output_shapes,
+    get_op_types_in_program,
 )
-import copy
+
 
 """
 Test manipulating variable and operations in the Block.
diff --git a/coremltools/converters/mil/mil/types/get_type_info.py b/coremltools/converters/mil/mil/types/get_type_info.py
index 017c72f3a..74d471397 100644
--- a/coremltools/converters/mil/mil/types/get_type_info.py
+++ b/coremltools/converters/mil/mil/types/get_type_info.py
@@ -1,11 +1,8 @@
-# -*- coding: utf-8 -*-
-
 #  Copyright (c) 2020, Apple Inc. All rights reserved.
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
-
 from .type_spec import FunctionType, Type
 from .type_void import void
 
diff --git a/coremltools/converters/mil/mil/types/global_methods.py b/coremltools/converters/mil/mil/types/global_methods.py
index 701527519..b739beed6 100644
--- a/coremltools/converters/mil/mil/types/global_methods.py
+++ b/coremltools/converters/mil/mil/types/global_methods.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
 #  Copyright (c) 2020, Apple Inc. All rights reserved.
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
diff --git a/coremltools/converters/mil/mil/types/type_bool.py b/coremltools/converters/mil/mil/types/type_bool.py
index 6689115a1..9c74bd3de 100644
--- a/coremltools/converters/mil/mil/types/type_bool.py
+++ b/coremltools/converters/mil/mil/types/type_bool.py
@@ -1,11 +1,9 @@
-# -*- coding: utf-8 -*-
-
 #  Copyright (c) 2020, Apple Inc. All rights reserved.
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
-from .annotate import class_annotate, annotate, delay_type
+from .annotate import annotate, class_annotate, delay_type
 from .type_spec import Type
 
 
diff --git a/coremltools/converters/mil/mil/types/type_dict.py b/coremltools/converters/mil/mil/types/type_dict.py
index 932637149..4a6ca452f 100644
--- a/coremltools/converters/mil/mil/types/type_dict.py
+++ b/coremltools/converters/mil/mil/types/type_dict.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
 #  Copyright (c) 2020, Apple Inc. All rights reserved.
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
diff --git a/coremltools/converters/mil/mil/types/type_double.py b/coremltools/converters/mil/mil/types/type_double.py
index bbdbf53a1..08f8c9a8c 100644
--- a/coremltools/converters/mil/mil/types/type_double.py
+++ b/coremltools/converters/mil/mil/types/type_double.py
@@ -1,14 +1,12 @@
-# -*- coding: utf-8 -*-
-
 #  Copyright (c) 2020, Apple Inc. All rights reserved.
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+import logging
 
-import numpy as np
 import math
-import logging
+import numpy as np
 
 from .annotate import class_annotate, annotate, delay_type
 from .type_bool import bool
@@ -38,13 +36,18 @@ def val(self, v):
             )
 
             if not isinstance(v, np.generic):
-                raise ValueError(
-                    "types should have value of numpy type, got {} instead".format(
-                        type(v)
+
+                if isinstance(v, np.ndarray) and v.ndim == 0:
+                    # Rank zero tensor case. Use as a scalar.
+                    self._val = v.item()
+                else:
+                    raise ValueError(
+                        "types should have value of numpy type, got {} instead".format(
+                            type(v)
+                        )
                     )
-                )
 
-            if isinstance(v, np.floating):
+            elif isinstance(v, np.floating):
                 v_type = numpy_type_to_builtin_type(v.dtype)
                 if v_type.get_bitwidth() <= self.get_bitwidth():
                     self._val = v
diff --git a/coremltools/converters/mil/mil/types/type_globals_pseudo_type.py b/coremltools/converters/mil/mil/types/type_globals_pseudo_type.py
index 3c51f91a8..b849ba95e 100644
--- a/coremltools/converters/mil/mil/types/type_globals_pseudo_type.py
+++ b/coremltools/converters/mil/mil/types/type_globals_pseudo_type.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
 #  Copyright (c) 2020, Apple Inc. All rights reserved.
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
diff --git a/coremltools/converters/mil/mil/types/type_int.py b/coremltools/converters/mil/mil/types/type_int.py
index 23e6ea793..db5f8873c 100644
--- a/coremltools/converters/mil/mil/types/type_int.py
+++ b/coremltools/converters/mil/mil/types/type_int.py
@@ -1,15 +1,13 @@
-# -*- coding: utf-8 -*-
-
 #  Copyright (c) 2020, Apple Inc. All rights reserved.
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+import logging
+import math
 
 import numpy as np
 import sympy as sm
-import math
-import logging
 
 from .annotate import class_annotate, annotate, delay_type
 from .type_bool import bool
diff --git a/coremltools/converters/mil/mil/types/type_list.py b/coremltools/converters/mil/mil/types/type_list.py
index 76d738d0c..cb8247bbb 100644
--- a/coremltools/converters/mil/mil/types/type_list.py
+++ b/coremltools/converters/mil/mil/types/type_list.py
@@ -1,15 +1,13 @@
-# -*- coding: utf-8 -*-
-
 #  Copyright (c) 2020, Apple Inc. All rights reserved.
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
-from .annotate import annotate
-from .type_void import void
 from . import type_int
-from .type_spec import Type
+from .annotate import annotate
 from .get_type_info import get_type_info
+from .type_spec import Type
+from .type_void import void
 
 
 def memoize(f):
diff --git a/coremltools/converters/mil/mil/types/type_mapping.py b/coremltools/converters/mil/mil/types/type_mapping.py
index 59be98ced..479ff2590 100644
--- a/coremltools/converters/mil/mil/types/type_mapping.py
+++ b/coremltools/converters/mil/mil/types/type_mapping.py
@@ -2,6 +2,8 @@
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+import numpy as _np
+
 import coremltools.proto.MIL_pb2 as _mil_pm
 from coremltools.converters.mil.mil import types as _mil_types
 
@@ -70,8 +72,6 @@ def np_dtype_to_py_type(np_dtype):
         return bool
     if np_dtype in [np.float32, np.float64]:
         return float
-    if np_dtype == np.float16:
-        return bytes
     raise NotImplementedError('{} is not supported'.format(np_dtype))
 
 _STRINGS_TO_types = {v: k for k, v in _types_TO_STRINGS.items()}
@@ -172,9 +172,9 @@ def is_primitive(btype):
 
 def is_scalar(btype):
     """
-    Is the given builtin type a scalar integer, float, or boolean?
+    Is the given builtin type a scalar integer, float, boolean or string?
     """
-    return is_bool(btype) or is_int(btype) or is_float(btype)
+    return is_bool(btype) or is_int(btype) or is_float(btype) or is_str(btype)
 
 
 def is_tensor(tensor_type):
@@ -376,3 +376,24 @@ def is_subtype(type1, type2):
 }
 
 proto_to_builtin_types = {v: k for k, v in builtin_to_proto_types.items()}
+
+
+def np_val_to_py_type(val):
+    """Convert numpy val to python primitive equivalent. Ex:
+
+    Given: val = np.array([True, False])
+    Returns: [True, False]
+
+    Given: val = np.array(32, dtype=np.int)
+    Returns 32
+    """
+    if not isinstance(val, (_np.ndarray, _np.generic)):
+        return val
+
+    if val.dtype in [_np.float16, _np.uint8, _np.int8, _np.uint32]:
+        return val.tobytes()
+    else:
+        # val is np.ndarray or np.generic
+        is_np_scalar = isinstance(val, _np.generic) or val.shape == ()
+        py_type = np_dtype_to_py_type(val.dtype)
+        return py_type(val) if is_np_scalar else tuple(py_type(v) for v in val.flatten())
\ No newline at end of file
diff --git a/coremltools/converters/mil/mil/types/type_spec.py b/coremltools/converters/mil/mil/types/type_spec.py
index 1039b757a..ef46bd898 100644
--- a/coremltools/converters/mil/mil/types/type_spec.py
+++ b/coremltools/converters/mil/mil/types/type_spec.py
@@ -1,12 +1,8 @@
-# -*- coding: utf-8 -*-
-
 #  Copyright (c) 2020, Apple Inc. All rights reserved.
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
-
-
 class Type:
     """
      - Type.name : A string with the name of the object
diff --git a/coremltools/converters/mil/mil/types/type_str.py b/coremltools/converters/mil/mil/types/type_str.py
index f3c98d341..11e26774e 100644
--- a/coremltools/converters/mil/mil/types/type_str.py
+++ b/coremltools/converters/mil/mil/types/type_str.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
 #  Copyright (c) 2020, Apple Inc. All rights reserved.
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
diff --git a/coremltools/converters/mil/mil/types/type_tensor.py b/coremltools/converters/mil/mil/types/type_tensor.py
index 6602af7e3..67110ba17 100644
--- a/coremltools/converters/mil/mil/types/type_tensor.py
+++ b/coremltools/converters/mil/mil/types/type_tensor.py
@@ -1,14 +1,12 @@
-# -*- coding: utf-8 -*-
-
 #  Copyright (c) 2020, Apple Inc. All rights reserved.
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+import logging
 
 import numpy as np
 import sympy as sm
-import logging
 
 from .type_spec import Type
 from .get_type_info import get_type_info
diff --git a/coremltools/converters/mil/mil/types/type_tuple.py b/coremltools/converters/mil/mil/types/type_tuple.py
index 60e88b0cf..1bb7435ef 100644
--- a/coremltools/converters/mil/mil/types/type_tuple.py
+++ b/coremltools/converters/mil/mil/types/type_tuple.py
@@ -1,15 +1,13 @@
-# -*- coding: utf-8 -*-
-
 #  Copyright (c) 2020, Apple Inc. All rights reserved.
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+from . import type_int, type_unknown
 from .annotate import annotate
-from . import type_int
-from . import type_unknown
-from .type_spec import Type
 from .get_type_info import get_type_info
+from .type_spec import Type
+
 
 _global_tuple = tuple
 
diff --git a/coremltools/converters/mil/mil/types/type_unknown.py b/coremltools/converters/mil/mil/types/type_unknown.py
index 7bf05f033..af9402291 100644
--- a/coremltools/converters/mil/mil/types/type_unknown.py
+++ b/coremltools/converters/mil/mil/types/type_unknown.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
 #  Copyright (c) 2020, Apple Inc. All rights reserved.
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
diff --git a/coremltools/converters/mil/mil/types/type_void.py b/coremltools/converters/mil/mil/types/type_void.py
index 392b481a7..7abb90088 100644
--- a/coremltools/converters/mil/mil/types/type_void.py
+++ b/coremltools/converters/mil/mil/types/type_void.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
 #  Copyright (c) 2020, Apple Inc. All rights reserved.
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
diff --git a/coremltools/converters/mil/mil/var.py b/coremltools/converters/mil/mil/var.py
index 4f5568035..57260933f 100644
--- a/coremltools/converters/mil/mil/var.py
+++ b/coremltools/converters/mil/mil/var.py
@@ -8,7 +8,7 @@
 from coremltools.converters.mil.mil.types.symbolic import any_symbolic
 
 
-class Var(object):
+class Var:
     """
     Var represents the outputs of an Operation. Most Vars are derived from an
     Operation (including const), and all Vars must have `sym_type`.
@@ -206,7 +206,7 @@ def __init__(
 
         sym_val: value of the list, if available
         """
-        super(ListVar, self).__init__(
+        super().__init__(
             name=name,
             sym_type=types.list(elem_type, init_length, dynamic_length),
             sym_val=sym_val,
@@ -268,6 +268,6 @@ class InternalVar(Var):
     """
 
     def __init__(self, val, name=None):
-        super(InternalVar, self).__init__(
+        super().__init__(
             name=name, sym_type=types.unknown, sym_val=types.unknown(val)
         )
diff --git a/coremltools/converters/mil/mil/visitors/dot_visitor.py b/coremltools/converters/mil/mil/visitors/dot_visitor.py
index 3c511c205..4471f61b1 100644
--- a/coremltools/converters/mil/mil/visitors/dot_visitor.py
+++ b/coremltools/converters/mil/mil/visitors/dot_visitor.py
@@ -32,7 +32,7 @@ def _get_input_vars(op, only_nonconst_vars=False):
     return input_vars
 
 
-class DotVisitor(object):
+class DotVisitor:
     """
     Generates a dot description of a ssa block
     """
diff --git a/coremltools/converters/mil/test_flexible_shape_inputs.py b/coremltools/converters/mil/test_flexible_shape_inputs.py
index f927ebd1a..9920b758b 100644
--- a/coremltools/converters/mil/test_flexible_shape_inputs.py
+++ b/coremltools/converters/mil/test_flexible_shape_inputs.py
@@ -23,6 +23,7 @@ def __init__(self, in_channels=3, out_channels=10, kernel_size=3):
         def forward(self, x):
             return self.conv(x)
 
+
 def _numpy_array_to_pil_image(x):
     """
     convert x of shape (1, 3, H, W) to PIL image
diff --git a/coremltools/converters/mil/testing_utils.py b/coremltools/converters/mil/testing_utils.py
index 40058acf8..cf9e91784 100644
--- a/coremltools/converters/mil/testing_utils.py
+++ b/coremltools/converters/mil/testing_utils.py
@@ -4,9 +4,10 @@
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 import copy
-import logging
+from functools import partial
 import os
 from pathlib import Path
+from PIL import Image
 import re
 
 import numpy as np
@@ -18,14 +19,20 @@
 from coremltools.converters.mil.mil.passes.quantization_passes import AbstractQuantizationPass
 from coremltools.converters.mil.mil.passes.pass_registry import PASS_REGISTRY
 import coremltools.models.utils as coremltoolsutils
+from coremltools.proto import FeatureTypes_pb2 as ft
 
 
 np.random.seed(10)
 
+DTYPE_TO_FEATURE_TYPE_MAP = {"int32": ft.ArrayFeatureType.INT32,
+                             "fp32": ft.ArrayFeatureType.FLOAT32,
+                             "fp16": ft.ArrayFeatureType.FLOAT16,
+                             }
+
 def _serialize_current_pytest(mlmodel):
     class_name = os.environ.get('PYTEST_CURRENT_TEST').split("::")[1].strip()
     test_name = "::".join(os.environ.get('PYTEST_CURRENT_TEST').split("::")[2:]).split("(call)")[0].strip()
-    mlpackage_path = "/tmp/pytest_failures/{}/{}/model.mlpackage".format(class_name,test_name)
+    mlpackage_path = "/tmp/pytest_failures/{}/{}/model.mlpackage".format(class_name, test_name)
     Path(mlpackage_path).mkdir(parents=True, exist_ok=True)
     mlmodel.save(mlpackage_path)
 
@@ -65,7 +72,8 @@ def assert_model_is_valid(
     for name, shape in inputs.items():
         input_dict[name] = np.random.rand(*shape)
 
-    mlmodel = ct_convert(program, source="milinternal", convert_to=backend, useCPUOnly=True)
+    mlmodel = ct_convert(program, source="milinternal", convert_to=backend,
+                         compute_units=ct.ComputeUnit.CPU_ONLY)
     assert mlmodel is not None
 
     if verbose:
@@ -73,7 +81,7 @@ def assert_model_is_valid(
         print_network_spec(mlmodel.get_spec(), style="coding")
 
     if _IS_MACOS and (not mlmodel.is_package or coremltoolsutils._macos_version() >= (12, 0)):
-        prediction = mlmodel.predict(input_dict, useCPUOnly=True)
+        prediction = mlmodel.predict(input_dict)
         assert prediction is not None
         if expected_output_shapes is not None:
             for out_name, out_shape in expected_output_shapes.items():
@@ -168,7 +176,7 @@ def to_tuple(v):
     return tuple(v)
 
 
-def run_core_ml_predict(mlmodel, input_key_values, use_cpu_only=True):
+def run_core_ml_predict(mlmodel, input_key_values):
     for k, v in input_key_values.items():
         if isinstance(v, PIL.Image.Image):
             continue
@@ -176,7 +184,7 @@ def run_core_ml_predict(mlmodel, input_key_values, use_cpu_only=True):
             input_key_values[k] = v.astype(np.float32)
         else:
             input_key_values[k] = np.array([v], dtype=np.float32)
-    return mlmodel.predict(input_key_values, useCPUOnly=use_cpu_only)
+    return mlmodel.predict(input_key_values)
 
 def _get_coreml_out_from_dict(out_dict, out_name):
     if out_name in out_dict:
@@ -190,7 +198,6 @@ def compare_backend(
     mlmodel,
     input_key_values,
     expected_outputs,
-    use_cpu_only=True,
     dtype = "fp32",
     atol=1e-04,
     rtol=1e-05,
@@ -204,35 +211,42 @@ def compare_backend(
           input_placeholders.
 
         - expected_outputs: dict[str, np.array]. Required iff
-          frontend_only == False
-
-        - use_cpu_only: True/False.
+          frontend_only is False
     """
     if _IS_MACOS and (not mlmodel.is_package or coremltoolsutils._macos_version() >= (12, 0)):
 
         if dtype not in ["fp32", "fp16"]:
             raise ValueError("Unsupported dtype config")
 
-        pred = run_core_ml_predict(mlmodel, input_key_values,
-                                   use_cpu_only=use_cpu_only)
+        pred = run_core_ml_predict(mlmodel, input_key_values)
         if also_compare_shapes:
             compare_shapes(
                 mlmodel,
                 input_key_values,
                 expected_outputs,
-                use_cpu_only=use_cpu_only,
                 pred=pred,
             )
-        if not use_cpu_only or (dtype == "fp16"):
+        if mlmodel.compute_unit != ct.ComputeUnit.CPU_ONLY or (dtype == "fp16"):
             atol = max(atol * 100.0, 5e-1)
             rtol = max(rtol * 100.0, 5e-2)
         for o, expected in expected_outputs.items():
             coreml_out = _get_coreml_out_from_dict(pred, o)
-            np.testing.assert_allclose(coreml_out, expected, atol=atol, rtol=rtol)
+
+            if isinstance(coreml_out, np.ndarray):
+                np.testing.assert_allclose(coreml_out, expected, atol=atol, rtol=rtol)
+            elif isinstance(coreml_out, dict):
+                for k, v in coreml_out.items():
+                    assert k in expected
+                    assert expected[k] == v
+            else:
+                assert coreml_out == expected
+
+        return pred
+    return None
 
 
 def compare_shapes(
-    mlmodel, input_key_values, expected_outputs, use_cpu_only=True, pred=None
+    mlmodel, input_key_values, expected_outputs, pred=None
 ):
     """
     Inputs:
@@ -243,36 +257,47 @@ def compare_shapes(
 
         - expected_outputs: dict[str, np.array].
 
-        - use_cpu_only: True/False.
-
         - pred: Prediction to use, if it has already been computed.
     """
 
     if _IS_MACOS:
         if not pred:
-            pred = run_core_ml_predict(mlmodel, input_key_values,
-                                       use_cpu_only)
+            pred = run_core_ml_predict(mlmodel, input_key_values)
         for o, expected in expected_outputs.items():
             coreml_out = _get_coreml_out_from_dict(pred, o)
-            msg = "Output: {}. expected shape {} != actual shape {}".format(
-                o, expected.shape, coreml_out.shape
-            )
-            # Core ML does not support scalar as output
-            # remove this special case when support is added
-            if expected.shape == () and coreml_out.shape == (1,):
+            
+            # output is dictionary (for classifier)
+            if isinstance(coreml_out, dict) and isinstance(expected, dict):
+                assert len(coreml_out) == len(expected)
+                continue
+
+            # output is numpy objects
+            np_types = (np.generic, np.ndarray)
+            if isinstance(coreml_out, np_types) and isinstance(expected, np_types):
+                msg = "Output: {}. expected shape {} != actual shape {}".format(
+                    o, expected.shape, coreml_out.shape
+                )
+                # Core ML does not support scalar as output
+                # remove this special case when support is added
+                if expected.shape == () and coreml_out.shape == (1,):
+                    continue
+                assert coreml_out.shape == expected.shape, msg
                 continue
-            assert coreml_out.shape == expected.shape, msg
+
+            # output is other types (for classifier)
+            assert type(coreml_out) == type(expected)
 
 def ct_convert(
     program,
-    source = "auto",
-    inputs = None,
-    outputs = None,
-    classifier_config = None,
-    minimum_deployment_target = None,
-    convert_to = None,
-    compute_precision = None,
-    skip_model_load = False,
+    source="auto",
+    inputs=None,
+    outputs=None,
+    classifier_config=None,
+    minimum_deployment_target=None,
+    convert_to=None,
+    compute_precision=None,
+    skip_model_load=False,
+    converter=ct.convert,
     **kwargs,
 ):
 
@@ -282,6 +307,9 @@ def ct_convert(
     Ex: ("neuralnetwork", "fp32"), ("mlprogram", "fp16")
     """
 
+    if isinstance(converter, partial):
+        raise ValueError("Partial function is not supported for function-parameter 'converter' since its keywords arguments could get overriden.")
+
     target, dtype = convert_to
 
     if dtype not in ["fp32", "fp16"]:
@@ -291,16 +319,16 @@ def ct_convert(
     if target == "neuralnetwork":
         compute_precision = None
 
-    mlmodel = ct.convert(
+    mlmodel = converter(
                 program,
-                source = source,
-                inputs = inputs,
-                outputs = outputs,
-                classifier_config = classifier_config,
-                minimum_deployment_target = minimum_deployment_target,
-                convert_to = target,
-                compute_precision = compute_precision,
-                skip_model_load = skip_model_load,
+                source=source,
+                inputs=inputs,
+                outputs=outputs,
+                classifier_config=classifier_config,
+                minimum_deployment_target=minimum_deployment_target,
+                convert_to=target,
+                compute_precision=compute_precision,
+                skip_model_load=skip_model_load,
                 **kwargs
     )
 
@@ -311,8 +339,8 @@ def ct_convert(
     return mlmodel
 
 def get_core_ml_prediction(
-    build, input_placeholders, input_values, use_cpu_only=True,
-    backend=("neuralnetwork", "fp32")):
+        build, input_placeholders, input_values, use_cpu_only=True,
+        backend=("neuralnetwork", "fp32")):
     """
     Return predictions of the given model.
     """
@@ -326,9 +354,14 @@ def get_core_ml_prediction(
         ssa_func.set_outputs(output_vars)
         program.add_function("main", ssa_func)
 
+    if use_cpu_only:
+        compute_unit = ct.ComputeUnit.CPU_ONLY
+    else:
+        compute_unit = ct.ComputeUnit.ALL
+
     mlmodel = ct_convert(program, source="milinternal",
-                         convert_to=backend, useCPUOnly=use_cpu_only)
-    return mlmodel.predict(input_values, useCPUOnly=use_cpu_only)
+                         convert_to=backend,  compute_units=compute_unit)
+    return mlmodel.predict(input_values)
 
 
 def apply_pass_and_basic_check(prog, pass_name, skip_output_name_check=False):
@@ -344,3 +377,94 @@ def apply_pass_and_basic_check(prog, pass_name, skip_output_name_check=False):
         assert_same_output_names(prev_prog, prog)
     assert_same_output_shapes(prev_prog, prog)
     return prev_prog, prev_block, block
+
+
+def assert_prog_input_type(prog, expected_dtype_str, expected_name=None, index=0):
+    block = prog.functions["main"]
+    if expected_name is None:
+        input_var = list(block.inputs.values())[index]
+        assert input_var.is_tensor_or_scalar_of(dtype=expected_dtype_str)
+    else:
+        for input_var in block.inputs.values():
+            if input_var.name == expected_name:
+                assert input_var.is_tensor_or_scalar_of(dtype=expected_dtype_str)
+
+def assert_spec_input_type(spec, expected_feature_type, expected_name=None, index=0):
+    if expected_name is None:
+        assert spec.description.input[index].type.multiArrayType.dataType == expected_feature_type
+    else:
+        for input in spec.description.input:
+            if input.name == expected_name:
+                assert input.type.multiArrayType.dataType == expected_feature_type
+
+def assert_input_dtype(mlmodel, expected_type_str, expected_name=None, index=0):
+    assert_prog_input_type(mlmodel._mil_program, expected_type_str,
+                           expected_name=expected_name, index=index)
+    assert_spec_input_type(mlmodel._spec, DTYPE_TO_FEATURE_TYPE_MAP[expected_type_str],
+                           expected_name=expected_name, index=index)
+
+def assert_spec_output_type(spec, expected_feature_type, expected_name=None, index=0):
+    assert spec.description.output[index].type.multiArrayType.dataType == expected_feature_type
+    if expected_name is not None:
+        assert spec.description.output[index].name == expected_name
+
+def assert_prog_output_type(prog, expected_dtype_str, expected_name=None, index=0):
+    block = prog.functions["main"]
+    output_var = block.outputs[index]
+    assert output_var.is_tensor_or_scalar_of(dtype=expected_dtype_str)
+    if expected_name is not None:
+        assert output_var.name == expected_name
+
+def assert_output_dtype(mlmodel, expected_type_str, expected_name=None, index=0):
+    assert_prog_output_type(mlmodel._mil_program, expected_type_str,
+                            expected_name=expected_name, index=index)
+    assert_spec_output_type(mlmodel._spec, DTYPE_TO_FEATURE_TYPE_MAP[expected_type_str],
+                            expected_name=expected_name, index=index)
+
+def random_gen_input_feature_type(input_desc):
+    if input_desc.type.WhichOneof("Type") == "multiArrayType":
+        shape = [s for s in input_desc.type.multiArrayType.shape]
+        if input_desc.type.multiArrayType.dataType == ft.ArrayFeatureType.FLOAT32:
+            dtype = np.float32
+        elif input_desc.type.multiArrayType.dataType == ft.ArrayFeatureType.INT32:
+            dtype = np.int32
+        elif input_desc.type.multiArrayType.dataType == ft.ArrayFeatureType.FLOAT16:
+            dtype = np.float16
+        elif input_desc.type.multiArrayType.dataType == ft.ArrayFeatureType.FLOAT64:
+            dtype = np.float64
+        else:
+            raise ValueError("unsupported type")
+        return np.random.rand(*shape).astype(dtype)
+    elif input_desc.type.WhichOneof("Type") == "imageType":
+        if input_desc.type.imageType.colorSpace in (ft.ImageFeatureType.BGR, ft.ImageFeatureType.RGB):
+            shape = [3, input_desc.type.imageType.height, input_desc.type.imageType.width]
+            x = np.random.randint(low=0, high=256, size=shape)
+            return Image.fromarray(np.transpose(x, [1, 2, 0]).astype(np.uint8))
+        else:
+            shape = [input_desc.type.imageType.height, input_desc.type.imageType.width]
+            x = np.random.randint(low=0, high=256, size=shape)
+            return Image.fromarray(x.astype(np.uint8), 'L')
+    else:
+        raise ValueError('unsupported type')
+
+def verify_prediction(mlmodel, multiarray_type=None):
+    spec = mlmodel._spec
+    input_dict = {}
+    for input_desc in spec.description.input:
+        input_dict[input_desc.name] = random_gen_input_feature_type(input_desc)
+        if multiarray_type is not None:
+            input_dict[input_desc.name] = input_dict[input].astype(multiarray_type)
+    mlmodel.predict(input_dict)
+
+def assert_spec_input_image_type(spec, expected_feature_type):
+    assert spec.description.input[0].type.imageType.colorSpace == expected_feature_type
+
+def assert_spec_output_image_type(spec, expected_feature_type):
+    assert spec.description.output[0].type.imageType.colorSpace == expected_feature_type
+
+def assert_cast_ops_count(mlmodel, expected_count):
+    block = mlmodel._mil_program.functions["main"]
+    assert len(block.find_ops(op_type="cast")) == expected_count
+
+def assert_ops_in_mil_program(mlmodel, expected_op_list):
+    assert expected_op_list == get_op_types_in_program(mlmodel._mil_program)
\ No newline at end of file
diff --git a/coremltools/converters/onnx/_backend.py b/coremltools/converters/onnx/_backend.py
deleted file mode 100644
index 602ea3b35..000000000
--- a/coremltools/converters/onnx/_backend.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-from typing import Any, Text, Dict, Tuple
-from onnx import ModelProto
-from onnx.backend.base import Backend
-from ._backend_rep import CoreMLRep
-from ._converter import convert
-import onnx
-from ._graph import _input_from_onnx_input, EdgeInfo
-
-DEBUG = False
-
-
-def _get_onnx_outputs_info(model):  # type: (...) -> Dict[Text, EdgeInfo]
-    """
-    Takes in an onnx model and returns a dictionary
-    of onnx output names mapped to a tuple that is (output_name, type, shape)
-    """
-    if isinstance(model, str):
-        onnx_model = onnx.load(model)
-    elif isinstance(model, onnx.ModelProto):
-        onnx_model = model
-
-    graph = onnx_model.graph
-    onnx_output_dict = {}
-    for o in graph.output:
-        out = _input_from_onnx_input(o)
-        onnx_output_dict[out[0]] = out
-    return onnx_output_dict
-
-
-class CoreMLBackend(Backend):
-    @classmethod
-    def prepare(
-        cls,
-        model,  # type: ModelProto
-        device="CPU",  # type: Text
-        minimum_ios_deployment_target="12",  # type: str
-        **kwargs  # type: Any
-    ):
-        # type: (...) -> CoreMLRep
-        super(CoreMLBackend, cls).prepare(model, device, **kwargs)
-        if DEBUG:
-            with open("/tmp/node_model.onnx", "wb") as f:
-                s = model.SerializeToString()
-                f.write(s)
-        coreml_model = convert(
-            model, minimum_ios_deployment_target=minimum_ios_deployment_target
-        )
-        if DEBUG:
-            coreml_model.save("/tmp/node_model.mlmodel")
-        onnx_outputs_info = _get_onnx_outputs_info(model)
-        return CoreMLRep(
-            coreml_model,
-            onnx_outputs_info,
-            device == "CPU",
-            minimum_ios_deployment_target=minimum_ios_deployment_target,
-        )
-
-    @classmethod
-    def is_compatible(
-        cls,
-        model,  # type: ModelProto
-        device="CPU",  # type: Text
-        **kwargs  # type: Any
-    ):  # type: (...) -> bool
-        # Return whether the model is compatible with CoreML.
-        """
-         This function will gradually grow to cover more cases.
-         Need to be careful of false negatives. There are some cases that seemingly
-         are not supported on CoreML, which the graph transformer optimizes and converts to
-         a graph that can be converted to CoreML.
-
-         1. Check whether the layers for which CoreML expects constant weights are in
-            the list of initializers in the onnx graph
-         2. unsupported ops like "And", "Or" etc
-
-         """
-
-        node_set = set()
-        initializer_set = set()
-        graph = model.graph
-        for t in graph.initializer:
-            initializer_set.add(t.name)
-        for node in graph.node:
-            if node.op_type in [
-                "ConvTranspose",
-                "Conv",
-                "BatchNormalization",
-                "InstanceNormalization",
-                "PRelu",
-            ]:
-                if len(node.input) > 1 and node.input[1] not in initializer_set:
-                    return False
-            node_set.add(node.op_type)
-
-        # unsupported ops remove
-        for node in graph.node:
-            if node.op_type in [
-                "Cast",
-                "And",
-                "Or",
-                "Xor",
-                "Not",
-                "Less",
-                "Greater",
-                "Equal",
-                "Ceil",
-                "Floor",
-            ]:
-                return False
-
-        return True
-
-    @classmethod
-    def supports_device(
-        cls, device,  # type: Text
-    ):
-        # type: (...) -> bool
-        return device == "CPU"
-
-
-class CoreMLBackendND(Backend):
-    @classmethod
-    def prepare(
-        cls,
-        model,  # type: ModelProto
-        device="CPU",  # type: Text
-        minimum_ios_deployment_target="13",  # type: str
-        **kwargs  # type: Any
-    ):
-        # type: (...) -> CoreMLRep
-        super(CoreMLBackendND, cls).prepare(model, device, **kwargs)
-        if DEBUG:
-            with open("/tmp/node_model.onnx", "wb") as f:
-                s = model.SerializeToString()
-                f.write(s)
-        coreml_model = convert(
-            model, minimum_ios_deployment_target=minimum_ios_deployment_target
-        )
-        if DEBUG:
-            coreml_model.save("/tmp/node_model.mlmodel")
-        onnx_outputs_info = _get_onnx_outputs_info(model)
-        return CoreMLRep(
-            coreml_model,
-            onnx_outputs_info,
-            device == "CPU",
-            minimum_ios_deployment_target=minimum_ios_deployment_target,
-        )
-
-    @classmethod
-    def is_compatible(
-        cls,
-        model,  # type: ModelProto
-        device="CPU",  # type: Text
-        **kwargs  # type: Any
-    ):  # type: (...) -> bool
-        # Return whether the model is compatible with CoreML.
-        """
-        This function will gradually grow to cover more cases.
-        Need to be careful of false negatives. There are some cases that seemingly
-        are not supported on CoreML, which the graph transformer optimizes and converts to
-        a graph that can be converted to CoreML.
-
-        2. Unsupported ops: If graph has one of unsupported op, exit
-
-        """
-        ## TODO: Add un-supported ops
-        unsupported_ops = []
-        graph = model.graph
-        for node in graph.node:
-            if node.op_type in unsupported_ops:
-                return False
-        return True
-
-    @classmethod
-    def supports_device(
-        cls, device,  # type: Text
-    ):
-        # type: (...) -> bool
-        return device == "CPU"
diff --git a/coremltools/converters/onnx/_backend_rep.py b/coremltools/converters/onnx/_backend_rep.py
deleted file mode 100644
index d5044e0ea..000000000
--- a/coremltools/converters/onnx/_backend_rep.py
+++ /dev/null
@@ -1,120 +0,0 @@
-
-# from __future__ import unicode_literals
-
-import numpy as np
-from typing import Any, Sequence, List
-from onnx.backend.base import BackendRep, namedtupledict
-from onnx.mapping import TENSOR_TYPE_TO_NP_TYPE
-from coremltools.proto import FeatureTypes_pb2 as ft  # type: ignore
-from coremltools.models import MLModel  # type: ignore
-from typing import Dict, Any, Text, Tuple
-from onnx import TensorProto
-from ._graph import EdgeInfo
-from ._converter import SupportedVersion
-
-
-def _set_dtypes(
-    input_dict,  # type: Dict[Text, np._ArrayLike[Any]]
-    model,  # type: MLModel
-):
-    # type: (...) -> None
-    spec = model.get_spec()
-    for input_ in spec.description.input:
-        if input_.type.HasField("multiArrayType") and input_.name in input_dict:
-            if input_.type.multiArrayType.dataType == ft.ArrayFeatureType.INT32:
-                input_dict[input_.name] = input_dict[input_.name].astype(np.int32)
-            if input_.type.multiArrayType.dataType == ft.ArrayFeatureType.FLOAT32:
-                input_dict[input_.name] = input_dict[input_.name].astype(np.float32)
-            if input_.type.multiArrayType.dataType == ft.ArrayFeatureType.DOUBLE:
-                input_dict[input_.name] = input_dict[input_.name].astype(np.float64)
-
-
-class CoreMLRep(BackendRep):
-    def __init__(
-        self,
-        coreml_model,  # type: MLModel
-        onnx_outputs_info,  # type: Dict[Text, EdgeInfo]
-        useCPUOnly=False,  # type: bool
-        minimum_ios_deployment_target="12",  # type: str
-    ):
-        # type: (...) -> None
-        super(CoreMLRep, self).__init__()
-        self.model = coreml_model
-        self.useCPUOnly = useCPUOnly
-        self.minimum_ios_deployment_target = minimum_ios_deployment_target
-
-        spec = coreml_model.get_spec()
-        self.input_names = [str(i.name) for i in spec.description.input]
-        self.output_names = [str(o.name) for o in spec.description.output]
-        self.onnx_outputs_info = onnx_outputs_info  # type: Dict[Text, EdgeInfo]
-
-    def run(
-        self,
-        inputs,  # type: Any
-        **kwargs  # type: Any
-    ):
-        # type: (...) -> Tuple[Any, ...]
-        super(CoreMLRep, self).run(inputs, **kwargs)
-        inputs_ = inputs
-        _reshaped = False
-        if not SupportedVersion.is_nd_array_supported(
-            self.minimum_ios_deployment_target
-        ):
-            for i, input_ in enumerate(inputs_):
-                shape = input_.shape
-                if len(shape) == 4 or len(shape) == 2:
-                    inputs_[i] = input_[np.newaxis, :]
-                    _reshaped = True
-                elif len(shape) == 3:
-                    spec = self.model.get_spec()
-                    spec_shape = [
-                        int(k)
-                        for k in spec.description.input[i].type.multiArrayType.shape
-                    ]
-                    prod = spec_shape[0] * spec_shape[1] * spec_shape[2]
-                    onnx_shape = list(shape)
-                    if onnx_shape != spec_shape:
-                        if onnx_shape[2] == prod:
-                            inputs_[i] = np.reshape(
-                                inputs_[i], [onnx_shape[0], onnx_shape[1]] + spec_shape
-                            )
-                        elif onnx_shape[1] * onnx_shape[2] == prod:
-                            inputs_[i] = np.reshape(
-                                inputs_[i], [1, onnx_shape[0]] + spec_shape
-                            )
-        input_dict = dict(zip(self.input_names, map(np.array, inputs_)))
-        _set_dtypes(input_dict, self.model)  # type: ignore
-
-        prediction = self.model.predict(input_dict, self.useCPUOnly)
-        output_values = [prediction[name] for name in self.output_names]
-
-        if not SupportedVersion.is_nd_array_supported(
-            self.minimum_ios_deployment_target
-        ):
-            for i, output_ in enumerate(output_values):
-                shape = output_.shape
-                # reshape the CoreML output to match Onnx's output shape
-                try:
-                    output_values[i] = np.reshape(output_, self.onnx_outputs_info[self.output_names[i]][2])  # type: ignore
-                except RuntimeError:
-                    print(
-                        "Output '%s' shape incompatible between CoreML (%s) and onnx (%s)"
-                        % (
-                            self.output_names[i],
-                            output_.shape,
-                            self.onnx_outputs_info[self.output_names[i]],
-                        )
-                    )
-
-        ## Type Cast to ONNX expected output types
-        for i, output_ in enumerate(output_values):
-            output_type = self.onnx_outputs_info[self.output_names[i]][1]
-            if TENSOR_TYPE_TO_NP_TYPE[output_type] != output_values[i].dtype:
-                output_values[i] = output_values[i].astype(
-                    TENSOR_TYPE_TO_NP_TYPE[output_type]
-                )
-
-        result = namedtupledict("Outputs", self.output_names)(
-            *output_values
-        )  # type: Tuple[Any, ...]
-        return result
diff --git a/coremltools/converters/onnx/_converter.py b/coremltools/converters/onnx/_converter.py
deleted file mode 100644
index 3708dd9e6..000000000
--- a/coremltools/converters/onnx/_converter.py
+++ /dev/null
@@ -1,941 +0,0 @@
-# Copyright (c) 2021, Apple Inc. All rights reserved.
-#
-# Use of this source code is governed by a BSD-3-clause license that can be
-# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
-import numpy as np
-
-from ...models._deprecation import deprecated as _deprecated
-from coremltools.models.neural_network import NeuralNetworkBuilder  # type: ignore
-from coremltools.models import datatypes, MLModel  # type: ignore
-from coremltools.proto import FeatureTypes_pb2 as ft  # type: ignore
-from coremltools import (
-    _MINIMUM_CUSTOM_LAYER_SPEC_VERSION as IOS_11_2_SPEC_VERSION,
-)  # iOS 11.2
-from coremltools import (
-    _MINIMUM_CUSTOM_MODEL_SPEC_VERSION as IOS_12_SPEC_VERSION,
-)  # iOS 12.0
-from coremltools import _MINIMUM_NDARRAY_SPEC_VERSION as IOS_13_SPEC_VERSION  # iOS 13.0
-from coremltools import __version__ as ct_version
-from coremltools.models import _METADATA_VERSION, _METADATA_SOURCE
-
-from coremltools._deps import _HAS_ONNX
-
-# ML model passes
-from coremltools.converters.mil.backend.nn.passes.mlmodel_passes import (
-    remove_disconnected_layers,
-    transform_conv_crop,
-)
-
-if _HAS_ONNX:
-    import onnx
-    from onnx import shape_inference
-    from onnx import TensorProto
-
-    from typing import Tuple
-    from typing import Text, Union, Optional, Dict, Any, Iterable, Sequence, Callable, List
-
-    from ._operators import (
-        _convert_node,
-        _SEQUENCE_LAYERS_REGISTRY,
-        _ONNX_NODE_REGISTRY,
-        _add_const_inputs_if_required,
-    )
-    from ._operators_nd import _ONNX_NODE_REGISTRY_ND, _convert_node_nd
-
-    from ._graph import Graph, EdgeInfo, Transformer
-
-    from ._transformers import (
-        ConvAddFuser,
-        DropoutRemover,
-        ReshapeInitTensorFuser,
-        BNBroadcastedMulFuser,
-        BNBroadcastedAddFuser,
-        PixelShuffleFuser,
-        OutputRenamer,
-        AddModelInputsOutputs,
-        ConstantsToInitializers,
-        ImageScalerRemover,
-        ShapeOpRemover,
-        ConstantRemover,
-        ConstantFillToInitializers,
-        ReshapeTransposeReshape_pattern1,
-        CastOpRemover,
-        DeadCodeElimination,
-        PaddingOpRemover,
-    )
-
-    from ._error_utils import ErrorHandling
-    from ._graph_viz import plot_graph  # type: ignore
-
-
-USE_SHAPE_MAPPING = True
-
-DEBUG = False
-
-
-class SupportedVersion:
-    # Supported iOS Version
-    # New OS Version must be added at the end to maintain backward version index
-    supported_ios_version = ["11.2", "12", "13"]
-    IOS_13_VERSION = supported_ios_version.index("13")
-    ND_ARRARY_SUPPORT = IOS_13_VERSION
-
-    @staticmethod
-    def ios_support_check(minimum_ios_deployment_target):
-        return minimum_ios_deployment_target in SupportedVersion.supported_ios_version
-
-    @staticmethod
-    def is_nd_array_supported(minimum_ios_deployment_target):
-        if not SupportedVersion.ios_support_check(minimum_ios_deployment_target):
-            raise TypeError(
-                "{} not supported. Please provide one of target iOS: {}".format(
-                    minimum_ios_deployment_target,
-                    SupportedVersion.supported_ios_version
-                )
-            )
-
-        minimum_ios_deployment_target_index = SupportedVersion.supported_ios_version.index(
-            minimum_ios_deployment_target
-        )
-        return SupportedVersion.ND_ARRARY_SUPPORT <= minimum_ios_deployment_target_index
-
-    @staticmethod
-    def get_supported_ios():
-        return SupportedVersion.supported_ios_version
-
-    @staticmethod
-    def get_specification_version(minimum_ios_deployment_target):
-        if not SupportedVersion.ios_support_check(minimum_ios_deployment_target):
-            raise TypeError(
-                "{} not supported. Please provide one of target iOS: {}",
-                minimum_ios_deployment_target,
-                SupportedVersion.supported_ios_version,
-            )
-
-        if minimum_ios_deployment_target == "11.2":
-            return IOS_11_2_SPEC_VERSION
-        elif minimum_ios_deployment_target == "12":
-            return IOS_12_SPEC_VERSION
-        else:
-            return IOS_13_SPEC_VERSION
-
-
-"""
-inputs: list of tuples.
-      [Tuple]: [(name, type, shape)]
-"""
-
-
-def _make_coreml_input_features(
-    graph, onnx_coreml_input_shape_map, disable_coreml_rank5_mapping=False
-):  # type: (...) -> Sequence[Tuple[Text, datatypes.Array]]
-    """
-    If "disable_coreml_rank5_mapping" is False, then:
-
-    ONNX shapes to CoreML static shapes mapping
-    length==1: [C]
-    length==2: [B,C]
-    length==3: [C,H,W] or [Seq,B,C]
-    length==4: [B,C,H,W]
-
-    If "disable_coreml_rank5_mapping" is True, then
-    onnx shapes are mapped "as is" to CoreML.
-    """
-    inputs = graph.inputs
-    op_types = graph.blob_to_op_type
-    features = []
-    for input_ in inputs:
-        shape = input_[2]
-        if disable_coreml_rank5_mapping:
-            if len(shape) > 5:
-                raise ValueError(
-                    "ONNX input %s has a rank greater than 5, which is not supported in CoreML framework"
-                    % str(input_[0])
-                )
-            else:
-                features.append((str(input_[0]), datatypes.Array(*shape)))
-            continue
-
-        if USE_SHAPE_MAPPING and input_[0] in onnx_coreml_input_shape_map:
-            mapp = onnx_coreml_input_shape_map[input_[0]]
-            if len(mapp) != len(shape):
-                raise ValueError(
-                    "Incorrect value in onnx_coreml_input_shape_map argument"
-                )
-            graph.onnx_coreml_shape_mapping[input_[0]] = mapp
-            coreml_shape = [1, 1, 1]
-            for i in range(3):
-                if (i + 2) in mapp:
-                    coreml_shape[i] = shape[mapp.index(i + 2)]
-            shape = coreml_shape
-        else:
-            if len(shape) == 0:
-                shape = [1, 1, 1]
-            elif len(shape) == 1:
-                # assume [C]
-                if USE_SHAPE_MAPPING:
-                    graph.onnx_coreml_shape_mapping[input_[0]] = [2]
-            elif len(shape) == 2:
-                # assume [Batch,C]
-                shape = [shape[1]]
-                if USE_SHAPE_MAPPING:
-                    graph.onnx_coreml_shape_mapping[input_[0]] = [1, 2]
-            elif len(shape) == 3:
-                # assume [C,H,W] unless its connected an op that bestows another mapping
-                if input_[0] in op_types and len(op_types[input_[0]]) == 1:
-                    if str(op_types[input_[0]][0]) in _SEQUENCE_LAYERS_REGISTRY:
-                        # (Seq,B,C)
-                        shape = [shape[2]]
-                        if USE_SHAPE_MAPPING:
-                            graph.onnx_coreml_shape_mapping[input_[0]] = [0, 1, 2]
-                    elif str(op_types[input_[0]][0]) in [
-                        "MaxPool",
-                        "AveragePool",
-                        "BatchNormalization",
-                        "GlobalAveragePool",
-                        "GlobalLpPool",
-                        "GlobalMaxPool",
-                        "InstanceNormalization",
-                        "LRN",
-                        "LpPool",
-                        "Conv",
-                        "ConvTranspose",
-                    ]:
-                        # (B,C,W)
-                        shape = [shape[1], 1, shape[2]]
-                        if USE_SHAPE_MAPPING:
-                            graph.onnx_coreml_shape_mapping[input_[0]] = [1, 2, 4]
-                    else:
-                        if USE_SHAPE_MAPPING:
-                            graph.onnx_coreml_shape_mapping[input_[0]] = [2, 3, 4]
-                else:
-                    if USE_SHAPE_MAPPING:
-                        graph.onnx_coreml_shape_mapping[input_[0]] = [2, 3, 4]
-            elif len(shape) == 4:  # (B,C,H,W) --> (C,H,W)
-                shape = shape[1:]
-                if USE_SHAPE_MAPPING:
-                    graph.onnx_coreml_shape_mapping[input_[0]] = [1, 2, 3, 4]
-            else:
-                raise ValueError(
-                    "CoreML input cannot be more than rank 4. Input shape: %s, input: '%s' "
-                    % (str(shape), str(input_[0]))
-                )
-        features.append((str(input_[0]), datatypes.Array(*shape)))
-    return features
-
-
-"""
-outputs: list of tuples.
-      [Tuple]: [(name, type, shape)]
-"""
-
-
-def _make_coreml_output_features(
-    graph, forceShape=False, disable_coreml_rank5_mapping=False
-):  # type: (...) -> Sequence[Tuple[Text, datatypes.Array]]
-    features = []
-    outputs = graph.outputs
-    op_types = graph.blob_from_op_type
-    ops_allowing_zerod_output = {"Size"}
-
-    for output_ in outputs:
-        if op_types[output_[0]] in ops_allowing_zerod_output and len(output_[2]) == 0:
-            output_ = list(output_)
-            output_[2] = (1,)
-
-        if disable_coreml_rank5_mapping:
-            shape = output_[2]
-            if len(shape) > 5:
-                raise ValueError(
-                    "ONNX output %s has a rank greater than 5, which is not supported in CoreML framework"
-                    % str(output_[0])
-                )
-            else:
-                features.append((str(output_[0]), datatypes.Array(*shape)))
-            continue
-
-        if not forceShape:
-            features.append((str(output_[0]), None))
-        else:
-            shape = output_[2]
-            if len(shape) == 0:
-                shape = [1, 1, 1]
-            elif len(shape) == 1:
-                pass
-            elif len(shape) == 3:
-                if (
-                    output_[0] in op_types
-                    and str(op_types[output_[0]]) in _SEQUENCE_LAYERS_REGISTRY
-                ):
-                    # onnx shape: (Seq,B,C)
-                    shape = [shape[2]]
-            elif len(shape) == 4:  # (B,C,H,W) --> (C,H,W)
-                shape = shape[1:]
-            else:
-                shape = None  # output shape need not be specified for CoreML.
-            if shape is None:
-                features.append((str(output_[0]), shape))
-            else:
-                features.append((str(output_[0]), datatypes.Array(*shape)))
-    return features
-
-
-def _check_unsupported_ops(
-    nodes, disable_coreml_rank5_mapping=False
-):  # type: (...) -> None
-    unsupported_op_types = []  # type: List[Text]
-    for node in nodes:
-
-        if disable_coreml_rank5_mapping:
-            if (
-                node.op_type not in _ONNX_NODE_REGISTRY_ND
-                and node.op_type not in unsupported_op_types
-            ):
-                unsupported_op_types.append(node.op_type)
-            continue
-
-        if (
-            node.op_type not in _ONNX_NODE_REGISTRY
-            and node.op_type not in unsupported_op_types
-        ):
-            unsupported_op_types.append(node.op_type)
-
-    coreml_3_rerun_message = ""
-    if not disable_coreml_rank5_mapping:
-        coreml_3_rerun_message = (
-            "\nPlease try converting again by providing the additonal argument, "
-            "minimum_ios_deployment_target=13"
-            " and making sure you have the latest coremltools package"
-        )
-    if len(unsupported_op_types) > 0:
-        raise NotImplementedError(
-            "Unsupported ONNX ops of type: %s %s"
-            % (",".join(unsupported_op_types), coreml_3_rerun_message)
-        )
-
-
-def _update_multiarray_to_float32(
-    feature,  # type: Any
-):  # type : (...) -> None
-    if feature.type.HasField("multiArrayType"):
-        feature.type.multiArrayType.dataType = ft.ArrayFeatureType.FLOAT32
-
-
-def _update_multiarray_to_int32(
-    feature,  # type: Any
-):  # type : (...) -> None
-    if feature.type.HasField("multiArrayType"):
-        feature.type.multiArrayType.dataType = ft.ArrayFeatureType.INT32
-
-
-def _transform_coreml_dtypes(
-    builder,  # type : NeuralNetworkBuilder
-    inputs,  # type: List[EdgeInfo]
-    outputs,  # type: List[EdgeInfo]
-):
-    # type: (...) -> None
-
-    """ Make sure ONNX input/output data types are mapped to the equivalent CoreML types
-    """
-    for i, input_ in enumerate(inputs):
-        onnx_type = input_[1]
-        if onnx_type == TensorProto.FLOAT:
-            _update_multiarray_to_float32(builder.spec.description.input[i])
-        elif onnx_type == TensorProto.DOUBLE:
-            continue
-        elif onnx_type == TensorProto.INT32 or onnx_type == TensorProto.INT64:
-            _update_multiarray_to_int32(builder.spec.description.input[i])
-        elif onnx_type == TensorProto.BOOL:
-            _update_multiarray_to_float32(builder.spec.description.input[i])
-        else:
-            raise TypeError("Input must be of of type FLOAT, DOUBLE, INT32 or INT64")
-
-    for i, output_ in enumerate(outputs):
-        onnx_type = output_[1]
-        if onnx_type == TensorProto.FLOAT:
-            _update_multiarray_to_float32(builder.spec.description.output[i])
-        elif onnx_type == TensorProto.DOUBLE:
-            continue
-        elif onnx_type == TensorProto.INT32 or onnx_type == TensorProto.INT64:
-            _update_multiarray_to_int32(builder.spec.description.output[i])
-        elif onnx_type == TensorProto.BOOL:
-            _update_multiarray_to_float32(builder.spec.description.output[i])
-        else:
-            raise TypeError("Output must be of of type FLOAT, DOUBLE, INT32 or INT64")
-
-
-def _convert_multiarray_output_to_image(
-    spec,  # type: Any
-    feature_name,  # type: Text
-    is_bgr=False,  # type: bool
-):
-    # type: (...) -> None
-    for output in spec.description.output:
-        if output.name != feature_name:
-            continue
-        if output.type.WhichOneof("Type") != "multiArrayType":
-            raise ValueError("{} is not a multiarray type".format(output.name,))
-        array_shape = tuple(output.type.multiArrayType.shape)
-        if len(array_shape) == 2:
-            height, width = array_shape
-            output.type.imageType.colorSpace = ft.ImageFeatureType.ColorSpace.Value(
-                "GRAYSCALE"
-            )
-        else:
-            if len(array_shape) == 4:
-                if array_shape[0] != 1:
-                    raise ValueError(
-                        "Shape {} is not supported for image output".format(
-                            array_shape,
-                        )
-                    )
-                array_shape = array_shape[1:]
-
-            channels, height, width = array_shape
-
-            if channels == 1:
-                output.type.imageType.colorSpace = ft.ImageFeatureType.ColorSpace.Value(
-                    "GRAYSCALE"
-                )
-            elif channels == 3:
-                if is_bgr:
-                    output.type.imageType.colorSpace = ft.ImageFeatureType.ColorSpace.Value(
-                        "BGR"
-                    )
-                else:
-                    output.type.imageType.colorSpace = ft.ImageFeatureType.ColorSpace.Value(
-                        "RGB"
-                    )
-            else:
-                raise ValueError(
-                    "Channel Value {} is not supported for image output".format(
-                        channels,
-                    )
-                )
-
-        output.type.imageType.width = width
-        output.type.imageType.height = height
-
-
-def _set_deprocessing(
-    is_grayscale,  # type: bool
-    builder,  # type: NeuralNetworkBuilder
-    deprocessing_args,  # type: Dict[Text, Any]
-    input_name,  # type: Text
-    output_name,  # type: Text
-):
-    # type: (...) -> None
-    is_bgr = deprocessing_args.get("is_bgr", False)
-
-    image_scale = deprocessing_args.get("image_scale", 1.0)
-
-    if is_grayscale:
-        gray_bias = deprocessing_args.get("gray_bias", 0.0)
-        W = np.array([image_scale])
-        b = np.array([gray_bias])
-    else:
-        W = np.array([image_scale, image_scale, image_scale])
-
-        red_bias = deprocessing_args.get("red_bias", 0.0)
-        green_bias = deprocessing_args.get("green_bias", 0.0)
-        blue_bias = deprocessing_args.get("blue_bias", 0.0)
-
-        if not is_bgr:
-            b = np.array([red_bias, green_bias, blue_bias,])
-        else:
-            b = np.array([blue_bias, green_bias, red_bias,])
-    builder.add_scale(
-        name=input_name,
-        W=W,
-        b=b,
-        has_bias=True,
-        shape_scale=W.shape,
-        shape_bias=b.shape,
-        input_name=input_name,
-        output_name=output_name,
-    )
-
-
-def _prepare_onnx_graph(
-    graph, transformers, onnx_ir_version
-):  # type: (Graph, Iterable[Transformer]) -> Graph
-    graph_ = Graph.from_onnx(graph, onnx_ir_version)
-    if DEBUG:
-        plot_graph(graph_, graph_img_path="/tmp/graph_raw.pdf")
-    graph_ = graph_.transformed(transformers)
-    if DEBUG:
-        plot_graph(graph_, graph_img_path="/tmp/graph_opt.pdf")
-    return graph_
-
-
-@_deprecated()
-def convert(
-    model,  # type: Union[onnx.ModelProto, Text]
-    mode=None,  # type: Optional[Text]
-    image_input_names=[],  # type: Sequence[Text]
-    preprocessing_args={},  # type: Dict[Text, Any]
-    image_output_names=[],  # type: Sequence[Text]
-    deprocessing_args={},  # type: Dict[Text, Any]
-    class_labels=None,  # type: Union[Text, Iterable[Text], None]
-    predicted_feature_name="classLabel",  # type: Text
-    add_custom_layers=False,  # type: bool
-    custom_conversion_functions={},  # type: Dict[Text, Any]
-    onnx_coreml_input_shape_map={},  # type: Dict[Text, List[int,...]]
-    minimum_ios_deployment_target="12",
-):
-    # type: (...) -> MLModel
-    """
-    WARNING: This function is deprecated. It will be removed in the 6.0.
-
-    Convert ONNX model to CoreML.
-    
-    Parameters
-    ----------
-    model:
-        An ONNX model with parameters loaded in the ONNX package, or path to file
-        with models.
-        
-    mode: 'classifier', 'regressor' or None
-    
-        Mode of the converted coreml model:
-        
-        * ``'classifier'``: a NeuralNetworkClassifier spec will be constructed.
-        * ``'regressor'``: a NeuralNetworkRegressor spec will be constructed.
-        
-    preprocessing_args:
-        The ``'is_bgr'``, ``'red_bias'``, ``'green_bias'``, ``'blue_bias'``, ``'gray_bias'``,
-        and ``'image_scale'`` keys have the same meaning as the pre-processing arguments for
-        `NeuralNetworkBuilder <https://coremltools.readme.io/reference/modelsneural_network>`_.
-    
-    deprocessing_args:
-        Same as ``'preprocessing_args'`` but for de-processing.
-    
-    class_labels:
-        * As a string, it represents the name of the file which contains
-          the classification labels (one per line).
-        * As a list of strings, it represents a list of categories that map
-          the index of the output of a neural network to labels in a classifier.
-    
-    predicted_feature_name:
-        Name of the output feature for the class labels exposed in the Core ML
-        model (applies to classifiers only). Defaults to ``'classLabel'``.
-    
-    add_custom_layers: bool
-        Flag to turn on additional custom CoreML layers for unsupported ONNX ops or
-    	attributes within a supported op.
-    
-    custom_conversion_functions: dict()
-        * A dictionary with keys corresponding to the names/types of ONNX ops and values as 
-          functions taking an object of the ``coreml-tools`` class:
-          ``'NeuralNetworkBuilder'``, ``'Graph'`` (see ``onnx-coreml/_graph.Graph``),
-          ``'Node'`` (see ``onnx-coreml/_graph.Node``), and 
-          ``'ErrorHandling'`` (see ``onnx-coreml/_error_utils.ErrorHandling``).
-        * This custom conversion function gets full control and responsibility for 
-          converting a given ONNX op.
-        * The function returns nothing and is responsible for adding an equivalent CoreML
-          layer via ``'NeuralNetworkBuilder'``.
-    
-    onnx_coreml_input_shape_map: dict() (Optional) 
-        * A dictionary with keys corresponding to the model input names.
-        * Values are a list of integers that specify how the shape of the input is mapped
-          to CoreML.
-        * Convention used for CoreML shapes is ``0: Sequence``, ``1: Batch``,
-          ``2: channel``, ``3: height``, ``4: width``. For example, an input of rank 2
-          could be mapped as ``[3,4]`` (H,W) or ``[1,2]`` (B,C), and so on. This is
-          ignored if ``minimum_ios_deployment_target`` is set to ``13``.
-    
-    minimum_ios_deployment_target: str
-        Target Deployment iOS Version (default: ``'12'``). Supported iOS version options:
-        ``'11.2'``, ``'12'``, ``'13'``. CoreML model produced by the converter will be
-        compatible with the iOS version specified in this argument. For example, if
-        ``minimum_ios_deployment_target = '12'``, the converter would utilize only CoreML
-        features released up to version iOS12 (equivalent to macOS 10.14, watchOS 5, and
-        so on). iOS 11.2 (CoreML 0.8) does not support ``resize_bilinear`` and
-        ``crop_resize`` layers. See `supported v0.8 features <https://github.com/apple/coremltools/releases/tag/v0.8>`_.
-        iOS 12 (CoreML 2.0), see `supported v2.0 features <https://github.com/apple/coremltools/releases/tag/v2.0>`_.
-        iSO 13 (CoreML 3.0), see `supported v3.0 features <https://github.com/apple/coremltools/releases/tag/3.0-beta6>`_.
-    
-    
-    Returns
-    -------
-    model: A coreml model.
-    """
-    if not _HAS_ONNX:
-        raise ModuleNotFoundError("Missing ONNX package.")
-
-    if isinstance(model, Text):
-        onnx_model = onnx.load(model)
-    elif isinstance(model, onnx.ModelProto):
-        onnx_model = model
-    else:
-        raise TypeError("Model must be file path to .onnx file or onnx loaded model")
-
-    if not SupportedVersion.ios_support_check(minimum_ios_deployment_target):
-        raise TypeError(
-            "{} not supported. Please provide one of target iOS: {}",
-            minimum_ios_deployment_target,
-            SupportedVersion.get_supported_ios(),
-        )
-
-    global USE_SHAPE_MAPPING
-    disable_coreml_rank5_mapping = False
-    if SupportedVersion.is_nd_array_supported(minimum_ios_deployment_target):
-        disable_coreml_rank5_mapping = True
-
-    if disable_coreml_rank5_mapping:
-        USE_SHAPE_MAPPING = False
-    else:
-        USE_SHAPE_MAPPING = True
-
-    """
-    First, apply a few optimizations to the ONNX graph,
-    in preparation for conversion to CoreML.
-    """
-
-    # Using Dummy transformation to conditionally disable certain transformation
-    class DummyTransformation(object):
-        def __call__(self, graph):
-            return graph
-
-    transformers = [
-        ConstantsToInitializers(),
-        ShapeOpRemover(),
-        ConstantRemover(),
-        CastOpRemover(),
-        PaddingOpRemover(),
-        ReshapeInitTensorFuser(),
-        DropoutRemover(),
-        DeadCodeElimination(),
-        ConvAddFuser(),
-        BNBroadcastedMulFuser(),
-        BNBroadcastedAddFuser(),
-        ReshapeTransposeReshape_pattern1(),
-        PixelShuffleFuser(),
-        AddModelInputsOutputs()
-        if not disable_coreml_rank5_mapping
-        else DummyTransformation(),
-        ConstantFillToInitializers(),
-    ]  # type: Iterable[Transformer]
-
-    onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
-    graph = _prepare_onnx_graph(onnx_model.graph, transformers, onnx_model.ir_version)
-
-    """
-    Check for ImageScalar nodes in ONNX, this will indicate whether input image preprocessing needs
-    to be added to the CoreML graph or not.
-    """
-    # are there ImageScaler nodes in the Graph?
-    # If yes then add the info from it to the "preprocessing_args" dictionary, if the dictionary is not
-    # already provided by the user
-    if not bool(preprocessing_args):
-        for node in graph.nodes:
-            if node.op_type == "ImageScaler":
-                inp_name = node.inputs[0]
-                scale = node.attrs.get("scale", 1.0)
-                bias = node.attrs.get("bias", [0, 0, 0])
-                if not (len(bias) == 1 or len(bias) == 3):
-                    continue
-                if "image_scale" in preprocessing_args:
-                    preprocessing_args["image_scale"][inp_name] = scale
-                else:
-                    preprocessing_args["image_scale"] = {inp_name: scale}
-                if len(bias) == 3:
-                    for i, color in enumerate(["red", "green", "blue"]):
-                        if color + "_bias" in preprocessing_args:
-                            preprocessing_args[color + "_bias"][inp_name] = bias[i]
-                        else:
-                            preprocessing_args[color + "_bias"] = {inp_name: bias[i]}
-                else:
-                    if "gray_bias" in preprocessing_args:
-                        preprocessing_args["gray_bias"][inp_name] = bias[0]
-                    else:
-                        preprocessing_args["gray_bias"] = {inp_name: bias[0]}
-                if inp_name not in image_input_names:
-                    image_input_names.append(inp_name)  # type: ignore
-
-    # remove all ImageScaler ops
-    graph = graph.transformed([ImageScalerRemover()])
-
-    """
-    Gather information (name, shape) for model inputs and outputs
-    This information is then used to initialize the neural network builder object of coremltools.
-    The builder object is later used to add layers to the CoreML model.
-    """
-
-    # Make CoreML input and output features by gathering shape info and
-    # interpreting it for CoreML
-    input_features = _make_coreml_input_features(
-        graph, onnx_coreml_input_shape_map, disable_coreml_rank5_mapping
-    )
-    if len(image_output_names) > 0:
-        output_features = _make_coreml_output_features(
-            graph,
-            forceShape=True,
-            disable_coreml_rank5_mapping=disable_coreml_rank5_mapping,
-        )
-    else:
-        output_features = _make_coreml_output_features(
-            graph, disable_coreml_rank5_mapping=disable_coreml_rank5_mapping
-        )
-
-    builder = NeuralNetworkBuilder(
-        input_features,
-        output_features,
-        mode=mode,
-        disable_rank5_shape_mapping=disable_coreml_rank5_mapping,
-    )
-
-    # TODO: To be removed once, auto-downgrading of spec version is enabled
-    builder.spec.specificationVersion = SupportedVersion.get_specification_version(
-        minimum_ios_deployment_target
-    )
-
-    """
-    Set CoreML input,output types (float, double, int) same as onnx types, if supported
-    """
-    _transform_coreml_dtypes(builder, graph.inputs, graph.outputs)
-
-    """what follows is some book-keeping to support outputs of type image.
-    """
-
-    is_deprocess_bgr_only = (len(deprocessing_args) == 1) and (
-        "is_bgr" in deprocessing_args
-    )
-    add_deprocess = (
-        (len(image_output_names) > 0)
-        and (len(deprocessing_args) > 0)
-        and (not is_deprocess_bgr_only)
-    )
-
-    if add_deprocess:
-        mapping = {}
-        for f in output_features:
-            output_name = f[0]
-            mapping[output_name] = graph.get_unique_edge_name(output_name)
-        graph = OutputRenamer(mapping)(graph)
-
-    if len(image_input_names) > 0:
-        builder.set_pre_processing_parameters(
-            image_input_names=image_input_names,
-            is_bgr=preprocessing_args.get("is_bgr", False),
-            red_bias=preprocessing_args.get("red_bias", 0.0),
-            green_bias=preprocessing_args.get("green_bias", 0.0),
-            blue_bias=preprocessing_args.get("blue_bias", 0.0),
-            gray_bias=preprocessing_args.get("gray_bias", 0.0),
-            image_scale=preprocessing_args.get("image_scale", 1.0),
-        )
-
-    preprocessing_args.clear()
-
-    if len(image_output_names) > 0:
-        for f in output_features:
-            f_name = f[0]
-            if f_name in image_output_names:
-                is_bgr = deprocessing_args.get("is_bgr", False)
-                _convert_multiarray_output_to_image(builder.spec, f_name, is_bgr=is_bgr)
-
-    """
-    Iterate through all the ONNX ops and translate them to CoreML layers, one by one.
-    """
-
-    """
-    before proceeding to start the layer translation process,
-    check whether there is an op in the ONNX graph, whose translation function is not yet
-    implemented in the converter or which is not supported in the CoreML framework. If so,
-    raise an error before starting the process.
-    (if the user desires to add a custom layer then this check is not required)
-    """
-    if not add_custom_layers:
-        _check_unsupported_ops(graph.nodes, disable_coreml_rank5_mapping)
-
-    """
-    ErrorHandling is a generic class, useful to store a variety of parameters during the conversion process
-    """
-    err = ErrorHandling(add_custom_layers, custom_conversion_functions)
-
-    for i, node in enumerate(graph.nodes):
-        print(
-            "%d/%d: Converting Node Type %s" % (i + 1, len(graph.nodes), node.op_type)
-        )
-        if disable_coreml_rank5_mapping:
-            _convert_node_nd(builder, node, graph, err)
-        else:
-            _add_const_inputs_if_required(builder, node, graph, err)
-            _convert_node(builder, node, graph, err)
-
-    if DEBUG:
-        plot_graph(
-            graph,
-            graph_img_path="/tmp/after_conversion.pdf",
-            show_coreml_mapped_shapes=not disable_coreml_rank5_mapping,
-        )
-
-    if add_deprocess:
-        for f in output_features:
-            output_name = f[0]
-            if output_name not in image_output_names:
-                continue
-            output_shape = f[1].dimensions
-            if len(output_shape) == 2 or output_shape[0] == 1:
-                is_grayscale = True
-            elif output_shape[0] == 3:
-                is_grayscale = False
-            else:
-                raise ValueError("Output must be RGB image or Grayscale")
-            _set_deprocessing(
-                is_grayscale,
-                builder,
-                deprocessing_args,
-                mapping[output_name],
-                output_name,
-            )
-
-    if class_labels is not None:
-        if isinstance(class_labels, Text):
-            labels = [
-                l.strip() for l in open(class_labels).readlines()
-            ]  # type: Sequence[Text]
-        elif isinstance(class_labels, list):
-            labels = class_labels
-        else:
-            raise TypeError(
-                "synset variable of unknown type. Type found: {}. \
-                Expected either string or list of strings.".format(
-                    type(class_labels),
-                )
-            )
-
-        builder.set_class_labels(
-            class_labels=labels, predicted_feature_name=predicted_feature_name
-        )
-
-    def _add_informative_description(feature, raise_error=True):
-        if feature.type.WhichOneof("Type") == "multiArrayType":
-            if (
-                feature.name in graph.onnx_coreml_shape_mapping
-                and feature.name in graph.shape_dict
-            ):
-                mapp = graph.onnx_coreml_shape_mapping[feature.name]
-                onnx_shape = graph.shape_dict[feature.name]
-                if raise_error:
-                    assert len(mapp) == len(onnx_shape), "Something wrong in shape"
-                if len(mapp) == len(onnx_shape):
-                    shape = []
-                    for i in range(5):
-                        if i in mapp:
-                            shape += [int(onnx_shape[mapp.index(i)])]
-                        else:
-                            shape += [1]
-                    msg = "MultiArray of shape {}. The first and second dimensions correspond to sequence and batch size, respectively".format(
-                        str(tuple(shape))
-                    )
-                    feature.shortDescription += msg
-
-    optional_input_names = []
-    for tup in graph.optional_inputs:
-        optional_input_names.append(tup[0])
-    optional_output_names = []
-    for tup in graph.optional_outputs:
-        optional_output_names.append(tup[0])
-
-    # add description for inputs and outputs shapes
-    remove_input_id = []
-    for i, input_ in enumerate(builder.spec.description.input):
-        if input_.name not in optional_input_names:
-            if not disable_coreml_rank5_mapping:
-                _add_informative_description(input_)
-        else:
-            remove_input_id.append(i)
-    remove_output_id = []
-    for i, output_ in enumerate(builder.spec.description.output):
-        if output_.name not in optional_output_names:
-            if not disable_coreml_rank5_mapping:
-                _add_informative_description(output_, raise_error=False)
-        else:
-            remove_output_id.append(i)
-
-    for index in sorted(remove_input_id, reverse=True):
-        del builder.spec.description.input[index]
-    for index in sorted(remove_output_id, reverse=True):
-        del builder.spec.description.output[index]
-
-    if len(graph.optional_inputs) > 0 or len(graph.optional_outputs):
-        builder.add_optionals(graph.optional_inputs, graph.optional_outputs)
-
-    # Check for specification version and target ios compatibility
-    if (
-        minimum_ios_deployment_target == "11.2"
-        and builder.spec.WhichOneof("Type") == "neuralNetwork"
-    ):
-        nn_spec = builder.spec.neuralNetwork
-        for layer in nn_spec.layers:
-            if (
-                layer.WhichOneof("layer") == "resizeBilinear"
-                or layer.WhichOneof("layer") == "cropResize"
-            ):
-                raise TypeError(
-                    "{} not supported with target iOS 11.2 please provide higher target iOS".format(
-                        layer.WhichOneof("layer")
-                    )
-                )
-
-    # Optimize ML Model Spec
-    ml_model_passes = [remove_disconnected_layers, transform_conv_crop]
-    for opt in ml_model_passes:
-        opt(builder.spec)
-
-    print("Translation to CoreML spec completed. Now compiling the CoreML model.")
-    try:
-        if DEBUG:
-            import coremltools
-
-            coremltools.models.utils.save_spec(
-                builder.spec, "/tmp/node_model_raw_spec.mlmodel"
-            )
-            from coremltools.models.neural_network.printer import print_network_spec
-
-            print_network_spec(builder.spec, style="coding")
-        mlmodel = MLModel(builder.spec)
-    except RuntimeError as e:
-        raise ValueError("Compilation failed: {}".format(str(e)))
-    print("Model Compilation done.")
-
-    # print information about all ops for which custom layers have been added
-    if len(err.custom_layer_nodes) > 0:
-        print("\n")
-        print(
-            "Custom layers have been added to the CoreML model "
-            "corresponding to the following ops in the onnx model: "
-        )
-        for i, node in enumerate(err.custom_layer_nodes):
-            input_info = []
-            for input_ in node.inputs:
-                input_info.append(
-                    (
-                        str(input_),
-                        graph.shape_dict.get(input_, str("Shape not available")),
-                    )
-                )
-            output_info = []
-            for output_ in node.outputs:
-                output_info.append(
-                    (
-                        str(output_),
-                        graph.shape_dict.get(output_, str("Shape not available")),
-                    )
-                )
-            print(
-                "{}/{}: op type: {}, op input names and shapes: {}, op output names and shapes: {}".format(
-                    i + 1,
-                    len(err.custom_layer_nodes),
-                    node.op_type,
-                    str(input_info),
-                    str(output_info),
-                )
-            )
-
-    mlmodel.user_defined_metadata[_METADATA_VERSION] = ct_version
-    mlmodel.user_defined_metadata[_METADATA_SOURCE] = "onnx=={0}".format(
-        onnx.__version__
-    )
-    return mlmodel
diff --git a/coremltools/converters/onnx/_error_utils.py b/coremltools/converters/onnx/_error_utils.py
deleted file mode 100644
index aaa1d017a..000000000
--- a/coremltools/converters/onnx/_error_utils.py
+++ /dev/null
@@ -1,102 +0,0 @@
-
-from typing import Dict, Text, Any, Callable
-from coremltools.models.neural_network import NeuralNetworkBuilder  # type: ignore
-from ._graph import Node, Graph
-
-
-class ErrorHandling(object):
-    """
-  To handle errors and addition of custom layers
-  """
-
-    def __init__(
-        self,
-        add_custom_layers=False,  # type: bool
-        custom_conversion_functions=dict(),  # type: Dict[Text, Any]
-        custom_layer_nodes=[],  # type : List[Node]
-    ):
-        # type: (...) -> None
-        self.add_custom_layers = add_custom_layers
-        self.custom_conversion_functions = custom_conversion_functions
-        self.custom_layer_nodes = custom_layer_nodes
-
-        self.rerun_suggestion = (
-            "\n Please try converting with higher minimum_ios_deployment_target.\n"
-            "You can also provide custom function/layer to convert the model."
-        )
-
-    def unsupported_op(
-        self, node,  # type: Node
-    ):
-        # type: (...) -> Callable[[Any, Node, Graph, ErrorHandling], None]
-        """
-      Either raise an error for an unsupported op type or return custom layer add function
-      """
-        if self.add_custom_layers:
-            from ._operators import _convert_custom
-
-            return _convert_custom
-        else:
-            raise TypeError(
-                "ONNX node of type {} is not supported. {}\n".format(
-                    node.op_type, self.rerun_suggestion
-                )
-            )
-
-    def unsupported_op_configuration(
-        self,
-        builder,  # type: NeuralNetworkBuilder
-        node,  # type: Node
-        graph,  # type: Graph
-        err_message,  # type: Text
-    ):
-        # type: (...) -> None
-        """
-      Either raise an error for an unsupported attribute or add a custom layer.
-      """
-        if self.add_custom_layers:
-            from ._operators import _convert_custom
-
-            _convert_custom(builder, node, graph, self)
-        else:
-            raise TypeError(
-                "Error while converting op of type: {}. Error message: {} {}\n".format(
-                    node.op_type, err_message, self.rerun_suggestion
-                )
-            )
-
-    def missing_initializer(
-        self,
-        node,  # type: Node
-        err_message,  # type: Text
-    ):
-        # type: (...) -> None
-        """
-      Missing initializer error
-      """
-        raise ValueError(
-            "Missing initializer error in op of type {}, with input name = {}, "
-            "output name = {}. Error message: {} {}\n".format(
-                node.op_type,
-                node.inputs[0],
-                node.outputs[0],
-                err_message,
-                self.rerun_suggestion,
-            )
-        )
-
-    def unsupported_feature_warning(
-        self,
-        node,  # type: Node
-        warn_message,  # type: Text
-    ):
-        # type: (...) -> None
-        """
-      Unsupported feature warning
-      """
-        print(
-            "Warning: Unsupported Feature in op of type {}, with input name = {}, "
-            "output name = {}. Warning message: {}\n".format(
-                node.op_type, node.inputs[0], node.outputs[0], warn_message
-            )
-        )
diff --git a/coremltools/converters/onnx/_graph.py b/coremltools/converters/onnx/_graph.py
deleted file mode 100644
index db8a20ab7..000000000
--- a/coremltools/converters/onnx/_graph.py
+++ /dev/null
@@ -1,313 +0,0 @@
-
-from onnx import (
-    numpy_helper,
-    ValueInfoProto,
-    AttributeProto,
-    GraphProto,
-    NodeProto,
-    TensorProto,
-    TensorShapeProto,
-)
-from typing import Any, Text, Iterable, List, Dict, Sequence, Optional, Tuple, Union
-from typing_extensions import Protocol
-import numpy as np
-
-
-class Transformer(Protocol):
-    def __call__(self, graph):  # type: (Graph) -> Graph
-        pass
-
-
-EdgeInfo = Tuple[Text, Any, TensorShapeProto]
-AttributeValue = Any  # TODO Union[Sequence[float], Sequence[int], Sequence[Text], Sequence[TensorProto], Sequence[GraphProto]]
-
-
-def _input_from_onnx_input(input):  # type: (ValueInfoProto) -> EdgeInfo
-    name = input.name
-    type = input.type.tensor_type.elem_type
-    shape = tuple([d.dim_value for d in input.type.tensor_type.shape.dim])
-    return (name, type, shape)
-
-
-def _convertAttributeProto(onnx_arg):  # type: (AttributeProto) -> AttributeValue
-    """
-    Convert an ONNX AttributeProto into an appropriate Python object
-    for the type.
-    NB: Tensor attribute gets returned as numpy array
-    """
-    if onnx_arg.HasField("f"):
-        return onnx_arg.f
-    elif onnx_arg.HasField("i"):
-        return onnx_arg.i
-    elif onnx_arg.HasField("s"):
-        return onnx_arg.s
-    elif onnx_arg.HasField("t"):
-        return numpy_helper.to_array(onnx_arg.t)
-    elif len(onnx_arg.floats):
-        return list(onnx_arg.floats)
-    elif len(onnx_arg.ints):
-        return list(onnx_arg.ints)
-    elif len(onnx_arg.strings):
-        return list(onnx_arg.strings)
-    else:
-        return None
-
-
-def _extract_node_names(graph):  # type : (Graph) -> List[Text]
-    node_names = []
-    for node in graph.nodes:
-        node_names.append(node.name)
-    return node_names
-
-
-def _apply_graph_transformations(
-    graph, transformers
-):  # (Graph, Iterable[Transformer]) -> Graph
-    old_node_names = _extract_node_names(graph)  # type: ignore
-    while True:
-        for transformer in transformers:
-            graph = transformer(graph)
-        new_node_names = _extract_node_names(graph)  # type: ignore
-        if new_node_names == old_node_names:
-            break
-        old_node_names = new_node_names
-    return graph
-
-
-class Attributes(Dict[Text, Any]):
-    @staticmethod
-    def from_onnx(args):  # type: (Iterable[AttributeProto]) -> Attributes
-        d = Attributes()
-        for arg in args:
-            val = _convertAttributeProto(arg)
-            if val is not None:
-                d[arg.name] = val
-        return d
-
-
-class Node(object):
-    def __init__(
-        self,
-        name,  # type: Optional[Text]
-        op_type,  # type: Text
-        attrs,  # type: Dict[Text, AttributeValue]
-        inputs,  # type: List[Text]
-        outputs,  # type: List[Text]
-    ):
-        # type: (...) -> None
-        self.name = name
-        self.op_type = op_type
-        self.attrs = attrs
-        self.inputs = inputs
-        self.outputs = outputs
-        self.input_tensors = {}  # type: Dict[Text, np._ArrayLike[Any]]
-        self.parents = []  # type: List[Node]
-        self.children = []  # type: List[Node]
-        self.metadata = {}  # type: Dict[Any, Any]
-
-    def add_parent(self, parent_node):  # type: (Node) -> None
-        assert parent_node not in self.parents
-        self.parents.append(parent_node)
-        if self not in parent_node.children:
-            parent_node.children.append(self)
-
-    def add_child(self, child_node):  # type: (Node) -> None
-        assert child_node not in self.children
-        self.children.append(child_node)
-        if self not in child_node.parents:
-            child_node.parents.append(self)
-
-    def get_only_parent(self):  # type: () -> Node
-        if len(self.parents) != 1:
-            raise ValueError(
-                "Node ({}) expected to have 1 parent. Found {}.".format(
-                    self, len(self.parents)
-                )
-            )
-        return self.parents[0]
-
-    @staticmethod
-    def from_onnx(node):  # type: (NodeProto) -> Node
-        attrs = Attributes.from_onnx(node.attribute)
-        name = Text(node.name)
-        if len(name) == 0:
-            name = "_".join(node.output)
-        return Node(name, node.op_type, attrs, list(node.input), list(node.output))
-
-
-class Graph(object):
-    def __init__(
-        self,
-        nodes,  # type: List[Node]
-        inputs,  # type: List[EdgeInfo]
-        outputs,  # type: List[EdgeInfo]
-        shape_dict,  # type: Dict[Text,Tuple[int,...]]
-        onnx_ir_version,  # type: int
-    ):
-        # type: (...) -> None
-        self.nodes = nodes
-        self.inputs = inputs
-        self.outputs = outputs
-        self.shape_dict = shape_dict  # data blob name to its shape
-        self.constants_loaded = set()  # set of constants present in graph as node
-        self.onnx_ir_version = onnx_ir_version  # ONNX IR Version for current graph
-
-        self.optional_inputs = (
-            []
-        )  # list of tuple(str, tuple(int)), use with recurrent layers
-        self.optional_outputs = (
-            []
-        )  # list of tuple(str,tuple(int)), use with recurrent layers
-
-        """
-        All axes in CoreML Tensor shapes are annotated. That is,
-        0: Sequence
-        1: Batch
-        2: Channel
-        3: Height
-        4: Width
-        This dictionary "onnx_coreml_shape_mapping" records onnx shape to coreml shape mapping for
-        every tensor (including intermediate tensors) in the onnx graph.
-        The requirement is to only know the "rank" (i.e. number of dimensions) of the onnx tensor, not its actual shape, during conversion time.
-
-        The Dict is "str" -> List of ints
-
-        e.g. "x" -> [1,3] carries the following information:
-        - "x" is rank 2
-        - "x" in Coreml will have the shape [Seq=1, B=x.shape[0], C=1, H=x.shape[1], W=1]
-
-        e.g. "x" -> [1,3,2] carries the following information:
-        - "x" is rank 3
-        - "x" in Coreml will have the shape [Seq=1, B=x.shape[0], C=x.shape[2], H=x.shape[1], W=1]
-
-        The dictionary "onnx_coreml_shape_mapping" is progressively built as the onnx graph is converted to CoreML graph.
-        The op to layer conversion functions use the information in this dict to correctly set the parameters of the CoreML layer
-        to be added and at the end they update the dict with that layer's output(s).
-        """
-        self.onnx_coreml_shape_mapping = {}  # type: Dict[Text, List[int,...]]
-
-        # data blob name to the list of op types it feeds into
-        self.blob_to_op_type = {}  # type: Dict[Text, List[Text]]
-        # data blob name to the op_type that generates it
-        self.blob_from_op_type = {}  # type: Dict[Text, Text]
-
-        self.constant_layers_added = {}  # type: Dict[Text, bool]
-
-        for node_ in nodes:
-            for input_ in node_.inputs:
-                if input_ in self.blob_to_op_type:
-                    self.blob_to_op_type[input_].append(node_.op_type)
-                else:
-                    self.blob_to_op_type[input_] = [node_.op_type]
-            for output_ in node_.outputs:
-                if output_ in self.blob_from_op_type:
-                    raise ValueError(
-                        "Data blob: %s, is generated by more than 1 op" % (output_)
-                    )
-                self.blob_from_op_type[output_] = node_.op_type
-
-    def create_graph(
-        self,
-        nodes=None,
-        inputs=None,
-        outputs=None,
-        shape_dict=None,
-        onnx_ir_version=None,
-    ):
-        node = self.nodes if nodes is None else nodes
-        inputs = self.inputs if inputs is None else inputs
-        outputs = self.outputs if outputs is None else outputs
-        shape_dict = self.shape_dict if shape_dict is None else shape_dict
-        onnx_ir_version = (
-            self.onnx_ir_version if onnx_ir_version is None else onnx_ir_version
-        )
-        return Graph(nodes, inputs, outputs, shape_dict, onnx_ir_version)
-
-    def transformed(self, transformers):  # type: (Iterable[Transformer]) -> Graph
-        graph = self
-        return _apply_graph_transformations(graph, transformers)  # type: ignore
-
-    def has_edge_name(self, name):  # type: (Text) -> bool
-        """
-        Check if name is already used for graph inputs/outputs or for nodes
-        inputs/outputs
-        """
-        names = set()
-        for input in self.inputs:
-            names.add(input[0])
-        for output in self.outputs:
-            names.add(output[0])
-        for node in self.nodes:
-            names.update(node.inputs)
-            names.update(node.outputs)
-        return name in names
-
-    def get_unique_edge_name(self, name):  # type: (Text) -> Text
-        n_ = name
-        i = 0
-        while self.has_edge_name(n_):
-            n_ = "{}_{}".format(name, i)
-            i += 1
-        return n_
-
-    @staticmethod
-    def from_onnx(graph, onnx_ir_version):  # type: (GraphProto) -> Graph
-        input_tensors = {t.name: numpy_helper.to_array(t) for t in graph.initializer}
-        nodes_ = []
-        nodes_by_input = {}  # type: Dict[Text, List[Node]]
-        nodes_by_output = {}
-        for node in graph.node:
-            node_ = Node.from_onnx(node)
-            for input_ in node_.inputs:
-                if input_ in input_tensors:
-                    node_.input_tensors[input_] = input_tensors[input_]
-                else:
-                    if input_ in nodes_by_input:
-                        input_nodes = nodes_by_input[input_]
-                    else:
-                        input_nodes = []
-                        nodes_by_input[input_] = input_nodes
-                    input_nodes.append(node_)
-            for output_ in node_.outputs:
-                nodes_by_output[output_] = node_
-            nodes_.append(node_)
-
-        inputs = []
-        for i in graph.input:
-            if i.name not in input_tensors:
-                inputs.append(_input_from_onnx_input(i))
-
-        outputs = []
-        for o in graph.output:
-            outputs.append(_input_from_onnx_input(o))
-
-        for node_ in nodes_:
-            for input_ in node_.inputs:
-                if input_ in nodes_by_output:
-                    node_.parents.append(nodes_by_output[input_])
-            for output_ in node_.outputs:
-                if output_ in nodes_by_input:
-                    node_.children.extend(nodes_by_input[output_])
-
-        # Dictionary to hold the "value_info" field from ONNX graph
-        shape_dict = {}  # type: Dict[Text,Tuple[int,...]]
-
-        def extract_value_info(
-            shape_dict,  # type: Dict[Text,Tuple[int,...]]
-            value_info,  # type: ValueInfoProto[...]
-        ):
-            # type: (...) -> None
-            t = tuple(
-                [int(dim.dim_value) for dim in value_info.type.tensor_type.shape.dim]
-            )
-            if t:
-                shape_dict[value_info.name] = t
-
-        for value_info in graph.value_info:
-            extract_value_info(shape_dict, value_info)
-        for value_info in graph.input:
-            extract_value_info(shape_dict, value_info)
-        for value_info in graph.output:
-            extract_value_info(shape_dict, value_info)
-
-        return Graph(nodes_, inputs, outputs, shape_dict, onnx_ir_version)
diff --git a/coremltools/converters/onnx/_graph_viz.py b/coremltools/converters/onnx/_graph_viz.py
deleted file mode 100644
index e6c896471..000000000
--- a/coremltools/converters/onnx/_graph_viz.py
+++ /dev/null
@@ -1,126 +0,0 @@
-import os
-
-
-def _shape_notation(int_shape):
-    X = ["S", "B", "C", "H", "W"]
-    return [X[i] for i in int_shape]
-
-
-def plot_graph(graph, graph_img_path="graph.png", show_coreml_mapped_shapes=False):
-    """
-    Plot graph using pydot
-
-    It works in two steps:
-    1. Add nodes to pydot
-    2. connect nodes added in pydot
-
-    :param graph
-    :return: writes down a png/pdf file using dot
-    """
-
-    try:
-        # pydot-ng is a fork of pydot that is better maintained.
-        import pydot_ng as pydot  # type: ignore
-    except:
-        # pydotplus is an improved version of pydot
-        try:
-            import pydotplus as pydot  # type: ignore
-        except:
-            # Fall back on pydot if necessary.
-            try:
-                import pydot  # type: ignore
-            except:
-                return None
-
-    dot = pydot.Dot()
-    dot.set("rankdir", "TB")
-    dot.set("concentrate", True)
-    dot.set_node_defaults(shape="record")
-
-    # Add nodes corresponding to graph inputs
-    graph_inputs = []
-    for input_ in graph.inputs:
-        if show_coreml_mapped_shapes:
-            if input_[0] in graph.onnx_coreml_shape_mapping:
-                shape = tuple(
-                    _shape_notation(graph.onnx_coreml_shape_mapping[input_[0]])
-                )
-            else:
-                shape = "NA, "
-        else:
-            shape = tuple(input_[2])
-        label = "%s\n|{|%s}|{{%s}|{%s}}" % ("Input", input_[0], "", str(shape))
-        pydot_node = pydot.Node(input_[0], label=label)
-        dot.add_node(pydot_node)
-        graph_inputs.append(input_[0])
-
-    # Traverse graph and add nodes to pydot
-    for node in graph.nodes:
-        inputlabels = ""
-        for input_ in node.inputs:
-            if show_coreml_mapped_shapes:
-                if input_ in graph.onnx_coreml_shape_mapping:
-                    inputlabels += (
-                        str(
-                            tuple(
-                                _shape_notation(graph.onnx_coreml_shape_mapping[input_])
-                            )
-                        )
-                        + ", "
-                    )
-                else:
-                    inputlabels += "NA, "
-            else:
-                if input_ in graph.shape_dict:
-                    inputlabels += str(tuple(graph.shape_dict[input_])) + ", "
-                else:
-                    inputlabels += "NA, "
-        outputlabels = ""
-        for output_ in node.outputs:
-            if show_coreml_mapped_shapes:
-                if output_ in graph.onnx_coreml_shape_mapping:
-                    outputlabels += (
-                        str(
-                            tuple(
-                                _shape_notation(
-                                    graph.onnx_coreml_shape_mapping[output_]
-                                )
-                            )
-                        )
-                        + ", "
-                    )
-                else:
-                    outputlabels += "NA, "
-            else:
-                if output_ in graph.shape_dict:
-                    outputlabels += str(tuple(graph.shape_dict[output_])) + ", "
-                else:
-                    outputlabels += "NA, "
-        output_names = ", ".join([output_ for output_ in node.outputs])
-        input_names = ", ".join([input_ for input_ in node.inputs])
-        label = "%s\n|{{%s}|{%s}}|{{%s}|{%s}}" % (
-            node.op_type,
-            input_names,
-            output_names,
-            inputlabels,
-            outputlabels,
-        )
-        pydot_node = pydot.Node(node.name, label=label)
-        dot.add_node(pydot_node)
-
-    # add edges
-    for node in graph.nodes:
-        for child in node.children:
-            # add edge in pydot
-            dot.add_edge(pydot.Edge(node.name, child.name))
-        for input_ in node.inputs:
-            if input_ in graph_inputs:
-                dot.add_edge(pydot.Edge(input_, node.name))
-
-    # write out the image file
-    _, extension = os.path.splitext(graph_img_path)
-    if not extension:
-        extension = "pdf"
-    else:
-        extension = extension[1:]
-    dot.write(graph_img_path, format=extension)
diff --git a/coremltools/converters/onnx/_operators.py b/coremltools/converters/onnx/_operators.py
deleted file mode 100644
index 4333709a4..000000000
--- a/coremltools/converters/onnx/_operators.py
+++ /dev/null
@@ -1,2666 +0,0 @@
-
-import numpy as np
-import copy
-
-from typing import Sequence, Callable, List, Tuple, Optional, Text, Any
-from coremltools.models.neural_network import NeuralNetworkBuilder  # type: ignore
-from ._graph import Node, Graph
-from coremltools.proto import NeuralNetwork_pb2  # type: ignore
-from ._error_utils import ErrorHandling
-
-INT_MAX = 2 ** 30
-
-"""
-General common functions
-"""
-
-
-def _compare(a, b, encoding="utf8"):  # type: (Text, Text, Text) -> bool
-    if isinstance(a, bytes):
-        a = a.decode(encoding)
-    if isinstance(b, bytes):
-        b = b.decode(encoding)
-    return a == b
-
-
-def _is_input_shape_mapping_defined(node, graph):  # type: (Node, Graph) -> bool
-    if node.inputs[0] in graph.onnx_coreml_shape_mapping:
-        return True
-    else:
-        return False
-
-
-def _update_shape_mapping_unchanged(
-    node, graph, err
-):  # type: (Node, Graph, ErrorHandling) -> None
-    if _is_input_shape_mapping_defined(node, graph):
-        graph.onnx_coreml_shape_mapping[
-            node.outputs[0]
-        ] = graph.onnx_coreml_shape_mapping[node.inputs[0]]
-
-
-def _convert_broadcast_op(
-    builder, node, graph, err, mode
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling, Text) -> None
-    if node.op_type == "Max" or node.op_type == "Min" or node.op_type == "Mean":
-        if len(node.inputs) == 1:
-            inputs = [node.inputs[0], node.inputs[0]]
-        else:
-            inputs = node.inputs
-    else:
-        inputs = node.inputs
-
-    if node.op_type == "Sub":
-        builder.add_elementwise(
-            name=node.name + "_neg",
-            input_names=[inputs[1]],
-            output_name=inputs[1] + "_neg",
-            mode="MULTIPLY",
-            alpha=-1.0,
-        )
-        builder.add_elementwise(
-            name=node.name,
-            input_names=[inputs[0], inputs[1] + "_neg"],
-            output_name=node.outputs[0],
-            mode=mode,
-        )
-    else:
-        builder.add_elementwise(
-            name=node.name, input_names=inputs, output_name=node.outputs[0], mode=mode
-        )
-
-    if _is_input_shape_mapping_defined(node, graph):
-        ranks = [len(graph.onnx_coreml_shape_mapping[input_]) for input_ in node.inputs]
-        max_id = np.argmax(np.array(ranks))
-        graph.onnx_coreml_shape_mapping[
-            node.outputs[0]
-        ] = graph.onnx_coreml_shape_mapping[node.inputs[max_id]]
-
-
-def _get_coreml_target_shape(target_shape, builder, node, graph, err):
-    # type: (Tuple[int, ...], NeuralNetworkBuilder, node, Graph, ErrorHandling) -> Optional[Tuple[int, ...]]
-
-    if len(target_shape) == 1:  # (D,)
-        coreml_shape = (1, target_shape[0], 1, 1)  # type: Optional[Tuple[int, ...]]
-        if _is_input_shape_mapping_defined(node, graph):
-            graph.onnx_coreml_shape_mapping[node.outputs[0]] = [2]
-    elif len(target_shape) == 2:  # (S,D)
-        coreml_shape = target_shape + (1, 1)
-        if _is_input_shape_mapping_defined(node, graph):
-            graph.onnx_coreml_shape_mapping[node.outputs[0]] = [0, 2]
-    elif len(target_shape) == 3:  # (C,H,W)
-        coreml_shape = (1, target_shape[0], target_shape[1], target_shape[2])
-        if _is_input_shape_mapping_defined(node, graph):
-            graph.onnx_coreml_shape_mapping[node.outputs[0]] = [2, 3, 4]
-    elif len(target_shape) == 4:
-        coreml_shape = target_shape
-        if _is_input_shape_mapping_defined(node, graph):
-            mapp = graph.onnx_coreml_shape_mapping[node.inputs[0]]
-            if mapp[0] == 1 and coreml_shape[0] == 1:
-                graph.onnx_coreml_shape_mapping[node.outputs[0]] = [1, 2, 3, 4]
-            else:
-                graph.onnx_coreml_shape_mapping[node.outputs[0]] = [0, 2, 3, 4]
-    elif len(target_shape) > 4:
-        # return err.unsupported_op_configuration(builder, node, graph, "Supports tensors not more than 4d")  # type: ignore
-        diff = len(target_shape) - 4
-        if all([d == 1 for d in target_shape[:diff]]):
-            coreml_shape = target_shape[diff:]
-        else:
-            err.unsupported_op_configuration(builder, node, graph, "Tensors more than rank 4 are not supported")  # type: ignore
-        if _is_input_shape_mapping_defined(node, graph):
-            if target_shape[0] == 1 and len(target_shape) == 5:
-                graph.onnx_coreml_shape_mapping[node.outputs[0]] = [1, 0, 2, 3, 4]
-        else:
-            return err.unsupported_op_configuration(builder, node, graph, "Supports tensors not more than 4d")  # type: ignore
-    else:
-        coreml_shape = None
-    return coreml_shape
-
-
-def _get_coreml_axis(
-    axes, builder, node, graph, err
-):  # type: (List[int], NeuralNetworkBuilder, node, Graph, ErrorHandling) -> Text
-    coreml_axis = ""
-    if node.inputs[0] not in graph.shape_dict:
-        return err.unsupported_op_configuration(
-            builder, node, graph, "Failed to translate axis"
-        )
-    input_shape = graph.shape_dict[node.inputs[0]]
-    if len(input_shape) == 1:
-        coreml_axis = "C"
-    elif len(input_shape) == 2:
-        if len(axes) == 1 and axes[0] == 1:
-            coreml_axis = "C"
-    elif len(input_shape) == 3:
-        for ind in [["C", "H", "W"][i] for i in axes]:
-            coreml_axis += ind
-    elif len(input_shape) == 4:
-        for ind in [["B", "C", "H", "W"][i] for i in axes]:
-            coreml_axis += ind
-    return coreml_axis
-
-
-def _add_transpose_before_after(
-    layer_func,  # function for layer conversion
-    input_names,  # List[str]
-    output_names,  # List[str]
-    transpose_dims,  # List[int]
-    **kwargs
-):  # type: ignore
-
-    for i, input_ in enumerate(input_names):
-        kwargs["builder"].add_permute(
-            name=kwargs["node"].name + "_input_transpose" + str(i),
-            dim=transpose_dims,
-            input_name=input_,
-            output_name=kwargs["node"].name + "_" + input_ + "_transpose",
-        )
-
-    new_input_names = [
-        kwargs["node"].name + "_" + input_ + "_transpose" for input_ in input_names
-    ]
-    new_output_names = [output_ + "_transpose" for output_ in output_names]
-    layer_func(new_input_names, new_output_names, **kwargs)
-
-    for i, output_ in enumerate(output_names):
-        kwargs["builder"].add_permute(
-            name=kwargs["node"].name + "_output_transpose" + str(i),
-            dim=transpose_dims,
-            input_name=output_ + "_transpose",
-            output_name=output_,
-        )
-
-
-def _add_inner_product(input_names, output_names, **kwargs):
-    node = kwargs["node"]
-    builder = kwargs["builder"]
-    builder.add_inner_product(
-        name=node.name,
-        W=kwargs["W"],
-        b=kwargs["b"],
-        input_channels=kwargs["W"].shape[1],
-        output_channels=kwargs["W"].shape[0],
-        has_bias=kwargs["b"] is not None,
-        input_name=input_names[0],
-        output_name=output_names[0],
-    )
-
-
-def _add_conv_like_op(
-    add_func, get_params_func, params_dict, builder, node, graph, err
-):
-    if node.inputs[0] in graph.onnx_coreml_shape_mapping:
-        mapp = graph.onnx_coreml_shape_mapping[node.inputs[0]]
-
-        r = len(mapp)
-        if not (r == 3 or r == 4):
-            return err.unsupported_op_configuration(
-                builder, node, graph, "more than 4 axes not supported"
-            )
-        if r == 4:
-            if not (mapp == [1, 2, 3, 4] or mapp == [0, 2, 3, 4]):
-                return err.unsupported_op_configuration(
-                    builder,
-                    node,
-                    graph,
-                    "error in axes alignment between onnx and coreml",
-                )
-            get_params_func(builder, node, graph, err, params_dict)
-            add_func(
-                node.inputs,
-                node.outputs,
-                params_dict=params_dict,
-                node=node,
-                builder=builder,
-                graph=graph,
-                err=err,
-            )
-        if r == 3:
-            if mapp == [1, 2, 3]:  # [B,C,H]
-                # spatial dimension: height
-                get_params_func(builder, node, graph, err, params_dict, axis="height")
-                add_func(
-                    node.inputs,
-                    node.outputs,
-                    params_dict=params_dict,
-                    node=node,
-                    builder=builder,
-                    graph=graph,
-                    err=err,
-                )
-            elif mapp == [1, 2, 4]:  # [B,C,W]
-                # spatial dimension: width
-                get_params_func(builder, node, graph, err, params_dict, axis="width")
-                add_func(
-                    node.inputs,
-                    node.outputs,
-                    params_dict=params_dict,
-                    node=node,
-                    builder=builder,
-                    graph=graph,
-                    err=err,
-                )
-            elif mapp == [
-                2,
-                3,
-                4,
-            ]:  # [C,H,W] in CoreML, but it represents [B,C,D] in ONNX.
-                # spatial dimension: sequence
-                get_params_func(builder, node, graph, err, params_dict, axis="width")
-                node.inputs = [node.inputs[0]]
-                _add_transpose_before_after(
-                    add_func,
-                    node.inputs,
-                    node.outputs,
-                    [0, 2, 1, 3],  # swap C & H
-                    builder=builder,
-                    node=node,
-                    params_dict=params_dict,
-                    graph=graph,
-                    err=err,
-                )
-
-            elif mapp == [1, 2, 0]:  # [B,C,S]
-                # spatial dimension: sequence
-                get_params_func(builder, node, graph, err, params_dict, axis="width")
-                node.inputs = [node.inputs[0]]
-                _add_transpose_before_after(
-                    add_func,
-                    node.inputs,
-                    node.outputs,
-                    [3, 1, 2, 0],
-                    builder=builder,
-                    node=node,
-                    params_dict=params_dict,
-                    graph=graph,
-                    err=err,
-                )
-            else:
-                return err.unsupported_op_configuration(
-                    builder,
-                    node,
-                    graph,
-                    "error in axes alignment between onnx and coreml",
-                )
-
-    else:
-        get_params_func(builder, node, graph, err, params_dict)
-        add_func(
-            node.inputs,
-            node.outputs,
-            params_dict=params_dict,
-            builder=builder,
-            node=node,
-            graph=graph,
-            err=err,
-        )
-
-
-def _is_no_op(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> bool
-
-    if node.inputs[0] in graph.shape_dict and node.outputs[0] in graph.shape_dict:
-        if graph.shape_dict[node.inputs[0]] == graph.shape_dict[node.outputs[0]]:
-            builder.add_activation(
-                name=node.name,
-                non_linearity="LINEAR",
-                input_name=node.inputs[0],
-                output_name=node.outputs[0],
-                params=[1.0, 0.0],
-            )
-            _update_shape_mapping_unchanged(node, graph, err)
-            return True
-
-    return False
-
-
-"""
-Layer conversion functions
-"""
-
-
-def _convert_abs(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    builder.add_unary(
-        name=node.name,
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-        mode="abs",
-    )
-    _update_shape_mapping_unchanged(node, graph, err)
-
-
-def _convert_add(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-
-    # check if its equivalent to a bias layer
-    if len(node.inputs) > 1:
-        if node.inputs[1] in node.input_tensors:
-            second_input = np.squeeze(node.input_tensors[node.inputs[1]])
-            if len(second_input.shape) == 1:
-                builder.add_bias(
-                    name=node.name,
-                    b=second_input,
-                    input_name=node.inputs[0],
-                    output_name=node.outputs[0],
-                    shape_bias=[second_input.shape[0]],
-                )
-                return
-    """
-    Supported shapes by CoreML 2.0 for broadcasting (-1 means it can be 1 or greater than 1):
-    (i.e. all of the outputs must have one of these shapes for broadcasting support)
-    - (S=-1,B=-1,1,1,1)
-    - (S=-1,B=-1,C,1,1)
-    - (S=-1,B=-1,1,H,W)
-    - (S=-1,B=-1,C,H,W)
-    Unsupported:
-    - (S=-1,B=-1,1,1,W)
-    - (S=-1,B=-1,1,H,1)
-    - (S=-1,B=-1,C,1,W)
-    - (S=-1,B=-1,C,H,1)
-    """
-    _convert_broadcast_op(builder, node, graph, err, "ADD")
-
-
-def _convert_sub(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    _convert_broadcast_op(builder, node, graph, err, "ADD")
-
-
-def _get_conv_params(builder, node, graph, err, params_dict, axis=None):
-    if "dilations" not in node.attrs:
-        params_dict["dilations"] = [1, 1]
-    else:
-        if axis == "height":
-            params_dict["dilations"] = node.attrs["dilations"]
-            params_dict["dilations"].append(1)
-        elif axis == "width":
-            params_dict["dilations"] = node.attrs["dilations"]
-            params_dict["dilations"].insert(0, 1)
-        else:
-            params_dict["dilations"] = node.attrs["dilations"]
-
-    if "pads" not in node.attrs:
-        params_dict["pads"] = [0, 0, 0, 0]
-    else:
-        pads = node.attrs["pads"]
-        if axis == "height":
-            pads = [pads[0], 0, pads[1], 0]
-        elif axis == "width":
-            pads = [0, pads[0], 0, pads[1]]
-        params_dict["pads"] = pads
-
-    if "kernel_shape" in node.attrs:
-        params_dict["kernel_shape"] = node.attrs["kernel_shape"]
-    else:
-        # w_shape is ONNX format shape
-        w_shape = params_dict["w_shape"]
-        if len(w_shape) == 4:
-            params_dict["kernel_shape"] = [w_shape[-2], w_shape[-1]]
-        else:
-            params_dict["kernel_shape"] = [w_shape[-1]]
-    params_dict["strides"] = node.attrs.get("strides", [1, 1] if axis is None else [1])
-
-    if axis == "height":
-        if params_dict["W"] is not None:
-            params_dict["W"] = np.expand_dims(params_dict["W"], axis=-1)
-        params_dict["kernel_shape"].append(1)
-        params_dict["strides"].append(1)
-    elif axis == "width":
-        if params_dict["W"] is not None:
-            params_dict["W"] = np.expand_dims(params_dict["W"], axis=-2)
-        params_dict["strides"].insert(0, 1)
-        params_dict["kernel_shape"].insert(0, 1)
-
-    params_dict["out_shape"] = None
-    params_dict["padding_type"] = "valid"
-    params_dict["same_padding_asymmetry_mode"] = "BOTTOM_RIGHT_HEAVY"
-
-    if params_dict["W"] is not None:
-        if not params_dict["is_deconv"]:
-            params_dict["W"] = params_dict["W"].transpose((2, 3, 1, 0))  # type: ignore
-        else:
-            params_dict["W"] = params_dict["W"].transpose((2, 3, 0, 1))  # type: ignore
-
-    if "auto_pad" in node.attrs and \
-        not (_compare(node.attrs["auto_pad"], 'VALID') or _compare(node.attrs["auto_pad"], 'NOTSET')):
-        params_dict["padding_type"] = "same"
-        if _compare(node.attrs["auto_pad"], "SAME_LOWER"):
-            params_dict["same_padding_asymmetry_mode"] = "TOP_LEFT_HEAVY"
-
-    if params_dict["is_deconv"]:
-        if "output_shape" in node.attrs:
-            if axis == "height":
-                params_dict["out_shape"] = (
-                    node.attrs["output_shape"][-1],
-                    1,
-                )  # (Hout, wout)
-            elif axis == "width":
-                params_dict["out_shape"] = (
-                    1,
-                    node.attrs["output_shape"][-1],
-                )  # (Hout, wout)
-            else:
-                params_dict["out_shape"] = (
-                    node.attrs["output_shape"][-2],
-                    node.attrs["output_shape"][-1],
-                )  # (Hout, wout)
-        elif "output_padding" in node.attrs:
-            params_dict["crops"] = copy.copy(params_dict["pads"])
-            params_dict["pads"] = [0, 0, 0, 0]
-            post_pads = node.attrs["output_padding"]
-            if sum(post_pads) != 0:
-                t = l = b = r = 0
-                if len(post_pads) == 1:
-                    if axis == "height":
-                        b = post_pads[0]
-                    elif axis == "width":
-                        r = post_pads[0]
-                    else:
-                        err.unsupported_op_configuration(
-                            builder,
-                            node,
-                            graph,
-                            "length 1 output padding attribute only supported for 1D conv",
-                        )
-                elif len(post_pads) == 2:
-                    if axis == "height":
-                        b, r = post_pads
-                    elif axis == "width":
-                        r, b = post_pads
-                    else:
-                        b, r = post_pads
-                elif len(post_pads) == 4:
-                    b, r, t, l = post_pads
-                else:
-                    return err.unsupported_op_configuration(
-                        builder,
-                        node,
-                        graph,
-                        "Supports only length 1 or 2 or 4 output padding attribute",
-                    )
-
-                def _update_crop_pad(idx, v):
-                    if params_dict["crops"][idx] >= v:
-                        params_dict["crops"][idx] -= v
-                    else:
-                        params_dict["pads"][idx] = v - params_dict["crops"][idx]
-
-                _update_crop_pad(0, t)
-                _update_crop_pad(1, l)
-                _update_crop_pad(2, b)
-                _update_crop_pad(3, r)
-                params_dict["is_post_crop"] = (
-                    True if sum(params_dict["crops"]) > 0 else False
-                )
-                params_dict["is_pre_pad"] = (
-                    True if sum(params_dict["pads"]) > 0 else False
-                )
-
-
-def _add_conv(input_names, output_names, **kwargs):
-    params_dict = kwargs["params_dict"]
-    node = kwargs["node"]
-    builder = kwargs["builder"]
-    graph = kwargs["graph"]
-    err = kwargs["err"]
-
-    W_shape = params_dict["w_shape"]
-
-    output_name = output_names[0]
-    pre_padding_input_name = input_names[0]
-
-    if params_dict.get("is_post_crop", False):
-        output_name += "_conv_tranpose_post_crop"
-    if params_dict.get("is_pre_pad", False):
-        input_names[0] += "_conv_tranpose_pre_pad"
-
-    if params_dict["W"] is None and len(node.inputs) == 1:
-        return err.unsupported_op_configuration(
-            builder, node, graph, "Kernel weight missing"
-        )
-
-    if params_dict["is_deconv"]:
-        oc = W_shape[1] * params_dict["groups"]
-        kc = W_shape[0]
-    else:
-        oc = W_shape[0]
-        kc = W_shape[1]
-
-    if params_dict.get("is_pre_pad", False):
-        builder.add_padding(
-            name=node.name + "_pre_pad",  # type: ignore
-            left=params_dict["pads"][1],
-            right=params_dict["pads"][3],
-            top=params_dict["pads"][0],
-            bottom=params_dict["pads"][2],
-            input_name=pre_padding_input_name,
-            output_name=input_names[0],
-            value=0,
-        )
-    builder.add_convolution(
-        name=node.name,
-        kernel_channels=kc,
-        output_channels=oc,
-        height=params_dict["kernel_shape"][0],
-        width=params_dict["kernel_shape"][1],
-        stride_height=params_dict["strides"][0],
-        stride_width=params_dict["strides"][1],
-        border_mode=params_dict["padding_type"],
-        same_padding_asymmetry_mode=params_dict["same_padding_asymmetry_mode"],
-        groups=params_dict["groups"],
-        W=params_dict["W"],
-        b=params_dict["bias"],
-        has_bias=params_dict["bias"] is not None,
-        is_deconv=params_dict["is_deconv"],
-        output_shape=params_dict["out_shape"],
-        input_name=input_names[0]
-        if params_dict["W"] is not None
-        else [input_names[0], input_names[1]],
-        output_name=output_name,
-        dilation_factors=params_dict["dilations"],
-        padding_top=params_dict["pads"][0],
-        padding_bottom=params_dict["pads"][2],
-        padding_left=params_dict["pads"][1],
-        padding_right=params_dict["pads"][3],
-    )
-    if params_dict.get("is_post_crop", False):
-        builder.add_crop(
-            name=node.name + "_post_crop",  # type: ignore
-            left=params_dict["crops"][1],
-            right=params_dict["crops"][3],
-            top=params_dict["crops"][0],
-            bottom=params_dict["crops"][2],
-            input_names=[output_name],
-            output_name=output_names[0],
-            offset=[0, 0],
-        )
-
-
-def _convert_conv(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    params_dict = dict()
-    # get weights for convolution
-    weight_name = node.inputs[1]
-    W = None
-    if weight_name in node.input_tensors:
-        W = node.input_tensors[weight_name]
-        params_dict["w_shape"] = W.shape
-    else:
-        err.missing_initializer(
-            node,
-            "Weight tensor: {} not found in the graph initializer".format(weight_name,),
-        )
-    params_dict["W"] = W
-
-    params_dict["is_deconv"] = False
-    if node.op_type.endswith("Transpose"):
-        params_dict["is_deconv"] = True
-    bias = None
-    if len(node.inputs) > 2:
-        bias = node.input_tensors[node.inputs[2]]
-    params_dict["bias"] = bias
-    params_dict["groups"] = node.attrs.get("group", 1)
-
-    _add_conv_like_op(
-        _add_conv, _get_conv_params, params_dict, builder, node, graph, err
-    )
-
-    # update map
-    _update_shape_mapping_unchanged(node, graph, err)
-
-
-def _convert_relu(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    builder.add_activation(
-        name=node.name,
-        non_linearity="RELU",
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-    )
-    _update_shape_mapping_unchanged(node, graph, err)
-
-
-def _convert_thresholdedrelu(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    alpha = node.attrs.get("alpha", 1.0)
-    builder.add_activation(
-        name=node.name,
-        non_linearity="THRESHOLDEDRELU",
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-        params=alpha,
-    )
-    _update_shape_mapping_unchanged(node, graph, err)
-
-
-def _convert_reshape(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-
-    shape = tuple(node.attrs.get("shape", ()))  # type: (Tuple[int, ...])
-    if len(shape) == 0:
-        shape_name = node.inputs[1]
-        if shape_name in node.input_tensors:
-            shape = tuple(node.input_tensors[shape_name].astype(int))  # type: ignore
-        else:
-            err.missing_initializer(
-                node,
-                "CoreML only supports Reshape layer when the target shape is static and known apriori",
-            )
-
-    # check if all entries in shape are 1/-1
-    is_flatten = True
-    for s in shape:
-        if abs(s) != 1:
-            is_flatten = False
-            break
-    if is_flatten:
-        builder.add_flatten(
-            name=node.name,
-            input_name=node.inputs[0],
-            output_name=node.outputs[0],
-            mode=0,
-        )
-        if _is_input_shape_mapping_defined(node, graph):
-            mapp = graph.onnx_coreml_shape_mapping[node.inputs[0]]
-            if len(shape) == 4:
-                mapp_out = [mapp[0], 2, 3, 4]
-            elif len(shape) == 3:
-                mapp_out = [2, 3, 4]
-            elif len(shape) == 2:
-                mapp_out = [mapp[0], 2]
-            elif len(shape) == 1:
-                mapp_out = [2]
-            else:
-                return err.unsupported_op_configuration(
-                    builder, node, graph, "Supports only less than equal to 4d tensors"
-                )
-            graph.onnx_coreml_shape_mapping[node.outputs[0]] = mapp_out
-        return
-
-    new_shape = _get_coreml_target_shape(shape, builder, node, graph, err)
-
-    if new_shape is None:
-        return err.unsupported_op_configuration(
-            builder, node, graph, "Unsupported shape for reshape"
-        )
-
-    builder.add_reshape(
-        name=node.name,
-        target_shape=new_shape,
-        mode=0,
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-    )
-
-
-def _convert_transpose(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-
-    if _is_input_shape_mapping_defined(node, graph):
-        mapp = graph.onnx_coreml_shape_mapping[node.inputs[0]]
-        r = len(mapp)
-        default_perm = list(range(r))
-        default_perm.reverse()
-        perm = node.attrs.get("perm", default_perm)
-        coreml_perm = []
-        for p in perm:
-            coreml_perm.append(mapp[p])
-        if 1 in mapp:
-            batch_index = mapp.index(1)
-            batch_index_new = coreml_perm.index(1)
-            if batch_index != batch_index_new:
-                return err.unsupported_op_configuration(
-                    builder, node, graph, "cannot transpose batch dimension"
-                )
-        perm_translated = []
-        for c in coreml_perm:
-            if c == 0:
-                perm_translated.append(c)
-            elif c == 1:
-                continue
-            else:
-                perm_translated.append(c - 1)
-        perm_final = [
-            -1,
-            -1,
-            -1,
-            -1,
-        ]  # has to be of length 4 corresponding to [S,C,H,W]
-        for i in range(4):
-            if i not in perm_translated:
-                perm_final[i] = i
-        if perm_final.count(-1) != len(perm_translated):
-            return err.unsupported_op_configuration(
-                builder, node, graph, "unable to translate transpose op to CoreML"
-            )
-        ctr = 0
-        for i, v in enumerate(perm_final):
-            if v == -1:
-                perm_final[i] = perm_translated[ctr]
-                ctr += 1
-        perm = tuple(perm_final)
-    else:
-        perm = node.attrs.get("perm", [0, 3, 2, 1])
-        if len(perm) > 4:
-            diff = len(perm) - 4
-            if all([perm[i] == i for i in range(diff)]):
-                perm = [p - diff for p in perm[diff:]]
-            else:
-                return err.unsupported_op_configuration(
-                    builder, node, graph, "Supports only 4d tensors"
-                )
-        elif len(perm) < 4:
-            diff = 4 - len(perm)
-            perm = [d for d in range(diff)] + [d + diff for d in perm]
-        perm = tuple(perm)
-
-    builder.add_permute(
-        name=node.name, dim=perm, input_name=node.inputs[0], output_name=node.outputs[0]
-    )
-    _update_shape_mapping_unchanged(node, graph, err)
-
-
-def _get_pool_params(builder, node, graph, err, params_dict, axis=None):
-    (
-        params_dict["pad_b"],
-        params_dict["pad_l"],
-        params_dict["pad_r"],
-        params_dict["pad_t"],
-    ) = (0, 0, 0, 0)
-    params_dict["stride_height"], params_dict["stride_width"] = 1, 1
-    params_dict["padding_type"] = "VALID"
-    params_dict["same_padding_asymmetry_mode"] = "BOTTOM_RIGHT_HEAVY"
-
-    if params_dict["is_global"]:
-        params_dict["height"], params_dict["width"] = 0, 0
-        params_dict["stride_height"], params_dict["stride_width"] = 1, 1
-    else:
-        kernel_shape = node.attrs["kernel_shape"]
-        if axis == "height":
-            params_dict["height"] = kernel_shape[0]
-        elif axis == "width":
-            params_dict["width"] = kernel_shape[0]
-        else:
-            params_dict["height"] = kernel_shape[0]
-            params_dict["width"] = kernel_shape[1]
-
-        pads = node.attrs.get("pads", None)
-        if pads:
-            if axis == "height":
-                params_dict["pad_t"] = pads[0]
-                params_dict["pad_b"] = pads[1]
-            elif axis == "width":
-                params_dict["pad_l"] = pads[0]
-                params_dict["pad_r"] = pads[1]
-            else:
-                params_dict["pad_t"] = pads[0]
-                params_dict["pad_l"] = pads[1]
-                params_dict["pad_b"] = pads[2]
-                params_dict["pad_r"] = pads[3]
-
-        strides = node.attrs.get("strides", [1, 1])
-        if axis == "height":
-            params_dict["stride_height"] = strides[0]
-        elif axis == "width":
-            params_dict["stride_width"] = strides[0]
-        else:
-            params_dict["stride_height"] = strides[0]
-            params_dict["stride_width"] = strides[1]
-
-        if "auto_pad" in node.attrs and \
-            not (_compare(node.attrs["auto_pad"], 'VALID') or _compare(node.attrs["auto_pad"], 'NOTSET')):
-            params_dict["padding_type"] = "SAME"
-            if _compare(node.attrs["auto_pad"], "SAME_LOWER"):
-                params_dict["same_padding_asymmetry_mode"] = "TOP_LEFT_HEAVY"
-
-    params_dict["exclude_pad_area"] = node.attrs.get("count_include_pad", 0) == 0
-
-
-def _add_pool(input_names, output_names, **kwargs):
-    params_dict = kwargs["params_dict"]
-    node = kwargs["node"]
-    kwargs["builder"].add_pooling(
-        name=node.name,
-        height=params_dict.get("height", 1),
-        width=params_dict.get("width", 1),
-        stride_height=params_dict.get("stride_height", 1),
-        stride_width=params_dict.get("stride_width", 1),
-        layer_type=params_dict["layer_type"],
-        padding_type=params_dict["padding_type"],
-        exclude_pad_area=params_dict["exclude_pad_area"],
-        is_global=params_dict["is_global"],
-        input_name=input_names[0],
-        output_name=output_names[0],
-        padding_top=params_dict.get("pad_t", 0),
-        padding_bottom=params_dict.get("pad_b", 0),
-        padding_left=params_dict.get("pad_l", 0),
-        padding_right=params_dict.get("pad_r", 0),
-        same_padding_asymmetry_mode=params_dict["same_padding_asymmetry_mode"],
-    )
-
-
-def _convert_pool(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    input_name = node.inputs[0]
-    output_name = node.outputs[0]
-    params_dict = dict()
-    params_dict["is_global"] = False
-    if node.op_type.startswith("Global"):
-        params_dict["is_global"] = True
-    if node.op_type.endswith("MaxPool"):
-        params_dict["layer_type"] = "MAX"
-    elif node.op_type.endswith("AveragePool"):
-        params_dict["layer_type"] = "AVERAGE"
-    else:
-        return err.unsupported_op_configuration(
-            builder, node, graph, "Unsupported pool type"
-        )
-
-    if len(node.outputs) == 2:
-        return err.unsupported_op_configuration(
-            builder, node, graph, "argmax with pool unsupported"
-        )
-
-    if "ceil_mode" in node.attrs and node.attrs["ceil_mode"] == 1:
-        return err.unsupported_op_configuration(
-            builder, node, graph, "ceil_mod=1 not supported"
-        )
-
-    if "dilations" in node.attrs:
-        return err.unsupported_op_configuration(
-            builder, node, graph, "dilations not supported"
-        )
-
-    _add_conv_like_op(
-        _add_pool, _get_pool_params, params_dict, builder, node, graph, err
-    )
-
-    # update map
-    _update_shape_mapping_unchanged(node, graph, err)
-
-
-def _convert_bn(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    def add_bn(input_names, output_names, **kwargs):
-        kwargs["builder"].add_batchnorm(
-            name=node.name,
-            input_name=input_names[0],
-            output_name=output_names[0],
-            channels=kwargs["channels"][0],
-            gamma=kwargs["scale"],
-            beta=kwargs["bias"],
-            mean=kwargs["mean"],
-            variance=kwargs["var"],
-            epsilon=kwargs["epsilon"],
-        )
-
-    if len(node.outputs) > 1:
-        return err.unsupported_op_configuration(
-            builder,
-            node,
-            graph,
-            "This converter only supports BatchNormalization with one output",
-        )
-
-    epsilon = node.attrs.get("epsilon", 1e-5)
-    channels = set()
-    for v in node.input_tensors.values():
-        channels.add(v.shape)
-    assert len(channels) == 1
-    channels = channels.pop()
-    scale = (
-        node.input_tensors[node.inputs[1]]
-        if node.inputs[1] in node.input_tensors
-        else np.ones(shape=channels, dtype=np.float32)
-    )
-    bias = (
-        node.input_tensors[node.inputs[2]]
-        if node.inputs[2] in node.input_tensors
-        else np.zeros(shape=channels, dtype=np.float32)
-    )
-    mean = (
-        node.input_tensors[node.inputs[3]]
-        if node.inputs[3] in node.input_tensors
-        else np.zeros(shape=channels, dtype=np.float32)
-    )
-    var = (
-        node.input_tensors[node.inputs[4]]
-        if node.inputs[4] in node.input_tensors
-        else np.ones(shape=channels, dtype=np.float32)
-    )
-
-    mapp = graph.onnx_coreml_shape_mapping.get(node.inputs[0], None)
-    if mapp == [2, 3, 4]:
-        _add_transpose_before_after(
-            add_bn,
-            [node.inputs[0]],
-            node.outputs,
-            [0, 2, 1, 3],
-            builder=builder,
-            node=node,
-            scale=scale,
-            bias=bias,
-            mean=mean,
-            var=var,
-            epsilon=epsilon,
-            channels=channels,
-        )
-    else:
-        builder.add_batchnorm(
-            name=node.name,
-            channels=channels[0],
-            gamma=scale,
-            beta=bias,
-            mean=mean,
-            variance=var,
-            input_name=node.inputs[0],
-            output_name=node.outputs[0],
-            epsilon=epsilon,
-        )
-    _update_shape_mapping_unchanged(node, graph, err)
-
-
-def _convert_instancenorm(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    epsilon = node.attrs.get("epsilon", 1e-5)
-    scale = node.input_tensors[node.inputs[1]]
-    bias = node.input_tensors[node.inputs[2]]
-
-    builder.add_batchnorm(
-        name=node.name,
-        channels=scale.shape[0],
-        gamma=scale,
-        beta=bias,
-        compute_mean_var=True,
-        instance_normalization=True,
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-        epsilon=epsilon,
-    )
-    _update_shape_mapping_unchanged(node, graph, err)
-
-
-def _convert_mul(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    _convert_broadcast_op(builder, node, graph, err, "MULTIPLY")
-
-
-def _convert_mean(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    _convert_broadcast_op(builder, node, graph, err, "AVE")
-
-
-def _convert_div(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    builder.add_unary(
-        name=node.name + "_inverse",  # type: ignore
-        input_name=node.inputs[1],
-        output_name=node.inputs[1] + "_inverse",
-        mode="inverse",
-    )
-    builder.add_elementwise(
-        name=node.name,
-        input_names=[node.inputs[0], node.inputs[1] + "_inverse"],
-        output_name=node.outputs[0],
-        mode="MULTIPLY",
-    )
-    if _is_input_shape_mapping_defined(node, graph):
-        ranks = [len(graph.onnx_coreml_shape_mapping[input_]) for input_ in node.inputs]
-        max_id = np.argmax(np.array(ranks))
-        graph.onnx_coreml_shape_mapping[
-            node.outputs[0]
-        ] = graph.onnx_coreml_shape_mapping[node.inputs[max_id]]
-
-
-def _convert_leaky_relu(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    alpha = node.attrs.get("alpha", 0.01)
-    builder.add_activation(
-        name=node.name,
-        non_linearity="LEAKYRELU",
-        params=[alpha],
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-    )
-    _update_shape_mapping_unchanged(node, graph, err)
-
-
-def _convert_concat(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    def _add_concat(input_names, output_names, **kwargs):
-        kwargs["builder"].add_elementwise(
-            name=kwargs["node"].name,
-            input_names=input_names,
-            output_name=output_names[0],
-            mode=kwargs["mode"],
-        )
-
-    axis = node.attrs.get("axis", 1)
-    parent_op_type = graph.blob_from_op_type.get(node.inputs[0], None)
-
-    if _is_input_shape_mapping_defined(node, graph):
-        mapp = graph.onnx_coreml_shape_mapping[node.inputs[0]]
-        caxis = mapp[axis]
-        if caxis == 0:
-            _add_concat(
-                node.inputs,
-                node.outputs,
-                node=node,
-                builder=builder,
-                mode="SEQUENCE_CONCAT",
-            )
-        elif caxis == 2:
-            _add_concat(
-                node.inputs, node.outputs, node=node, builder=builder, mode="CONCAT"
-            )
-        elif caxis == 3:
-            _add_transpose_before_after(
-                _add_concat,
-                node.inputs,
-                node.outputs,
-                [0, 2, 1, 3],
-                mode="CONCAT",
-                node=node,
-                builder=builder,
-            )
-        elif caxis == 4:
-            _add_transpose_before_after(
-                _add_concat,
-                node.inputs,
-                node.outputs,
-                [0, 3, 2, 1],
-                mode="CONCAT",
-                node=node,
-                builder=builder,
-            )
-        else:
-            return err.unsupported_op_configuration(
-                builder, node, graph, "Concat not supported along batch axis"
-            )
-    else:
-        mode = None
-        first_input_shape = None
-        if node.inputs[0] in graph.shape_dict:
-            first_input_shape = graph.shape_dict[node.inputs[0]]
-            if (
-                parent_op_type in _SEQUENCE_LAYERS_REGISTRY
-                and len(first_input_shape) == 3
-            ):
-                if axis == 0:
-                    mode = "SEQUENCE_CONCAT"
-                if axis == 2:
-                    mode = "CONCAT"
-            elif (
-                (len(first_input_shape) == 1 and axis == 0)
-                or (len(first_input_shape) == 3 and axis == 0)
-                or (len(first_input_shape) == 4 and axis == 1)
-                or (len(first_input_shape) == 2 and axis == 1)
-            ):
-                mode = "CONCAT"
-        else:  # shape info is not available. Fall back to guessing (ideally this should not happen)
-            if axis == 0:
-                mode = "SEQUENCE_CONCAT"
-            elif axis == 1:
-                mode = "CONCAT"
-        if mode is None:
-            return err.unsupported_op_configuration(
-                builder,
-                node,
-                graph,
-                "Unsupported axis {} in input of shape {}".format(
-                    axis, str(first_input_shape)
-                ),
-            )
-        _add_concat(node.inputs, node.outputs, node=node, builder=builder, mode=mode)
-
-    _update_shape_mapping_unchanged(node, graph, err)
-
-
-def _convert_split(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    def _add_split(input_names, output_names, **kwargs):
-        kwargs["builder"].add_split(
-            name=kwargs["node"].name,
-            input_name=input_names[0],
-            output_names=output_names,
-        )
-
-    axis = node.attrs.get("axis", 0)
-    splits = node.attrs.get("split", None)
-    # check that splits are equal
-    if splits:
-        if splits.count(splits[0]) != len(splits):
-            return err.unsupported_op_configuration(
-                builder, node, graph, "Only Equal splits are supported"
-            )
-
-    if _is_input_shape_mapping_defined(node, graph):
-        mapp = graph.onnx_coreml_shape_mapping[node.inputs[0]]
-        if mapp[axis] == 2:
-            _add_split(node.inputs, node.outputs, node=node, builder=builder)
-        elif mapp[axis] == 0:
-            _add_transpose_before_after(
-                _add_split,
-                node.inputs,
-                node.outputs,
-                [1, 0, 2, 3],
-                builder=builder,
-                node=node,
-            )
-        elif mapp[axis] == 3:
-            _add_transpose_before_after(
-                _add_split,
-                node.inputs,
-                node.outputs,
-                [0, 2, 1, 3],
-                builder=builder,
-                node=node,
-            )
-        elif mapp[axis] == 4:
-            _add_transpose_before_after(
-                _add_split,
-                node.inputs,
-                node.outputs,
-                [0, 3, 2, 1],
-                builder=builder,
-                node=node,
-            )
-        else:
-            err.unsupported_op_configuration(
-                builder, node, graph, "Split along Batch axis not supported"
-            )
-    else:
-        if not (axis == 0 or axis == 1):
-            return err.unsupported_op_configuration(
-                builder, node, graph, "Unsupported axis {}".format(axis,)
-            )
-        _add_split(node.inputs, node.outputs, node=node, builder=builder)
-
-    if _is_input_shape_mapping_defined(node, graph):
-        for out_ in node.outputs:
-            graph.onnx_coreml_shape_mapping[out_] = graph.onnx_coreml_shape_mapping[
-                node.inputs[0]
-            ]
-
-
-def _convert_argmax(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    def _add_argmax_or_argmin(input_names, output_names, **kwargs):
-        input_name = input_names[0]
-        output_name = output_names[0]
-        if kwargs["node"].op_type == "ArgMin":
-            kwargs["builder"].add_elementwise(
-                name=kwargs["node"].name + "_multiply_minus_1",  # type: ignore
-                input_names=[input_name],
-                output_name=input_name + "_multiply_minus_1",
-                mode="MULTIPLY",
-                alpha=-1,
-            )
-            input_name += "_multiply_minus_1"
-        kwargs["builder"].add_reduce(
-            name=kwargs["node"].name,
-            input_name=input_name,
-            output_name=output_name,
-            axis=kwargs["coreml_axis"],
-            mode="argmax",
-        )
-
-    """
-    Conversion
-    """
-    axis = node.attrs.get("axis", 0)
-    keepdims = node.attrs.get("keepdims", 1)
-
-    input_name = node.inputs[0]
-    output_name = node.outputs[0]
-
-    if _is_input_shape_mapping_defined(node, graph):
-        mapp = graph.onnx_coreml_shape_mapping[node.inputs[0]]
-        coreml_axis = mapp[axis]
-        coreml_axis_string = "C"
-        if coreml_axis == 1:  # coreml_axis corresponds to the batch dimension
-            return err.unsupported_op_configuration(
-                builder, node, graph, "Cannot apply operation along Batch axis"
-            )
-        if coreml_axis != 0:
-            coreml_axis_string = ["C", "H", "W"][coreml_axis - 2]
-            _add_argmax_or_argmin(
-                [input_name],
-                [output_name],
-                builder=builder,
-                node=node,
-                coreml_axis=coreml_axis_string,
-            )
-        else:  # coreml_axis corresponds to the sequence dimension
-            _add_transpose_before_after(
-                _add_argmax_or_argmin,
-                [input_name],
-                [output_name],
-                [1, 0, 2, 3],
-                builder=builder,
-                node=node,
-                coreml_axis=coreml_axis_string,
-            )
-
-    else:
-        coreml_axis_string = _get_coreml_axis([axis], builder, node, graph, err)
-        if coreml_axis_string not in ["C", "H", "W", "HW", "CHW"]:
-            return err.unsupported_op_configuration(
-                builder,
-                node,
-                graph,
-                "Unable to translate axes attribute to CoreML axis parameter for %s"
-                % axis,
-            )
-        _add_argmax_or_argmin(
-            [input_name],
-            [output_name],
-            builder=builder,
-            node=node,
-            coreml_axis=coreml_axis_string,
-        )
-
-    """
-    update output shape map
-    """
-    if _is_input_shape_mapping_defined(node, graph):
-        mapp = graph.onnx_coreml_shape_mapping[node.inputs[0]]
-        if keepdims == 1:
-            graph.onnx_coreml_shape_mapping[node.outputs[0]] = mapp
-        else:
-            graph.onnx_coreml_shape_mapping[node.outputs[0]] = (
-                mapp[:axis] + mapp[axis + 1 :]
-            )
-
-
-def _convert_reduce(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-
-    # CoreML reduction supported along: C, H, W, CHW, HW
-
-    def _add_reduce(input_names, output_names, **kwargs):
-        input_name = input_names[0]
-        output_name = output_names[0]
-
-        if "add_log" in kwargs and kwargs["add_log"]:
-            if kwargs["node"].op_type == "ReduceLogSum":
-                output_name = output_names[0] + "_before_log"
-
-        kwargs["builder"].add_reduce(
-            name=kwargs["node"].name + "_" + output_name,
-            input_name=input_name,
-            output_name=output_name,
-            axis=kwargs["coreml_axis"],
-            mode=kwargs["mode"],
-        )
-
-        if "add_log" in kwargs and kwargs["add_log"]:
-            if node.op_type == "ReduceLogSum":
-                kwargs["builder"].add_unary(
-                    name=kwargs["node"].name + "_log",
-                    input_name=output_name,
-                    output_name=output_names[0],
-                    mode="log",
-                )
-
-    """
-    Conversion
-    """
-    input_name = node.inputs[0]
-    output_name = node.outputs[0]
-
-    axes = node.attrs.get("axes", None)
-    keepdims = node.attrs.get("keepdims", 1)
-
-    if axes is None:
-        if node.inputs[0] in graph.onnx_coreml_shape_mapping:
-            axes = range(0, len(graph.onnx_coreml_shape_mapping[node.inputs[0]]))
-        elif node.inputs[0] in graph.shape_dict:
-            axes = range(0, len(graph.shape_dict[node.inputs[0]]))
-        else:
-            return err.unsupported_op_configuration(
-                builder, node, graph, "Shape inference failed for reduce op"
-            )
-
-    if node.op_type == "ReduceMean":
-        mode = "avg"
-    elif node.op_type == "ReduceL1":
-        mode = "L1"
-    elif node.op_type == "ReduceL2":
-        mode = "L2"
-    elif node.op_type == "ReduceLogSum":
-        mode = "sum"
-    elif node.op_type == "ReduceMax":
-        mode = "max"
-    elif node.op_type == "ReduceMin":
-        mode = "min"
-    elif node.op_type == "ReduceProd":
-        mode = "prod"
-    elif node.op_type == "ReduceSum":
-        mode = "sum"
-    elif node.op_type == "ReduceSumSquare":
-        mode = "sumsquare"
-    else:
-        return err.unsupported_op_configuration(builder, node, graph, "Unsupported op")
-
-    if _is_input_shape_mapping_defined(node, graph):
-        mapp = graph.onnx_coreml_shape_mapping[node.inputs[0]]
-        coreml_axis = ""
-        for ind in [["S", "B", "C", "H", "W"][mapp[i]] for i in axes]:
-            coreml_axis += ind
-        coreml_axis = "".join(sorted(coreml_axis))
-    else:
-        coreml_axis = _get_coreml_axis(axes, builder, node, graph, err)
-
-    if coreml_axis in ["C", "H", "W", "HW", "CHW"]:
-        _add_reduce(
-            [input_name],
-            [output_name],
-            builder=builder,
-            node=node,
-            coreml_axis=coreml_axis,
-            mode=mode,
-            add_log=True,
-        )
-    else:
-        if node.op_type in ["ReduceMean"]:
-            return err.unsupported_op_configuration(
-                builder,
-                node,
-                graph,
-                "Unable to translate axes attribute to CoreML axis parameter for %s"
-                % axes,
-            )
-        n = len(coreml_axis)
-        for i, ax in enumerate(coreml_axis):
-            if ax not in ["C", "H", "W"]:
-                return err.unsupported_op_configuration(
-                    builder,
-                    node,
-                    graph,
-                    "Unable to translate axes attribute to CoreML axis parameter for %s"
-                    % axes,
-                )
-            else:
-                if i == 0:
-                    iname = input_name
-                else:
-                    iname = input_name + str(i)
-                if i == n - 1:
-                    oname = output_name
-                else:
-                    oname = input_name + str(i + 1)
-                if i < n - 1:
-                    _add_reduce(
-                        [iname],
-                        [oname],
-                        builder=builder,
-                        node=node,
-                        coreml_axis=ax,
-                        mode=mode,
-                        add_log=False,
-                    )
-                else:
-                    _add_reduce(
-                        [iname],
-                        [oname],
-                        builder=builder,
-                        node=node,
-                        coreml_axis=ax,
-                        mode=mode,
-                        add_log=True,
-                    )
-
-    """
-    update output shape map
-    """
-    if _is_input_shape_mapping_defined(node, graph):
-        mapp = graph.onnx_coreml_shape_mapping[node.inputs[0]]
-        if keepdims == 1:
-            graph.onnx_coreml_shape_mapping[node.outputs[0]] = mapp
-        else:
-            out_mapp = []
-            for i, m in enumerate(mapp):
-                if i not in axes:
-                    out_mapp.append(m)
-            if len(out_mapp) == 0:
-                out_mapp = [2]
-            graph.onnx_coreml_shape_mapping[node.outputs[0]] = out_mapp
-
-
-def _convert_softmax(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    def _add_softmax(input_names, output_names, **kwargs):
-        node = kwargs["node"]
-        builder = kwargs["builder"]
-
-        if node.op_type == "LogSoftmax":
-            builder.add_softmax(
-                name=node.name + "_softmax",  # type: ignore
-                input_name=node.inputs[0],
-                output_name=node.outputs[0] + "_softmax",
-            )
-            builder.add_unary(
-                name=node.name,
-                input_name=node.outputs[0] + "_softmax",
-                output_name=node.outputs[0],
-                mode="log",
-            )
-        else:
-            builder.add_softmax(
-                name=node.name, input_name=input_names[0], output_name=output_names[0]
-            )
-
-    axis = node.attrs.get("axis", 1)
-    if axis != 1:
-        return err.unsupported_op_configuration(
-            builder, node, graph, "Unsupported axis {} for softmax".format(axis,)
-        )
-
-    _add_softmax(node.inputs, node.outputs, node=node, builder=builder)
-
-    if node.inputs[0] in graph.onnx_coreml_shape_mapping:
-        mapp = graph.onnx_coreml_shape_mapping[node.inputs[0]]
-        graph.onnx_coreml_shape_mapping[node.outputs[0]] = mapp
-
-
-def _convert_gemm(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-
-    """
-    operation:  alpha * (A * B) + beta * C
-    so far the case only handled is :
-    - B is a constant matrix
-    - C is a constant vector
-    - alpha == beta == 1.0
-    - transA is off
-    """
-
-    if node.attrs.get("transA", 0) != 0:
-        return err.unsupported_op_configuration(
-            builder,
-            node,
-            graph,
-            "This Gemm layer cannot be converted to CoreML inner_product layer",
-        )
-
-    if (
-        abs(node.attrs.get("alpha", 1.0) - 1.0) > 1e-3
-        or abs(node.attrs.get("beta", 1.0) - 1.0) > 1e-3
-    ):
-        return err.unsupported_op_configuration(
-            builder,
-            node,
-            graph,
-            "This Gemm layer cannot be converted to CoreML inner_product layer",
-        )
-
-    weight_name = node.inputs[1]
-    if weight_name in node.input_tensors:
-        W = node.input_tensors[weight_name]
-        if not node.attrs.get("transB", 0):
-            W = np.transpose(W)
-    else:
-        err.missing_initializer(node, "Second input to Gemm layer must be a constant")
-
-    b = None
-    if len(node.inputs) > 2:
-        b = (node.input_tensors[node.inputs[2]]).flatten()
-    if len(W.shape) != 2 or (b is not None and len(b.shape) != 1):
-        return err.unsupported_op_configuration(
-            builder,
-            node,
-            graph,
-            "This Gemm layer cannot be converted to CoreML inner_product layer",
-        )
-
-    if b is not None:
-        if W.shape[0] != b.shape[0]:
-            return err.unsupported_op_configuration(
-                builder,
-                node,
-                graph,
-                "This Gemm layer cannot be converted to CoreML inner_product layer",
-            )
-
-    if node.inputs[0] in graph.onnx_coreml_shape_mapping:
-        mapp = graph.onnx_coreml_shape_mapping[node.inputs[0]]
-        if mapp == [1, 2] or mapp == [0, 2]:  # [B,C] or [S,C]
-            _add_inner_product(
-                [node.inputs[0]], node.outputs, W=W, b=b, node=node, builder=builder
-            )
-        elif mapp == [3, 4]:  # [H,W]
-            _add_transpose_before_after(
-                _add_inner_product,
-                [node.inputs[0]],
-                node.outputs,
-                [2, 3, 0, 1],
-                W=W,
-                b=b,
-                node=node,
-                builder=builder,
-            )
-        elif mapp == [2, 3]:  # (C,H)
-            _add_transpose_before_after(
-                _add_inner_product,
-                [node.inputs[0]],
-                node.outputs,
-                [1, 2, 0, 3],
-                W=W,
-                b=b,
-                node=node,
-                builder=builder,
-            )
-        elif mapp == [2, 4]:  # (C,W)
-            _add_transpose_before_after(
-                _add_inner_product,
-                [node.inputs[0]],
-                node.outputs,
-                [1, 3, 2, 0],
-                W=W,
-                b=b,
-                node=node,
-                builder=builder,
-            )
-        else:
-            return err.unsupported_op_configuration(
-                builder, node, graph, "CoreML incompatible axis placement"
-            )
-    else:
-        _add_inner_product(
-            [node.inputs[0]], node.outputs, W=W, b=b, node=node, builder=builder
-        )
-
-    if node.inputs[0] in graph.onnx_coreml_shape_mapping:
-        graph.onnx_coreml_shape_mapping[
-            node.outputs[0]
-        ] = graph.onnx_coreml_shape_mapping[node.inputs[0]]
-
-
-def _convert_matmul(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-
-    weight_name = node.inputs[1]
-    if weight_name in node.input_tensors:
-        W = node.input_tensors[weight_name]
-    else:
-        err.missing_initializer(node, "Second input to Matmul layer must be a constant")
-
-    if len(W.shape) != 2:
-        return err.unsupported_op_configuration(
-            builder,
-            node,
-            graph,
-            "This Matmul layer cannot be converted to CoreML inner_product layer",
-        )
-
-    W = np.transpose(W)
-
-    if node.inputs[0] in graph.onnx_coreml_shape_mapping:
-        mapp = graph.onnx_coreml_shape_mapping[node.inputs[0]]
-        if mapp == [1, 2] or mapp == [0, 2]:  # [B,C] or [S,C]
-            _add_inner_product(
-                [node.inputs[0]], node.outputs, W=W, b=None, node=node, builder=builder
-            )
-        elif mapp == [3, 4]:  # [H,W]
-            _add_transpose_before_after(
-                _add_inner_product,
-                [node.inputs[0]],
-                node.outputs,
-                [2, 3, 0, 1],
-                W=W,
-                b=None,
-                node=node,
-                builder=builder,
-            )
-        elif mapp == [2, 3]:  # (C,H)
-            _add_transpose_before_after(
-                _add_inner_product,
-                [node.inputs[0]],
-                node.outputs,
-                [1, 2, 0, 3],
-                W=W,
-                b=None,
-                node=node,
-                builder=builder,
-            )
-        elif mapp == [2, 4]:  # (C,W)
-            _add_transpose_before_after(
-                _add_inner_product,
-                [node.inputs[0]],
-                node.outputs,
-                [1, 3, 2, 0],
-                W=W,
-                b=None,
-                node=node,
-                builder=builder,
-            )
-        else:
-            return err.unsupported_op_configuration(
-                builder, node, graph, "CoreML incompatible axis placement"
-            )
-    else:
-        _add_inner_product(
-            [node.inputs[0]], node.outputs, W=W, b=None, node=node, builder=builder
-        )
-
-    if node.inputs[0] in graph.onnx_coreml_shape_mapping:
-        graph.onnx_coreml_shape_mapping[
-            node.outputs[0]
-        ] = graph.onnx_coreml_shape_mapping[node.inputs[0]]
-
-
-def _convert_lrn(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    alpha = node.attrs.get("alpha", 1.0e-4)
-    beta = node.attrs.get("beta", 0.75)
-    bias = node.attrs.get("bias", 1.0)
-    size = node.attrs["size"]
-    builder.add_lrn(
-        name=node.name,
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-        alpha=alpha,
-        beta=beta,
-        k=bias,
-        local_size=size,
-    )
-    _update_shape_mapping_unchanged(node, graph, err)
-
-
-def _convert_sigmoid(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    builder.add_activation(
-        name=node.name,
-        non_linearity="SIGMOID",
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-    )
-    _update_shape_mapping_unchanged(node, graph, err)
-
-
-def _convert_sign(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    builder.add_activation(
-        name=node.name,
-        non_linearity="SIGMOID_HARD",
-        input_name=node.inputs[0],
-        output_name=node.outputs[0] + "_step",
-        params=[10000, 0],
-    )
-    builder.add_elementwise(
-        name=node.name + "_subtract_half",
-        input_names=node.outputs[0] + "_step",
-        output_name=node.outputs[0] + "_step_half",
-        mode="ADD",
-        alpha=-0.5,
-    )
-    builder.add_elementwise(
-        name=node.name + "_multiply_2",
-        input_names=node.outputs[0] + "_step_half",
-        output_name=node.outputs[0],
-        mode="MULTIPLY",
-        alpha=2,
-    )
-    _update_shape_mapping_unchanged(node, graph, err)
-
-
-def _convert_elu(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    alpha = node.attrs.get("alpha", 1.0)
-    builder.add_activation(
-        name=node.name,
-        non_linearity="ELU",
-        params=alpha,
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-    )
-    _update_shape_mapping_unchanged(node, graph, err)
-
-
-def _convert_selu(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    alpha = node.attrs.get("alpha", 1.6732)
-    gamma = node.attrs.get("gamma", 1.0507)
-    builder.add_activation(
-        name=node.name + "_elu",  # type: ignore
-        non_linearity="ELU",
-        params=alpha,
-        input_name=node.inputs[0],
-        output_name=node.inputs[0] + "_elu",
-    )
-    builder.add_elementwise(
-        name=node.name,
-        input_names=node.inputs[0] + "_elu",
-        output_name=node.outputs[0],
-        mode="MULTIPLY",
-        alpha=gamma,
-    )
-    _update_shape_mapping_unchanged(node, graph, err)
-
-
-def _convert_prelu(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    if node.inputs[1] not in node.input_tensors:
-        return err.unsupported_op_configuration(
-            builder, node, graph, "Slope must be known!"
-        )
-
-    slope = node.input_tensors[node.inputs[1]]
-    builder.add_activation(
-        name=node.name,
-        non_linearity="PRELU",
-        params=slope,
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-    )
-    _update_shape_mapping_unchanged(node, graph, err)
-
-
-def _convert_tanh(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    builder.add_activation(
-        name=node.name,
-        non_linearity="TANH",
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-    )
-    _update_shape_mapping_unchanged(node, graph, err)
-
-
-def _convert_pad(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    def _get_pad_params(builder, node, graph, err, params_dict, axis=None):
-
-        pads = node.attrs["pads"]
-        if not (len(pads) % 2 == 0 and len(pads) >= 2):
-            return err.unsupported_op_configuration(
-                builder,
-                node,
-                graph,
-                "pads attribute: {}."
-                "Length of pads must be a multiple of 2".format(str(pads)),
-            )
-        if len(pads) == 8:
-            az = pads[:2] + pads[4:6]
-            if az.count(0) != len(az):
-                return err.unsupported_op_configuration(
-                    builder,
-                    node,
-                    graph,
-                    "Paddings value {} not supported".format(pads,),
-                )
-            pads = pads[2:4] + pads[6:8]
-
-        if len(pads) == 6:
-            az = pads[:2] + pads[3:5]
-            if az.count(0) != len(az):
-                return err.unsupported_op_configuration(
-                    builder,
-                    node,
-                    graph,
-                    "Paddings value {} not supported".format(pads,),
-                )
-            pads = [pads[2], pads[5]]
-
-        pad_t, pad_b, pad_l, pad_r = 0, 0, 0, 0
-        if axis == "height":
-            pad_t, pad_b = pads
-        elif axis == "width":
-            pad_l, pad_r = pads
-        else:
-            pad_t, pad_l, pad_b, pad_r = pads
-        params_dict["pad_t"] = pad_t
-        params_dict["pad_b"] = pad_b
-        params_dict["pad_l"] = pad_l
-        params_dict["pad_r"] = pad_r
-
-    def _add_pad(input_names, output_names, **kwargs):
-        params_dict = kwargs["params_dict"]
-        node = kwargs["node"]
-        builder = kwargs["builder"]
-        builder.add_padding(
-            name=node.name,
-            left=params_dict["pad_l"],
-            right=params_dict["pad_r"],
-            top=params_dict["pad_t"],
-            bottom=params_dict["pad_b"],
-            value=params_dict["value"],
-            input_name=input_names[0],
-            output_name=output_names[0],
-            padding_type=params_dict["mode"],
-        )
-
-    params_dict = dict()
-    mode = node.attrs["mode"]
-    if mode == "reflect" or mode == b"reflect":
-        mode = "reflection"
-    elif mode == "edge" or mode == b"edge":
-        mode = "replication"
-    else:
-        mode = "constant"
-    params_dict["mode"] = mode
-    params_dict["value"] = node.attrs.get("value", 0.0)
-
-    _add_conv_like_op(_add_pad, _get_pad_params, params_dict, builder, node, graph, err)
-
-    # update map
-    _update_shape_mapping_unchanged(node, graph, err)
-
-
-def _convert_slice(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-
-    if _is_no_op(builder, node, graph, err):
-        return
-
-    def _add_slice(input_names, output_names, **kwargs):
-        node = kwargs["node"]
-        builder = kwargs["builder"]
-        params_dict = kwargs["params_dict"]
-        builder.add_slice(
-            name=node.name + "_" + output_names[0],
-            input_name=input_names[0],
-            output_name=output_names[0],
-            axis=params_dict["axis"],
-            start_index=params_dict["start_index"],
-            end_index=params_dict["end_index"],
-            stride=1,
-        )
-
-    params_dict = dict()
-    starts = node.attrs["starts"]
-    ends = node.attrs["ends"]
-    axes = node.attrs.get("axes", range(len(starts)))
-
-    if node.inputs[0] in graph.shape_dict:
-        for ii, _ in enumerate(axes):
-            if ends[ii] > INT_MAX:
-                ends[ii] = graph.shape_dict[node.inputs[0]][ii]
-
-    if _is_input_shape_mapping_defined(node, graph):
-        mapp = graph.onnx_coreml_shape_mapping[node.inputs[0]]
-        r = len(starts)
-        for i, ax in enumerate(axes):
-            params_dict["start_index"] = starts[i]
-            params_dict["end_index"] = ends[i]
-            if i == 0:
-                iname = node.inputs[0]
-            else:
-                iname = node.inputs[0] + str(i)
-            oname = node.inputs[0] + str(i + 1)
-            if i == r - 1:
-                oname = node.outputs[0]
-
-            if mapp[ax] == 2:
-                params_dict["axis"] = "channel"
-                _add_slice(
-                    [iname],
-                    [oname],
-                    node=node,
-                    builder=builder,
-                    params_dict=params_dict,
-                )
-            elif mapp[ax] == 3:
-                params_dict["axis"] = "height"
-                _add_slice(
-                    [iname],
-                    [oname],
-                    node=node,
-                    builder=builder,
-                    params_dict=params_dict,
-                )
-            elif mapp[ax] == 4:
-                params_dict["axis"] = "width"
-                _add_slice(
-                    [iname],
-                    [oname],
-                    node=node,
-                    builder=builder,
-                    params_dict=params_dict,
-                )
-            elif mapp[ax] == 0:
-                params_dict["axis"] = "channel"
-                _add_transpose_before_after(
-                    _add_slice,
-                    [iname],
-                    [oname],
-                    [1, 0, 2, 3],
-                    node=node,
-                    builder=builder,
-                    params_dict=params_dict,
-                )
-            else:
-                err.unsupported_op_configuration(
-                    builder, node, graph, "cannot slice along batch axis"
-                )
-    else:
-        params_dict["start_index"] = starts[0]
-        params_dict["end_index"] = ends[0]
-        input_shape = graph.shape_dict.get(node.inputs[0], None)
-        if len(axes) != 1:
-            return err.unsupported_op_configuration(
-                builder, node, graph, "Only single axis Slice is supported now"
-            )
-        if input_shape and len(input_shape) == 4 and len(axes) == 1:
-            axis = ["B", "channel", "height", "width"][axes[0]]
-        elif len(axes) == 1:
-            if axes[0] == 0:
-                axis = "channel"
-            elif axes[0] == 1:
-                axis = "height"
-            elif axes[0] == 2:
-                axis = "width"
-            else:
-                return err.unsupported_op_configuration(
-                    builder,
-                    node,
-                    graph,
-                    "Slice is supported only along H, W or C dimensions",
-                )
-        else:
-            return err.unsupported_op_configuration(
-                builder,
-                node,
-                graph,
-                "Slice is supported only along one axis for 3D or 4D Tensors",
-            )
-        params_dict["axis"] = axis
-        _add_slice(
-            node.inputs,
-            node.outputs,
-            builder=builder,
-            node=node,
-            params_dict=params_dict,
-        )
-
-    _update_shape_mapping_unchanged(node, graph, err)
-
-
-def _convert_exp(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    builder.add_unary(
-        name=node.name,
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-        mode="exp",
-    )
-    _update_shape_mapping_unchanged(node, graph, err)
-
-
-def _convert_pow(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-
-    input2 = node.inputs[1]
-    is_supported = False
-    if input2 in node.input_tensors:
-        alpha = node.input_tensors[input2]
-        if len(alpha.shape) == 0:
-            is_supported = True
-
-    if not is_supported:
-        err.missing_initializer(
-            node, "Only mode supported is when the second input is a scalar constant"
-        )
-
-    builder.add_unary(
-        name=node.name,
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-        mode="power",
-        alpha=float(alpha),
-    )
-    _update_shape_mapping_unchanged(node, graph, err)
-
-
-def _convert_flatten(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    def _add_flatten(input_names, output_names, **kwargs):
-        kwargs["builder"].add_flatten(
-            name=kwargs["node"].name,
-            input_name=input_names[0],
-            output_name=output_names[0],
-            mode=0,
-        )
-
-    axis = node.attrs.get("axis", 1)
-    if node.inputs[0] in graph.onnx_coreml_shape_mapping:
-        mapp = graph.onnx_coreml_shape_mapping[node.inputs[0]]
-        if (mapp[0] == 0 or mapp[0] == 1) and (axis == 0 or axis == 1):
-            _add_flatten(node.inputs, node.outputs, builder=builder, node=node)
-        elif mapp[0:2] == [0, 1] and axis == 2:
-            _add_flatten(node.inputs, node.outputs, builder=builder, node=node)
-        elif len(mapp) == 1 and axis == 1 and mapp[0] < 4:
-            _add_flatten(node.inputs, node.outputs, builder=builder, node=node)
-        else:
-            return err.unsupported_op_configuration(
-                builder, node, graph, "Flatten axis mode not supported"
-            )
-    else:
-        _add_flatten(node.inputs, node.outputs, builder=builder, node=node)
-
-    if node.inputs[0] in graph.onnx_coreml_shape_mapping:
-        mapp = graph.onnx_coreml_shape_mapping[node.inputs[0]]
-        if len(mapp) == 1 and axis == 1 and mapp[0] < 4:
-            graph.onnx_coreml_shape_mapping[node.outputs[0]] = [mapp[0], mapp[0] + 1]
-        else:
-            graph.onnx_coreml_shape_mapping[node.outputs[0]] = [mapp[0], 2]
-
-
-def _convert_max(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    _convert_broadcast_op(builder, node, graph, err, "MAX")
-
-
-def _convert_min(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    _convert_broadcast_op(builder, node, graph, err, "MIN")
-
-
-def _convert_softsign(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    builder.add_activation(
-        name=node.name,
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-        non_linearity="SOFTSIGN",
-    )
-    _update_shape_mapping_unchanged(node, graph, err)
-
-
-def _convert_softplus(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    builder.add_activation(
-        name=node.name,
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-        non_linearity="SOFTPLUS",
-    )
-    _update_shape_mapping_unchanged(node, graph, err)
-
-
-def _convert_hardsigmoid(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    alpha = node.attrs.get("alpha", 0.2)
-    beta = node.attrs.get("beta", 0.5)
-    builder.add_activation(
-        name=node.name,
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-        non_linearity="SIGMOID_HARD",
-        params=[alpha, beta],
-    )
-    _update_shape_mapping_unchanged(node, graph, err)
-
-
-def _convert_neg(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    builder.add_elementwise(
-        name=node.name,
-        input_names=node.inputs,
-        output_name=node.outputs[0],
-        mode="MULTIPLY",
-        alpha=-1.0,
-    )
-    _update_shape_mapping_unchanged(node, graph, err)
-
-
-def _convert_log(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    builder.add_unary(
-        name=node.name,
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-        mode="log",
-    )
-    _update_shape_mapping_unchanged(node, graph, err)
-
-
-def _convert_sqrt(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    builder.add_unary(
-        name=node.name,
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-        mode="sqrt",
-    )
-    _update_shape_mapping_unchanged(node, graph, err)
-
-
-def _convert_reciprocal(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    builder.add_unary(
-        name=node.name,
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-        mode="inverse",
-    )
-    _update_shape_mapping_unchanged(node, graph, err)
-
-
-def _convert_reorganize_data(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    mode = "SPACE_TO_DEPTH"
-    if node.op_type == "DepthToSpace":
-        mode = "DEPTH_TO_SPACE"
-    block_size = node.attrs.get("blocksize", 2)
-    builder.add_reorganize_data(
-        name=node.name,
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-        mode=mode,
-        block_size=block_size,
-    )
-    _update_shape_mapping_unchanged(node, graph, err)
-
-
-def _convert_upsample(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    if "scales" in node.attrs:
-        scales = node.attrs["scales"]
-        if len(scales) != 4 or scales[0] != 1.0 or scales[1] != 1.0:
-            err.unsupported_op_configuration(
-                builder,
-                node,
-                graph,
-                "Unsupported scales {} for upsample".format(scales),
-            )
-        height_scale = int(scales[2])
-        width_scale = int(scales[3])
-    elif len(node.input_tensors):
-        key = next(iter(node.input_tensors.keys()))
-        scales = node.input_tensors[key]
-        height_scale = int(scales[2])
-        width_scale = int(scales[3])
-    else:
-        if len(node.inputs) > 1:
-            return err.unsupported_op_configuration(
-                builder,
-                node,
-                graph,
-                "This ONNX upsample layer has 'scales' provided as an input. CoreML upsample requires 'scales' as an attribute of the layer.",
-            )
-        height_scale = int(node.attrs.get("height_scale", 1))
-        width_scale = int(node.attrs.get("width_scale", 1))
-    mode_convert = {
-        "nearest": "NN",
-        "linear": "BILINEAR",
-    }
-    mode = mode_convert[node.attrs["mode"].decode("UTF-8")]
-    builder.add_upsample(
-        name=node.name,
-        scaling_factor_h=height_scale,
-        scaling_factor_w=width_scale,
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-        mode=mode,
-    )
-    _update_shape_mapping_unchanged(node, graph, err)
-
-
-def _convert_clip(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    # clip(x, a, b) = max(min(x, a), b) = -min(-min(x, a), -b)
-
-    if node.attrs.get("max") is None:
-        min_limit = node.attrs.get("min", float(-(2 ** 16) - 1))
-        builder.add_unary(
-            name=node.name,
-            input_name=node.inputs[0],
-            output_name=node.outputs[0],
-            mode="threshold",
-            alpha=min_limit,
-            shift=0,
-            scale=1.0,
-        )
-    elif node.attrs.get("min") is None:
-        max_limit = node.attrs.get("max", float(2 ** 16 - 1))
-        builder.add_unary(
-            name=node.name + "_min_minus_x_minus_b",
-            input_name=node.inputs[0],
-            output_name=node.inputs[0] + "_min_minus_x_minus_b",
-            mode="threshold",
-            alpha=-max_limit,
-            shift=0,
-            scale=-1.0,
-        )
-
-        builder.add_activation(
-            name=node.name,
-            non_linearity="LINEAR",
-            input_name=node.inputs[0] + "_min_minus_x_minus_b",
-            output_name=node.outputs[0],
-            params=[-1.0, 0],
-        )
-
-    else:
-        min_limit = node.attrs.get("min")
-        max_limit = node.attrs.get("max")
-        builder.add_unary(
-            name=node.name + "_min_x_a",
-            input_name=node.inputs[0],
-            output_name=node.inputs[0] + "_min_x_a",
-            mode="threshold",
-            alpha=min_limit,
-            shift=0,
-            scale=1.0,
-        )
-
-        builder.add_unary(
-            name=node.name + "_min_minus_x_minus_b",
-            input_name=node.inputs[0] + "_min_x_a",
-            output_name=node.inputs[0] + "_min_minus_x_minus_b",
-            mode="threshold",
-            alpha=-max_limit,
-            shift=0,
-            scale=-1.0,
-        )
-
-        builder.add_activation(
-            name=node.name,
-            non_linearity="LINEAR",
-            input_name=node.inputs[0] + "_min_minus_x_minus_b",
-            output_name=node.outputs[0],
-            params=[-1.0, 0],
-        )
-
-    _update_shape_mapping_unchanged(node, graph, err)
-
-
-def _convert_mvn(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    builder.add_mvn(
-        name=node.name,
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-        across_channels=node.attrs.get("across_channels", 0),
-        normalize_variance=node.attrs.get("normalize_variance", 1),
-        epsilon=1e-5,
-    )
-    _update_shape_mapping_unchanged(node, graph, err)
-
-
-def _convert_lstm(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    W_name = node.inputs[1]
-    R_name = node.inputs[2]
-    B = None
-    if len(node.inputs) > 3:
-        B_name = node.inputs[3]
-        B = node.input_tensors.get(B_name, None)
-    W = node.input_tensors.get(W_name, None)
-    R = node.input_tensors.get(R_name, None)
-    if W is None:
-        err.missing_initializer(
-            node, "Weight tensor: {} not found in the graph initializer".format(W_name,)
-        )
-    if R is None:
-        err.missing_initializer(
-            node, "Weight tensor: {} not found in the graph initializer".format(R_name,)
-        )
-
-    h = node.attrs["hidden_size"]
-    W_i, W_o, W_f, W_c = np.split(np.squeeze(W), 4)  # type: ignore
-    R_i, R_o, R_f, R_c = np.split(np.squeeze(R), 4)  # type: ignore
-    x = W_i.shape[1]
-    h = W_i.shape[0]
-    W_x = [W_i, W_f, W_o, W_c]
-    W_h = [R_i, R_f, R_o, R_c]
-    b = None
-    if B is not None:
-        b_Wi, b_Wo, b_Wf, b_Wc, b_Ri, b_Ro, b_Rf, b_Rc = np.split(np.squeeze(B), 8)  # type: ignore
-        b = [b_Wi + b_Ri, b_Wf + b_Rf, b_Wo + b_Ro, b_Wc + b_Rc]
-
-    input_h = node.inputs[5] if len(node.inputs) > 5 else node.inputs[0] + "_h_input"
-    input_c = node.inputs[6] if len(node.inputs) > 6 else node.inputs[0] + "_c_input"
-    output_h = (
-        node.outputs[1] if len(node.outputs) > 1 else node.outputs[0] + "_h_output"
-    )
-    output_c = (
-        node.outputs[2] if len(node.outputs) > 2 else node.outputs[0] + "_c_output"
-    )
-
-    graph.optional_inputs.append((input_h, (h)))
-    graph.optional_inputs.append((input_c, (h)))
-    graph.optional_outputs.append((output_h, (h)))
-    graph.optional_outputs.append((output_c, (h)))
-
-    builder.add_unilstm(
-        name=node.name,
-        W_h=W_h,
-        W_x=W_x,
-        b=b,
-        hidden_size=h,
-        input_size=x,
-        input_names=[node.inputs[0], input_h, input_c],
-        output_names=[node.outputs[0], output_h, output_c],
-        inner_activation="SIGMOID",
-        cell_state_update_activation="TANH",
-        output_activation="TANH",
-        peep=None,
-        output_all=True,
-        forget_bias=False,
-        coupled_input_forget_gate=False,
-        cell_clip_threshold=50000.0,
-        reverse_input=False,
-    )
-
-    if _is_input_shape_mapping_defined(node, graph):
-        graph.onnx_coreml_shape_mapping[
-            node.outputs[0]
-        ] = graph.onnx_coreml_shape_mapping[node.inputs[0]]
-        graph.onnx_coreml_shape_mapping[
-            node.outputs[1]
-        ] = graph.onnx_coreml_shape_mapping[node.inputs[0]]
-        graph.onnx_coreml_shape_mapping[
-            node.outputs[2]
-        ] = graph.onnx_coreml_shape_mapping[node.inputs[0]]
-
-
-def _convert_custom(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    params = NeuralNetwork_pb2.CustomLayerParams()
-    params.className = node.op_type
-    params.description = "Custom layer that corresponds to the ONNX op {}".format(
-        node.op_type,
-    )
-
-    inputs_ = []
-    # skip the inputs that are initializers
-    for inp in node.inputs:
-        if inp not in node.input_tensors:
-            inputs_.append(inp)
-
-    builder.add_custom(
-        name=node.name,
-        input_names=inputs_,
-        output_names=node.outputs,
-        custom_proto_spec=params,
-    )
-    err.custom_layer_nodes.append(node)
-
-
-def _convert_identity(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    builder.add_activation(
-        name=node.name,
-        non_linearity="LINEAR",
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-        params=[1.0, 0.0],
-    )
-    if _is_input_shape_mapping_defined(node, graph):
-        mapp = graph.onnx_coreml_shape_mapping[node.inputs[0]]
-        mapp_out = []
-        if node.op_type == "Squeeze":
-            axes = node.attrs.get("axes", None)
-            if axes is None:
-                if node.inputs[0] not in graph.shape_dict:
-                    return err.unsupported_op_configuration(
-                        builder, node, graph, "shape not known"
-                    )
-                else:
-                    ishape = graph.shape_dict[node.inputs[0]]
-                    if ishape.count(1) == len(ishape):
-                        mapp_out = [2]
-                    else:
-                        for i, d in enumerate(ishape):
-                            if d != 1:
-                                mapp_out.append(mapp[i])
-            else:
-                for i, a in enumerate(mapp):
-                    if i in axes:
-                        continue
-                    else:
-                        mapp_out.append(a)
-                if len(mapp_out) == 0:
-                    mapp_out = [2]
-        elif node.op_type == "Unsqueeze":
-            axes = node.attrs["axes"]
-            available_set = [0, 1, 2, 3, 4]
-            for d in mapp:
-                if d in available_set:
-                    available_set.remove(d)
-            if len(axes) > len(available_set):
-                return err.unsupported_op_configuration(
-                    builder,
-                    node,
-                    graph,
-                    "cannot unsqueeze to a dimension greater than 5",
-                )
-            mapp_out = [1] * (len(axes) + len(mapp))
-            mapp_ptr = 0
-            available_set_ptr = 0
-            for i in range(len(mapp_out)):
-                if i in axes:
-                    mapp_out[i] = available_set[available_set_ptr]
-                    available_set_ptr += 1
-                else:
-                    mapp_out[i] = mapp[mapp_ptr]
-                    mapp_ptr += 1
-        else:
-            raise ValueError("convert_identity incorrectly called")
-        graph.onnx_coreml_shape_mapping[node.outputs[0]] = mapp_out
-
-
-def _convert_const(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-
-    mapp = None
-    for input_ in node.inputs:
-        if input_ in graph.onnx_coreml_shape_mapping:
-            mapp = graph.onnx_coreml_shape_mapping[input_]
-
-    for name, value in node.input_tensors.items():
-        output_name = name
-        if name not in graph.constant_layers_added:
-            add_transpose_later = False
-            shape = value.shape
-            coreml_shape = [1, 1, 1]
-            if len(shape) == 0:
-                graph.onnx_coreml_shape_mapping[name] = [2]  # [C]
-            elif len(shape) == 3:
-                coreml_shape = list(shape)
-                graph.onnx_coreml_shape_mapping[name] = [2, 3, 4]  # [C,H,W]
-            elif len(shape) == 1:
-                coreml_shape = [shape[0], 1, 1]
-                graph.onnx_coreml_shape_mapping[name] = [2]  # [C]
-            elif len(shape) == 2:
-                coreml_shape = [1, shape[0], shape[1]]
-                if mapp is not None and (mapp == [1, 2] or mapp == [0, 2]):
-                    add_transpose_later = True
-                    transpose_dims = [2, 3, 0, 1]
-                    graph.onnx_coreml_shape_mapping[name] = [0, 2]  # [S,C]
-                else:
-                    graph.onnx_coreml_shape_mapping[name] = [3, 4]  # [H,W]
-            else:
-                return err.unsupported_op_configuration(
-                    builder,
-                    node,
-                    graph,
-                    "unable to translate constant array shape to CoreML shape",
-                )
-
-            if add_transpose_later:
-                output_name += "_pre_transpose"
-            builder.add_load_constant(
-                name=output_name,
-                output_name=output_name,
-                constant_value=value.flatten(),
-                shape=coreml_shape,
-            )
-            if add_transpose_later:
-                builder.add_permute(
-                    name=name,
-                    dim=transpose_dims,
-                    input_name=output_name,
-                    output_name=name,
-                )
-
-            graph.constant_layers_added[output_name] = True
-
-
-_ONNX_NODE_REGISTRY = {
-    "Abs": _convert_abs,
-    "Add": _convert_add,
-    "ArgMax": _convert_argmax,
-    "ArgMin": _convert_argmax,
-    "AveragePool": _convert_pool,
-    "BatchNormalization": _convert_bn,
-    "Clip": _convert_clip,
-    "Concat": _convert_concat,
-    "Conv": _convert_conv,
-    "ConvTranspose": _convert_conv,
-    "DepthToSpace": _convert_reorganize_data,
-    "Div": _convert_div,
-    "Elu": _convert_elu,
-    "Exp": _convert_exp,
-    "Flatten": _convert_flatten,  # Todo: handle more cases
-    "Gemm": _convert_gemm,
-    "GlobalAveragePool": _convert_pool,
-    "GlobalMaxPool": _convert_pool,
-    "HardSigmoid": _convert_hardsigmoid,
-    "InstanceNormalization": _convert_instancenorm,
-    "LeakyRelu": _convert_leaky_relu,
-    "Log": _convert_log,
-    "LogSoftmax": _convert_softmax,
-    "LRN": _convert_lrn,
-    "LSTM": _convert_lstm,
-    "MatMul": _convert_matmul,
-    "Max": _convert_max,
-    "MaxPool": _convert_pool,
-    "Mean": _convert_mean,
-    "MeanVarianceNormalization": _convert_mvn,
-    "Min": _convert_min,
-    "Mul": _convert_mul,
-    "Neg": _convert_neg,
-    "Pad": _convert_pad,
-    "Pow": _convert_pow,
-    "PRelu": _convert_prelu,
-    "Reciprocal": _convert_reciprocal,
-    "ReduceL1": _convert_reduce,
-    "ReduceL2": _convert_reduce,
-    "ReduceLogSum": _convert_reduce,
-    "ReduceMax": _convert_reduce,
-    "ReduceMean": _convert_reduce,
-    "ReduceMin": _convert_reduce,
-    "ReduceProd": _convert_reduce,
-    "ReduceSum": _convert_reduce,
-    "ReduceSumSquare": _convert_reduce,
-    "Relu": _convert_relu,
-    "Reshape": _convert_reshape,
-    "Selu": _convert_selu,
-    "Sigmoid": _convert_sigmoid,
-    "Sign": _convert_sign,
-    "Slice": _convert_slice,
-    "Softmax": _convert_softmax,  # Todo: handle more cases
-    "Softplus": _convert_softplus,
-    "Softsign": _convert_softsign,
-    "SpaceToDepth": _convert_reorganize_data,
-    "SpatialBN": _convert_bn,
-    "Split": _convert_split,
-    "Sqrt": _convert_sqrt,
-    "Squeeze": _convert_identity,
-    "Sub": _convert_sub,
-    "Sum": _convert_add,
-    "Tanh": _convert_tanh,
-    "ThresholdedRelu": _convert_thresholdedrelu,
-    "Transpose": _convert_transpose,
-    "Unsqueeze": _convert_identity,
-    "Upsample": _convert_upsample,
-}
-
-_SEQUENCE_LAYERS_REGISTRY = set(["LSTM"])
-
-_CONST_INPUT_ALLOWED_LAYERS = set(
-    ["Add", "Sub", "Sum", "Mul", "Concat", "Max", "Min", "Div", "Reciprocal"]
-)
-
-
-def _get_node_converter_fn(
-    builder, node, err
-):  # type: (NeuralNetworkBuilder, Node, ErrorHandling) -> Callable[[NeuralNetworkBuilder, Node, Graph, ErrorHandling], None]
-    """
-    Get the right converter function for ONNX node op_type
-    """
-    op_type = node.op_type
-    # Return custom conversion function if provided
-    # If both node type and node name custom function
-    # is provided, then use node name specific custom function, as
-    # type specific custom function is more generic than name specific
-    if node.name in err.custom_conversion_functions:
-        return err.custom_conversion_functions[node.name]
-    elif op_type in err.custom_conversion_functions:
-        return err.custom_conversion_functions[op_type]
-    elif op_type in _ONNX_NODE_REGISTRY:
-        return _ONNX_NODE_REGISTRY[op_type]
-    else:
-        return err.unsupported_op(node)
-
-
-def _add_const_inputs_if_required(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    if node.op_type in _CONST_INPUT_ALLOWED_LAYERS:
-        if len(node.input_tensors) > 0:
-            _convert_const(builder, node, graph, err)
-
-
-def _convert_node(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    converter_fn = _get_node_converter_fn(builder, node, err)
-    return converter_fn(builder, node, graph, err)
diff --git a/coremltools/converters/onnx/_operators_nd.py b/coremltools/converters/onnx/_operators_nd.py
deleted file mode 100644
index 5e40b3be2..000000000
--- a/coremltools/converters/onnx/_operators_nd.py
+++ /dev/null
@@ -1,2773 +0,0 @@
-
-import numpy as np
-import copy
-
-from typing import Sequence, Callable, List, Tuple, Optional, Text, Any
-from coremltools.models.neural_network import NeuralNetworkBuilder  # type: ignore
-from onnx import TensorProto
-from ._graph import Node, Graph
-from coremltools.proto import NeuralNetwork_pb2  # type: ignore
-from ._error_utils import ErrorHandling
-
-from ._operators import (
-    _convert_abs,
-    _convert_relu,
-    _convert_sqrt,
-    _convert_exp,
-    _convert_elu,
-    _convert_selu,
-    _convert_sigmoid,
-    _convert_sign,
-    _convert_prelu,
-    _convert_upsample,
-    _convert_softsign,
-    _convert_softplus,
-    _convert_log,
-    _convert_neg,
-    _convert_reciprocal,
-    _convert_hardsigmoid,
-    _convert_reorganize_data,
-    _add_pool,
-    _get_pool_params,
-    _add_conv,
-    _get_conv_params,
-    _convert_thresholdedrelu,
-    _convert_leaky_relu,
-    _convert_lrn,
-)
-
-from ._operators import _convert_pad as _convert_pad_5d
-
-INT_MAX = 2 ** 63 - 1
-
-
-## Helper functions
-def load_input_constants(builder, node, graph, err):
-    for i in range(len(node.inputs)):
-        if (
-            node.inputs[i] in node.input_tensors
-            and node.inputs[i] not in graph.constants_loaded
-        ):
-            value = node.input_tensors[node.inputs[i]]
-            builder.add_load_constant_nd(
-                name=node.name + "_load_constant_" + str(i),
-                output_name=node.inputs[i],
-                constant_value=value,
-                shape=[1] if value.shape == () else value.shape,
-            )
-            graph.constants_loaded.add(node.inputs[i])
-
-
-def _add_conv_like_op(
-    add_func, get_params_func, params_dict, builder, node, graph, err
-):
-    rank = builder._get_rank(node.inputs[0])
-    if rank == 4:
-        get_params_func(builder, node, graph, err, params_dict)
-        add_func(
-            node.inputs,
-            node.outputs,
-            params_dict=params_dict,
-            builder=builder,
-            node=node,
-            graph=graph,
-            err=err,
-        )
-    elif rank == 3:
-        axes = [0, 3]
-        # Make 5d tensor
-        expanded_node_output = node.name + "_" + node.inputs[0] + "_expanded"
-        builder.add_expand_dims(
-            name=node.name + "_ip_expand",
-            input_name=node.inputs[0],
-            output_name=expanded_node_output,
-            axes=axes,
-        )
-        node.inputs[0] = expanded_node_output
-        output_name = node.outputs[0]
-        node.outputs[0] = node.name + "_" + output_name + "_expanded"
-        # Add conversion op
-        get_params_func(builder, node, graph, err, params_dict, axis="width")
-        add_func(
-            node.inputs,
-            node.outputs,
-            params_dict=params_dict,
-            builder=builder,
-            node=node,
-            graph=graph,
-            err=err,
-        )
-        # Make 3d tensor back
-        builder.add_squeeze(
-            name=node.name + "_ip_squeeze_out",
-            input_name=node.outputs[0],
-            output_name=output_name,
-            axes=axes,
-        )
-    else:
-        return err.unsupported_op_configuration(
-            builder, node, graph, "provided number axes {} not supported".format(rank)
-        )
-
-
-def add_broadcastable_op_chain(builder, node, err, add_op_function):
-    """
-    Splits list of input into chain of operator with two inputs
-    where output of first node is fed into next one until the final input
-    is processed
-    Pass node:            Node to be converted
-         add_op_function: Conversion function to be used
-    """
-    total_nodes = len(node.inputs)
-
-    if total_nodes < 2:
-        # TODO: Skip or CopyProp + DeadCode elimination
-        builder.add_activation(
-            name=node.name,
-            non_linearity="LINEAR",
-            input_name=node.inputs[0],
-            output_name=node.outputs[0],
-            params=[1.0, 0.0],
-        )
-    elif total_nodes == 2:
-        add_op_function(
-            name=node.name, input_names=node.inputs, output_name=node.outputs[0]
-        )
-    else:
-        decorator = 0
-        out_name = node.outputs[0]
-        # Add broadcastable layer for first two inputs
-        add_op_function(
-            name=node.name,
-            input_names=[node.inputs[0], node.inputs[1]],
-            output_name=out_name + "_" + str(decorator),
-        )
-        # Continue chain of broadcastable layers
-        for i in range(2, total_nodes - 1):
-            add_op_function(
-                name=node.name,
-                input_names=[out_name + "_" + str(decorator), node.inputs[i]],
-                output_name=out_name + "_" + str(decorator + 1),
-            )
-            decorator += 1
-        # End chain of broadcastable layers with final output
-        add_op_function(
-            name=node.name + "_" + str(decorator),
-            input_names=[out_name + "_" + str(decorator), node.inputs[total_nodes - 1]],
-            output_name=out_name,
-        )
-
-
-def add_bn_with_expansion(
-    builder,
-    node,
-    err,
-    node_name,
-    input_name,
-    output_name,
-    channels,
-    scale,
-    bias,
-    mean=None,
-    var=None,
-    epsilon=None,
-    compute_mean_var=False,
-    instance_normalization=False,
-    axes_for_expansion=[],
-):
-    real_input_name = input_name
-    real_output_name = output_name
-
-    # Expand input if needed
-    if len(axes_for_expansion) != 0:
-        input_name = node_name + "_" + input_name + "_expanded"
-        output_name = output_name + "_expanded"
-        builder.add_expand_dims(
-            name=node_name + "_expand",
-            input_name=real_input_name,
-            output_name=input_name,
-            axes=axes_for_expansion,
-        )
-
-    builder.add_batchnorm(
-        name=node.name,
-        channels=channels,
-        gamma=scale,
-        beta=bias,
-        mean=mean,
-        variance=var,
-        input_name=input_name,
-        output_name=output_name,
-        compute_mean_var=compute_mean_var,
-        instance_normalization=instance_normalization,
-        epsilon=epsilon,
-    )
-
-    # Squeeze output if needed
-    if len(axes_for_expansion) != 0:
-        builder.add_squeeze(
-            name=node_name + "_squeeze",
-            input_name=output_name,
-            output_name=real_output_name,
-            axes=axes_for_expansion,
-        )
-
-
-# Helper function to convert RandomNormal, RandomUniform and it's variants
-def add_random(builder, node, graph, err, add_op_function):
-    # Ignoring attribute `dtype` as CoreML internally represents tensors into 'Float'
-    mean = node.attrs.get("mean", 0.0)
-    scale = node.attrs.get("scale", 1.0)
-    seed = node.attrs.get("seed", -1)
-    shape = node.attrs.get("shape", None)
-    if shape is None:
-        return err.unsupported_op_configuration(
-            builder, node, graph, "Shape not provided"
-        )
-    add_op_function(
-        name=node.name,
-        output_name=node.outputs[0],
-        output_shape=shape,
-        mean=mean,
-        stddev=scale,
-        seed=seed,
-    )
-
-
-## Converter functions
-
-
-def _convert_acos(builder, node, graph, err):
-    """
-    convert to CoreML Acos Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L3793
-    """
-    load_input_constants(builder, node, graph, err)
-    builder.add_acos(
-        name=node.name, input_name=node.inputs[0], output_name=node.outputs[0]
-    )
-
-
-def _convert_acosh(builder, node, graph, err):
-    """
-    convert to CoreML Acosh Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L3925
-    """
-    load_input_constants(builder, node, graph, err)
-    builder.add_acosh(
-        name=node.name, input_name=node.inputs[0], output_name=node.outputs[0]
-    )
-
-
-def _convert_add(builder, node, graph, err):
-    """
-    convert to CoreML Add Broadcastable Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L4117
-    """
-    load_input_constants(builder, node, graph, err)
-    add_broadcastable_op_chain(builder, node, err, builder.add_add_broadcastable)
-
-
-def _convert_argmax(builder, node, graph, err):
-    """
-    convert to CoreML ArgMax Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L4961
-    """
-    axis = node.attrs.get("axis", 0)
-    keepdims = node.attrs.get("keepdims", True)
-    builder.add_argmax(
-        name=node.name,
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-        axis=axis,
-        keepdims=keepdims,
-    )
-
-
-def _convert_argmin(builder, node, graph, err):
-    """
-    convert to CoreML ArgMin Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L4988
-    """
-    axis = node.attrs.get("axis", 0)
-    keepdims = node.attrs.get("keepdims", True)
-    builder.add_argmin(
-        name=node.name,
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-        axis=axis,
-        keepdims=keepdims,
-    )
-
-
-def _convert_asin(builder, node, graph, err):
-    """
-    convert to CoreML Asin Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L3771
-    """
-    load_input_constants(builder, node, graph, err)
-    builder.add_asin(
-        name=node.name, input_name=node.inputs[0], output_name=node.outputs[0]
-    )
-
-
-def _convert_asinh(builder, node, graph, err):
-    """
-    convert to CoreML Asinh Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L3903
-    """
-    load_input_constants(builder, node, graph, err)
-    builder.add_asinh(
-        name=node.name, input_name=node.inputs[0], output_name=node.outputs[0]
-    )
-
-
-def _convert_atan(builder, node, graph, err):
-    """
-    convert to CoreML Atan Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L3815
-    """
-    load_input_constants(builder, node, graph, err)
-    builder.add_atan(
-        name=node.name, input_name=node.inputs[0], output_name=node.outputs[0]
-    )
-
-
-def _convert_atanh(builder, node, graph, err):
-    """
-    convert to CoreML Atanh Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L3947
-    """
-    load_input_constants(builder, node, graph, err)
-    builder.add_atanh(
-        name=node.name, input_name=node.inputs[0], output_name=node.outputs[0]
-    )
-
-
-def _convert_bn(builder, node, graph, err):
-    """
-    convert to CoreML BatchNorm Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L1633
-    """
-    if len(node.outputs) > 1:
-        return err.unsupported_op_configuration(
-            builder,
-            node,
-            graph,
-            "This converter only supports BatchNormalization with one output",
-        )
-
-    epsilon = node.attrs.get("epsilon", 1e-5)
-    scale_name = node.inputs[1]
-
-    if scale_name in node.input_tensors:
-        channels = node.input_tensors[scale_name].shape
-    elif scale_name in graph.shape_dict:
-        channels = graph.shape_dict[scale_name]
-    else:
-        err.unsupported_op_configuration(
-            builder, node, graph, "Input shape not available"
-        )
-
-    # TODO: Move error check under VERBOSE / DEBUG Mode
-    for i in range(2, len(node.inputs)):
-        ip_name = node.inputs[i]
-        if ip_name in node.input_tensors:
-            tensor_shape = node.input_tensors[ip_name].shape
-        else:
-            if ip_name not in graph.shape_dict:
-                return err.unsupported_op_configuration(
-                    builder, node, graph, "Input shape not available"
-                )
-            tensor_shape = graph.shape_dict[ip_name]
-        if tensor_shape != channels:
-            err.unsupported_op_configuration(
-                builder,
-                node,
-                graph,
-                "Shape mismatch between Scale, Bias, Mean and Variance",
-            )
-
-    scale = (
-        node.input_tensors[node.inputs[1]]
-        if node.inputs[1] in node.input_tensors
-        else np.ones(shape=channels, dtype=np.float32)
-    )
-    bias = (
-        node.input_tensors[node.inputs[2]]
-        if node.inputs[2] in node.input_tensors
-        else np.zeros(shape=channels, dtype=np.float32)
-    )
-    mean = (
-        node.input_tensors[node.inputs[3]]
-        if node.inputs[3] in node.input_tensors
-        else np.zeros(shape=channels, dtype=np.float32)
-    )
-    var = (
-        node.input_tensors[node.inputs[4]]
-        if node.inputs[4] in node.input_tensors
-        else np.ones(shape=channels, dtype=np.float32)
-    )
-
-    rank = builder._get_rank(node.inputs[0])
-    # ONNX converts B x C tensor into B x C x 1 hence
-    # Rank 2 BN is mapped to Rank 3 BN
-    if rank == 3:
-        # 1D Batch Norm
-        add_bn_with_expansion(
-            builder,
-            node,
-            err,
-            node.name,
-            node.inputs[0],
-            node.outputs[0],
-            channels[0],
-            scale,
-            bias,
-            mean,
-            var,
-            epsilon,
-            axes_for_expansion=[0, 3],
-        )
-    elif rank == 4:
-        # 2D Batch Norm
-        add_bn_with_expansion(
-            builder,
-            node,
-            err,
-            node.name,
-            node.inputs[0],
-            node.outputs[0],
-            channels[0],
-            scale,
-            bias,
-            mean,
-            var,
-            epsilon,
-            axes_for_expansion=[],
-        )
-    else:
-        # Unsupported 1D, 3D and above
-        err.unsupported_op_configuration(
-            builder, node, graph, "provided number axes {} not supported".format(rank)
-        )
-
-
-def _convert_cast(builder, node, graph, err):
-    """
-    Perform cast operation in CoreML
-        e.g. Casting from Float (assumed) to Int maps to Floor Layer
-             For Others, add copy layer
-    """
-    convert_to = node.attrs.get("to")
-    convert_to_int = set(
-        {
-            TensorProto.UINT8,
-            TensorProto.INT8,
-            TensorProto.UINT16,
-            TensorProto.INT32,
-            TensorProto.INT64,
-            TensorProto.UINT32,
-            TensorProto.UINT64,
-        }
-    )
-
-    ## TODO: Add support for conversion from STRING TO FLOAT
-    ## Currently, such input will error out in parsing
-    if convert_to in convert_to_int:
-        builder.add_floor(
-            name=node.name, input_name=node.inputs[0], output_name=node.outputs[0]
-        )
-    else:
-        load_input_constants(builder, node, graph, err)
-        builder.add_activation(
-            name=node.name,
-            non_linearity="LINEAR",
-            input_name=node.inputs[0],
-            output_name=node.outputs[0],
-            params=[1.0, 0.0],
-        )
-
-
-def _convert_ceil(builder, node, graph, err):
-    """
-    convert to CoreML Ceil Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L5018
-    """
-    builder.add_ceil(
-        name=node.name, input_name=node.inputs[0], output_name=node.outputs[0],
-    )
-
-
-def _convert_clip(builder, node, graph, err):
-    """
-    convert to CoreML Clip Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L5066
-    """
-    max_value = node.attrs.get("max", 3.4028234663852886e38)
-    min_value = node.attrs.get("min", -3.4028234663852886e38)
-    builder.add_clip(
-        name=node.name,
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-        min_value=min_value,
-        max_value=max_value,
-    )
-
-
-def _convert_concat(builder, node, graph, err):
-    """
-    convert to CoreML ConcatND Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L3521
-    """
-    axis = node.attrs.get("axis")
-    load_input_constants(builder, node, graph, err)
-
-    # TODO: Adding Linear layer will change to
-    #       either: Skip the op right away
-    #       or:     Insert Linear and perform copy-propogation followed by dead code elimination
-    if len(node.inputs) == 1:
-        builder.add_activation(
-            name=node.name,
-            non_linearity="LINEAR",
-            input_name=node.inputs[0],
-            output_name=node.outputs[0],
-            params=[1.0, 0.0],
-        )
-    else:
-        builder.add_concat_nd(
-            name=node.name,
-            input_names=node.inputs,
-            output_name=node.outputs[0],
-            axis=axis,
-        )
-
-
-def _convert_constant(builder, node, graph, err):
-    """
-    convert to CoreML Load Constant ND Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L3596
-    """
-    value = node.attrs["value"]
-    # HACK: If Value is 0-Rank then make it 1-Rank
-    builder.add_load_constant_nd(
-        name=node.name,
-        output_name=node.outputs[0],
-        constant_value=value,
-        shape=[1] if value.shape == () else value.shape,
-    )
-    graph.constants_loaded(node.outputs[0])
-
-
-def _convert_constant_of_shape(builder, node, graph, err):
-    """
-    convert to CoreML Fill Static Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L3641
-    """
-    value = node.attrs.get("value", [0.0])
-    # if shape is known, create tensor of given shape
-    # otherwise create tensor at runtime
-    if node.inputs[0] in node.input_tensors:
-        output_shape = node.input_tensors[node.inputs[0]]
-        # add_fill_static requires shape to be more than rank-1
-        if len(output_shape.shape) == 1:
-            output_shape = output_shape.reshape(output_shape.shape[0], 1)
-        builder.add_fill_static(
-            name=node.name,
-            output_name=node.outputs[0],
-            output_shape=output_shape,
-            value=value[0],
-        )
-    else:
-        builder.add_fill_dynamic(
-            name=node.name,
-            input_name=node.inputs[0],
-            output_name=node.outputs[0],
-            value=value[0],
-        )
-
-
-def _convert_conv(builder, node, graph, err):
-    """
-    convert to CoreML Convolution Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L1418
-    """
-    params_dict = dict()
-    params_dict["is_deconv"] = False
-    if node.op_type.endswith("Transpose"):
-        params_dict["is_deconv"] = True
-    # get weights for convolution
-    weight_name = node.inputs[1]
-    W = None
-    if weight_name in node.input_tensors:
-        W = node.input_tensors[weight_name]
-        params_dict["w_shape"] = W.shape
-    else:
-        # W is provided as a input
-        # Make W compatible for CoreML Conv Layer
-        # W ONNX format: OC x KC x H x W
-        # Expected CoreML Format: H x W x KC x OC
-        W_name = node.inputs[1]
-        W_shape = graph.shape_dict[W_name]
-        W_rank = len(W_shape)
-
-        params_dict["w_shape"] = W_shape
-        if W_rank == 3:
-            expanded_node_name = node.name + "_" + W_name + "_expanded"
-            builder.add_expand_dims(
-                name=node.name + "_w_expand",
-                input_name=W_name,
-                output_name=expanded_node_name,
-                axes=[-2],
-            )
-            W_name = expanded_node_name
-
-        # Now Permute the W tensor
-        W_transpose_axes = [2, 3, 1, 0]
-        # If ConvTranpose then, Kernel and Output channels are shuffled
-        if params_dict["is_deconv"]:
-            W_transpose_axes = [2, 3, 0, 1]
-
-        builder.add_transpose(
-            name=node.name + "_w_transpose",
-            axes=W_transpose_axes,
-            input_name=W_name,
-            output_name=W_name + "_transposed",
-        )
-        W_name = W_name + "_transposed"
-        node.inputs[1] = W_name
-
-    params_dict["W"] = W
-    bias = None
-    if len(node.inputs) > 2:
-        bias = node.input_tensors[node.inputs[2]]
-    params_dict["bias"] = bias
-    params_dict["groups"] = node.attrs.get("group", 1)
-
-    _add_conv_like_op(
-        _add_conv, _get_conv_params, params_dict, builder, node, graph, err
-    )
-
-
-def _convert_cos(builder, node, graph, err):
-    """
-    convert to CoreML Cos Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L3727
-    """
-    load_input_constants(builder, node, graph, err)
-    builder.add_cos(
-        name=node.name, input_name=node.inputs[0], output_name=node.outputs[0]
-    )
-
-
-def _convert_cosh(builder, node, graph, err):
-    """
-    convert to CoreML Cosh Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L3859
-    """
-    load_input_constants(builder, node, graph, err)
-    builder.add_cosh(
-        name=node.name, input_name=node.inputs[0], output_name=node.outputs[0]
-    )
-
-
-def _convert_div(builder, node, graph, err):
-    """
-    convert to CoreML Divide Broadcastable Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L4180
-    """
-    load_input_constants(builder, node, graph, err)
-    add_broadcastable_op_chain(builder, node, err, builder.add_divide_broadcastable)
-
-
-def _convert_equal(builder, node, graph, err):
-    """
-    convert to CoreML Equal Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L961
-    """
-    load_input_constants(builder, node, graph, err)
-    builder.add_equal(
-        name=node.name, input_names=node.inputs, output_name=node.outputs[0]
-    )
-
-
-def _convert_erf(builder, node, graph, err):
-    """
-    convert to CoreML Erf Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L5140
-    """
-    load_input_constants(builder, node, graph, err)
-    builder.add_erf(
-        name=node.name, input_name=node.inputs[0], output_name=node.outputs[0]
-    )
-
-
-def _convert_expand(builder, node, graph, err):
-    """
-    convert to CoreML Broadcast To Static/Dynamic Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L4086
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L4108
-    """
-    load_input_constants(builder, node, graph, err)
-    if node.inputs[1] in node.input_tensors:
-        output_shape = node.input_tensors[node.inputs[1]].astype(np.int64)
-        builder.add_broadcast_to_static(
-            name=node.name,
-            input_name=node.inputs[0],
-            output_name=node.outputs[0],
-            output_shape=output_shape,
-        )
-    else:
-        builder.add_broadcast_to_dynamic(
-            name=node.name, input_names=node.inputs, output_name=node.outputs[0],
-        )
-
-
-def _convert_flatten(builder, node, graph, err):
-    """
-    convert to CoreML Flatten Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L4826
-    """
-    axis = node.attrs.get("axis", 1)
-    builder.add_flatten_to_2d(
-        name=node.name,
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-        axis=axis,
-    )
-
-
-def _convert_floor(builder, node, graph, err):
-    """
-    convert to CoreML Floor Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L5040
-    """
-    builder.add_floor(
-        name=node.name, input_name=node.inputs[0], output_name=node.outputs[0]
-    )
-
-
-def _convert_gather(builder, node, graph, err):
-    """
-    convert to CoreML Gather Along Axis Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L4296
-    """
-    axis = node.attrs.get("axis", 0)
-
-    if len(node.inputs) != 2:
-        err.unsupported_op_configuration(
-            builder, node, graph, "Error in ONNX model: Gather expects two inputs"
-        )
-
-    if (
-        node.inputs[0] in node.input_tensors
-        and node.inputs[0] not in graph.constants_loaded
-    ):
-        value = node.input_tensors[node.inputs[0]]
-        builder.add_load_constant_nd(
-            name=node.name + "_load_data",
-            output_name=node.inputs[0],
-            constant_value=value,
-            shape=[1] if value.shape == () else value.shape,
-        )
-        graph.constants_loaded.add(node.inputs[0])
-
-    if (
-        node.inputs[1] in node.input_tensors
-        and node.inputs[1] not in graph.constants_loaded
-    ):
-        value = node.input_tensors[node.inputs[1]]
-        builder.add_load_constant_nd(
-            name=node.name + "_load_indices",
-            output_name=node.inputs[1],
-            constant_value=value,
-            shape=[1] if value.shape == () else value.shape,
-        )
-        graph.constants_loaded.add(node.inputs[1])
-
-    builder.add_gather(
-        name=node.name,
-        input_names=[node.inputs[0], node.inputs[1]],
-        output_name=node.outputs[0],
-        axis=axis,
-    )
-
-
-def _convert_gemm(builder, node, graph, err):
-    """
-    convert to CoreML Tranpose (Optional) and Inner Product Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L4180
-    """
-    # Read attributes
-    alpha = node.attrs.get("alpha", 1.0)
-    beta = node.attrs.get("beta", 1.0)
-    transA = node.attrs.get("transA", False)
-    transB = node.attrs.get("transB", False)
-
-    A = node.inputs[0]
-    if A in node.input_tensors:
-        A_tensor = node.input_tensors[A]
-        builder.add_load_constant_nd(
-            name=node.name + A + "_const",
-            output_name="const_" + A,
-            constant_value=A_tensor,
-            shape=A_tensor.shape,
-        )
-        A = "const_" + A
-
-    if alpha != 1.0:
-        builder.add_load_constant_nd(
-            name=node.name + "_load_alpha",
-            output_name="alpha_for_" + A,
-            constant_value=np.array([alpha]),
-            shape=[1],
-        )
-        builder.add_multiply_broadcastable(
-            name=node.name + "_alphaA",
-            input_names=[A, "alpha_for_" + A],
-            output_name=A + "_alphaA",
-        )
-        A = A + "_alphaA"
-
-    B = node.inputs[1]
-    C = node.inputs[2]
-    if B in node.input_tensors and C in node.input_tensors:
-        B = node.input_tensors[B]
-        C = node.input_tensors[C]
-
-        if transB:
-            B = B.transpose()
-
-        C = C.flatten()
-        builder.add_batched_mat_mul(
-            name=node.name,
-            input_names=[A],
-            output_name=node.outputs[0],
-            transpose_a=transA,
-            weight_matrix_rows=B.shape[0],
-            weight_matrix_columns=B.shape[1],
-            W=B,
-            bias=C,
-        )
-    else:
-        ## TODO: Test coverage when B and C are non-constant
-        ## Should C be of Rank-1? or it's okay to keep it that way?
-        if beta != 1.0:
-            builder.add_load_constant_nd(
-                name=node.name + "_load_beta",
-                output_name="beta_for_" + B,
-                constant_value=np.array([beta]),
-                shape=[1],
-            )
-            builder.add_multiply_broadcastable(
-                name=node.name + "_betaC",
-                input_names=[C, "beta_for_" + B],
-                output_name=C + "_betaC",
-            )
-            C = C + "_betaC"
-
-        builder.add_batched_mat_mul(
-            name=node.name,
-            input_names=[A, B],
-            output_name=node.outputs[0] + "_b_mat_mul",
-            transpose_a=transA,
-            transpose_b=transB,
-        )
-
-        builder.add_add_broadcastable(
-            name=node.name + "_add_bias",
-            input_names=[node.outputs[0] + "_b_mat_mul", C],
-            output_name=node.outputs[0],
-        )
-
-
-def _convert_greater(builder, node, graph, err):
-    """
-    convert to CoreML Greater than Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L853
-    """
-    load_input_constants(builder, node, graph, err)
-    builder.add_greater_than(
-        name=node.name, input_names=node.inputs, output_name=node.outputs[0],
-    )
-
-
-def _convert_gru(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    """
-    convert to CoreML GRU Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L3104
-    """
-
-    def get_weights(W, W_name, R, R_name, B):
-        """
-        Helper routine to return weights in CoreML LSTM required format
-        """
-        W = np.expand_dims(np.expand_dims(W, 3), 3)
-        R = np.expand_dims(np.expand_dims(R, 3), 3)
-
-        if W is None:
-            err.missing_initializer(
-                node,
-                "Weight tensor: {} not found in the graph initializer".format(W_name),
-            )
-        if R is None:
-            err.missing_initializer(
-                node,
-                "Weight tensor: {} not found in the graph initializer".format(R_name),
-            )
-
-        W_z, W_r, W_h = np.split(np.squeeze(W), 3)  # type: ignore
-        R_z, R_r, R_h = np.split(np.squeeze(R), 3)  # type: ignore
-
-        W_x = [W_z, W_r, W_h]
-        W_h = [R_z, R_r, R_h]
-        b = None
-        if B is not None:
-            b_Wz, b_Wr, b_Wh, b_Rz, b_Rr, b_Rh = np.split(np.squeeze(B), 6)  # type: ignore
-            b = [b_Wz + b_Rz, b_Wr + b_Rr, b_Wh + b_Rh]
-
-        return W_x, W_h, b
-
-    def expand_dim(node_name, input_name, output_name, axes):
-        builder.add_expand_dims(
-            name=node_name, input_name=input_name, output_name=output_name, axes=axes
-        )
-
-    # Read attributes
-    # activation alpha and beta
-    if "activation_alpha" in node.attrs or "activation_beta" in node.attrs:
-        err.unsupported_feature_warning(
-            node, "Activation parameter alpha and beta are currently not used"
-        )
-
-    inner_activation = "SIGMOID"
-    output_activation = "TANH"
-
-    if "activations" in node.attrs:
-        activations_list = node.attrs["activations"]
-
-        if len(activations_list) < 2:
-            err.unsupported_op_configuration(
-                builder,
-                node,
-                graph,
-                "Error in ONNX model: Less number of activations provided",
-            )
-
-        inner_activation = activations_list[0].upper()
-        output_activation = activations_list[1].upper()
-
-    # Extract direction from ONNX attribute
-    direction = node.attrs.get("direction", "forward")
-    if direction == "bidirectional":
-        return err.unsupported_op_configuration(
-            builder,
-            node,
-            graph,
-            "Bidirectional GRU not supported!! Please consider adding custom conversion function/layer",
-        )
-
-    hidden_size = node.attrs.get("hidden_size")
-
-    # Read inputs
-    W_name = node.inputs[1]
-    R_name = node.inputs[2]
-    B = None
-    if len(node.inputs) > 3:
-        B_name = node.inputs[3]
-        B = node.input_tensors.get(B_name, None)
-
-    if W_name not in node.input_tensors or R_name not in node.input_tensors:
-        return err.unsupported_op_configuration(
-            builder,
-            node,
-            graph,
-            "Input and Recursion weights must be known!! Please consider adding custom conversion function/layer",
-        )
-
-    W = node.input_tensors.get(W_name, None)
-    R = node.input_tensors.get(R_name, None)
-
-    # Get weights for forward direction
-    W_x, W_h, b = get_weights(W, W_name, R, R_name, B)
-
-    # shape of input
-    input_size = W_x[0].shape[1]
-
-    # Get input and output for hidden and cell
-    input_h = node.inputs[5] if len(node.inputs) > 5 else node.inputs[0] + "_h_input"
-    output_h = (
-        node.outputs[1] if len(node.outputs) > 1 else node.outputs[0] + "_h_output"
-    )
-    output_h_5d = output_h + "_5d"
-
-    if len(node.inputs) < 6:
-        # if input is not present in the network, load they as constant
-        if node.inputs[0] not in graph.shape_dict:
-            err.unsupported_op_configuration(
-                builder, node, graph, "Input shape not represented within Graph"
-            )
-
-        # Input is represented as [Seq Len, Batch Size, Input Size]
-        batch_size = graph.shape_dict[node.inputs[0]][1]
-        builder.add_load_constant_nd(
-            name=node.name + "_load_initial_h",
-            output_name=input_h,
-            constant_value=0.0,
-            shape=[1, batch_size, hidden_size],
-        )
-
-    # CoreML GRU expects 5-d tensor
-    # Expand dimensions of input to 5-d for compatibility
-    input_rank = builder._get_rank(node.inputs[0])
-    if input_rank == -1:
-        return err.unsupported_op_configuration(
-            builder, node, graph, "Rank unknown for input"
-        )
-
-    if input_rank < 5:
-        add_nodes = 5 - input_rank
-
-        # TODO: Add one expand instead of adding one after another for input, h
-        expand_dim(
-            node.name + "_expand_in_0",
-            node.inputs[0],
-            node.inputs[0] + "_expand_out_0",
-            [input_rank],
-        )
-        expand_dim(
-            node.name + "_expand_in_h_0",
-            input_h,
-            input_h + "_expand_out_h_0",
-            [input_rank],
-        )
-
-        for i in range(1, add_nodes):
-            i_str = str(i)
-            i_p_str = str(i - 1)
-            expand_dim(
-                node.name + "_expand_in_" + i_str,
-                node.inputs[0] + "_expand_out_" + i_p_str,
-                node.inputs[0] + "_expand_out_" + i_str,
-                [input_rank + i],
-            )
-            expand_dim(
-                node.name + "_expand_in_h_" + i_str,
-                input_h + "_expand_out_h_" + i_p_str,
-                input_h + "_expand_out_h_" + i_str,
-                [input_rank + i],
-            )
-
-    builder.add_gru(
-        name=node.name,
-        W_h=W_h,
-        W_x=W_x,
-        b=b,
-        hidden_size=hidden_size,
-        input_size=input_size,
-        input_names=[
-            node.inputs[0] + "_expand_out_" + str(add_nodes - 1),
-            input_h + "_expand_out_h_" + str(add_nodes - 1),
-        ],
-        output_names=[node.outputs[0] + "_5d_out", output_h_5d],
-        inner_activation=inner_activation,
-        activation=output_activation,
-        output_all=True,
-        reverse_input=(direction == "reverse"),
-    )
-
-    # CoreML output is [Seq Len, Batch Size, Num Dir * Hidden Size, 1, 1]
-    # Return output as [Seq Len, Num Dir, Batch Size, Hidden Size]
-    # Following steps:
-    #       a. Reshape and split hidden size for direction [Seq Len, Batch Size, Num Dir, Hidden Size, 1]
-    #       b. Squeeze last dimension [Seq Len, Batch Size, Num Dir, Hidden Size]
-    #       c. Permute to fix the order [Seq Len, Num Dir, Batch Size, Hidden Size, 1]
-    builder.add_rank_preserving_reshape(
-        name=node.name + "_reshape_",
-        input_name=node.outputs[0] + "_5d_out",
-        output_name=node.outputs[0] + "_5d_reshaped",
-        output_shape=[0, 0, 1, -1, 0],
-    )
-
-    builder.add_squeeze(
-        name=node.name + "_squeeze_out",
-        input_name=node.outputs[0] + "_5d_reshaped",
-        output_name=node.outputs[0] + "_4d",
-        axes=[-1],
-    )
-
-    builder.add_transpose(
-        name=node.name + "_transpose",
-        axes=[0, 2, 1, 3],
-        input_name=node.outputs[0] + "_4d",
-        output_name=node.outputs[0],
-    )
-
-    # Squeeze dimensions of output_h
-    builder.add_squeeze(
-        name=node.name + "_squeeze_out_h",
-        input_name=output_h_5d,
-        output_name=output_h,
-        axes=[-1, -2],
-    )
-
-
-def _convert_identity(builder, node, graph, err):
-    """
-    convert to CoreML Linear Activation Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L417
-    """
-    # TODO: Skip or CopyProp + DeadCode elimination
-    builder.add_activation(
-        name=node.name,
-        non_linearity="LINEAR",
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-        params=[1.0, 0.0],
-    )
-
-
-def _convert_instancenorm(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    """
-    convert to CoreML BatchNorm Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L1633
-    """
-    epsilon = node.attrs.get("epsilon", 1e-5)
-    if (
-        node.inputs[1] not in node.input_tensors
-        or node.inputs[2] not in node.input_tensors
-    ):
-        return err.unsupported_op_configuration(
-            builder,
-            node,
-            graph,
-            "CoreML InstanceNorm requires Scale and Bias to be known",
-        )
-
-    scale = node.input_tensors[node.inputs[1]]
-    bias = node.input_tensors[node.inputs[2]]
-
-    rank = builder._get_rank(node.inputs[0])
-    # ONNX converts B x C tensor into B x C x 1 hence
-    # Rank 2 BN is mapped to Rank 3 BN
-    if rank == 3:
-        # 1D Batch Norm
-        add_bn_with_expansion(
-            builder,
-            node,
-            err,
-            node.name,
-            node.inputs[0],
-            node.outputs[0],
-            scale.shape[0],
-            scale,
-            bias,
-            epsilon=epsilon,
-            compute_mean_var=True,
-            instance_normalization=True,
-            axes_for_expansion=[0, 3],
-        )
-    elif rank == 4:
-        # 2D Batch Norm
-        add_bn_with_expansion(
-            builder,
-            node,
-            err,
-            node.name,
-            node.inputs[0],
-            node.outputs[0],
-            scale.shape[0],
-            scale,
-            bias,
-            epsilon=epsilon,
-            compute_mean_var=True,
-            instance_normalization=True,
-            axes_for_expansion=[],
-        )
-    else:
-        # Unsupported 1D, 3D and above
-        err.unsupported_op_configuration(
-            builder, node, graph, "provided number axes {} not supported".format(rank)
-        )
-
-
-def _convert_less(builder, node, graph, err):
-    """
-    convert to CoreML Less Than Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L907
-    """
-    load_input_constants(builder, node, graph, err)
-    builder.add_less_than(
-        name=node.name, input_names=node.inputs, output_name=node.outputs[0],
-    )
-
-
-def _convert_lstm(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    """
-    convert to CoreML Uni/Bi-Directional LSTM Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L3282
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L3348
-    """
-
-    def get_weights(W, W_name, R, R_name, B):
-        """
-        Helper routine to return weights in CoreML LSTM required format
-        """
-        W = np.expand_dims(np.expand_dims(W, 3), 3)
-        R = np.expand_dims(np.expand_dims(R, 3), 3)
-
-        if W is None:
-            err.missing_initializer(
-                node,
-                "Weight tensor: {} not found in the graph initializer".format(W_name),
-            )
-        if R is None:
-            err.missing_initializer(
-                node,
-                "Weight tensor: {} not found in the graph initializer".format(R_name),
-            )
-
-        W_i, W_o, W_f, W_c = np.split(np.squeeze(W), 4)  # type: ignore
-        R_i, R_o, R_f, R_c = np.split(np.squeeze(R), 4)  # type: ignore
-
-        W_x = [W_i, W_f, W_o, W_c]
-        W_h = [R_i, R_f, R_o, R_c]
-        b = None
-        if B is not None:
-            b_Wi, b_Wo, b_Wf, b_Wc, b_Ri, b_Ro, b_Rf, b_Rc = np.split(np.squeeze(B), 8)  # type: ignore
-            b = [b_Wi + b_Ri, b_Wf + b_Rf, b_Wo + b_Ro, b_Wc + b_Rc]
-
-        return W_x, W_h, b
-
-    def expand_dim(node_name, input_name, output_name, axes):
-        builder.add_expand_dims(
-            name=node_name, input_name=input_name, output_name=output_name, axes=axes
-        )
-
-    # Read attributes
-    # activation alpha and beta
-    if "activation_alpha" in node.attrs or "activation_beta" in node.attrs:
-        err.unsupported_feature_warning(
-            node, "Activation parameter alpha and beta are currently not used"
-        )
-
-    inner_activation = "SIGMOID"
-    cell_state_update_activation = "TANH"
-    output_activation = "TANH"
-
-    if "activations" in node.attrs:
-        activations_list = node.attrs["activations"]
-
-        if len(activations_list) < 3:
-            err.unsupported_op_configuration(
-                builder,
-                node,
-                graph,
-                "Error in ONNX model: Less number of activations provided",
-            )
-
-        if len(activations_list) == 6:
-            err.unsupported_feature_warning(
-                node, "Forward and backward pass will use same activations."
-            )
-
-        inner_activation = activations_list[0].upper()
-        cell_state_update_activation = activations_list[1].upper()
-        output_activation = activations_list[2].upper()
-
-    # Provide max Clip Value if not provided
-    clip_threshold = node.attrs.get("clip", 500000.0)
-
-    # Extract direction from ONNX attribute
-    direction = 1
-    if (
-        "direction" in node.attrs
-        and node.attrs["direction"].decode("utf-8") == "bidirectional"
-    ):
-        direction = 2
-
-    hidden_size = node.attrs.get("hidden_size")
-
-    input_forget = node.attrs.get("input_forget", 0) == 1
-
-    # Read inputs
-    W_name = node.inputs[1]
-    R_name = node.inputs[2]
-    B = None
-    if len(node.inputs) > 3:
-        B_name = node.inputs[3]
-        B = node.input_tensors.get(B_name, None)
-
-    W = node.input_tensors.get(W_name, None)
-    R = node.input_tensors.get(R_name, None)
-
-    W = np.split(W, direction)
-    R = np.split(R, direction)
-    if B is not None:
-        B = np.split(B, direction)
-    else:
-        B = [None, None]
-
-    # Get weights for forward direction
-    W_x, W_h, b = get_weights(W[0], W_name, R[0], R_name, B[0])
-
-    # shape of input
-    input_size = W_x[0].shape[1]
-
-    # Get input and output for hidden and cell
-    input_h = node.inputs[5] if len(node.inputs) > 5 else node.inputs[0] + "_h_input"
-    input_c = node.inputs[6] if len(node.inputs) > 6 else node.inputs[0] + "_c_input"
-    output_h = (
-        node.outputs[1] if len(node.outputs) > 1 else node.outputs[0] + "_h_output"
-    )
-    output_c = (
-        node.outputs[2] if len(node.outputs) > 2 else node.outputs[0] + "_c_output"
-    )
-    output_h_5d = output_h + "_5d"
-    output_c_5d = output_c + "_5d"
-
-    # if input is not present in the network, load they as constant
-    load_input_constants(builder, node, graph, err)
-
-    # Input is represented as [Seq Len, Batch Size, Input Size]
-    if len(node.inputs) < 6:
-        batch_size = graph.shape_dict[node.inputs[0]][1]
-        builder.add_load_constant_nd(
-            name=node.name + "_load_initial_h_and_c",
-            output_name=input_h,
-            constant_value=0.0,
-            shape=[direction, batch_size, hidden_size],
-        )
-        # OPTIMIZATION: let's reuse the intial weights
-        input_c = input_h
-
-    # Get tensors for peepholes
-    peepholes = node.inputs[7] if len(node.inputs) > 7 else None
-
-    # CoreML LSTM expects 5-d tensor
-    # Expand dimensions of input to 5-d for compatibility
-    rank = builder._get_rank(node.inputs[0])
-    if rank == -1:
-        return err.unsupported_op_configuration(
-            builder, node, graph, "Rank unknown for input"
-        )
-    if rank < 5:
-        add_nodes = 5 - rank
-        # TODO: Add one expand instead of adding one after another for input, h and c
-        expand_dim(
-            node.name + "_expand_in_0",
-            node.inputs[0],
-            node.inputs[0] + "_expand_out_0",
-            [rank],
-        )
-        expand_dim(
-            node.name + "_expand_in_h_0", input_h, input_h + "_expand_out_h_0", [rank]
-        )
-        expand_dim(
-            node.name + "_expand_in_c_0", input_c, input_c + "_expand_out_c_0", [rank]
-        )
-
-        for i in range(1, add_nodes):
-            i_str = str(i)
-            i_p_str = str(i - 1)
-            expand_dim(
-                node.name + "_expand_in_" + i_str,
-                node.inputs[0] + "_expand_out_" + i_p_str,
-                node.inputs[0] + "_expand_out_" + i_str,
-                [rank + i],
-            )
-            expand_dim(
-                node.name + "_expand_in_h_" + i_str,
-                input_h + "_expand_out_h_" + i_p_str,
-                input_h + "_expand_out_h_" + i_str,
-                [rank + i],
-            )
-            expand_dim(
-                node.name + "_expand_in_c_" + i_str,
-                input_c + "_expand_out_c_" + i_p_str,
-                input_c + "_expand_out_c_" + i_str,
-                [rank + i],
-            )
-
-    if direction == 1:
-        # Peephole from ONNX are of shape [Num Dir, 3 * hidden_size]
-        # Reshape into CoreML format of [input hs, forget hs, cell hs]
-        if peepholes is not None:
-            builder.add_reshape_static(
-                name=node.name + "_peephole_reshape",
-                input_name=peepholes,
-                output_name=peepholes + "_reshaped",
-                output_shape=[hidden_size, hidden_size, hidden_size],
-            )
-            peepholes = peepholes + "_reshaped"
-
-        builder.add_unilstm(
-            name=node.name,
-            W_h=W_h,
-            W_x=W_x,
-            b=b,
-            hidden_size=hidden_size,
-            input_size=input_size,
-            input_names=[
-                node.inputs[0] + "_expand_out_" + str(add_nodes - 1),
-                input_h + "_expand_out_h_" + str(add_nodes - 1),
-                input_c + "_expand_out_c_" + str(add_nodes - 1),
-            ],
-            output_names=[node.outputs[0] + "_5d_out", output_h_5d, output_c_5d],
-            inner_activation=inner_activation,
-            cell_state_update_activation=cell_state_update_activation,
-            output_activation=output_activation,
-            peep=peepholes,
-            output_all=True,
-            forget_bias=True,
-            coupled_input_forget_gate=input_forget,
-            cell_clip_threshold=clip_threshold,
-            reverse_input=False,
-        )
-    elif direction == 2:
-        if len(W) != 2 and len(R) != 2 and len(B) != 2:
-            err.unsupported_op_configuration(
-                builder,
-                node,
-                graph,
-                "Bi-Directional LSTM does not have weights for both the directions",
-            )
-
-        W_x_back, W_h_back, b_back = get_weights(W[1], W_name, R[1], R_name, B[1])
-
-        peephole_f = None
-        peephole_b = None
-        if peepholes is not None:
-            builder.add_reshape_static(
-                name=node.name + "_peephole_reshape",
-                input_name=peepholes,
-                output_name=peepholes + "_reshaped",
-                output_shape=[direction, hidden_size, hidden_size, hidden_size],
-            )
-
-            peepholes_f = peepholes + "_f"
-            peepholes_b = peepholes + "_b"
-
-            builder.add_split_nd(
-                name=node.name + "_peephole_split",
-                input_name=peepholes + "_reshaped",
-                output_names=[peepholes_f, peepholes_b],
-                axis=0,
-            )
-
-        # split input_h and input_c into two parts
-        builder.add_split_nd(
-            name=node.name + "_split_h",
-            input_name=input_h + "_expand_out_h_" + str(add_nodes - 1),
-            output_names=[input_h + "_f", input_h + "_b"],
-            axis=0,
-        )
-
-        # OPTIMIZATION: If input_h and input_c are same
-        # Avoid creating new split and instead reuse
-        if input_h != input_c:
-            builder.add_split_nd(
-                name=node.name + "_split_c",
-                input_name=input_c + "_expand_out_c_" + str(add_nodes - 1),
-                output_names=[input_c + "_f", input_c + "_b"],
-                axis=0,
-            )
-
-        builder.add_bidirlstm(
-            name=node.name,
-            W_h=W_h,
-            W_x=W_x,
-            b=b,
-            W_h_back=W_h_back,
-            W_x_back=W_x_back,
-            b_back=b_back,
-            hidden_size=hidden_size,
-            input_size=input_size,
-            input_names=[
-                node.inputs[0] + "_expand_out_" + str(add_nodes - 1),
-                input_h + "_f",
-                input_c + "_f",
-                input_h + "_b",
-                input_c + "_b",
-            ],
-            output_names=[
-                node.outputs[0] + "_5d_out",
-                output_h + "_f",
-                output_c + "_f",
-                output_h + "_b",
-                output_c + "_b",
-            ],
-            inner_activation=inner_activation,
-            cell_state_update_activation=cell_state_update_activation,
-            output_activation=output_activation,
-            output_all=True,
-            peep=peephole_f,
-            peep_back=peephole_b,
-            forget_bias=True,
-            coupled_input_forget_gate=input_forget,
-            cell_clip_threshold=clip_threshold,
-        )
-
-        # Combine output_h and output_c
-        builder.add_concat_nd(
-            name=node.name + "concat_output_h",
-            input_names=[output_h + "_f", output_h + "_b"],
-            output_name=output_h_5d,
-            axis=0,
-        )
-
-        builder.add_concat_nd(
-            name=node.name + "concat_output_c",
-            input_names=[output_c + "_f", output_c + "_b"],
-            output_name=output_c_5d,
-            axis=0,
-        )
-    else:
-        err.unsupported_op_configuration(
-            builder, node, graph, "Unsupported direction {} for LSTM".format(direction)
-        )
-
-    # CoreML output is [Seq Len, Batch Size, Num Dir * Hidden Size, 1, 1]
-    # Return output as [Seq Len, Num Dir, Batch Size, Hidden Size]
-    # Following steps:
-    #       a. Reshape and split hidden size for direction [Seq Len, Batch Size, Num Dir, Hidden Size, 1]
-    #       b. Squeeze last dimension [Seq Len, Batch Size, Num Dir, Hidden Size]
-    #       c. Permute to fix the order [Seq Len, Num Dir, Batch Size, Hidden Size, 1]
-    builder.add_rank_preserving_reshape(
-        name=node.name + "_reshape_",
-        input_name=node.outputs[0] + "_5d_out",
-        output_name=node.outputs[0] + "_5d_reshaped",
-        output_shape=[0, 0, direction, -1, 0],
-    )
-
-    builder.add_squeeze(
-        name=node.name + "_squeeze_out",
-        input_name=node.outputs[0] + "_5d_reshaped",
-        output_name=node.outputs[0] + "_4d",
-        axes=[-1],
-    )
-
-    builder.add_transpose(
-        name=node.name + "_transpose",
-        axes=[0, 2, 1, 3],
-        input_name=node.outputs[0] + "_4d",
-        output_name=node.outputs[0],
-    )
-
-    # Squeeze dimensions of output_h and output_c
-    builder.add_squeeze(
-        name=node.name + "_squeeze_out_h",
-        input_name=output_h_5d,
-        output_name=output_h,
-        axes=[-1, -2],
-    )
-    builder.add_squeeze(
-        name=node.name + "_squeeze_out_c",
-        input_name=output_c_5d,
-        output_name=output_c,
-        axes=[-1, -2],
-    )
-
-
-def _convert_logical(builder, node, graph, err):
-    """
-    convert to CoreML Logical And/Or/Xor/Not Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L1013
-    """
-    mode = node.op_type.upper()
-    builder.add_logical(
-        name=node.name, input_names=node.inputs, output_name=node.outputs[0], mode=mode
-    )
-
-
-def _convert_pad(builder, node, graph, err):
-    """
-    convert to CoreML Padding / ConstantPadding Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L4397
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L1822
-    """
-    mode = node.attrs.get("mode", "constant")
-
-    try:
-        mode = mode.decode()
-    except (UnicodeDecodeError, AttributeError):
-        pass
-
-    if mode == "constant":
-        pads = node.attrs.get("pads", [])
-        value = node.attrs.get("value", 0.0)
-        # onnx padding spec: [x1_top, ..., xn_top, x1_bottom, ..., xn_bottom]
-        # coreml padding spec: [x1_top, x1_bottom, ..., xn_top, xn_bottom]
-        assert len(pads) % 2 == 0, 'even number of pads expected'
-        pads_coreml = [None] * len(pads)
-        pads_coreml[::2] = pads[:len(pads) // 2]
-        pads_coreml[1::2] = pads[len(pads) // 2:]
-
-        builder.add_constant_pad(
-            name=node.name,
-            input_names=node.inputs,
-            output_name=node.outputs[0],
-            value=value,
-            pad_to_given_output_size_mode=False,
-            pad_amounts=pads_coreml,
-        )
-    else:
-        _convert_pad_5d(builder, node, graph, err)
-
-
-def _convert_matmul(builder, node, graph, err):
-    """
-    convert to CoreML BatchedMatMul Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L3473
-    """
-    weight_name = node.inputs[1]
-    W = None
-    weight_as_layer_parameter = False
-    if weight_name in node.input_tensors:
-        W = node.input_tensors[weight_name]
-
-    if W is not None:
-        if len(W.shape) != 2:
-            # since weight as parameter in batchedMatMul layer must be rank 2
-            builder.add_load_constant_nd(
-                node.name + "_const_weight_input",
-                weight_name,
-                constant_value=W,
-                shape=W.shape,
-            )
-        else:
-            weight_as_layer_parameter = True
-
-    if weight_as_layer_parameter:
-        builder.add_batched_mat_mul(
-            name=node.name,
-            input_names=[node.inputs[0]],
-            output_name=node.outputs[0],
-            weight_matrix_rows=W.shape[0],
-            weight_matrix_columns=W.shape[1],
-            W=W,
-        )
-    else:
-        builder.add_batched_mat_mul(
-            name=node.name,
-            input_names=[node.inputs[0], weight_name],
-            output_name=node.outputs[0],
-        )
-
-
-def _convert_max(builder, node, graph, err):
-    """
-    convert to CoreML Max Broadcastable Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L4126
-    """
-    load_input_constants(builder, node, graph, err)
-    add_broadcastable_op_chain(builder, node, err, builder.add_max_broadcastable)
-
-
-def _convert_mean(builder, node, graph, err):
-    """
-    convert to CoreML Add Broadcastable Layer and Divide BroadCastable Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L4117
-    """
-    number_of_inputs = len(node.inputs)
-    output_name = node.outputs[0]
-    node.outputs[0] = node.outputs[0] + "_sum"
-
-    builder.add_load_constant_nd(
-        name=node.name + "_divider",
-        output_name=output_name + "_divider",
-        constant_value=np.array(number_of_inputs),
-        shape=[1],
-    )
-    add_broadcastable_op_chain(builder, node, err, builder.add_add_broadcastable)
-    builder.add_divide_broadcastable(
-        name=node.name + "_mean",
-        input_names=[node.outputs[0], output_name + "_divider"],
-        output_name=output_name,
-    )
-
-
-def _convert_pow(builder, node, graph, err):
-    """
-    convert to CoreML Pow Broadcastable Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L3969
-    """
-    load_input_constants(builder, node, graph, err)
-    add_broadcastable_op_chain(builder, node, err, builder.add_pow_broadcastable)
-
-
-def _convert_randomnormal(builder, node, graph, err):
-    """
-    convert to CoreML Random Normal Static Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L4457
-    """
-    add_random(builder, node, graph, err, builder.add_random_normal_static)
-
-
-def _convert_randomnormallike(builder, node, graph, err):
-    """
-    convert to CoreML Random Normal Like Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L4434
-    """
-    # Ignoring attribute `dtype` as CoreML internally represents tensors into 'Float'
-    mean = node.attributes.get("mean", 0.0)
-    scale = node.attributes.get("scale", 1.0)
-    seed = node.attributes.get("seed", -1)
-
-    builder.add_random_normal_like(
-        name=node.name,
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-        mean=mean,
-        stddev=scale,
-        seed=seed,
-    )
-
-
-def _convert_randomuniform(builder, node, graph, err):
-    """
-    convert to CoreML Random Uniform Static Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L4526
-    """
-    add_random(builder, node, graph, err, builder.random_uniform_static)
-
-
-def _convert_randomuniformlike(builder, node, graph, err):
-    """
-    convert to CoreML Random Normal Like Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L4503
-    """
-    # Ignoring attribute `dtype` as CoreML internally represents tensors into 'Float'
-    mean = node.attributes.get("mean", 0.0)
-    scale = node.attributes.get("scale", 1.0)
-    seed = node.attributes.get("seed", -1)
-
-    builder.add_random_uniform_like(
-        name=node.name,
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-        mean=mean,
-        stddev=scale,
-        seed=seed,
-    )
-
-
-def _convert_min(builder, node, graph, err):
-    """
-    convert to CoreML Min Broadcastable Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L4135
-    """
-    load_input_constants(builder, node, graph, err)
-    add_broadcastable_op_chain(builder, node, err, builder.add_min_broadcastable)
-
-
-def _convert_mod(builder, node, graph, err):
-    """
-    convert to CoreML Mod Broadcastable Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L4144
-    """
-    load_input_constants(builder, node, graph, err)
-    add_broadcastable_op_chain(builder, node, err, builder.add_mod_broadcastable)
-
-
-def _convert_mul(builder, node, graph, err):
-    """
-    convert to CoreML Multiply Broadcastable Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L4171
-    """
-    load_input_constants(builder, node, graph, err)
-    add_broadcastable_op_chain(builder, node, err, builder.add_multiply_broadcastable)
-
-
-def _convert_nonzero(builder, node, graph, err):
-    """
-    convert to CoreML Where Non Zero Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L4002
-    """
-    load_input_constants(builder, node, graph, err)
-    builder.add_where_nonzero(
-        name=node.name, input_name=node.inputs[0], output_name=node.outputs[0]
-    )
-
-
-def _convert_pool(builder, node, graph, err):
-    """
-    convert to CoreML Pooling Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L477
-    """
-    params_dict = dict()
-    params_dict["is_global"] = False
-    if node.op_type.startswith("Global"):
-        params_dict["is_global"] = True
-    if node.op_type.endswith("MaxPool"):
-        params_dict["layer_type"] = "MAX"
-    elif node.op_type.endswith("AveragePool"):
-        params_dict["layer_type"] = "AVERAGE"
-    else:
-        return err.unsupported_op_configuration(
-            builder, node, graph, "Unsupported pool type"
-        )
-
-    if len(node.outputs) == 2:
-        return err.unsupported_op_configuration(
-            builder, node, graph, "argmax with pool unsupported"
-        )
-
-    if "ceil_mode" in node.attrs and node.attrs["ceil_mode"] == 1:
-        return err.unsupported_op_configuration(
-            builder, node, graph, "ceil_mode=1 not supported"
-        )
-
-    if "dilations" in node.attrs:
-        return err.unsupported_op_configuration(
-            builder, node, graph, "dilations not supported"
-        )
-
-    _add_conv_like_op(
-        _add_pool, _get_pool_params, params_dict, builder, node, graph, err
-    )
-
-
-def _convert_reduce(builder, node, graph, err):
-    """
-    convert to CoreML ReduceSum Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L4707
-    """
-    load_input_constants(builder, node, graph, err)
-
-    # read attributes
-    axes = node.attrs.get("axes", None)
-    reduce_all = False
-    if axes is None:
-        reduce_all = True
-    keepdims = node.attrs.get("keepdims", True)
-
-    # add respective operator
-    op_type = node.op_type
-    if op_type == "ReduceSum":
-        builder.add_reduce_sum(
-            name=node.name,
-            input_name=node.inputs[0],
-            output_name=node.outputs[0],
-            axes=axes,
-            keepdims=keepdims,
-            reduce_all=reduce_all,
-        )
-    elif op_type == "ReduceProd":
-        builder.add_reduce_prod(
-            name=node.name,
-            input_name=node.inputs[0],
-            output_name=node.outputs[0],
-            axes=axes,
-            keepdims=keepdims,
-            reduce_all=reduce_all,
-        )
-    elif op_type == "ReduceMean":
-        builder.add_reduce_mean(
-            name=node.name,
-            input_name=node.inputs[0],
-            output_name=node.outputs[0],
-            axes=axes,
-            keepdims=keepdims,
-            reduce_all=reduce_all,
-        )
-    elif op_type == "ReduceMax":
-        builder.add_reduce_max(
-            name=node.name,
-            input_name=node.inputs[0],
-            output_name=node.outputs[0],
-            axes=axes,
-            keepdims=keepdims,
-            reduce_all=reduce_all,
-        )
-    elif op_type == "ReduceMin":
-        builder.add_reduce_min(
-            name=node.name,
-            input_name=node.inputs[0],
-            output_name=node.outputs[0],
-            axes=axes,
-            keepdims=keepdims,
-            reduce_all=reduce_all,
-        )
-    elif op_type == "ReduceL2":
-        builder.add_reduce_l2(
-            name=node.name,
-            input_name=node.inputs[0],
-            output_name=node.outputs[0],
-            axes=axes,
-            keepdims=keepdims,
-            reduce_all=reduce_all,
-        )
-    elif op_type == "ReduceL1":
-        builder.add_reduce_l1(
-            name=node.name,
-            input_name=node.inputs[0],
-            output_name=node.outputs[0],
-            axes=axes,
-            keepdims=keepdims,
-            reduce_all=reduce_all,
-        )
-    elif op_type == "ReduceSumSquare":
-        builder.add_reduce_sumsquare(
-            name=node.name,
-            input_name=node.inputs[0],
-            output_name=node.outputs[0],
-            axes=axes,
-            keepdims=keepdims,
-            reduce_all=reduce_all,
-        )
-    elif op_type == "ReduceLogSum":
-        builder.add_reduce_logsum(
-            name=node.name,
-            input_name=node.inputs[0],
-            output_name=node.outputs[0],
-            axes=axes,
-            keepdims=keepdims,
-            reduce_all=reduce_all,
-        )
-    elif op_type == "ReduceLogSumExp":
-        builder.add_reduce_logsumexp(
-            name=node.name,
-            input_name=node.inputs[0],
-            output_name=node.outputs[0],
-            axes=axes,
-            keepdims=keepdims,
-            reduce_all=reduce_all,
-        )
-    else:
-        err.unsupported_op_configuration(
-            builder, node, graph, "Unsupported reduce operation: {}".format(op_type)
-        )
-
-
-def _convert_reshape(builder, node, graph, err):
-    """
-    convert to CoreML Reshape Static Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L4844
-    """
-    shape_node = node.inputs[1]
-    if shape_node in node.input_tensors:
-        output_shape = node.input_tensors[shape_node].astype(np.int64)
-
-        # if rank is same, then call rank preserving reshape
-        if node.inputs[0] not in graph.shape_dict:
-            # If Input shape is not present and output shape is known
-            # add reshape static as
-            # TODO: ONNX should be able to infer the shape
-            builder.add_reshape_static(
-                name=node.name,
-                input_name=node.inputs[0],
-                output_name=node.outputs[0],
-                output_shape=output_shape,
-            )
-            return
-
-        len_of_input_shape = builder._get_rank(node.inputs[0])
-        if len(output_shape) == len_of_input_shape:
-            builder.add_rank_preserving_reshape(
-                name=node.name,
-                input_name=node.inputs[0],
-                output_name=node.outputs[0],
-                output_shape=output_shape,
-            )
-        else:
-            add_static_reshape = True
-            if len_of_input_shape > len(output_shape):
-                # Output rank is less than input rank
-                # Case when most of the dims size is unchanged
-                num_zeros = 0
-                num_neg_ones = 0
-                for i in output_shape:
-                    if i == 0:
-                        num_zeros += 1
-                    elif i == -1:
-                        num_neg_ones += 1
-
-                if num_neg_ones > 1:
-                    err.unsupported_op_configuration(
-                        builder,
-                        node,
-                        graph,
-                        "Error in ONNX model: At most one dimension of new shape can be -1, found {}".format(
-                            num_neg_ones
-                        ),
-                    )
-
-                if num_neg_ones + num_zeros == len(output_shape):
-                    # Rank of output is less than input
-                    # Make Rank equivalent for reshape and then squeeze
-                    add_static_reshape = False
-                    new_shape = []
-                    i = 0
-                    for i in range(len(output_shape)):
-                        new_shape.append(output_shape[i])
-                        if output_shape[i] == -1:
-                            break
-                    while i < len_of_input_shape - 1:
-                        new_shape.append(1)
-                        i += 1
-
-                    builder.add_rank_preserving_reshape(
-                        name=node.name + "_reshape_preserving",
-                        input_name=node.inputs[0],
-                        output_name=node.outputs[0] + "_reshape_dim_preserved",
-                        output_shape=new_shape,
-                    )
-
-                    squeeze_axes = list(
-                        range(len(output_shape) - len_of_input_shape, 0)
-                    )
-                    squeeze_axes.reverse()
-
-                    builder.add_squeeze(
-                        name=node.name,
-                        input_name=node.outputs[0] + "_reshape_dim_preserved",
-                        output_name=node.outputs[0],
-                        axes=squeeze_axes,
-                    )
-
-            if add_static_reshape:
-                builder.add_reshape_static(
-                    name=node.name,
-                    input_name=node.inputs[0],
-                    output_name=node.outputs[0],
-                    output_shape=output_shape,
-                )
-    else:
-        builder.add_reshape_dynamic(
-            name=node.name, input_names=node.inputs, output_name=node.outputs[0],
-        )
-
-
-def _convert_resize(builder, node, graph, err):
-    """
-    convert to CoreML Upsample or Resize Bilinear Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L2139
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L2178
-    """
-    mode = node.attrs.get("mode", "nearest")
-    if node.inputs[1] not in node.input_tensors:
-        return err.unsupported_op_configuration(
-            builder,
-            node,
-            graph,
-            "Scaling factor unknown!! CoreML does not support dynamic scaling for Resize",
-        )
-
-    mode = "NN" if mode == "nearest" else "BILINEAR"
-    scale = node.input_tensors[node.inputs[1]]
-
-    if scale.size == 0:
-        input_shape = graph.shape_dict[node.inputs[0]]
-        output_shape = graph.shape_dict[node.outputs[0]]
-        scale = (output_shape[2] // input_shape[2], output_shape[3] // input_shape[3])
-
-    builder.add_upsample(
-        name=node.name,
-        scaling_factor_h=scale[-2],
-        scaling_factor_w=scale[-1],
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-        mode=mode,
-    )
-
-
-def _convert_reverse_sequence(builder, node, graph, err):
-    """
-    convert to CoreML Reverse Sequence Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L3577
-    """
-    batch_axis = node.attrs.get("batch_axis", 1)
-    time_axis = node.attrs.get("time_axis", 0)
-
-    output_name = node.outputs[0]
-    add_transpose = False
-    if batch_axis > time_axis:
-        output_name += "_before_reverse"
-        batch_axis, time_axis = time_axis, batch_axis
-        add_transpose = True
-
-    builder.add_reverse_sequence(
-        name=node.name,
-        input_names=node.inputs,
-        output_name=output_name,
-        batch_axis=batch_axis,
-        seq_axis=time_axis,
-    )
-
-    if add_transpose:
-        output_name_post = "_before_reverse"
-        rank = builder._get_rank(node.inputs[0])
-        if rank == -1:
-            return err.unsupported_op_configuration(
-                builder, node, graph, "Rank unknown for input"
-            )
-        axes = list(range(rank))
-        axes[batch_axis], axes[time_axis] = axes[time_axis], axes[batch_axis]
-        builder.add_transpose(
-            name=node.name + "_transpose",
-            axes=axes,
-            input_name=output_name,
-            output_name=node.outputs[0],
-        )
-
-
-def _convert_roialign(builder, node, graph, err):
-    """
-    convert to CoreML CropResize and Pooling Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L2239
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L1702
-    """
-
-    target_height = node.attrs.get("output_height", 1)
-    target_width = node.attrs.get("output_width", 1)
-    mode = node.attrs.get("mode", "AVERAGE").upper()
-    sampling_ratio = node.attrs.get("sampling_ratio", 0)
-    spatial_scale = node.attrs.get("sampling_scale", 1.0)
-
-    if node.inputs[2] in graph.inputs:
-        graph.inputs.remove(node.inputs[2])
-
-    builder.add_expand_dims(
-        name=node.name + "_expand_0",
-        input_name=node.inputs[0],
-        output_name=node.inputs[0] + "_expanded",
-        axes=[0],
-    )
-    node.inputs[0] += "_expanded"
-
-    builder.add_expand_dims(
-        name=node.name + "_expand_2",
-        input_name=node.inputs[2],
-        output_name=node.inputs[2] + "_expanded",
-        axes=[1],
-    )
-    node.inputs[2] += "_expanded"
-
-    builder.add_concat_nd(
-        name=node.name + "_concat_indices",
-        input_names=[node.inputs[2], node.inputs[1]],
-        output_name=node.inputs[1] + "_rois",
-        axis=1,
-    )
-    node.inputs[1] += "_rois"
-
-    builder.add_expand_dims(
-        name=node.name + "_expand_1",
-        input_name=node.inputs[1],
-        output_name=node.inputs[1] + "_expanded",
-        axes=[1, 3, 4],
-    )
-    node.inputs[1] += "_expanded"
-
-    builder.add_crop_resize(
-        name=node.name + "_crop_resize",
-        input_names=[node.inputs[0], node.inputs[1]],
-        output_name=node.outputs[0] + "_crop_resized",
-        target_height=target_height * sampling_ratio,
-        target_width=target_width * sampling_ratio,
-        mode="ROI_ALIGN_MODE",
-        box_indices_mode="CORNERS_WIDTH_FIRST",
-        spatial_scale=spatial_scale,
-    )
-
-    builder.add_squeeze(
-        name=node.name + "_squeeze",
-        input_name=node.outputs[0] + "_crop_resized",
-        output_name=node.outputs[0] + "_crop_resized_squeezed",
-        axes=[1],
-    )
-
-    builder.add_pooling(
-        name=node.name + "_pool",
-        height=sampling_ratio,
-        width=sampling_ratio,
-        layer_type=mode,
-        input_name=node.outputs[0] + "_crop_resized_squeezed",
-        output_name=node.outputs[0],
-        stride_height=sampling_ratio,
-        stride_width=sampling_ratio,
-        padding_type="VALID",
-    )
-
-
-def _convert_round(builder, node, graph, err):
-    """
-    convert to CoreML Round Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L5029
-    """
-    builder.add_round(
-        name=node.name, input_name=node.inputs[0], output_name=node.outputs[0]
-    )
-
-
-def _convert_scatter(builder, node, graph, err):
-    """
-    convert to CoreML Scatter Along Axis Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L4308
-    """
-    axis = node.attrs.get("axis", 0)
-    builder.add_scatter_along_axis(
-        name=node.name, input_names=node.inputs, output_name=node.outputs[0], axis=axis
-    )
-
-
-def _convert_size(builder, node, graph, err):
-    """
-    convert to CoreML GetShape and ReduceProd Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L5131
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L4722
-    """
-    builder.add_get_shape(
-        name=node.name,
-        input_name=node.inputs[0],
-        output_name=node.inputs[0] + "_getshape",
-    )
-    builder.add_reduce_prod(
-        name=node.name + "_reduce_prod",
-        input_name=node.inputs[0] + "_getshape",
-        output_name=node.outputs[0],
-    )
-
-
-def _convert_slice_ir4v9(builder, node, graph, err):
-    """
-    convert to CoreML Slice Static Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L5082
-    """
-    if node.inputs[0] in graph.shape_dict:
-        data_shape = graph.shape_dict[node.inputs[0]]
-    else:
-        rank = builder._get_rank(node.inputs[0])
-        if rank == -1:
-            return err.unsupported_op_configuration(
-                builder, node, graph, "Input shape not available"
-            )
-        data_shape = [INT_MAX] * rank
-
-    len_of_data = len(data_shape)
-    begin_masks = [True] * len_of_data
-    end_masks = [True] * len_of_data
-
-    default_axes = list(range(len_of_data))
-    default_steps = [1] * len_of_data
-
-    ip_starts = node.attrs.get("starts")
-    ip_ends = node.attrs.get("ends")
-    axes = node.attrs.get("axes", default_axes)
-    steps = node.attrs.get("steps", default_steps)
-
-    starts = [0] * len_of_data
-    ends = [0] * len_of_data
-
-    for i in range(len(axes)):
-        current_axes = axes[i]
-        starts[current_axes] = ip_starts[i]
-        ends[current_axes] = ip_ends[i]
-        # n <= end <= INT_MAX implies end is -1, hence end_mask should be True
-        # otherwise end_mask should be False
-        if ends[current_axes] < data_shape[current_axes]:
-            # this means end is not -1
-            end_masks[current_axes] = False
-
-        if starts[current_axes] != 0:
-            begin_masks[current_axes] = False
-
-    builder.add_slice_static(
-        name=node.name,
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-        begin_ids=starts,
-        end_ids=ends,
-        strides=steps,
-        begin_masks=begin_masks,
-        end_masks=end_masks,
-    )
-
-
-def _convert_slice(builder, node, graph, err):
-    """
-    convert to CoreML Slice Static Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L5082
-    """
-    if len(node.inputs) == 1:
-        return _convert_slice_ir4v9(builder, node, graph, err)
-
-    if node.inputs[0] not in graph.shape_dict:
-        err.unsupported_op_configuration(
-            builder, node, graph, "Input shape not available"
-        )
-
-    data_shape = graph.shape_dict[node.inputs[0]]
-    len_of_data = len(data_shape)
-    begin_masks = [True] * len_of_data
-    end_masks = [True] * len_of_data
-
-    default_axes = list(range(len_of_data))
-
-    add_static_slice_layer = False
-    if node.inputs[1] in node.input_tensors and node.inputs[2] in node.input_tensors:
-        if len(node.inputs) > 3:
-            if node.inputs[3] in node.input_tensors:
-                if len(node.inputs) > 4:
-                    if node.inputs[4] in node.input_tensors:
-                        add_static_slice_layer = True
-                else:
-                    add_static_slice_layer = True
-        else:
-            add_static_slice_layer = True
-
-    if add_static_slice_layer:
-        ip_starts = node.input_tensors[node.inputs[1]]
-        ip_ends = node.input_tensors[node.inputs[2]]
-        axes = (
-            node.input_tensors[node.inputs[3]] if len(node.inputs) > 3 else default_axes
-        )
-        ip_steps = node.input_tensors[node.inputs[4]] if len(node.inputs) > 4 else None
-
-        starts = [0] * len_of_data
-        ends = [0] * len_of_data
-        steps = [1] * len_of_data
-
-        for i in range(len(axes)):
-            current_axes = axes[i]
-            starts[current_axes] = ip_starts[i]
-            ends[current_axes] = ip_ends[i]
-            # n <= end <= INT_MAX implies end is -1, hence end_mask should be True
-            # otherwise end_mask should be False
-            if ends[current_axes] < data_shape[current_axes]:
-                # this means end is not -1
-                end_masks[current_axes] = False
-
-            if starts[current_axes] != 0:
-                begin_masks[current_axes] = False
-
-            if isinstance(ip_steps, list):
-                steps[current_axes] = ip_steps[i]
-
-        builder.add_slice_static(
-            name=node.name,
-            input_name=node.inputs[0],
-            output_name=node.outputs[0],
-            begin_ids=starts,
-            end_ids=ends,
-            strides=steps,
-            begin_masks=begin_masks,
-            end_masks=end_masks,
-        )
-    else:
-        err.unsupported_op_configuration(
-            builder,
-            node,
-            graph,
-            "CoreML does not support Dynamic Slice with unknown axes. Please provide Custom Function/Layer",
-        )
-
-
-def _convert_softmax_nd(builder, node, graph, err):
-    """
-    convert to CoreML SoftMax ND Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#3547
-    """
-    axis = node.attrs.get("axis", 1)
-    builder.add_softmax_nd(
-        name=node.name,
-        input_name=node.inputs[0],
-        output_name=node.outputs[0]
-        + ("_softmax" if node.op_type == "LogSoftmax" else ""),
-        axis=axis,
-    )
-    if node.op_type == "LogSoftmax":
-        builder.add_unary(
-            name=node.name + "_log",
-            input_name=node.outputs[0] + "_softmax",
-            output_name=node.outputs[0],
-            mode="log",
-        )
-
-
-def _convert_softmax(builder, node, graph, err):
-    """
-    convert to CoreML SoftMax ND Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#3547
-    """
-
-    def add_softmax(output_name, rank=-1, axis=-3):
-        softmax_axis = 3
-        axes = list(range(5 - rank))
-        if axis < 0:
-            axis = rank + axis
-        axis += len(axes)
-        softmax_output_name = output_name + "_expanded"
-
-        expanded_node = node.name + "_" + node.inputs[0] + "_expanded"
-        builder.add_expand_dims(
-            name=node.name + "_expand_dims",
-            input_name=node.inputs[0],
-            output_name=expanded_node,
-            axes=axes,
-        )
-        input_name = expanded_node
-        rank = 5
-
-        if axis != -3 and axis != rank - softmax_axis:
-            transpose_axes = list(range(rank))
-            transpose_axes[-3], transpose_axes[axis] = (
-                transpose_axes[axis],
-                transpose_axes[-3],
-            )
-
-            builder.add_transpose(
-                name=node.name + "_transpose",
-                axes=transpose_axes,
-                input_name=input_name,
-                output_name=input_name + "_transposed",
-            )
-            input_name += "_transposed"
-            softmax_output_name += "_transposed"
-
-        builder.add_softmax(
-            name=node.name, input_name=input_name, output_name=softmax_output_name
-        )
-
-        if axis != -3 and axis != rank - softmax_axis:
-            transpose_axes = list(range(rank))
-            transpose_axes[-3], transpose_axes[axis] = (
-                transpose_axes[axis],
-                transpose_axes[-3],
-            )
-
-            builder.add_transpose(
-                name=node.name + "_transpose_back",
-                axes=transpose_axes,
-                input_name=softmax_output_name,
-                output_name=softmax_output_name + "_transposed_back",
-            )
-            softmax_output_name += "_transposed_back"
-
-        builder.add_squeeze(
-            name=node.name + "_squeeze_dims",
-            input_name=softmax_output_name,
-            output_name=output_name,
-            axes=axes,
-        )
-
-    axis = node.attrs.get("axis", 1)
-    rank = builder._get_rank(node.inputs[0])
-    if rank == -1:
-        return _convert_softmax_nd(builder, node, graph, err)
-
-    if node.op_type == "LogSoftmax":
-        add_softmax(node.outputs[0] + "_softmax", rank=rank, axis=axis)
-        builder.add_unary(
-            name=node.name + "_log",
-            input_name=node.outputs[0] + "_softmax",
-            output_name=node.outputs[0],
-            mode="log",
-        )
-    else:
-        add_softmax(node.outputs[0], rank=rank, axis=axis)
-
-
-def _convert_split(builder, node, graph, err):
-    """
-    convert to CoreML Squeeze Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#5003
-    """
-    axis = node.attrs.get("axis", 0)
-    split = node.attrs.get("split", None)
-    num_splits = len(node.outputs) if split is None else 2
-
-    builder.add_split_nd(
-        name=node.name,
-        input_name=node.inputs[0],
-        output_names=node.outputs,
-        axis=axis,
-        num_splits=num_splits,
-        split_sizes=split,
-    )
-
-
-def _convert_shape(builder, node, graph, err):
-    """
-    convert to CoreML GetShape Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L5131
-    """
-    builder.add_get_shape(
-        name=node.name, input_name=node.inputs[0], output_name=node.outputs[0]
-    )
-
-
-def _convert_squeeze(builder, node, graph, err):
-    """
-    convert to CoreML Squeeze Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L4903
-    """
-    axes = node.attrs.get("axes", None)
-    builder.add_squeeze(
-        name=node.name,
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-        axes=axes,
-    )
-
-
-def _convert_sub(builder, node, graph, err):
-    """
-    convert to CoreML Subtract Broadcastable Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L4117
-    """
-    load_input_constants(builder, node, graph, err)
-    add_broadcastable_op_chain(builder, node, err, builder.add_subtract_broadcastable)
-
-
-def _convert_tanh(builder, node, graph, err):
-    """
-    convert to CoreML Tanh Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L3881
-    """
-    load_input_constants(builder, node, graph, err)
-    builder.add_tanh(
-        name=node.name, input_name=node.inputs[0], output_name=node.outputs[0]
-    )
-
-
-def _convert_tile(builder, node, graph, err):
-    """
-    convert to CoreML Tile Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L5117
-    """
-    load_input_constants(builder, node, graph, err)
-    if node.inputs[1] not in node.input_tensors:
-        err.unsupported_op_configuration(
-            builder,
-            node,
-            graph,
-            "CoreML Tile layer does not support dynamic 'reps'. 'reps' should be known statically",
-        )
-    builder.add_tile(
-        name=node.name,
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-        reps=node.input_tensors[node.inputs[1]].astype(np.int32).tolist(),
-    )
-
-
-def _convert_topk(builder, node, graph, err):
-    """
-    convert to CoreML TopK Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L5190
-    """
-    load_input_constants(builder, node, graph, err)
-    axis = node.attrs.get("axis", -1)
-    bottom_k = node.attrs.get("largest", True) == False
-    k = node.attrs.get('k', 0)
-    # NOTE: Sorted order attribute is currently ignored in CoreML
-    sorted_order = node.attrs.get("sorted", True)
-    if "sorted" in node.attrs:
-        err.unsupported_feature_warning(
-            node, "Sorted Order attribute('sorted') is currently ignored in CoreML 3.0"
-        )
-
-    builder.add_topk(
-        name=node.name,
-        input_names=node.inputs,
-        output_names=node.outputs,
-        k=k,
-        axis=axis,
-        use_bottom_k=bottom_k,
-    )
-
-
-def _convert_transpose(builder, node, graph, err):
-    """
-    convert to CoreML Transpose Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L3426
-    """
-
-    axes = node.attrs.get("perm", [])
-    # If 'perm' not provided, the reverse the dimensions
-    if axes == []:
-        rank = builder._get_rank(node.inputs[0])
-        if rank == -1:
-            return err.unsupported_op_configuration(
-                builder, node, graph, "Rank unknown for input"
-            )
-        axes = list(range(-1, -(rank + 1), -1))
-
-    builder.add_transpose(
-        name=node.name,
-        axes=axes,
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-    )
-
-
-def _convert_unsqueeze(builder, node, graph, err):
-    """
-    convert to CoreML ExpandDim Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L4810
-    """
-    axes = node.attrs.get("axes")
-    builder.add_expand_dims(
-        name=node.name,
-        input_name=node.inputs[0],
-        output_name=node.outputs[0],
-        axes=axes,
-    )
-
-
-def _convert_where(builder, node, graph, err):
-    """
-    convert to CoreML WhereBroadcastable Layer:
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L3742
-    """
-    load_input_constants(builder, node, graph, err)
-    builder.add_where_broadcastable(
-        name=node.name, input_names=node.inputs, output_name=node.outputs[0],
-    )
-
-
-_ONNX_NODE_REGISTRY_ND = {
-    "Abs": _convert_abs,
-    "Acos": _convert_acos,
-    "Acosh": _convert_acosh,
-    "Add": _convert_add,
-    "And": _convert_logical,
-    "ArgMax": _convert_argmax,
-    "ArgMin": _convert_argmin,
-    "Asin": _convert_asin,
-    "Asinh": _convert_asinh,
-    "Atan": _convert_atan,
-    "Atanh": _convert_atanh,
-    "AveragePool": _convert_pool,
-    "BatchNormalization": _convert_bn,
-    "Cast": _convert_cast,
-    "Ceil": _convert_ceil,
-    "Clip": _convert_clip,
-    "Concat": _convert_concat,
-    "Constant": _convert_constant,
-    "ConstantOfShape": _convert_constant_of_shape,
-    "Conv": _convert_conv,
-    "ConvTranspose": _convert_conv,
-    "Cos": _convert_cos,
-    "Cosh": _convert_cosh,
-    "DepthToSpace": _convert_reorganize_data,
-    "Div": _convert_div,
-    "Elu": _convert_elu,
-    "Equal": _convert_equal,
-    "Erf": _convert_erf,
-    "Exp": _convert_exp,
-    "Expand": _convert_expand,
-    "Flatten": _convert_flatten,
-    "Floor": _convert_floor,
-    "Gather": _convert_gather,
-    "Gemm": _convert_gemm,
-    "Greater": _convert_greater,
-    "GRU": _convert_gru,
-    "GlobalAveragePool": _convert_pool,
-    "GlobalMaxPool": _convert_pool,
-    "HardSigmoid": _convert_hardsigmoid,
-    "Identity": _convert_identity,
-    "InstanceNormalization": _convert_instancenorm,
-    "LeakyRelu": _convert_leaky_relu,
-    "Log": _convert_log,
-    "LogSoftmax": _convert_softmax,
-    "LRN": _convert_lrn,
-    "Less": _convert_less,
-    "LSTM": _convert_lstm,
-    "MatMul": _convert_matmul,
-    "Max": _convert_max,
-    "MaxPool": _convert_pool,
-    "Mean": _convert_mean,
-    "Min": _convert_min,
-    "Mod": _convert_mod,
-    "Mul": _convert_mul,
-    "Neg": _convert_neg,
-    "NonZero": _convert_nonzero,
-    "Not": _convert_logical,
-    "Or": _convert_logical,
-    "Pad": _convert_pad,
-    "Pow": _convert_pow,
-    "PRelu": _convert_prelu,
-    "RandomNormal": _convert_randomnormal,
-    "RandomNormalLike": _convert_randomnormallike,
-    "RandomUniform": _convert_randomuniform,
-    "RandomUniformLike": _convert_randomuniformlike,
-    "Reciprocal": _convert_reciprocal,
-    "ReduceL1": _convert_reduce,
-    "ReduceL2": _convert_reduce,
-    "ReduceLogSum": _convert_reduce,
-    "ReduceLogSumExp": _convert_reduce,
-    "ReduceMax": _convert_reduce,
-    "ReduceMean": _convert_reduce,
-    "ReduceMin": _convert_reduce,
-    "ReduceProd": _convert_reduce,
-    "ReduceSum": _convert_reduce,
-    "ReduceSumSquare": _convert_reduce,
-    "Relu": _convert_relu,
-    "Reshape": _convert_reshape,
-    "Resize": _convert_resize,
-    "ReverseSequence": _convert_reverse_sequence,
-    "RoiAlign": _convert_roialign,
-    "Round": _convert_round,
-    "Scatter": _convert_scatter,
-    "Selu": _convert_selu,
-    "Sigmoid": _convert_sigmoid,
-    "Sign": _convert_sign,
-    "Size": _convert_size,
-    "Slice": _convert_slice,
-    "Softmax": _convert_softmax,
-    "Softplus": _convert_softplus,
-    "Softsign": _convert_softsign,
-    "SpaceToDepth": _convert_reorganize_data,
-    "Split": _convert_split,
-    "Shape": _convert_shape,
-    "Sqrt": _convert_sqrt,
-    "Squeeze": _convert_squeeze,
-    "Sub": _convert_sub,
-    "Sum": _convert_add,
-    "Tanh": _convert_tanh,
-    "ThresholdedRelu": _convert_thresholdedrelu,
-    "Tile": _convert_tile,
-    "TopK": _convert_topk,
-    "Transpose": _convert_transpose,
-    "Unsqueeze": _convert_unsqueeze,
-    "Upsample": _convert_upsample,
-    "Xor": _convert_logical,
-    "Where": _convert_where,
-}
-
-
-def _get_node_converter_fn(
-    builder, node, err
-):  # type: (NeuralNetworkBuilder, Node, ErrorHandling) -> Callable[[NeuralNetworkBuilder, Node, Graph, ErrorHandling], None]
-    """
-    Get the right converter function for ONNX node op_type
-    """
-    op_type = node.op_type
-    # Return custom conversion function if provided
-    # If both node type and node name custom function
-    # is provided, then use node name specific custom function, as
-    # type specific custom function is more generic than name specific
-    if node.name in err.custom_conversion_functions:
-        return err.custom_conversion_functions[node.name]
-    elif op_type in err.custom_conversion_functions:
-        return err.custom_conversion_functions[op_type]
-    elif op_type in _ONNX_NODE_REGISTRY_ND:
-        return _ONNX_NODE_REGISTRY_ND[op_type]
-    else:
-        return err.unsupported_op(node)
-
-
-def _convert_node_nd(
-    builder, node, graph, err
-):  # type: (NeuralNetworkBuilder, Node, Graph, ErrorHandling) -> None
-    converter_fn = _get_node_converter_fn(builder, node, err)
-    return converter_fn(builder, node, graph, err)
diff --git a/coremltools/converters/onnx/_tests/_test_utils.py b/coremltools/converters/onnx/_tests/_test_utils.py
deleted file mode 100644
index 4fb21a879..000000000
--- a/coremltools/converters/onnx/_tests/_test_utils.py
+++ /dev/null
@@ -1,270 +0,0 @@
-
-import numpy as np
-import numpy.testing as npt  # type: ignore
-import numpy.random as npr
-from onnx import helper, ModelProto, ValueInfoProto, TensorProto, NodeProto
-from typing import Any, Sequence, Text, Tuple, Optional, Dict, List, TypeVar
-from coremltools.converters.onnx import convert
-from coremltools.converters.onnx._converter import SupportedVersion
-from coremltools._deps import _IS_MACOS
-import sys
-
-"""
-   dynamically generate random inputs,
-   use caffe2 backend for onnx and
-"""
-
-
-def _forward_onnx_model(
-    model,  # type: ModelProto
-    input_dict,  # type: Dict[Text, np._ArrayLike[Any]]
-    test_name="",  # type: Text
-):
-    # type: (...) -> np.ndarray[Any]
-
-    import caffe2.python.onnx.backend  # type: ignore
-
-    prepared_backend = caffe2.python.onnx.backend.prepare(model)
-    out = prepared_backend.run(input_dict)
-    out_dict = {}
-    out_names = [v.name for v in model.graph.output]
-    for out_name in out_names:
-        out_dict[out_name] = out[out_name]
-
-    result = [out[v.name] for v in model.graph.output]
-    output_shapes = [_shape_from_onnx_value_info(o) for o in model.graph.output]
-    for i, output in enumerate(result):
-        result[i] = output.reshape(output_shapes[i])
-    return np.array(result)
-
-
-def _onnx_create_model(
-    nodes,  # type: Sequence[NodeProto]
-    inputs,  # type: Sequence[Tuple[Text,Tuple[int, ...]]]
-    outputs,  # type: Sequence[Tuple[Text,Tuple[int, ...], int]]
-    initializer=[],  # type: Sequence[TensorProto]
-    value_info=[],  # type: Sequence[Tuple[Text,Tuple[int, ...], int]]
-):
-    # type: (...) -> ModelProto
-    initializer_inputs = [
-        helper.make_tensor_value_info(t.name, TensorProto.FLOAT, t.dims)
-        for t in initializer
-    ]
-
-    value_infos = [
-        helper.make_tensor_value_info(v_[0], v_[2], v_[1])
-        for v_ in value_info
-    ]
-
-    graph = helper.make_graph(
-        nodes=nodes,
-        name="test",
-        inputs=initializer_inputs
-        + [
-            helper.make_tensor_value_info(input_[0], TensorProto.FLOAT, input_[1])
-            for input_ in inputs
-        ],
-        outputs=[
-            helper.make_tensor_value_info(output_[0], output_[2], output_[1])
-            for output_ in outputs
-        ],
-        initializer=initializer,
-        value_info=value_infos,
-    )
-    onnx_model = helper.make_model(graph)
-    return onnx_model
-
-
-def _onnx_create_single_node_model(
-    op_type,  # type: Text
-    input_shapes,  # type: Sequence[Tuple[int, ...]]
-    output_shapes,  # type: Sequence[Tuple[int, ...]]
-    initializer=[],  # type: Sequence[TensorProto]
-    **kwargs  # type: Any
-):
-    # type: (...) -> ModelProto
-    inputs = [("input{}".format(i,), input_shapes[i]) for i in range(len(input_shapes))]
-    outputs = [
-        ("output{}".format(i,), output_shapes[i], TensorProto.FLOAT)
-        for i in range(len(output_shapes))
-    ]
-
-    node = helper.make_node(
-        op_type,
-        inputs=[i[0] for i in inputs] + [t.name for t in initializer],
-        outputs=[o[0] for o in outputs],
-        **kwargs
-    )
-    return _onnx_create_model([node], inputs, outputs, initializer)
-
-
-def _shape_from_onnx_value_info(
-    v,
-):  # type: (ValueInfoProto) -> Sequence[Tuple[int, ...]]
-    return tuple([d.dim_value for d in v.type.tensor_type.shape.dim])
-
-
-def _coreml_forward_model(
-    model,  # type: ModelProto
-    input_dict,  # type: Dict[Text, np._ArrayLike[Any]]
-    output_names,  # type: Sequence[Text]
-    minimum_ios_deployment_target="12",
-):
-    # type: (...) -> np.ndarray[Any]
-    if not SupportedVersion.is_nd_array_supported(minimum_ios_deployment_target):
-        for k, arr in input_dict.items():
-            if len(arr.shape) == 4:
-                input_dict[k] = arr[0]
-        for k, v in input_dict.items():
-            if len(v.shape) == 2 and v.shape[0] == 1:
-                input_dict[k] = v.flatten()
-    coreml_out = model.predict(input_dict, useCPUOnly=True)
-    return np.array([coreml_out[name] for name in output_names])
-
-
-def _coreml_forward_onnx_model(
-    model,  # type: ModelProto
-    input_dict,  # type: Dict[Text, np._ArrayLike[Any]]
-    onnx_coreml_input_shape_map={},  # type: Dict[Text, List[int,...]]
-    minimum_ios_deployment_target="12",
-):
-    # type: (...) -> np.ndarray[Any]
-    coreml_model = convert(
-        model,
-        onnx_coreml_input_shape_map=onnx_coreml_input_shape_map,
-        minimum_ios_deployment_target=minimum_ios_deployment_target,
-    )
-    output_names = [o.name for o in model.graph.output]
-    return _coreml_forward_model(
-        coreml_model,
-        input_dict,
-        output_names,
-        minimum_ios_deployment_target=minimum_ios_deployment_target,
-    )
-
-
-def _random_array(
-    shape, random_seed=10
-):  # type: (Tuple[int, ...], Any) -> np._ArrayLike[float]
-    if random_seed:
-        npr.seed(random_seed)  # type: ignore
-    return npr.ranf(shape).astype("float32")
-
-
-def _conv_pool_output_size(
-    input_shape,  # type: Sequence[int]
-    dilations,  # type: Sequence[int]
-    kernel_shape,  # type: Tuple[int, int]
-    pads,  # type: Sequence[int]
-    strides,  # type: Tuple[int, int]
-):
-    # type: (...) -> Tuple[int, int]
-    output_height = (
-        input_shape[2] + pads[0] + pads[2] - (dilations[0] * (kernel_shape[0] - 1) + 1)
-    ) / strides[0] + 1
-    output_width = (
-        input_shape[3] + pads[1] + pads[3] - (dilations[1] * (kernel_shape[1] - 1) + 1)
-    ) / strides[1] + 1
-
-    return (int(output_height), int(output_width))
-
-
-_T = TypeVar("_T")
-
-
-def _assert_outputs(
-    output1,  # type: np.ndarray[_T]
-    output2,  # type: np.ndarray[_T]
-    decimal=7,  # type: int
-):
-    # type: (...) -> None
-    npt.assert_equal(len(output1), len(output2))
-    for o1, o2 in zip(output1, output2):
-        npt.assert_almost_equal(o2.flatten(), o1.flatten(), decimal=decimal)
-
-
-def _prepare_inputs_for_onnx(
-    model,  # type: ModelProto
-    test_name="",  # type: Text
-    values=None,  # type: Optional[List[np._ArrayLike[Any]]]
-):
-    # type: (...) -> Dict[Text, np._ArrayLike[Any]]
-    graph = model.graph
-    initializer_names = {t.name for t in graph.initializer}
-    input_names = [i.name for i in graph.input if i.name not in initializer_names]
-    input_shapes = [
-        tuple([d.dim_value for d in i.type.tensor_type.shape.dim])
-        for i in graph.input
-        if i.name not in initializer_names
-    ]
-
-    if values is None:
-        inputs = [_random_array(shape) for shape in input_shapes]
-    else:
-        inputs = values
-    input_dict = dict(zip(input_names, inputs))
-    return input_dict
-
-
-def _test_onnx_model(
-    model,  # type: ModelProto
-    test_name="",  # type: Text
-    decimal=5,  # type: int
-    onnx_coreml_input_shape_map={},  # type: Dict[Text, List[int,...]]
-    coreml_input_shape={},  # type: Dict[Text, List[int,...]]
-    minimum_ios_deployment_target="12",
-):
-    # type: (...) -> None
-    if not test_name:
-        test_name = sys._getframe(1).f_code.co_name
-    W = _prepare_inputs_for_onnx(model, test_name=test_name)
-    c2_outputs = _forward_onnx_model(model, W, test_name=test_name)
-    coreml_input_dict = dict()
-    # Supported iOS Version
-    # New OS Version must be added at the end to maintain backward version index
-    supported_ios_version = ["11.2", "12", "13"]
-    IOS_13_VERSION = supported_ios_version.index("13")
-    for key, value in W.items():
-        if (
-            supported_ios_version.index(minimum_ios_deployment_target) < IOS_13_VERSION
-            and key in coreml_input_shape
-        ):
-            coreml_input_dict[key] = np.reshape(value, coreml_input_shape[key])
-        else:
-            coreml_input_dict[key] = value
-    if _IS_MACOS:
-        coreml_outputs = _coreml_forward_onnx_model(
-            model,
-            coreml_input_dict,
-            onnx_coreml_input_shape_map=onnx_coreml_input_shape_map,
-            minimum_ios_deployment_target=minimum_ios_deployment_target,
-        )
-        _assert_outputs(c2_outputs, coreml_outputs, decimal=decimal)
-
-
-def _test_single_node(
-    op_type,  # type: Text
-    input_shapes,  # type: Sequence[Tuple[int, ...]]
-    output_shapes,  # type: Sequence[Tuple[int, ...]]
-    initializer=[],  # type: Sequence[TensorProto]
-    decimal=5,  # type: int
-    test_name="",  # type: Text
-    onnx_coreml_input_shape_map={},  # type: Dict[Text, List[int,...]]
-    coreml_input_shape={},  # type: Dict[Text, List[int,...]]
-    minimum_ios_deployment_target="12",
-    **kwargs  # type: Any
-):
-    # type: (...) -> None
-    model = _onnx_create_single_node_model(
-        op_type, input_shapes, output_shapes, initializer, **kwargs
-    )
-    if not test_name:
-        test_name = sys._getframe(1).f_code.co_name
-    _test_onnx_model(
-        model,
-        test_name=test_name,
-        decimal=decimal,
-        onnx_coreml_input_shape_map=onnx_coreml_input_shape_map,
-        coreml_input_shape=coreml_input_shape,
-        minimum_ios_deployment_target=minimum_ios_deployment_target,
-    )
diff --git a/coremltools/converters/onnx/_tests/test_convert.py b/coremltools/converters/onnx/_tests/test_convert.py
deleted file mode 100644
index 8ec56c827..000000000
--- a/coremltools/converters/onnx/_tests/test_convert.py
+++ /dev/null
@@ -1,115 +0,0 @@
-
-from coremltools._deps import _HAS_ONNX, MSG_ONNX_NOT_FOUND, _IS_MACOS
-import unittest
-import numpy as np
-import numpy.testing as npt  # type: ignore
-import numpy.random as npr
-
-from PIL import Image  # type: ignore
-
-if _HAS_ONNX:
-    import onnx
-    from onnx.numpy_helper import from_array
-    from coremltools.converters.onnx import convert
-    from ._test_utils import _onnx_create_single_node_model
-
-
-@unittest.skipUnless(_HAS_ONNX, MSG_ONNX_NOT_FOUND)
-class ConvertTest(unittest.TestCase):
-    def setUp(self):  # type: () -> None
-        self.img_arr = np.uint8(npr.rand(224, 224, 3) * 255)  # type: ignore
-        self.img = Image.fromarray(np.uint8(self.img_arr))  # type: ignore
-        self.img_arr = np.float32(self.img_arr)  # type: ignore
-        self.onnx_model = _onnx_create_single_node_model(
-            "Relu", [(3, 224, 224)], [(3, 224, 224)]
-        )
-        self.input_names = [i.name for i in self.onnx_model.graph.input]
-        self.output_names = [o.name for o in self.onnx_model.graph.output]
-
-    def test_convert_image_input(self):  # type: () -> None
-        coreml_model = convert(self.onnx_model, image_input_names=self.input_names)
-        spec = coreml_model.get_spec()
-        for input_ in spec.description.input:
-            self.assertEqual(input_.type.WhichOneof("Type"), "imageType")
-
-    def test_convert_image_output(self):  # type: () -> None
-        coreml_model = convert(self.onnx_model, image_output_names=self.output_names)
-        spec = coreml_model.get_spec()
-        for output in spec.description.output:
-            self.assertEqual(output.type.WhichOneof("Type"), "imageType")
-
-    def test_convert_image_input_preprocess(self):  # type: () -> None
-        bias = np.array([100, 90, 80])
-        coreml_model = convert(
-            self.onnx_model,
-            image_input_names=self.input_names,
-            preprocessing_args={
-                "is_bgr": True,
-                "blue_bias": bias[0],
-                "green_bias": bias[1],
-                "red_bias": bias[2],
-            },
-        )
-
-        if _IS_MACOS:
-            output = coreml_model.predict({self.input_names[0]: self.img})[
-                self.output_names[0]
-            ]
-
-            expected_output = self.img_arr[:, :, ::-1].transpose((2, 0, 1))
-            expected_output[0] = expected_output[0] + bias[0]
-            expected_output[1] = expected_output[1] + bias[1]
-            expected_output[2] = expected_output[2] + bias[2]
-            npt.assert_equal(output.flatten(), expected_output.flatten())
-
-    def test_convert_image_output_bgr(self):  # type: () -> None
-        coreml_model = convert(
-            self.onnx_model,
-            image_input_names=self.input_names,
-            image_output_names=self.output_names,
-            deprocessing_args={"is_bgr": True},
-        )
-
-        if _IS_MACOS:
-            output = coreml_model.predict({self.input_names[0]: self.img})[
-                self.output_names[0]
-            ]
-            output = np.array(output)[:, :, :3].transpose((2, 0, 1))
-            expected_output = self.img_arr[:, :, ::-1].transpose((2, 0, 1))
-            npt.assert_equal(output, expected_output)
-
-
-@unittest.skipUnless(_HAS_ONNX, MSG_ONNX_NOT_FOUND)
-class NodeConversionTest(unittest.TestCase):
-    def test_resize_node_without_scales(self):
-        input_shape = (1, 3, 192, 78)
-        output_shape = (1, 3, 384, 234)
-        roi = from_array(np.array([], dtype=int), name="roi")
-        scales = from_array(np.empty([], dtype=int), name="scales")
-        sizes = from_array(np.empty([], dtype=int), name="sizes")
-        onnx_model_to_test = _onnx_create_single_node_model(
-            "Resize",
-            [input_shape],
-            [output_shape],
-            initializer=[roi, scales, sizes],
-            coordinate_transformation_mode="pytorch_half_pixel",
-            cubic_coeff_a=-0.5,
-            mode="linear",
-            nearest_mode="floor"
-        )
-
-        coreml_model = convert(onnx_model_to_test,
-                               minimum_ios_deployment_target="13")
-
-        self.assertEqual(len(coreml_model.get_spec().neuralNetwork.layers), 1,
-                         msg="Wrong number of layers in converted network")
-
-        layer = coreml_model.get_spec().neuralNetwork.layers[0]
-        self.assertTrue(hasattr(layer, "upsample"),
-                        msg="Wrong resize conversion")
-        self.assertEqual(len(layer.upsample.scalingFactor), 2,
-                         msg="Wrong number of scaling factors")
-        self.assertSequenceEqual(layer.upsample.scalingFactor,
-                                 (output_shape[2] // input_shape[2],
-                                  output_shape[3] // input_shape[3]),
-                                  msg="Conversion produces wrong scaling factor")
diff --git a/coremltools/converters/onnx/_tests/test_custom_layers.py b/coremltools/converters/onnx/_tests/test_custom_layers.py
deleted file mode 100644
index 28d50ffff..000000000
--- a/coremltools/converters/onnx/_tests/test_custom_layers.py
+++ /dev/null
@@ -1,221 +0,0 @@
-
-import unittest
-
-from coremltools._deps import _HAS_ONNX, MSG_ONNX_NOT_FOUND
-
-if _HAS_ONNX:
-    import onnx
-    from ._test_utils import _onnx_create_model
-    from onnx import helper, ModelProto, TensorProto
-    from coremltools.converters.onnx import convert
-from coremltools.proto import NeuralNetwork_pb2  # type: ignore
-
-
-def _make_model_acos_exp_topk():  # type: (...) -> ModelProto
-    """
-  make a very simple model for testing: input->clip->exp->topk->2 outputs
-  """
-    inputs = [("input0", (10,), TensorProto.FLOAT), ("K", (1,), TensorProto.INT64)]
-    outputs = [
-        ("output_values", (3,), TensorProto.FLOAT),
-        ("output_indices", (3,), TensorProto.INT64),
-    ]
-    acos = helper.make_node("Acos", inputs=[inputs[0][0]], outputs=["acos_out"])
-    exp = helper.make_node("Exp", inputs=[acos.output[0]], outputs=["exp_out"])
-    topk = helper.make_node(
-        "TopK",
-        inputs=[exp.output[0], inputs[1][0]],
-        outputs=[outputs[0][0], outputs[1][0]],
-        axis=0,
-    )
-    return _onnx_create_model([acos, exp, topk], inputs, outputs)
-
-
-def _make_model_flatten_axis3():  # type: (...) -> ModelProto
-    """
-  make a simple model: 4-D input -> flatten (axis=3)-> output
-  """
-    inputs = [("input", (1, 3, 10, 20), TensorProto.FLOAT)]
-    outputs = [("output", (30, 20), TensorProto.FLOAT)]
-    flatten = helper.make_node(
-        "Flatten", inputs=[inputs[0][0]], outputs=[outputs[0][0]], axis=3
-    )
-    return _onnx_create_model([flatten], inputs, outputs)
-
-
-@unittest.skipUnless(_HAS_ONNX, MSG_ONNX_NOT_FOUND)
-class CustomLayerTest(unittest.TestCase):
-    def test_unsupported_ops(self):  # type: () -> None
-
-        onnx_model = _make_model_acos_exp_topk()
-        coreml_model = convert(onnx_model, add_custom_layers=True)
-
-        spec = coreml_model.get_spec()
-        layers = spec.neuralNetwork.layers
-        self.assertIsNotNone(layers[0].custom)
-        self.assertIsNotNone(layers[2].custom)
-        self.assertEqual("Acos", layers[0].custom.className)
-        self.assertEqual("TopK", layers[2].custom.className)
-
-    def test_unsupported_ops_provide_functions(self):  # type: () -> None
-        def convert_acos(builder, node, graph, err):
-            params = NeuralNetwork_pb2.CustomLayerParams()
-            params.className = node.op_type
-            params.description = "Custom layer that corresponds to the ONNX op {}".format(
-                node.op_type,
-            )
-
-            builder.add_custom(
-                name=node.name,
-                input_names=node.inputs,
-                output_names=node.outputs,
-                custom_proto_spec=params,
-            )
-
-        def convert_topk(builder, node, graph, err):
-            params = NeuralNetwork_pb2.CustomLayerParams()
-            params.className = node.op_type
-            params.description = "Custom layer that corresponds to the ONNX op {}".format(
-                node.op_type,
-            )
-            params.parameters["axis"].intValue = node.attrs.get("axis", -1)
-
-            builder.add_custom(
-                name=node.name,
-                input_names=node.inputs,
-                output_names=node.outputs,
-                custom_proto_spec=params,
-            )
-
-        onnx_model = _make_model_acos_exp_topk()
-        coreml_model = convert(
-            model=onnx_model,
-            add_custom_layers=True,
-            custom_conversion_functions={"Acos": convert_acos, "TopK": convert_topk},
-        )
-
-        spec = coreml_model.get_spec()
-        layers = spec.neuralNetwork.layers
-        self.assertIsNotNone(layers[0].custom)
-        self.assertIsNotNone(layers[2].custom)
-        self.assertEqual("Acos", layers[0].custom.className)
-        self.assertEqual("TopK", layers[2].custom.className)
-        self.assertEqual(0, layers[2].custom.parameters["axis"].intValue)
-
-    def test_node_name_type_custom_functions(self):  # type: () -> None
-        def convert_acos(builder, node, graph, err):
-            params = NeuralNetwork_pb2.CustomLayerParams()
-            params.className = node.op_type
-            params.description = "Custom layer that corresponds to the ONNX op {}".format(
-                node.op_type,
-            )
-
-            builder.add_custom(
-                name=node.name,
-                input_names=node.inputs,
-                output_names=node.outputs,
-                custom_proto_spec=params,
-            )
-
-        def convert_topk_generic(builder, node, graph, err):
-            params = NeuralNetwork_pb2.CustomLayerParams()
-            params.className = node.op_type
-            params.description = "Custom layer that corresponds to the ONNX op {}".format(
-                node.op_type,
-            )
-            params.parameters["axis"].intValue = node.attrs.get("axis", -1)
-            params.parameters["k"].intValue = node.attrs["k"]
-
-            builder.add_custom(
-                name=node.name,
-                input_names=node.inputs,
-                output_names=node.outputs,
-                custom_proto_spec=params,
-            )
-
-        def convert_topk_node_specific(builder, node, graph, err):
-            params = NeuralNetwork_pb2.CustomLayerParams()
-            params.className = node.op_type
-            params.description = "Custom layer that corresponds to the ONNX op {}".format(
-                node.op_type,
-            )
-            params.parameters["axis"].intValue = node.attrs.get("axis", -1)
-
-            builder.add_custom(
-                name=node.name,
-                input_names=node.inputs,
-                output_names=node.outputs,
-                custom_proto_spec=params,
-            )
-
-        onnx_model = _make_model_acos_exp_topk()
-        coreml_model = convert(
-            model=onnx_model,
-            add_custom_layers=True,
-            custom_conversion_functions={
-                "Acos": convert_acos,
-                "TopK": convert_topk_generic,
-                "output_values_output_indices": convert_topk_node_specific,
-            },
-        )
-
-        spec = coreml_model.get_spec()
-        layers = spec.neuralNetwork.layers
-        self.assertIsNotNone(layers[0].custom)
-        self.assertIsNotNone(layers[2].custom)
-        self.assertEqual("Acos", layers[0].custom.className)
-        self.assertEqual("TopK", layers[2].custom.className)
-        self.assertEqual(0, layers[2].custom.parameters["axis"].intValue)
-
-    def test_unsupported_op_attribute(self):  # type: () -> None
-        onnx_model = _make_model_flatten_axis3()
-        coreml_model = convert(onnx_model, add_custom_layers=True)
-
-        spec = coreml_model.get_spec()
-        layers = spec.neuralNetwork.layers
-        self.assertIsNotNone(layers[0].custom)
-        self.assertEqual("Flatten", layers[0].custom.className)
-
-    def test_unsupported_op_attribute_provide_functions(self):  # type: () -> None
-        def convert_flatten(builder, node, graph, err):
-            params = NeuralNetwork_pb2.CustomLayerParams()
-            params.className = node.op_type
-            params.description = "Custom layer that corresponds to the ONNX op {}".format(
-                node.op_type,
-            )
-            params.parameters["axis"].intValue = node.attrs["axis"]
-
-            builder.add_custom(
-                name=node.name,
-                input_names=node.inputs,
-                output_names=node.outputs,
-                custom_proto_spec=params,
-            )
-
-        def test_conversion(onnx_model, add_custom_layers=False):
-            coreml_model = convert(
-                onnx_model,
-                add_custom_layers=add_custom_layers,
-                custom_conversion_functions={"Flatten": convert_flatten},
-            )
-
-            spec = coreml_model.get_spec()
-            layers = spec.neuralNetwork.layers
-            self.assertIsNotNone(layers[0].custom)
-            self.assertEqual("Flatten", layers[0].custom.className)
-            self.assertEqual(3, layers[0].custom.parameters["axis"].intValue)
-
-        onnx_model = _make_model_flatten_axis3()
-        # Test with add_custom_layers True
-        convert(
-            onnx_model,
-            add_custom_layers=True,
-            custom_conversion_functions={"Flatten": convert_flatten},
-        )
-
-        # Test with add_custom_layers False
-        convert(
-            onnx_model,
-            add_custom_layers=False,
-            custom_conversion_functions={"Flatten": convert_flatten},
-        )
diff --git a/coremltools/converters/onnx/_tests/test_graph.py b/coremltools/converters/onnx/_tests/test_graph.py
deleted file mode 100644
index 457ea09c2..000000000
--- a/coremltools/converters/onnx/_tests/test_graph.py
+++ /dev/null
@@ -1,81 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import unittest
-from coremltools._deps import _HAS_ONNX, MSG_ONNX_NOT_FOUND
-
-if _HAS_ONNX:
-    import onnx
-    from onnx import helper, numpy_helper, TensorProto
-    from coremltools.converters.onnx._graph import Node, Graph
-    from ._test_utils import (
-        _onnx_create_single_node_model,
-        _onnx_create_model,
-        _conv_pool_output_size,
-        _random_array,
-    )
-
-
-@unittest.skipUnless(_HAS_ONNX, MSG_ONNX_NOT_FOUND)
-class NodeTest(unittest.TestCase):
-    def test_create_node(self):  # type: () -> None
-        model = _onnx_create_single_node_model(
-            "Elu", [(1, 3, 224, 224)], [(1, 3, 224, 224)], alpha=0.5
-        )
-        graph = model.graph
-        node = graph.node[0]
-        node_ = Node.from_onnx(node)
-        self.assertTrue(len(node_.inputs) == 1)
-        self.assertTrue(len(node_.outputs) == 1)
-        self.assertTrue(len(node_.attrs) == 1)
-        self.assertTrue(node_.attrs["alpha"] == 0.5)
-
-
-@unittest.skipUnless(_HAS_ONNX, MSG_ONNX_NOT_FOUND)
-class GraphTest(unittest.TestCase):
-    def test_create_graph(self):  # type: () -> None
-        kernel_shape = (3, 2)
-        strides = (2, 3)
-        pads = (4, 2, 4, 2)
-        dilations = (1, 2)
-        group = 1
-        weight = numpy_helper.from_array(_random_array((16, 3, 3, 2)), name="weight")
-
-        input_shape = (1, 3, 224, 224)
-        output_size = _conv_pool_output_size(
-            input_shape, dilations, kernel_shape, pads, strides
-        )
-
-        output_shape = (1, int(weight.dims[0]), output_size[0], output_size[1])
-
-        inputs = [("input0", input_shape)]
-        outputs = [("output0", output_shape, TensorProto.FLOAT)]
-
-        conv = helper.make_node(
-            "Conv",
-            inputs=[inputs[0][0], "weight"],
-            outputs=["conv_output"],
-            dilations=dilations,
-            group=group,
-            kernel_shape=kernel_shape,
-            pads=pads,
-            strides=strides,
-        )
-
-        relu = helper.make_node(
-            "Relu", inputs=[conv.output[0]], outputs=[outputs[0][0]]
-        )
-
-        model = _onnx_create_model([conv, relu], inputs, outputs, [weight])
-        graph_ = Graph.from_onnx(model.graph, onnx_ir_version=5)
-        self.assertTrue(len(graph_.inputs) == 1)
-        self.assertEqual(graph_.inputs[0][2], input_shape)
-        self.assertTrue(len(graph_.outputs) == 1)
-        self.assertEqual(graph_.outputs[0][2], output_shape)
-        self.assertTrue(len(graph_.nodes) == 2)
-        self.assertEqual(len(graph_.nodes[0].parents), 0)
-        self.assertEqual(len(graph_.nodes[1].parents), 1)
-        self.assertEqual(len(graph_.nodes[0].children), 1)
-        self.assertEqual(len(graph_.nodes[1].children), 0)
diff --git a/coremltools/converters/onnx/_tests/test_mlmodel_passes.py b/coremltools/converters/onnx/_tests/test_mlmodel_passes.py
deleted file mode 100644
index 9252061bc..000000000
--- a/coremltools/converters/onnx/_tests/test_mlmodel_passes.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import numpy as np
-import unittest
-import coremltools.models.datatypes as datatypes
-from coremltools.models import neural_network as neural_network
-from coremltools.converters.mil.backend.nn.passes.mlmodel_passes import (
-    remove_disconnected_layers,
-)
-
-
-class MLModelPassesTest(unittest.TestCase):
-    def test_load_constant_remove(self):
-        input_features = [("data", datatypes.Array(*(3, 4)))]
-        output_features = [("out", None)]
-        builder = neural_network.NeuralNetworkBuilder(
-            input_features, output_features, disable_rank5_shape_mapping=True
-        )
-        builder.add_activation("relu1", "RELU", "data", "relu1")
-        builder.add_load_constant_nd(
-            "const1", "c1", constant_value=np.ones((5,)), shape=(5,)
-        )
-        builder.add_activation("relu2", "RELU", "relu1", "out")
-        builder.add_load_constant_nd(
-            "const2", "c2", constant_value=np.ones((5,)), shape=(5,)
-        )
-        builder.add_load_constant_nd(
-            "const3", "c3", constant_value=np.ones((5,)), shape=(5,)
-        )
-        spec = builder.spec
-        np.testing.assert_equal(5, len(spec.neuralNetwork.layers))
-        remove_disconnected_layers(spec)
-        np.testing.assert_equal(2, len(spec.neuralNetwork.layers))
diff --git a/coremltools/converters/onnx/_tests/test_operators.py b/coremltools/converters/onnx/_tests/test_operators.py
deleted file mode 100644
index 79db4045b..000000000
--- a/coremltools/converters/onnx/_tests/test_operators.py
+++ /dev/null
@@ -1,419 +0,0 @@
-# Copyright (c) 2021, Apple Inc. All rights reserved.
-#
-# Use of this source code is governed by a BSD-3-clause license that can be
-# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
-import unittest
-import numpy as np
-from coremltools._deps import _HAS_ONNX, MSG_ONNX_NOT_FOUND
-
-if _HAS_ONNX:
-    import onnx
-    from onnx.numpy_helper import from_array
-    from coremltools.converters.onnx import convert
-    from ._test_utils import (
-        _onnx_create_single_node_model,
-        _test_single_node,
-        _random_array,
-        _conv_pool_output_size,
-        _assert_outputs,
-    )
-
-from coremltools.models.utils import _macos_version
-
-MIN_MACOS_VERSION_10_15 = (10, 15)
-
-ONNX_SHAPE_INFERENCE_FAILS = True
-
-
-@unittest.skipUnless(_HAS_ONNX, MSG_ONNX_NOT_FOUND)
-class SingleOperatorTest(unittest.TestCase):
-    def test_conv(self):  # type: () -> None
-        kernel_shape = (3, 2)
-        strides = (2, 3)
-        pads = (4, 2, 4, 2)
-        dilations = (1, 2)
-        group = 1
-        weight = from_array(_random_array((16, 3, 3, 2)), name="weight")
-
-        input_shape = (1, 3, 224, 224)
-        output_size = _conv_pool_output_size(
-            input_shape, dilations, kernel_shape, pads, strides
-        )
-
-        output_shape = (1, int(weight.dims[0]), output_size[0], output_size[1])
-
-        _test_single_node(
-            "Conv",
-            [input_shape],
-            [output_shape],
-            initializer=[weight],
-            dilations=dilations,
-            group=group,
-            kernel_shape=kernel_shape,
-            pads=pads,
-            strides=strides,
-        )
-
-    def test_conv_transpose(self):  # type: () -> None
-        kernel_shape = (3, 3)
-        pads = (0, 0, 0, 0)
-        C_in = 3
-        C_out = 12
-        H_in, W_in = 30, 30
-        strides = (2, 2)
-
-        input_shape = (1, C_in, H_in, W_in)
-        weight = from_array(
-            _random_array((C_in, C_out, kernel_shape[0], kernel_shape[1])),
-            name="weight",
-        )
-
-        H_out = (H_in - 1) * strides[0] + kernel_shape[0] - pads[0] - pads[2]
-        W_out = (W_in - 1) * strides[1] + kernel_shape[1] - pads[1] - pads[3]
-        output_shape = (1, C_out, H_out, W_out)
-
-        _test_single_node(
-            "ConvTranspose",
-            [input_shape],
-            [output_shape],
-            initializer=[weight],
-            # Default values for other attributes: dilations=[1, 1], group=1
-            strides=strides,
-            kernel_shape=kernel_shape,
-            pads=pads,
-            output_padding=(0, 0),
-        )
-
-    def test_conv_without_pads(self):  # type: () -> None
-        kernel_shape = (3, 2)
-        strides = (2, 3)
-        dilations = (1, 2)
-        group = 1
-        weight = from_array(_random_array((16, 3, 3, 2)), name="weight")
-
-        input_shape = (1, 3, 224, 224)
-        output_size = _conv_pool_output_size(
-            input_shape, dilations, kernel_shape, [0, 0, 0, 0], strides
-        )
-
-        output_shape = (1, int(weight.dims[0]), output_size[0], output_size[1])
-        _test_single_node(
-            "Conv",
-            [input_shape],
-            [output_shape],
-            initializer=[weight],
-            dilations=dilations,
-            group=group,
-            kernel_shape=kernel_shape,
-            strides=strides,
-        )
-
-    def test_max_pool(self):  # type: () -> None
-        kernel_shape = (5, 3)
-        pads = (2, 1, 2, 1)
-        strides = (1, 2)
-
-        input_shape = (1, 3, 224, 224)
-
-        output_size = _conv_pool_output_size(
-            input_shape, [1, 1], kernel_shape, pads, strides
-        )
-
-        output_shape = (1, 3, output_size[0], output_size[1])
-
-        _test_single_node(
-            "MaxPool",
-            [input_shape],
-            [output_shape],
-            test_name="test_max_pool_1",
-            kernel_shape=kernel_shape,
-            pads=pads,
-            strides=strides,
-        )
-
-        output_size = _conv_pool_output_size(
-            input_shape, [1, 1], kernel_shape, [0, 0, 0, 0], strides
-        )
-        output_shape = (1, 3, output_size[0], output_size[1])
-        _test_single_node(
-            "MaxPool",
-            [input_shape],
-            [output_shape],
-            test_name="test_max_pool_2",
-            kernel_shape=kernel_shape,
-            strides=strides,
-        )
-
-    @unittest.skip("Skip due to internal CoreML CPU backend issue")
-    def test_avg_pool(self):  # type: () -> None
-        kernel_shape = (5, 3)
-        pads = (2, 1, 2, 1)
-        strides = (1, 2)
-
-        input_shape = (1, 3, 224, 224)
-        output_size = _conv_pool_output_size(
-            input_shape, (1, 1), kernel_shape, pads, strides
-        )
-        output_shape = (1, 3, output_size[0], output_size[1])
-        _test_single_node(
-            "AveragePool",
-            [input_shape],
-            [output_shape],
-            test_name="test_avg_pool_1",
-            kernel_shape=kernel_shape,
-            pads=pads,
-            strides=strides,
-        )
-
-        output_size = _conv_pool_output_size(
-            input_shape, (1, 1), kernel_shape, [0, 0, 0, 0], strides
-        )
-        output_shape = (1, 3, output_size[0], output_size[1])
-        _test_single_node(
-            "AveragePool",
-            [input_shape],
-            [output_shape],
-            test_name="test_avg_pool_2",
-            kernel_shape=kernel_shape,
-            strides=strides,
-        )
-
-    def test_bn(self):  # type: () -> None
-        scale = from_array(_random_array((3,)), name="scale")
-        bias = from_array(_random_array((3,)), name="bias")
-        mean = from_array(_random_array((3,)), name="mean")
-        var = from_array(_random_array((3,)), name="var")
-
-        epsilon = 1e-5
-        momentum = 0.001
-
-        op_types = ["BatchNormalization", "SpatialBN"]
-        for op_type in op_types:
-            _test_single_node(
-                "BatchNormalization",
-                [(1, 3, 224, 224)],
-                [(1, 3, 224, 224)],
-                initializer=[scale, bias, mean, var],
-                epsilon=epsilon,
-                momentum=momentum,
-            )
-
-            # epsilon by default
-            _test_single_node(
-                "BatchNormalization",
-                [(1, 3, 224, 224)],
-                [(1, 3, 224, 224)],
-                initializer=[scale, bias, mean, var],
-                # epsilon=epsilon,
-                momentum=momentum,
-            )
-
-    def test_gemm(self, minimum_ios_deployment_target="12"):  # type: () -> None
-        input_shape = (1, 2048)
-        output_shape = (1, 5)
-        W = from_array(_random_array((output_shape[1], input_shape[1])), name="weight")
-        b = from_array(_random_array((output_shape[1],)), name="bias")
-        _test_single_node(
-            "Gemm",
-            [input_shape],
-            [output_shape],
-            initializer=[W, b],
-            decimal=3,
-            transB=1,
-            minimum_ios_deployment_target=minimum_ios_deployment_target,
-        )
-
-    @unittest.skipIf(
-        _macos_version() < MIN_MACOS_VERSION_10_15,
-        "macOS 10.15+ required. Skipping test.",
-    )
-    def test_gemm_ios13(self):
-        self.test_gemm(minimum_ios_deployment_target="13")
-
-    def test_gemm_transB_off(
-        self, minimum_ios_deployment_target="12"
-    ):  # type: () -> None
-        input_shape = (1, 2048)
-        output_shape = (1, 5)
-        W = from_array(_random_array((input_shape[1], output_shape[1])), name="weight")
-        b = from_array(_random_array((output_shape[1],)), name="bias")
-        _test_single_node(
-            "Gemm",
-            [input_shape],
-            [output_shape],
-            initializer=[W, b],
-            decimal=3,
-            transB=0,
-            minimum_ios_deployment_target=minimum_ios_deployment_target,
-        )
-
-    @unittest.skipIf(
-        _macos_version() < MIN_MACOS_VERSION_10_15,
-        "macOS 10.15+ required. Skipping test.",
-    )
-    def test_gemm_transB_off_ios13(self):
-        self.test_gemm_transB_off(minimum_ios_deployment_target="13")
-
-    def test_lrn(self):  # type: () -> None
-        _test_single_node(
-            "LRN",
-            [(1, 3, 224, 224)],
-            [(1, 3, 224, 224)],
-            alpha=9.99e-5,
-            beta=0.75,
-            bias=1.0,
-            size=5,
-        )
-
-    @unittest.skipIf(
-        _macos_version() < MIN_MACOS_VERSION_10_15,
-        "macOS 10.15+ required. Skipping test.",
-    )
-    @unittest.skip("Failing: wrong input type")
-    def test_split_axis_0_rank_3(
-        self, minimum_ios_deployment_target="12"
-    ):  # type: () -> None
-        _test_single_node(
-            "Split",
-            [(2, 1, 200)],
-            [(1, 1, 200), (1, 1, 200)],
-            axes=0,
-            minimum_ios_deployment_target=minimum_ios_deployment_target,
-        )
-
-    @unittest.skipIf(
-        _macos_version() < MIN_MACOS_VERSION_10_15,
-        "macOS 10.15+ required. Skipping test.",
-    )
-    def test_concat(self, minimum_ios_deployment_target="13"):  # type: () -> None
-        _test_single_node(
-            "Concat",
-            [(1, 2, 200), (1, 2, 200)],
-            [(2, 2, 200)],
-            axis=0,
-            minimum_ios_deployment_target=minimum_ios_deployment_target,
-        )
-
-    @unittest.skipIf(
-        _macos_version() < MIN_MACOS_VERSION_10_15,
-        "macOS 10.15+ required. Skipping test.",
-    )
-    @unittest.skip("Failing: wrong input type")
-    def test_gather(self, minimum_ios_deployment_target="13"):  # type: () -> None
-        _test_single_node(
-            "Gather",
-            [(5, 4, 3), (3,)],
-            [(3, 4, 3)],
-            axis=0,
-            minimum_ios_deployment_target=minimum_ios_deployment_target,
-        )
-
-    @unittest.skipIf(
-        _macos_version() < MIN_MACOS_VERSION_10_15,
-        "macOS 10.15+ required. Skipping test.",
-    )
-    @unittest.skip("Failing: wrong input type")
-    def test_reshape_same_rank(
-        self, minimum_ios_deployment_target="13"
-    ):  # type: () -> None
-        _test_single_node(
-            "Reshape",
-            [(5, 4, 3), (3,)],
-            [(4, 5, 3)],
-            minimum_ios_deployment_target=minimum_ios_deployment_target,
-        )
-
-    @unittest.skipIf(
-        _macos_version() < MIN_MACOS_VERSION_10_15,
-        "macOS 10.15+ required. Skipping test.",
-    )
-    @unittest.skip("Failing: wrong input type")
-    def test_reshape_same_rank_infer_shape(
-        self, minimum_ios_deployment_target="13"
-    ):  # type: () -> None
-        _test_single_node(
-            "Reshape",
-            [(5, 4, 3), (3,)],
-            [(5, 2, 6)],
-            minimum_ios_deployment_target=minimum_ios_deployment_target,
-        )
-
-    # TODO: add test_reshape_diff_rank_infer_shape where shape is Constant and known
-    # to test rank-4 into rank-3 reshape with shape inferencing
-    @unittest.skipIf(
-        _macos_version() < MIN_MACOS_VERSION_10_15,
-        "macOS 10.15+ required. Skipping test.",
-    )
-    @unittest.skip("Failing: wrong input type")
-    def test_reshape_dynamic(
-        self, minimum_ios_deployment_target="13"
-    ):  # type: () -> None
-        _test_single_node(
-            "Reshape",
-            [(5, 4, 3, 2), (3,)],
-            [(2, 3, 20)],
-            minimum_ios_deployment_target=minimum_ios_deployment_target,
-        )
-
-    @unittest.skipIf(
-        _macos_version() < MIN_MACOS_VERSION_10_15,
-        "macOS 10.15+ required. Skipping test.",
-    )
-    def test_squeeze(self, minimum_ios_deployment_target="13"):  # type: () -> None
-        _test_single_node(
-            "Squeeze",
-            [(5, 1, 3, 1, 1)],
-            [(5, 3)],
-            axes=[1, 3, 4],
-            minimum_ios_deployment_target=minimum_ios_deployment_target,
-        )
-
-    @unittest.skipIf(
-        _macos_version() < MIN_MACOS_VERSION_10_15,
-        "macOS 10.15+ required. Skipping test.",
-    )
-    def test_transpose_default(
-        self, minimum_ios_deployment_target="13"
-    ):  # type: () -> None
-        _test_single_node(
-            "Transpose",
-            [(5, 3, 4, 6, 2)],
-            [(2, 6, 4, 3, 5)],
-            minimum_ios_deployment_target=minimum_ios_deployment_target,
-        )
-
-    @unittest.skipIf(
-        ONNX_SHAPE_INFERENCE_FAILS,
-        "ONNX Shape inference fails to recongnize correct shape",
-    )
-    @unittest.skipIf(
-        _macos_version() < MIN_MACOS_VERSION_10_15,
-        "macOS 10.15+ required. Skipping test.",
-    )
-    def test_transpose_permute(
-        self, minimum_ios_deployment_target="13"
-    ):  # type: () -> None
-        _test_single_node(
-            "Transpose",
-            [(5, 3, 4, 6, 2)],
-            [(2, 3, 4, 6, 5)],
-            axes=[4, 1, 2, 3, 0],
-            minimum_ios_deployment_target=minimum_ios_deployment_target,
-        )
-
-    @unittest.skipIf(
-        ONNX_SHAPE_INFERENCE_FAILS,
-        "ONNX Shape inference fails to recongnize correct shape",
-    )
-    @unittest.skipIf(
-        _macos_version() < MIN_MACOS_VERSION_10_15,
-        "macOS 10.15+ required. Skipping test.",
-    )
-    def test_unsqueeze(self, minimum_ios_deployment_target="13"):  # type: () -> None
-        _test_single_node(
-            "Unsqueeze",
-            [(5, 3, 4)],
-            [(1, 5, 1, 3, 4)],
-            axes=[0, 1],
-            minimum_ios_deployment_target=minimum_ios_deployment_target,
-        )
diff --git a/coremltools/converters/onnx/_tests/test_pytorch_model.py b/coremltools/converters/onnx/_tests/test_pytorch_model.py
deleted file mode 100644
index 0501f53f3..000000000
--- a/coremltools/converters/onnx/_tests/test_pytorch_model.py
+++ /dev/null
@@ -1,1007 +0,0 @@
-
-import unittest
-from coremltools._deps import _HAS_ONNX, MSG_ONNX_NOT_FOUND, _IS_MACOS
-
-if _HAS_ONNX:
-    import onnx
-    from coremltools.converters.onnx import convert
-    from coremltools.converters.onnx._converter import SupportedVersion
-    from ._test_utils import _assert_outputs
-
-import numpy as np
-import torch  # type: ignore
-import torch.nn as nn  # type: ignore
-import torch.nn.functional as F
-import shutil
-import tempfile
-import os
-import pytest
-
-from coremltools.models.utils import _macos_version
-
-np.random.seed(10)
-torch.manual_seed(10)
-
-MIN_MACOS_VERSION_10_15 = (10, 15)
-
-DEBUG = False
-
-
-def _test_torch_model_single_io(
-    torch_model,
-    torch_input_shape,
-    coreml_input_shape,
-    minimum_ios_deployment_target="12",
-    decimal=4,
-    opset_version=9,
-):
-    # run torch model
-    torch_input = torch.rand(*torch_input_shape)
-    torch_out_raw = torch_model(torch_input)
-    if isinstance(torch_out_raw, tuple):
-        torch_out = torch_out_raw[0].detach().numpy()
-    else:
-        torch_out = torch_out_raw.detach().numpy()
-
-    # convert to onnx model
-    model_dir = tempfile.mkdtemp()
-    if DEBUG:
-        model_dir = "/tmp"
-    onnx_file = os.path.join(model_dir, "torch_model.onnx")
-    torch.onnx.export(torch_model, torch_input, onnx_file, opset_version=opset_version)
-    onnx_model = onnx.load(onnx_file)
-
-    # convert to coreml and run
-    coreml_model = convert(
-        onnx_model, minimum_ios_deployment_target=minimum_ios_deployment_target
-    )
-
-    output_name = [o.name for o in onnx_model.graph.output][0]
-    initializer_names = {t.name for t in onnx_model.graph.initializer}
-    input_name = [
-        i.name for i in onnx_model.graph.input if i.name not in initializer_names
-    ][0]
-    input_numpy = torch_input.detach().numpy()
-    if SupportedVersion.is_nd_array_supported(minimum_ios_deployment_target):
-        input_dict = {input_name: input_numpy}  # type: ignore
-    else:
-        input_dict = {input_name: np.reshape(input_numpy, coreml_input_shape)}  # type: ignore
-    if _IS_MACOS:
-        coreml_out = coreml_model.predict(input_dict, useCPUOnly=True)[output_name]
-        if DEBUG:
-            coreml_model.save(model_dir + "/torch_model.mlmodel")
-            print("coreml_out")
-            print(np.squeeze(coreml_out))
-            print("torch_out")
-            print(np.squeeze(torch_out))
-            print("coreml out shape ", coreml_out.shape)
-            print("torch out shape: ", torch_out.shape)
-
-        # compare
-        _assert_outputs([torch_out], [coreml_out], decimal=decimal)  # type: ignore
-
-        # delete onnx model
-        if not DEBUG:
-            if os.path.exists(model_dir):
-                shutil.rmtree(model_dir)
-
-
-@unittest.skipUnless(_HAS_ONNX, MSG_ONNX_NOT_FOUND)
-class OnnxModelTest(unittest.TestCase):
-    def test_functional_average_pool(self, minimum_ios_deployment_target="12"):
-        class Net(nn.Module):
-            def __init__(self):
-                super(Net, self).__init__()
-
-            def forward(self, x):
-                y = F.avg_pool2d(x, [15, 18], [15, 18])
-                return y
-
-        torch_model = Net()
-        torch_model.train(False)
-        if minimum_ios_deployment_target == "12":
-            coreml_shape = (1, 64, 64)
-        else:
-            coreml_shape = (1, 1, 64, 64)
-        _test_torch_model_single_io(
-            torch_model,
-            (1, 1, 64, 64),
-            coreml_shape,
-            minimum_ios_deployment_target=minimum_ios_deployment_target,
-        )
-
-    @unittest.skipIf(
-        _macos_version() < MIN_MACOS_VERSION_10_15,
-        "macOS 10.15+ required. Skipping test.",
-    )
-    def test_functional_average_pool_disable_rank5_mapping(self):
-        self.test_functional_average_pool(minimum_ios_deployment_target="13")
-
-    def test_linear_no_bias(
-        self, minimum_ios_deployment_target="12"
-    ):  # type: () -> None
-        class Net(nn.Module):
-            def __init__(self):
-                super(Net, self).__init__()
-                self.simple_nn = nn.Sequential(
-                    nn.Linear(256, 128, bias=False), nn.ReLU()
-                )
-
-            def forward(self, x):
-                return self.simple_nn(x)
-
-        torch_model = Net()  # type: ignore
-        torch_model.train(False)
-        _test_torch_model_single_io(torch_model, (1, 256), (256), minimum_ios_deployment_target=minimum_ios_deployment_target)  # type: ignore
-
-    @unittest.skipIf(
-        _macos_version() < MIN_MACOS_VERSION_10_15,
-        "macOS 10.15+ required. Skipping test.",
-    )
-    def test_linear_no_bias_disable_rank5_mapping(self):
-        self.test_linear_no_bias(minimum_ios_deployment_target="13")
-
-    def test_linear_bias(self):  # type: () -> None
-        class Net(nn.Module):
-            def __init__(self):
-                super(Net, self).__init__()
-                self.simple_nn = nn.Sequential(
-                    nn.Linear(256, 128, bias=True), nn.ReLU()
-                )
-
-            def forward(self, x):
-                return self.simple_nn(x)
-
-        torch_model = Net()  # type: ignore
-        torch_model.train(False)
-        _test_torch_model_single_io(torch_model, (1, 256), (256))  # type: ignore
-
-    def test_dynamic_reshape(self):  # type: () -> None
-        class Net(nn.Module):
-            def __init__(self):
-                super(Net, self).__init__()
-                self.conv = nn.Conv2d(
-                    in_channels=3,
-                    out_channels=32,
-                    kernel_size=(3, 3),
-                    stride=1,
-                    padding=0,
-                    bias=True,
-                )
-
-            def forward(self, x):
-                x = self.conv(x)
-                x = x.view(x.size()[0], -1)
-                return x
-
-        torch_model = Net()  # type: ignore
-        torch_model.train(False)
-        _test_torch_model_single_io(torch_model, (1, 3, 100, 100), (3, 100, 100), "13")  # type: ignore
-
-    def test_const_initializer1(self):  # type: () -> None
-        class Net(nn.Module):
-            def __init__(self):
-                super(Net, self).__init__()
-                self.ones = torch.nn.Parameter(torch.ones(1,))
-
-            def forward(self, x):
-                y = x + self.ones
-                return y
-
-        torch_model = Net()  # type: ignore
-        torch_model.train(False)
-        _test_torch_model_single_io(torch_model, (1, 3), (3,))  # type: ignore
-
-    def test_const_initializer2(self):  # type: () -> None
-        class Net(nn.Module):
-            def __init__(self):
-                super(Net, self).__init__()
-
-            def forward(self, x):
-                y = x + torch.nn.Parameter(torch.ones(2, 3), requires_grad=False)
-                return y
-
-        torch_model = Net()  # type: ignore
-        torch_model.train(False)
-        _test_torch_model_single_io(torch_model, (1, 2, 3), (1, 2, 3))  # type: ignore
-
-    def test_conv2D_transpose(self):  # type: () -> None
-        class Net(nn.Module):
-            def __init__(self):
-                super(Net, self).__init__()
-                self.convT = torch.nn.ConvTranspose2d(
-                    1, 1, kernel_size=3, stride=2, output_padding=0, padding=3, groups=1
-                )
-
-            def forward(self, x):
-                y = self.convT(x)
-                return y
-
-        torch_model = Net()  # type: ignore
-        torch_model.train(False)
-        _test_torch_model_single_io(torch_model, (1, 1, 64, 64), (1, 64, 64))  # type: ignore
-
-    def test_conv2D_transpose_output_padding(self):  # type: () -> None
-        class Net(nn.Module):
-            def __init__(self):
-                super(Net, self).__init__()
-                self.convT = torch.nn.ConvTranspose2d(
-                    1, 1, kernel_size=3, stride=2, output_padding=1, padding=3, groups=1
-                )
-
-            def forward(self, x):
-                y = self.convT(x)
-                return y
-
-        torch_model = Net()  # type: ignore
-        torch_model.train(False)
-        _test_torch_model_single_io(torch_model, (1, 1, 64, 64), (1, 64, 64))  # type: ignore
-
-    def test_conv2D_transpose_groups(self):  # type: () -> None
-        class Net(nn.Module):
-            def __init__(self):
-                super(Net, self).__init__()
-                self.convT = torch.nn.ConvTranspose2d(
-                    4, 4, kernel_size=3, stride=2, output_padding=1, padding=1, groups=2
-                )
-
-            def forward(self, x):
-                y = self.convT(x)
-                return y
-
-        torch_model = Net()  # type: ignore
-        torch_model.train(False)
-        _test_torch_model_single_io(torch_model, (1, 4, 8, 8), (4, 8, 8))  # type: ignore
-
-    def test_conv2D_transpose_2(self):  # type: () -> None
-        class Net(nn.Module):
-            def __init__(self):
-                super(Net, self).__init__()
-                self.convT = torch.nn.ConvTranspose2d(
-                    1, 1, kernel_size=3, stride=3, output_padding=2, padding=1, groups=1
-                )
-
-            def forward(self, x):
-                y = self.convT(x)
-                return y
-
-        torch_model = Net()  # type: ignore
-        torch_model.train(False)
-        _test_torch_model_single_io(torch_model, (1, 1, 3, 3), (1, 3, 3))  # type: ignore
-
-    def test_pow(self):  # type: () -> None
-        class Net(nn.Module):
-            def __init__(self):
-                super(Net, self).__init__()
-
-            def forward(self, x):
-                y = x.pow(3)
-                return y
-
-        torch_model = Net()  # type: ignore
-        torch_model.train(False)
-        _test_torch_model_single_io(torch_model, (3, 2, 3), (3, 2, 3))  # type: ignore
-
-    @pytest.mark.skip(reason="")
-    @unittest.skipIf(
-        _macos_version() < MIN_MACOS_VERSION_10_15,
-        "macOS 10.15+ required. Skipping test.",
-    )
-    def test_lstm(self):  # type: () -> None
-        class Net(nn.Module):
-            def __init__(self):
-                super(Net, self).__init__()
-                self.lstm = nn.LSTM(input_size=256, hidden_size=64, num_layers=1)
-
-            def forward(self, x):
-                y = self.lstm(x)
-                return y
-
-        torch_model = Net()  # type: ignore
-        torch_model.train(False)
-        _test_torch_model_single_io(torch_model, (3, 1, 256), (3, 1, 256), minimum_ios_deployment_target="13")  # type: ignore
-
-    @pytest.mark.skip(reason="")
-    @unittest.skipIf(
-        _macos_version() < MIN_MACOS_VERSION_10_15,
-        "macOS 10.15+ required. Skipping test.",
-    )
-    def test_bidirlstm(self):  # type: () -> None
-        class Net(nn.Module):
-            def __init__(self):
-                super(Net, self).__init__()
-                self.lstm = nn.LSTM(
-                    input_size=256, hidden_size=64, num_layers=1, bidirectional=True
-                )
-
-            def forward(self, x):
-                y = self.lstm(x)
-                return y
-
-        torch_model = Net()  # type: ignore
-        torch_model.train(False)
-        _test_torch_model_single_io(torch_model, (3, 1, 256), (3, 1, 256), minimum_ios_deployment_target="13")  # type: ignore
-
-    @pytest.mark.skip(reason="")
-    @unittest.skipIf(
-        _macos_version() < MIN_MACOS_VERSION_10_15,
-        "macOS 10.15+ required. Skipping test.",
-    )
-    def test_gru(self):  # type: () -> None
-        class Net(nn.Module):
-            def __init__(self):
-                super(Net, self).__init__()
-                self.gru = nn.GRU(input_size=256, hidden_size=64, num_layers=1)
-
-            def forward(self, x):
-                y = self.gru(x)
-                return y
-
-        torch_model = Net()  # type: ignore
-        torch_model.train(False)
-        _test_torch_model_single_io(torch_model, (3, 1, 256), (3, 1, 256), minimum_ios_deployment_target="13", decimal=1)  # type: ignore
-
-    def test_1d_conv(self):
-        class Net(nn.Module):
-            def __init__(
-                self,
-                in_channels,
-                out_channels,
-                kernel_size,
-                stride=1,
-                dilation=1,
-                groups=1,
-                bias=True,
-            ):
-                super(Net, self).__init__()
-
-                self.conv = torch.nn.Conv1d(
-                    in_channels,
-                    out_channels,
-                    kernel_size=kernel_size,
-                    stride=stride,
-                    padding=0,
-                    dilation=dilation,
-                    groups=groups,
-                    bias=bias,
-                )
-
-                self.__padding = (kernel_size - 1) * dilation
-
-            def forward(self, x):
-                result = self.conv(x)
-                if self.__padding != 0:
-                    return result[:, :, : -self.__padding]
-                return result
-
-        B = 1
-        Cin = 5
-        Cout = 11
-        k = 3
-        Win = 15
-        torch_model = Net(Cin, Cout, k)  # type: ignore
-        torch_model.train(False)
-        _test_torch_model_single_io(torch_model, (1, Cin, Win), (Cin, 1, Win))  # type: ignore
-
-    def test_conv1d_after_reshape(self):  # type: () -> None
-        class Net(nn.Module):
-            def __init__(self):
-                super(Net, self).__init__()
-                self.conv = torch.nn.Conv1d(
-                    in_channels=300,
-                    out_channels=32,
-                    kernel_size=3,
-                    stride=1,
-                    padding=0,
-                    bias=True,
-                )
-
-            def forward(self, x):
-                x = x.view(1, 300, 100)
-                x = self.conv(x)
-                return x
-
-        torch_model = Net()  # type: ignore
-        torch_model.train(False)
-        _test_torch_model_single_io(torch_model, (1, 3, 100, 100), (3, 100, 100))  # type: ignore
-
-    def test_conv2d_stride(self):
-        class TestModule(torch.nn.Module):
-            def __init__(self):
-                in_channels = 1
-                out_channels = 1
-                bsz = 1  # batch size
-                super(TestModule, self).__init__()
-                self.conv1 = torch.nn.Conv2d(
-                    in_channels, out_channels, kernel_size=(3, 4), stride=1
-                )
-                self.conv2 = torch.nn.Conv2d(
-                    in_channels,
-                    out_channels,
-                    kernel_size=(3, 5),
-                    stride=(2, 1),
-                    padding=(1, 2),
-                )
-
-            def forward(self, x):
-                return (self.conv2(x),)  # self.conv2(x)
-
-        torch_model = TestModule()  # type: ignore
-        torch_model.train(False)
-        H, W = 6, 3
-        _test_torch_model_single_io(torch_model, (1, 1, H, W), (1, H, W))  # type: ignore
-
-    def test_conv2d_dilation(self):
-        class TestModule(torch.nn.Module):
-            def __init__(self):
-                in_channels = 1
-                out_channels = 3
-                bsz = 1  # batch size
-                super(TestModule, self).__init__()
-                self.conv1 = torch.nn.Conv2d(
-                    in_channels, out_channels, kernel_size=(3, 4), stride=2, dilation=2
-                )
-
-            def forward(self, x):
-                return self.conv1(x)
-
-        torch_model = TestModule()  # type: ignore
-        torch_model.train(False)
-        H, W = 64, 64
-        _test_torch_model_single_io(torch_model, (1, 1, H, W), (1, H, W))  # type: ignore
-
-    def test_bachnorm_after_reshape(self):  # type: () -> None
-        class Net(nn.Module):
-            def __init__(self):
-                super(Net, self).__init__()
-                self.conv = torch.nn.Conv1d(
-                    in_channels=300,
-                    out_channels=32,
-                    kernel_size=3,
-                    stride=1,
-                    padding=0,
-                    bias=True,
-                )
-                self.bachnorm = nn.BatchNorm1d(32)
-
-            def forward(self, x):
-                x = x.view(1, 300, 100)
-                x = self.conv(x)
-                x = self.bachnorm(x)
-                return x
-
-        torch_model = Net()  # type: ignore
-        torch_model.train(False)
-        _test_torch_model_single_io(torch_model, (1, 3, 100, 100), (3, 100, 100))  # type: ignore
-
-    def test_res_connect_downsampling_after_reshape(self):  # type: () -> None
-        class Net(nn.Module):
-            def __init__(self):
-                super(Net, self).__init__()
-                self.conv = torch.nn.Conv1d(
-                    in_channels=300,
-                    out_channels=32,
-                    kernel_size=3,
-                    stride=1,
-                    padding=1,
-                    bias=True,
-                )
-                self.downsample = torch.nn.Conv1d(
-                    in_channels=300,
-                    out_channels=32,
-                    kernel_size=1,
-                    stride=1,
-                    padding=0,
-                    bias=True,
-                )
-
-            def forward(self, x):
-                x = x.view(1, 300, 100)
-                y = self.conv(x)
-                res = self.downsample(x)
-                return y + res
-
-        torch_model = Net()  # type: ignore
-        torch_model.train(False)
-        _test_torch_model_single_io(torch_model, (1, 3, 100, 100), (3, 100, 100))  # type: ignore
-
-    def test_fc_plus_convenet(self):  # type: () -> None
-        class Net(nn.Module):
-            def __init__(
-                self,
-                channel_size=1,
-                output_h=16,
-                output_w=16,
-                filter_num=32,
-                latent_size=16,
-            ):
-                super(Net, self).__init__()
-                self.channel_size = channel_size
-                self.output_h = output_h
-                self.output_w = output_w
-                self.filter_num = filter_num
-                self.latent_size = latent_size
-
-                self.fc3 = nn.Linear(latent_size, 128)
-                self.fc4 = nn.Linear(128, 256)
-
-                self.relu = nn.ReLU()
-
-                self.convt = nn.Sequential(
-                    nn.ConvTranspose2d(256, self.filter_num * 4, 4, 1),
-                    nn.BatchNorm2d(self.filter_num * 4),
-                    nn.ReLU(inplace=True),
-                    nn.ConvTranspose2d(self.filter_num * 4, self.filter_num * 2, 4, 1),
-                    nn.BatchNorm2d(self.filter_num * 2),
-                    nn.ReLU(inplace=True),
-                    nn.ConvTranspose2d(self.filter_num * 2, self.filter_num, 4, 1),
-                    nn.BatchNorm2d(self.filter_num),
-                    nn.ReLU(inplace=True),
-                    nn.ConvTranspose2d(self.filter_num, self.filter_num, 4, 1),
-                    nn.BatchNorm2d(self.filter_num),
-                    nn.ReLU(inplace=True),
-                    nn.ConvTranspose2d(self.filter_num, 1, 4, 1),
-                    nn.Sigmoid(),
-                )
-
-            def forward(self, z):
-                x = self.relu(self.fc3(z))
-                deconv_input = self.fc4(x)
-                deconv_input = deconv_input.view(-1, 256, 1, 1)
-                x = self.convt(deconv_input)
-                return x
-
-        torch_model = Net()  # type: ignore
-        torch_model.train(False)
-        _test_torch_model_single_io(torch_model, (1, 16), (1, 1, 16))  # type: ignore
-
-    @unittest.skipIf(
-        _macos_version() < MIN_MACOS_VERSION_10_15,
-        "macOS 10.15+ required. Skipping test.",
-    )
-    def test_conv1d_pool1d(self, minimum_ios_deployment_target="13"):
-        class Net(nn.Module):
-            def __init__(self):
-                super(Net, self).__init__()
-                self.conv1 = nn.Conv1d(
-                    in_channels=4, out_channels=32, kernel_size=3, stride=1, padding=1
-                )
-                self.conv2 = nn.Conv1d(
-                    in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1
-                )
-
-            def forward(self, x):
-                x = x.permute(0, 2, 1)
-                x = self.conv1(x)
-                x = F.relu(x)
-                x = F.max_pool1d(x, 2)
-                x = self.conv2(x)
-                x = F.relu(x)
-                return x
-
-        torch_model = Net()  # type: ignore
-        torch_model.train(False)
-        _test_torch_model_single_io(
-            torch_model,
-            (2, 10, 4),
-            (2, 10, 4),
-            minimum_ios_deployment_target=minimum_ios_deployment_target,
-        )
-
-    @unittest.skipIf(
-        _macos_version() < MIN_MACOS_VERSION_10_15,
-        "macOS 10.15+ required. Skipping test.",
-    )
-    def test_slice(self, minimum_ios_deployment_target="13"):
-        class Net(nn.Module):
-            def __init__(self):
-                super(Net, self).__init__()
-
-            def forward(self, x):
-                x = x[:, :5] + x[:, 5:]
-                return x
-
-        torch_model = Net()  # type: ignore
-        torch_model.train(False)
-
-        # opset <= 9
-        _test_torch_model_single_io(
-            torch_model,
-            (10, 10),
-            (10, 10),
-            minimum_ios_deployment_target=minimum_ios_deployment_target,
-        )
-        # opset > 9
-        _test_torch_model_single_io(
-            torch_model,
-            (10, 10),
-            (10, 10),
-            opset_version=10,
-            minimum_ios_deployment_target=minimum_ios_deployment_target,
-        )
-
-
-@unittest.skipUnless(_HAS_ONNX, MSG_ONNX_NOT_FOUND)
-class ReshapeTransposeTests(unittest.TestCase):
-    """
-    tests for models that have patterns like:
-    rank(4) ---> reshape (rank 6) ----> transpose (rank 6) ----> reshape(4)
-    """
-
-    @pytest.mark.xfail
-    # is pytorch to onnx conversion correct?
-    def test_pixel_shuffle_not_working(self):
-        """
-        (1, c, h, w) --> reshape ---> (1, sh, sw, c/(sh*sw), h, w)
-        --> transpose [0,1,4,2,5,3] ---> (1, sh, h, sw, w, c/(sh*sw))
-        --> reshape ---> (1, c/(s1*s2), sh*h, sw*w)
-        """
-
-        class Net(nn.Module):
-            def __init__(self, upscale_factor=3):
-                super(Net, self).__init__()
-                self.upscale_factor = upscale_factor
-                self.ps = nn.PixelShuffle(self.upscale_factor)
-
-            def forward(self, x):
-                return self.ps(x)
-
-        torch_model = Net()  # type: ignore
-        torch_model.train(False)
-        _test_torch_model_single_io(torch_model, (1, 18, 4, 5), (18, 4, 5))  # type: ignore
-
-    def test_pixel_shuffle_working(self):
-        """
-        (1, c, h, w) --> reshape ---> (1, c/(sh*sw), sh, sw, h, w)
-        --> transpose [0,1,4,2,5,3] ---> (1, sh, h, sw, w, c/(sh*sw))
-        --> reshape ---> (1, c/(sh*sw), sh*h, sw*w)
-        """
-
-        class Net(nn.Module):
-            def __init__(self, C=12, H=4, W=6, sh=3, sw=2):
-                super(Net, self).__init__()
-                self.C = C
-                self.H = H
-                self.W = W
-                self.sh = sh
-                self.sw = sw
-
-            def forward(self, x):
-                y1 = x.view(
-                    1, self.C // (self.sh * self.sw), self.sh, self.sw, self.H, self.W
-                ).contiguous()
-                y2 = y1.permute(0, 1, 4, 2, 5, 3).contiguous()
-                y3 = y2.view(
-                    1, self.C // (self.sh * self.sw), self.sh * self.H, self.sw * self.W
-                ).contiguous()
-                return y3
-
-        torch_model = Net()  # type: ignore
-        torch_model.train(False)
-        _test_torch_model_single_io(torch_model, (1, 12, 4, 6), (12, 4, 6))  # type: ignore
-
-    def test_reorganize_1(self):
-        """
-        (1, c, h, w) --> reshape ---> (1, c/(sh*sw), h, sh, w, sw)
-        --> transpose [0,3,5,1,2,4] ---> (1, sh, sw, c/(sh*sw), h, w)
-        --> reshape ---> (1, c*sh*sw, h/sh, w/sw)
-        """
-
-        class Net(nn.Module):
-            def __init__(self, C=12, H=4, W=6, sh=2, sw=3):
-                super(Net, self).__init__()
-                self.C = C
-                self.H = H
-                self.W = W
-                self.sh = sh
-                self.sw = sw
-
-            def forward(self, x):
-                y1 = x.view(
-                    1, self.C // (self.sh * self.sw), self.H, self.sh, self.W, self.sw
-                ).contiguous()
-                y2 = y1.permute(0, 3, 5, 1, 2, 4).contiguous()
-                y3 = y2.view(
-                    1,
-                    self.C * (self.sh * self.sw),
-                    self.H // self.sh,
-                    self.W // self.sw,
-                ).contiguous()
-                return y3
-
-        torch_model = Net()  # type: ignore
-        torch_model.train(False)
-        _test_torch_model_single_io(torch_model, (1, 12, 4, 6), (12, 4, 6))  # type: ignore
-
-    def test_reorganize_2(self):
-        """
-        (1, c, h, w) --> reshape ---> (1, c, h/sh, sh, w/sw, sw)
-        --> transpose [0,1,2,4,3,5] ---> (1, c, h/sh, w/sw, sh, sw)
-        --> reshape ---> (1, c*sh*sw, h/sh, w/sw)
-        """
-
-        class Net(nn.Module):
-            def __init__(self, C=12, H=4, W=6, sh=2, sw=3):
-                super(Net, self).__init__()
-                self.C = C
-                self.H = H
-                self.W = W
-                self.sh = sh
-                self.sw = sw
-
-            def forward(self, x):
-                y1 = x.view(
-                    1, self.C, self.H // self.sh, self.sh, self.W // self.sw, self.sw
-                ).contiguous()
-                y2 = y1.transpose(4, 3).contiguous()
-                y3 = y2.view(
-                    1,
-                    self.C * (self.sh * self.sw),
-                    self.H // self.sh,
-                    self.W // self.sw,
-                ).contiguous()
-                return y3
-
-        torch_model = Net()  # type: ignore
-        torch_model.train(False)
-        _test_torch_model_single_io(torch_model, (1, 12, 4, 6), (12, 4, 6))  # type: ignore
-
-
-@unittest.skipUnless(_HAS_ONNX, MSG_ONNX_NOT_FOUND)
-class UnaryOperationTests(unittest.TestCase):
-    """
-    Unary Operation Test cases
-    """
-
-    ## Sqrt tests
-    @unittest.skipIf(
-        _macos_version() < MIN_MACOS_VERSION_10_15,
-        "macOS 10.15+ required. Skipping test.",
-    )
-    def test_sqrt_tensor(self, minimum_ios_deployment_target="13"):
-        class Net(nn.Module):
-            def forward(self, x):
-                return torch.sqrt(x)
-
-        torch_model = Net()  # type: ignore
-        torch_model.train(False)
-        _test_torch_model_single_io(torch_model, (18, 4, 5), (18, 4, 5), minimum_ios_deployment_target=minimum_ios_deployment_target)  # type: ignore
-
-
-@unittest.skipUnless(_HAS_ONNX, MSG_ONNX_NOT_FOUND)
-class OperatorTests(unittest.TestCase):
-    """
-    Operator test for Operator
-    """
-
-    @unittest.skipIf(
-        _macos_version() < MIN_MACOS_VERSION_10_15,
-        "macOS 10.15+ required. Skipping test.",
-    )
-    def test_repeat(self, minimum_ios_deployment_target="13"):
-        class Net(nn.Module):
-            def forward(self, x):
-                return x.repeat([2, 3, 1])
-
-        torch_model = Net()
-        torch_model.train(False)
-        _test_torch_model_single_io(torch_model, (18, 4, 5), (18, 4, 5), minimum_ios_deployment_target=minimum_ios_deployment_target)  # type: ignore
-
-
-@unittest.skipUnless(_HAS_ONNX, MSG_ONNX_NOT_FOUND)
-class BinaryOperationTests(unittest.TestCase):
-    """
-    Binary Operation Test cases
-    """
-
-    ## Addition tests
-    @unittest.skipIf(
-        _macos_version() < MIN_MACOS_VERSION_10_15,
-        "macOS 10.15+ required. Skipping test.",
-    )
-    def test_add_same_shape(self, minimum_ios_deployment_target="13"):
-        class Net(nn.Module):
-            def forward(self, x):
-                return torch.add(x, y)
-
-        y = torch.rand((18, 4, 5))
-        torch_model = Net()  # type: ignore
-        torch_model.train(False)
-        _test_torch_model_single_io(torch_model, (18, 4, 5), (18, 4, 5), minimum_ios_deployment_target=minimum_ios_deployment_target)  # type: ignore
-
-    @unittest.skipIf(
-        _macos_version() < MIN_MACOS_VERSION_10_15,
-        "macOS 10.15+ required. Skipping test.",
-    )
-    def test_add_same_shape_multiple(self, minimum_ios_deployment_target="13"):
-        class Net(nn.Module):
-            def forward(self, x):
-                return x + y + y1 + y2 + y3
-
-        y = torch.rand((18, 4, 5))
-        y1 = torch.rand((4, 5))
-        y2 = torch.rand((18, 4, 5))
-        y3 = 7.234
-
-        torch_model = Net()  # type: ignore
-        torch_model.train(False)
-        _test_torch_model_single_io(torch_model, (18, 4, 5), (18, 4, 5), minimum_ios_deployment_target=minimum_ios_deployment_target)  # type: ignore
-
-    @unittest.skipIf(
-        _macos_version() < MIN_MACOS_VERSION_10_15,
-        "macOS 10.15+ required. Skipping test.",
-    )
-    def test_add_tensor_scalar(self, minimum_ios_deployment_target="13"):
-        class Net(nn.Module):
-            def forward(self, x):
-                return torch.add(x, y)
-
-        y = 5
-        torch_model = Net()  # type: ignore
-        torch_model.train(False)
-        _test_torch_model_single_io(torch_model, (18, 4, 5), (18, 4, 5), minimum_ios_deployment_target=minimum_ios_deployment_target)  # type: ignore
-
-    @unittest.skipIf(
-        _macos_version() < MIN_MACOS_VERSION_10_15,
-        "macOS 10.15+ required. Skipping test.",
-    )
-    def test_add_diff_shape(self, minimum_ios_deployment_target="13"):
-        class Net(nn.Module):
-            def forward(self, x):
-                return torch.add(x, y)
-
-        y = torch.rand((4, 5))
-        torch_model = Net()  # type: ignore
-        torch_model.train(False)
-        _test_torch_model_single_io(torch_model, (18, 4, 5), (18, 4, 5), minimum_ios_deployment_target=minimum_ios_deployment_target)  # type: ignore
-
-    ## Subtraction tests
-    @unittest.skipIf(
-        _macos_version() < MIN_MACOS_VERSION_10_15,
-        "macOS 10.15+ required. Skipping test.",
-    )
-    def test_sub_same_shape(self, minimum_ios_deployment_target="13"):
-        class Net(nn.Module):
-            def forward(self, x):
-                return torch.sub(x, y)
-
-        y = torch.rand((18, 4, 5))
-        torch_model = Net()  # type: ignore
-        torch_model.train(False)
-        _test_torch_model_single_io(torch_model, (18, 4, 5), (18, 4, 5), minimum_ios_deployment_target=minimum_ios_deployment_target)  # type: ignore
-
-    @unittest.skipIf(
-        _macos_version() < MIN_MACOS_VERSION_10_15,
-        "macOS 10.15+ required. Skipping test.",
-    )
-    def test_sub_same_shape_multiple(self, minimum_ios_deployment_target="13"):
-        class Net(nn.Module):
-            def forward(self, x):
-                return x - y - y1 - y2 - y3
-
-        y = torch.rand((18, 4, 5))
-        y1 = torch.rand((4, 5))
-        y2 = torch.rand((18, 4, 5))
-        y3 = 7.234
-
-        torch_model = Net()  # type: ignore
-        torch_model.train(False)
-        _test_torch_model_single_io(torch_model, (18, 4, 5), (18, 4, 5), minimum_ios_deployment_target=minimum_ios_deployment_target)  # type: ignore
-
-    @unittest.skipIf(
-        _macos_version() < MIN_MACOS_VERSION_10_15,
-        "macOS 10.15+ required. Skipping test.",
-    )
-    def test_sub_tensor_scalar(self, minimum_ios_deployment_target="13"):
-        class Net(nn.Module):
-            def forward(self, x):
-                return torch.sub(x, y)
-
-        y = 5
-        torch_model = Net()  # type: ignore
-        torch_model.train(False)
-        _test_torch_model_single_io(torch_model, (18, 4, 5), (18, 4, 5), minimum_ios_deployment_target=minimum_ios_deployment_target)  # type: ignore
-
-    @unittest.skipIf(
-        _macos_version() < MIN_MACOS_VERSION_10_15,
-        "macOS 10.15+ required. Skipping test.",
-    )
-    def test_sub_diff_shape(self, minimum_ios_deployment_target="13"):
-        class Net(nn.Module):
-            def forward(self, x):
-                return torch.sub(x, y)
-
-        y = torch.rand((4, 5))
-        torch_model = Net()  # type: ignore
-        torch_model.train(False)
-        _test_torch_model_single_io(torch_model, (18, 4, 5), (18, 4, 5), minimum_ios_deployment_target=minimum_ios_deployment_target)  # type: ignore
-
-    @unittest.skipIf(
-        _macos_version() < MIN_MACOS_VERSION_10_15,
-        "macOS 10.15+ required. Skipping test.",
-    )
-    def test_bianry_ops_mix_test(self, minimum_ios_deployment_target="13"):
-        class Net(nn.Module):
-            def forward(self, x):
-                return ((x * g + a) - d * (c + b) + (a * e - g) / e) / f
-
-        a = torch.rand((18, 4, 5))
-        b = torch.rand((4, 5))
-        c = torch.rand((18, 4, 5))
-        d = 7.234
-        e = torch.rand((5))
-        f = 8.234
-        g = 5
-
-        torch_model = Net()  # type: ignore
-        torch_model.train(False)
-        _test_torch_model_single_io(torch_model, (18, 4, 5), (18, 4, 5), minimum_ios_deployment_target=minimum_ios_deployment_target)  # type: ignore
-
-
-@unittest.skipUnless(_HAS_ONNX, MSG_ONNX_NOT_FOUND)
-class ReduceOperationTests(unittest.TestCase):
-    """
-    Reduction Operation Test cases
-    """
-
-    ## Reduction tests
-    @unittest.skipIf(
-        _macos_version() < MIN_MACOS_VERSION_10_15,
-        "macOS 10.15+ required. Skipping test.",
-    )
-    def test_reducesum(self, minimum_ios_deployment_target="13"):
-        class Net(nn.Module):
-            def forward(self, x):
-                return x.sum(dim=0)
-
-        torch_model = Net()  # type: ignore
-        torch_model.train(False)
-        _test_torch_model_single_io(
-            torch_model,
-            (18, 4, 5),
-            (4, 5),
-            minimum_ios_deployment_target=minimum_ios_deployment_target,
-        )
-
-    @unittest.skipIf(
-        _macos_version() < MIN_MACOS_VERSION_10_15,
-        "macOS 10.15+ required. Skipping test.",
-    )
-    def test_reducemean(self, minimum_ios_deployment_target="13"):
-        class Net(nn.Module):
-            def forward(self, x):
-                return x.mean(dim=1)
-
-        torch_model = Net()  # type: ignore
-        torch_model.train(False)
-        _test_torch_model_single_io(
-            torch_model,
-            (18, 4, 5),
-            (18, 5),
-            minimum_ios_deployment_target=minimum_ios_deployment_target,
-        )
-
-
-@unittest.skipUnless(_HAS_ONNX, MSG_ONNX_NOT_FOUND)
-class TransformationTests(unittest.TestCase):
-    """
-    Test cases for validating transformations
-    """
-
-    # Upsample Test case
-    # Upsample with scalar factor is splited in Floor -> Cast -> Div -> Concat
-    # Hence, is a good measure to test Costant Propagation and removal transformation
-    @unittest.skipIf(
-        _macos_version() < MIN_MACOS_VERSION_10_15,
-        "macOS 10.15+ required. Skipping test.",
-    )
-    @pytest.mark.skip(reason="")
-    def test_cast_removal_transformation(self, minimum_ios_deployment_target="13"):
-        torch_model = nn.Upsample(scale_factor=2)
-        torch_model.train(False)
-        _test_torch_model_single_io(
-            torch_model,
-            (1, 18, 4, 5),
-            (1, 18, 8, 10),
-            minimum_ios_deployment_target=minimum_ios_deployment_target,
-        )
diff --git a/coremltools/converters/onnx/_tests/test_transformers.py b/coremltools/converters/onnx/_tests/test_transformers.py
deleted file mode 100644
index 698057488..000000000
--- a/coremltools/converters/onnx/_tests/test_transformers.py
+++ /dev/null
@@ -1,321 +0,0 @@
-
-import pytest
-
-onnx = pytest.importorskip("onnx")
-
-import unittest
-import numpy as np
-import numpy.testing as npt  # type: ignore
-
-from coremltools._deps import _HAS_ONNX, MSG_ONNX_NOT_FOUND
-
-if _HAS_ONNX:
-    import onnx
-    from onnx import helper, numpy_helper, TensorProto
-
-    from coremltools.converters.onnx import convert
-    from coremltools.converters.onnx._graph import Graph
-    from coremltools.converters.onnx._transformers import (
-        CastOpRemover,
-        ConvAddFuser,
-        DropoutRemover,
-        ImageScalerRemover,
-    )
-    from ._test_utils import (
-        _onnx_create_model,
-        _test_onnx_model,
-        _conv_pool_output_size,
-        _random_array,
-    )
-
-
-@unittest.skipUnless(_HAS_ONNX, MSG_ONNX_NOT_FOUND)
-class ConvAddFuserTest(unittest.TestCase):
-    def test_fuse_conv_without_bias(self):  # type: () -> None
-        kernel_shape = (3, 2)
-        strides = (2, 3)
-        pads = (4, 2, 4, 2)
-        dilations = (1, 2)
-        group = 1
-        weight = numpy_helper.from_array(_random_array((16, 3, 3, 2)), name="weight")
-
-        input_shape = (1, 3, 224, 224)
-        output_size = _conv_pool_output_size(
-            input_shape, dilations, kernel_shape, pads, strides
-        )
-
-        output_shape = (1, int(weight.dims[0]), output_size[0], output_size[1])
-
-        inputs = [("input0", input_shape)]
-        outputs = [("output0", output_shape, TensorProto.FLOAT)]
-
-        conv = helper.make_node(
-            "Conv",
-            inputs=[inputs[0][0], "weight"],
-            outputs=["conv_output"],
-            dilations=dilations,
-            group=group,
-            kernel_shape=kernel_shape,
-            pads=pads,
-            strides=strides,
-        )
-
-        b = _random_array((int(weight.dims[0]),))
-        bias = numpy_helper.from_array(b, name="bias")
-
-        add = helper.make_node(
-            "Add",
-            inputs=[conv.output[0], "bias"],
-            outputs=[outputs[0][0]],
-            broadcast=1,
-            axis=1,
-        )
-
-        model = _onnx_create_model([conv, add], inputs, outputs, [weight, bias])
-        graph_ = Graph.from_onnx(model.graph, onnx_ir_version=5)
-        fused_graph = graph_.transformed([ConvAddFuser()])
-
-        self.assertEqual(len(fused_graph.nodes), 1)
-        node = fused_graph.nodes[0]
-        self.assertEqual(len(node.inputs), 3)
-        npt.assert_equal(node.input_tensors[node.inputs[2]], b)
-        self.assertEqual(fused_graph.nodes[0].outputs[0], outputs[0][0])
-
-    def test_fuse_conv_with_bias(self):  # type: () -> None
-        kernel_shape = (3, 2)
-        strides = (2, 3)
-        pads = (4, 2, 4, 2)
-        dilations = (1, 2)
-        group = 1
-        weight = numpy_helper.from_array(_random_array((16, 3, 3, 2)), name="weight")
-        b = _random_array((int(weight.dims[0]),))
-        bias = numpy_helper.from_array(b, name="bias")
-
-        input_shape = (1, 3, 224, 224)
-        output_size = _conv_pool_output_size(
-            input_shape, dilations, kernel_shape, pads, strides
-        )
-
-        output_shape = (1, int(weight.dims[0]), output_size[0], output_size[1])
-
-        inputs = [("input0", input_shape)]
-        outputs = [("output0", output_shape, TensorProto.FLOAT)]
-
-        conv = helper.make_node(
-            "Conv",
-            inputs=[inputs[0][0], "weight", "bias"],
-            outputs=["conv_output"],
-            dilations=dilations,
-            group=group,
-            kernel_shape=kernel_shape,
-            pads=pads,
-            strides=strides,
-        )
-
-        add = helper.make_node(
-            "Add",
-            inputs=[conv.output[0], "bias"],
-            outputs=[outputs[0][0]],
-            broadcast=1,
-            axis=1,
-        )
-
-        model = _onnx_create_model([conv, add], inputs, outputs, [weight, bias])
-        graph_ = Graph.from_onnx(model.graph, onnx_ir_version=5)
-        fused_graph = graph_.transformed([ConvAddFuser()])
-
-        self.assertEqual(len(fused_graph.nodes), 1)
-        node = fused_graph.nodes[0]
-        self.assertEqual(len(node.inputs), 3)
-        npt.assert_equal(node.input_tensors[node.inputs[2]], b * 2)
-        self.assertEqual(fused_graph.nodes[0].outputs[0], outputs[0][0])
-
-
-@unittest.skipUnless(_HAS_ONNX, MSG_ONNX_NOT_FOUND)
-class NodeRemoverTests(unittest.TestCase):
-    def test_dropout_remover(self):  # type: () -> None
-        inputs = [("input", (1, 3, 50, 50))]
-        outputs = [("out", (1, 5, 50, 50), TensorProto.FLOAT)]
-        weight = numpy_helper.from_array(_random_array((5, 3, 1, 1)), name="weight")
-        conv = helper.make_node(
-            "Conv",
-            inputs=["input", "weight"],
-            outputs=["conv_output"],
-            kernel_shape=(1, 1),
-            strides=(1, 1),
-        )
-        drop = helper.make_node(
-            "Dropout", inputs=["conv_output"], outputs=["drop_output"],
-        )
-        exp = helper.make_node("Exp", inputs=["drop_output"], outputs=["out"])
-
-        onnx_model = _onnx_create_model([conv, drop, exp], inputs, outputs)
-
-        graph = Graph.from_onnx(onnx_model.graph, onnx_ir_version=5)
-        new_graph = graph.transformed([DropoutRemover()])
-        self.assertEqual(len(graph.nodes), 3)
-        self.assertEqual(len(new_graph.nodes), 2)
-        self.assertEqual(new_graph.nodes[0].inputs[0], "input")
-        self.assertEqual(new_graph.nodes[1].inputs[0], new_graph.nodes[0].outputs[0])
-        self.assertEqual(new_graph.nodes[1].outputs[0], "out")
-
-    def test_image_scaler_remover(self):  # type: () -> None
-        inputs = [("input", (1, 3, 50, 50))]
-        outputs = [("out", (1, 3, 50, 50), TensorProto.FLOAT)]
-
-        im_scaler = helper.make_node(
-            "ImageScaler",
-            inputs=["input"],
-            outputs=["scaler_out"],
-            bias=[10, -6, 20],
-            scale=3.0,
-        )
-
-        exp = helper.make_node("Exp", inputs=["scaler_out"], outputs=["out"])
-
-        onnx_model = _onnx_create_model([im_scaler, exp], inputs, outputs)
-
-        graph = Graph.from_onnx(onnx_model.graph, onnx_ir_version=5)
-        new_graph = graph.transformed([ImageScalerRemover()])
-        self.assertEqual(len(graph.nodes), 2)
-        self.assertEqual(len(new_graph.nodes), 1)
-        self.assertEqual(new_graph.nodes[0].inputs[0], "input")
-        self.assertEqual(new_graph.nodes[0].outputs[0], "out")
-
-        coreml_model = convert(onnx_model)
-        spec = coreml_model.get_spec()
-
-        self.assertEqual(spec.neuralNetwork.preprocessing[0].scaler.channelScale, 3.0)
-        self.assertEqual(spec.neuralNetwork.preprocessing[0].scaler.blueBias, 20.0)
-        self.assertEqual(spec.neuralNetwork.preprocessing[0].scaler.greenBias, -6.0)
-        self.assertEqual(spec.neuralNetwork.preprocessing[0].scaler.redBias, 10.0)
-
-    def test_multiple_image_scaler(self):  # type : () -> None
-        inputs = [("input_color", (1, 3, 10, 10)), ("input_gray", (1, 1, 10, 10))]
-        outputs = [("out", (1, 4, 10, 10), TensorProto.FLOAT)]
-
-        im_scaler1 = helper.make_node(
-            "ImageScaler",
-            inputs=["input_color"],
-            outputs=["scaler_out_1"],
-            bias=[10, -6, 20],
-            scale=3.0,
-        )
-
-        im_scaler2 = helper.make_node(
-            "ImageScaler",
-            inputs=["input_gray"],
-            outputs=["scaler_out_2"],
-            bias=[-13],
-            scale=5.0,
-        )
-
-        concat = helper.make_node(
-            "Concat", inputs=["scaler_out_1", "scaler_out_2"], outputs=["out"], axis=1
-        )
-
-        onnx_model = _onnx_create_model(
-            [im_scaler1, im_scaler2, concat], inputs, outputs
-        )
-
-        spec = convert(onnx_model).get_spec()
-        self.assertEqual(len(spec.neuralNetwork.layers), 1)
-        self.assertEqual(len(spec.neuralNetwork.preprocessing), 2)
-        self.assertEqual(spec.neuralNetwork.preprocessing[0].scaler.channelScale, 3.0)
-        self.assertEqual(spec.neuralNetwork.preprocessing[0].scaler.blueBias, 20.0)
-        self.assertEqual(spec.neuralNetwork.preprocessing[0].scaler.greenBias, -6.0)
-        self.assertEqual(spec.neuralNetwork.preprocessing[0].scaler.redBias, 10.0)
-        self.assertEqual(spec.neuralNetwork.preprocessing[1].scaler.channelScale, 5.0)
-        self.assertEqual(spec.neuralNetwork.preprocessing[1].scaler.grayBias, -13.0)
-
-    def test_cast_op_remover(self):
-        inputs = [("input", (1, 16, 224, 224))]
-        outputs = [("output", (1, 1, 224, 224), TensorProto.FLOAT)]
-
-        conv = helper.make_node(
-            "Conv",
-            name="Conv_1",
-            inputs=["input", "weight"],
-            outputs=["conv_output"],
-            kernel_shape=(3, 3),
-            strides=(1, 1),
-        )
-        cast_1 = helper.make_node(
-            "Cast",
-            name="Cast_1",
-            inputs=["conv_output"],
-            outputs=["cast_1_output"],
-            to=1
-        )
-        sigmoid = helper.make_node(
-            "Sigmoid",
-            name="Sigmoid_1",
-            inputs=["cast_1_output"],
-            outputs=["sigmoid_output"],
-        )
-        cast_2 = helper.make_node(
-            "Cast",
-            name="Cast_2",
-            inputs=["sigmoid_output"],
-            outputs=["output"],
-            to=1
-        )
-        value_info = [
-            ("input", (1, 16, 224, 224), TensorProto.FLOAT),
-            ("conv_output", (1, 1, 224, 224), TensorProto.FLOAT),
-            ("cast_1_output", (1, 1, 224, 224), TensorProto.FLOAT),
-            ("sigmoid_output", (1, 1, 224, 224), TensorProto.FLOAT),
-        ]
-        onnx_model = _onnx_create_model([conv, cast_1, sigmoid, cast_2], inputs, outputs, value_info=value_info)
-
-        graph = Graph.from_onnx(onnx_model.graph, onnx_ir_version=5)
-        new_graph = graph.transformed([CastOpRemover()])
-
-        # The last Cast operation should not be removed
-        self.assertEqual(len(graph.nodes), 4)
-        self.assertEqual(len(new_graph.nodes), 3)
-        self.assertEqual(new_graph.nodes[0].inputs[0], "input")
-        self.assertEqual(new_graph.nodes[1].inputs[0], new_graph.nodes[0].outputs[0])
-        self.assertEqual(new_graph.nodes[2].outputs[0], "output")
-
-
-@unittest.skipUnless(_HAS_ONNX, MSG_ONNX_NOT_FOUND)
-class PixelShuffleFuserTest(unittest.TestCase):
-    def test_pixel_shuffle(self):  # type: () -> None
-        scale_factor = 2
-        input_shape = (1, 8, 2, 2)
-        output_shape = (
-            input_shape[0],
-            int(input_shape[1] / (scale_factor ** 2)),
-            input_shape[2] * scale_factor,
-            input_shape[3] * scale_factor,
-        )
-
-        inputs = [("input0", input_shape)]
-        outputs = [("output0", output_shape, TensorProto.FLOAT)]
-
-        shape1 = [
-            output_shape[0],
-            output_shape[1],
-            scale_factor,
-            scale_factor,
-            input_shape[2],
-            input_shape[3],
-        ]
-
-        shape1 = numpy_helper.from_array(np.asarray(shape1), name="shape1")
-        shape2 = numpy_helper.from_array(np.asarray(list(output_shape)), name="shape2")
-
-        node_0 = helper.make_node(
-            "Reshape", inputs=[inputs[0][0], "shape1"], outputs=["node0"],
-        )
-        node_1 = helper.make_node(
-            "Transpose", inputs=["node0"], outputs=["node1"], perm=[0, 1, 4, 2, 5, 3]
-        )
-        node_2 = helper.make_node(
-            "Reshape", inputs=["node1", "shape2"], outputs=[outputs[0][0]],
-        )
-        model = _onnx_create_model(
-            [node_0, node_1, node_2], inputs, outputs, initializer=[shape1, shape2]
-        )
-        _test_onnx_model(model, decimal=7)
diff --git a/coremltools/converters/onnx/_transformers.py b/coremltools/converters/onnx/_transformers.py
deleted file mode 100644
index 048a03cb2..000000000
--- a/coremltools/converters/onnx/_transformers.py
+++ /dev/null
@@ -1,940 +0,0 @@
-
-from typing import Sequence, Text, Dict, List, Tuple
-import numpy as np
-
-from onnx import TensorProto
-
-from ._graph import Graph, Node
-
-
-def _get_fully_defined_shape(shape, blob_name, graph):
-    if not np.any(shape == -1):
-        return shape
-    if blob_name not in graph.shape_dict:
-        return shape
-    else:
-        return graph.shape_dict[blob_name]
-
-
-def _remove_single_input_output_node(node):
-    for child in node.children:
-        for i, child_input in enumerate(child.inputs):
-            if child_input == node.outputs[0]:
-                # Pass input to child
-                child.inputs[i] = node.inputs[0]
-                # If input tensor is known, pass down the input tensor value
-                if node.inputs[0] in node.input_tensors:
-                    child.input_tensors[node.inputs[0]] = node.input_tensors[
-                        node.inputs[0]
-                    ]
-                # Remove link as a parent from child node
-                child.parents.remove(node)
-                # Link current nodes parent and current child
-                for parent in node.parents:
-                    child.parents.append(parent)
-                    parent.children.append(child)
-                break
-
-    for parent in node.parents:
-        parent.children.remove(node)
-
-
-class NodesFuser(object):
-    """
-    An abstract helper for merging nodes
-    """
-
-    def __init__(
-        self, num_nodes,  # type: int
-    ):
-        # type: (...) -> None
-        assert num_nodes >= 2, "Algorithm only works if fusing multiple nodes"
-        self.num_nodes = num_nodes
-
-    def __call__(self, graph):  # type: (Graph) -> Graph
-        nodes = graph.nodes
-        merged_nodes = {}
-        for node in nodes:
-            nodes_window = []  # type: List[Node]
-            n = node
-            for _ in range(self.num_nodes - 1):
-                if len(n.parents) != 1:
-                    # We're only fusing nodes with single parents
-                    break
-                p = n.get_only_parent()
-                if len(p.children) != 1:
-                    # We can only fuse a node if its parent's
-                    # value isn't used by any other node.
-                    break
-                nodes_window.insert(0, n)
-                n = p
-            if len(nodes_window) > 0:
-                # add parent of chained nodes
-                first = nodes_window[0]
-                p = first.get_only_parent()
-                if len(p.children) == 1:
-                    nodes_window.insert(0, p)
-            if len(nodes_window) != self.num_nodes:
-                continue
-            if not self.is_eligible(graph, nodes_window):
-                continue
-            merged = self.merge(graph, nodes_window)
-            first, last = nodes_window[0], nodes_window[-1]
-            for parent in first.parents:
-                parent.children.remove(first)
-                if merged[0] not in parent.children:
-                    parent.add_child(merged[0])
-            for child in last.children:
-                child.parents.remove(last)
-                if merged[-1] not in child.parents:
-                    child.add_parent(merged[-1])
-            for n in nodes_window:
-                merged_nodes[n.name] = merged
-
-        transformed_nodes = []
-        added_merged = []  # type: List[Node]
-        for node in nodes:
-            if node.name in merged_nodes:
-                merged = merged_nodes[node.name]
-                if merged[0] not in added_merged:
-                    for n in merged:
-                        transformed_nodes.append(n)
-                    added_merged.append(merged[0])
-            else:
-                transformed_nodes.append(node)
-        return graph.create_graph(nodes=transformed_nodes)
-
-    def is_eligible(self, graph, nodes):  # type: (Graph, Sequence[Node]) -> bool
-        """Returns true if this subset of nodes is eligible for fusion."""
-        raise NotImplementedError("Must be implemented by subclass.")
-
-    def merge(self, graph, nodes):  # type: (Graph, Sequence[Node]) -> Sequence[Node]
-        """Merge nodes"""
-        nodes[0].outputs = nodes[-1].outputs
-        return [nodes[0]]
-
-
-class ConvAddFuser(NodesFuser):
-    """
-    Fuses Add layer into parent convolution layer.
-    """
-
-    def __init__(self):  # type: () -> None
-        super(ConvAddFuser, self).__init__(2)
-
-    def is_eligible(self, graph, nodes):  # type: (Graph, Sequence[Node]) -> bool
-        parent, child = nodes[0], nodes[1]
-        if parent.op_type != "Conv":
-            return False
-        if child.op_type != "Add":
-            return False
-        if "broadcast" not in child.attrs:
-            return False
-        if "axis" not in child.attrs:
-            return False
-        if parent.inputs[1] not in parent.input_tensors:
-            return False
-        if len(parent.inputs) > 2 and parent.inputs[2] not in parent.input_tensors:
-            return False
-        if child.inputs[1] not in child.input_tensors:
-            return False
-
-        broadcast = child.attrs["broadcast"]
-        if broadcast != 1:
-            return False
-
-        axis = child.attrs["axis"]
-        if axis != 1:
-            return False
-
-        return True
-
-    def merge(self, graph, nodes):  # type: (Graph, Sequence[Node]) -> Sequence[Node]
-        parent, child = nodes[0], nodes[1]
-        output_channels = parent.input_tensors[parent.inputs[1]].shape[0]
-        if len(parent.inputs) > 2:
-            bias_input_name = parent.inputs[2]
-            bias = parent.input_tensors[bias_input_name]
-        else:
-            bias_input_name = "{}_bias".format(parent.name,)
-            parent.inputs.append(bias_input_name)
-            bias = np.zeros((output_channels,), dtype=np.float32)
-            parent.input_tensors[bias_input_name] = bias
-        bias = bias + child.input_tensors[child.inputs[1]]
-        parent.input_tensors[bias_input_name] = bias
-        parent.outputs = child.outputs
-        parent.children.remove(child)
-        child.parents.remove(parent)
-        return [parent]
-
-
-class BNBroadcastedMulFuser(NodesFuser):
-    """
-    Fuses Mul into BatchNorm
-    """
-
-    def __init__(self):  # type: () -> None
-        super(BNBroadcastedMulFuser, self).__init__(2)
-
-    def is_eligible(self, graph, nodes):  # type: (Graph, Sequence[Node]) -> bool
-        parent, child = nodes[0], nodes[1]
-        if parent.op_type != "BatchNormalization":
-            return False
-        if child.op_type != "Mul":
-            return False
-        if len(child.inputs) != 2:
-            return False
-        if child.inputs[1] not in child.input_tensors:
-            return False
-        t = child.input_tensors[child.inputs[1]]
-        if len(np.squeeze(t).shape) != 1:
-            return False
-        if parent.inputs[1] not in parent.input_tensors:
-            return False
-        if parent.inputs[2] not in parent.input_tensors:
-            return False
-        return True
-
-    def merge(self, graph, nodes):  # type: (Graph, Sequence[Node]) -> Sequence[Node]
-        parent, child = nodes[0], nodes[1]
-        weight = parent.input_tensors[parent.inputs[1]]
-        bias = parent.input_tensors[parent.inputs[2]]
-        W = np.squeeze(child.input_tensors[child.inputs[1]])
-        parent.input_tensors[parent.inputs[1]] = np.multiply(weight, W)
-        parent.input_tensors[parent.inputs[2]] = np.multiply(bias, W)
-        parent.outputs = child.outputs
-        parent.children.remove(child)
-        child.parents.remove(parent)
-        return [parent]
-
-
-class BNBroadcastedAddFuser(NodesFuser):
-    """
-    Fuses Add into BatchNorm
-    """
-
-    def __init__(self):  # type: () -> None
-        super(BNBroadcastedAddFuser, self).__init__(2)
-
-    def is_eligible(self, graph, nodes):  # type: (Graph, Sequence[Node]) -> bool
-        parent, child = nodes[0], nodes[1]
-        if parent.op_type != "BatchNormalization":
-            return False
-        if child.op_type != "Add":
-            return False
-        if len(child.inputs) != 2:
-            return False
-        if child.inputs[1] not in child.input_tensors:
-            return False
-        t = child.input_tensors[child.inputs[1]]
-        if len(np.squeeze(t).shape) != 1:
-            return False
-        if parent.inputs[1] not in parent.input_tensors:
-            return False
-        if parent.inputs[2] not in parent.input_tensors:
-            return False
-        return True
-
-    def merge(self, graph, nodes):  # type: (Graph, Sequence[Node]) -> Sequence[Node]
-        parent, child = nodes[0], nodes[1]
-        bias = parent.input_tensors[parent.inputs[2]]
-        b = np.squeeze(child.input_tensors[child.inputs[1]])
-        parent.input_tensors[parent.inputs[2]] = bias + b
-        parent.outputs = child.outputs
-        parent.children.remove(child)
-        child.parents.remove(parent)
-        return [parent]
-
-
-class DropoutRemover(NodesFuser):
-    """
-    Removes Dropout layer
-    """
-
-    def __init__(self):  # type: () -> None
-        super(DropoutRemover, self).__init__(2)
-
-    def is_eligible(self, graph, nodes):  # type: (Graph, Sequence[Node]) -> bool
-        child = nodes[1]
-        return child.op_type == "Dropout"
-
-    def merge(self, graph, nodes):  # type: (Graph, Sequence[Node]) -> Sequence[Node]
-        parent, child = nodes[0], nodes[1]
-        parent.children.remove(child)
-        child.parents.remove(parent)
-        parent.outputs = [child.outputs[0]]
-        return [parent]
-
-
-class ReshapeInitTensorFuser(object):
-    """
-    Fuses Reshape operator if it is used only to reshape blob in
-    graph initializer. We can reshape here instead of runtime.
-    """
-
-    def __call__(self, graph):  # type: (Graph) -> Graph
-        nodes = graph.nodes
-        removed = []
-        for node in nodes:
-            if node.op_type != "Reshape":
-                continue
-            if not (len(node.input_tensors) == 2 or len(node.input_tensors) == 1):
-                continue
-            tensor_name = node.inputs[0]
-            if tensor_name not in node.input_tensors:
-                continue
-            if len(node.inputs) > 1:
-                shape_name = node.inputs[1]
-                if shape_name not in node.input_tensors:
-                    continue
-            is_non_constant_parent = False
-            if len(node.parents) > 0:
-                for parent in node.parents:
-                    if parent.op_type != "Constant":
-                        is_non_constant_parent = True
-                        break
-            if is_non_constant_parent:
-                continue
-
-            removed.append(node)
-            output_name = node.outputs[0]
-
-            tensor = node.input_tensors[tensor_name]
-            if "shape" in node.attrs:
-                shape = tuple(node.attrs["shape"])
-            else:
-                shape = node.input_tensors[shape_name]  # type: ignore
-
-            # ONNX spec supports setting dimension to '0', in which case
-            # it should be taken from old dimension.
-            # This isn't supported in numpy, so don't transform.
-            # TODO Should we support this case?
-            if any([s == 0 for s in shape]):
-                continue
-
-            reshaped_tensor = tensor.reshape(shape.astype(int))
-
-            for child in node.children:
-                child.parents.remove(node)
-                child.input_tensors[output_name] = reshaped_tensor
-
-        transformed_nodes = [node for node in nodes if node not in removed]
-        return graph.create_graph(nodes=transformed_nodes)
-
-
-class OutputRenamer(object):
-    """
-    Rename outputs according to mapping
-    """
-
-    def __init__(
-        self, mapping,  # type: Dict[Text, Text]
-    ):
-        # type: (...) -> None
-        self.mapping = mapping
-
-    def __call__(self, graph):  # type: (Graph) -> Graph
-        mapping = self.mapping.copy()
-        nodes = graph.nodes
-        for node in nodes:
-            for i in range(len(node.outputs)):
-                output = node.outputs[i]
-                if output not in mapping:
-                    continue
-                node.outputs[i] = mapping[output]
-                for child in node.children:
-                    for j in range(len(child.inputs)):
-                        input_ = child.inputs[j]
-                        if input_ != output:
-                            continue
-                        child.inputs[j] = mapping[output]
-                del mapping[output]
-                if len(mapping) == 0:
-                    break
-        return graph
-
-
-class ReshapeTransposeReshape_pattern1(NodesFuser):
-    """
-    Detects certain types of patterns of "reshape-> (rank 6) -> transpose (rank 6) -> reshape (rank 4)" that can be converted
-    """
-
-    def __init__(self):  # type: () -> None
-        super(ReshapeTransposeReshape_pattern1, self).__init__(3)
-        self.num_added = 0
-
-    def is_eligible(self, graph, nodes):  # type: (Graph, Sequence[Node]) -> bool
-        if not (
-            nodes[0].op_type == "Reshape"
-            and nodes[1].op_type == "Transpose"
-            and nodes[2].op_type == "Reshape"
-        ):
-            return False
-        if len(nodes[0].inputs) == 1 or len(nodes[2].inputs) == 1:
-            return False  # it's an old version of onnx Reshape op that had shape as an attribute
-        if nodes[0].inputs[1] not in nodes[0].input_tensors:
-            return False
-        if nodes[2].inputs[1] not in nodes[2].input_tensors:
-            return False
-
-        shape_1 = nodes[0].input_tensors[nodes[0].inputs[1]]
-        shape_final = nodes[2].input_tensors[nodes[2].inputs[1]]
-
-        shape_1 = _get_fully_defined_shape(shape_1, nodes[0].outputs[0], graph)
-        shape_final = _get_fully_defined_shape(shape_final, nodes[2].outputs[0], graph)
-
-        if len(shape_1) != 6 or shape_1[0] != 1 or len(shape_final) != 4:
-            return False
-
-        # check if coreml can convert this sequence using 1 transpose layer
-        perm = nodes[1].attrs.get("perm", [])
-        if len(perm) != 6:
-            return False
-        if perm[0] != 0:
-            return False
-
-        consecutive_indices = False
-        perm = perm[1:]
-        for i in range(1, 5):
-            if perm[i] - perm[i - 1] == 1:
-                consecutive_indices = True
-                break
-
-        if not consecutive_indices:
-            return False
-
-        return True
-
-    def get_unique_edge_name(self, graph, name):  # type: (Graph, Text) -> Text
-        self.num_added += 1
-        return graph.get_unique_edge_name(name + "_" + str(self.num_added))
-
-    def merge(self, graph, nodes):  # type: (Graph, Sequence[Node]) -> Sequence[Node]
-        """
-        In general, CoreML Reshape and Transpose layers don't support tensors with more
-        than 4 dimensions. However, certain patterns in onnx like
-            "reshape-> (rank 6) -> transpose (rank 6) -> reshape (rank 4)"
-        can be translated to CoreML as (i.e. without going to rank 6)
-            "reshape-> (rank 4) -> transpose (rank 4) -> reshape (rank 4)"
-        """
-        reshape_1 = nodes[0]
-        transpose_1 = nodes[1]
-        final_reshape = nodes[2]
-
-        shape_1 = reshape_1.input_tensors[reshape_1.inputs[1]]
-        shape_1 = _get_fully_defined_shape(shape_1, nodes[0].outputs[0], graph)
-        shape_1 = shape_1[1:]
-        perm = nodes[1].attrs.get("perm", [])
-        perm = perm[1:]
-        perm = [x - 1 for x in perm]
-        # now perm is length 5 list
-
-        new_perm = []
-        new_shape = [1, 1, 1, 1]
-        i = 0
-        found_consecutive_pair = False
-        while i < 5:
-            if not found_consecutive_pair and i < 4 and perm[i + 1] - perm[i] == 1:
-                new_perm.append(perm[i])
-                new_shape[perm[i]] = shape_1[perm[i]] * shape_1[perm[i + 1]]
-                i = i + 2
-                found_consecutive_pair = True
-                continue
-            else:
-                new_perm.append(perm[i] - 1)
-                new_shape[perm[i] - 1] = shape_1[perm[i]]
-            i += 1
-
-        reshape_1.input_tensors[reshape_1.inputs[1]] = np.asarray(new_shape)
-        transpose_1.attrs["perm"] = new_perm
-
-        return [reshape_1, transpose_1, final_reshape]
-
-
-class PixelShuffleFuser(NodesFuser):
-    def __init__(self):  # type: () -> None
-        super(PixelShuffleFuser, self).__init__(3)
-        self.num_added = 0
-
-    def is_eligible(self, graph, nodes):  # type: (Graph, Sequence[Node]) -> bool
-        if not (
-            nodes[0].op_type == "Reshape"
-            and nodes[1].op_type == "Transpose"
-            and nodes[2].op_type == "Reshape"
-        ):
-            return False
-        if len(nodes[0].inputs) == 1 or len(nodes[2].inputs) == 1:
-            return False  # it's an old version of onnx Reshape op that had shape as an attribute
-        if nodes[0].inputs[1] not in nodes[0].input_tensors:
-            return False
-        if nodes[2].inputs[1] not in nodes[2].input_tensors:
-            return False
-
-        shape_1 = nodes[0].input_tensors[nodes[0].inputs[1]]
-        shape_final = nodes[2].input_tensors[nodes[2].inputs[1]]
-
-        shape_1 = _get_fully_defined_shape(shape_1, nodes[0].outputs[0], graph)
-        shape_final = _get_fully_defined_shape(shape_final, nodes[2].outputs[0], graph)
-
-        if len(shape_1) != 6 or shape_1[0] != 1 or len(shape_final) != 4:
-            return False
-
-        if nodes[1].attrs.get("perm", []) != [0, 1, 4, 2, 5, 3]:
-            return False
-
-        return True
-
-    def get_unique_edge_name(self, graph, name):  # type: (Graph, Text) -> Text
-        self.num_added += 1
-        return graph.get_unique_edge_name(name + "_" + str(self.num_added))
-
-    def merge(self, graph, nodes):  # type: (Graph, Sequence[Node]) -> Sequence[Node]
-        """
-        Pixel shuffle is implemented using 3 operators:
-            - Reshape --> rank 6 (1, x1, x2, x3, x4, x5)
-            - Transpose(0, 1, 4, 2, 5, 3) --> (1, x1, x4, x2, x5, x3)
-            - Reshape ---> rank 4
-        CoreML Reshape and Transpose layers don't support tensors with more
-        than 4 dimensions. Thus we change above sequence of operators to the
-        following equivalent sequence:
-            - Reshape --> (x1, x2, x3, x4 * x5)
-            - Transpose(0, 3, 1, 2) --> (x1, x4 * x5, x2, x3)
-            - Reshape --> (x1 * x4, x5, x2, x3)
-            - Transpose(0, 2, 1, 3) --> (x1 * x4, x2, x5, x3)
-            - Reshape --> rank 4
-        """
-        reshape_1 = nodes[0]
-        transpose_1 = nodes[1]
-        final_reshape = nodes[2]
-
-        # first reshape
-        shape_1 = reshape_1.input_tensors[reshape_1.inputs[1]]
-        shape_1 = _get_fully_defined_shape(shape_1, nodes[0].outputs[0], graph)
-        x1 = shape_1[1]
-        x2 = shape_1[2]
-        x3 = shape_1[3]
-        x4 = shape_1[4]
-        x5 = shape_1[5]
-        reshape_1.input_tensors[reshape_1.inputs[1]] = np.asarray([x1, x2, x3, x4 * x5])
-
-        # first transpose
-        transpose_1.children = []
-        transpose_1.attrs["perm"] = [0, 3, 1, 2]
-
-        reshape_output_name = final_reshape.name + "_pixel_shuffle_reshape"
-        transpose_output_name = final_reshape.name + "_pixel_shuffle_transpose"
-
-        transpose_1.outputs = [self.get_unique_edge_name(graph, transpose_output_name)]
-
-        shape_name_second_reshape = self.get_unique_edge_name(
-            graph, reshape_output_name
-        )
-        output_name_second_reshape = self.get_unique_edge_name(
-            graph, reshape_output_name
-        )
-
-        # second reshape
-        reshape_2 = Node(
-            reshape_output_name,
-            "Reshape",
-            {},
-            [transpose_1.outputs[0], shape_name_second_reshape],
-            [output_name_second_reshape],
-        )
-        reshape_2.input_tensors[shape_name_second_reshape] = np.asarray(
-            [x1 * x4, x5, x2, x3]
-        )
-        transpose_1.add_child(reshape_2)
-
-        # second transpose
-        transpose_2 = Node(
-            transpose_output_name,
-            "Transpose",
-            {"perm": [0, 2, 1, 3]},
-            reshape_2.outputs,
-            [self.get_unique_edge_name(graph, transpose_output_name)],
-        )
-        reshape_2.add_child(transpose_2)
-
-        # third reshape
-        final_reshape.inputs = [transpose_2.outputs[0], nodes[2].inputs[1]]
-        final_reshape.parents = []
-        transpose_2.add_child(final_reshape)
-
-        return [reshape_1, transpose_1, reshape_2, transpose_2, final_reshape]
-
-
-class AddModelInputsOutputs(object):
-    """
-    Expose hidden states of recurrent layers as model inputs and outputs
-    """
-
-    def __call__(self, graph):  # type: (Graph) -> Graph
-        input_names = [str(input_[0]) for input_ in graph.inputs]
-        output_names = [str(output_[0]) for output_ in graph.outputs]
-        for node in graph.nodes:
-            if str(node.op_type) == "LSTM":
-                input_h = (
-                    node.inputs[5]
-                    if len(node.inputs) > 5
-                    else node.inputs[0] + "_h_input"
-                )
-                input_c = (
-                    node.inputs[6]
-                    if len(node.inputs) > 6
-                    else node.inputs[0] + "_c_input"
-                )
-                output_h = (
-                    node.outputs[1]
-                    if len(node.outputs) > 1
-                    else node.outputs[0] + "_h_output"
-                )
-                output_c = (
-                    node.outputs[2]
-                    if len(node.outputs) > 2
-                    else node.outputs[0] + "_c_output"
-                )
-                h = node.attrs["hidden_size"]
-                for input_ in [str(input_h), str(input_c)]:
-                    if input_ not in input_names:
-                        graph.inputs.append(tuple((input_, TensorProto.FLOAT, (h,))))  # type: ignore
-                    if input_ not in graph.blob_to_op_type:
-                        graph.blob_to_op_type[input_] = ["LSTM"]
-                for output_ in [str(output_h), str(output_c)]:
-                    if output_ not in output_names:
-                        graph.outputs.append(tuple((output_, TensorProto.FLOAT, (h,))))  # type: ignore
-                    graph.blob_from_op_type[output_] = "LSTM"
-        return graph
-
-
-class ConstantsToInitializers(object):
-    """
-    Takes onnx Constant nodes and puts the tensor into graph initializers instead.
-    """
-
-    def __call__(self, graph):  # type: (Graph) -> Graph
-        output_names = [str(output_[0]) for output_ in graph.outputs]
-        nodes_to_be_removed = []
-        for node in graph.nodes:
-            if node.op_type == "Constant" and (node.name not in output_names):
-                nodes_to_be_removed.append(node)
-                x = node.attrs["value"]
-                for child in node.children:
-                    child.input_tensors[node.outputs[0]] = x
-                    child.parents.remove(node)
-                graph.shape_dict[node.outputs[0]] = x.shape
-
-        transformed_nodes = []
-        for node in graph.nodes:
-            if node not in nodes_to_be_removed:
-                transformed_nodes.append(node)
-        return graph.create_graph(nodes=transformed_nodes)
-
-
-class ConstantFillToInitializers(object):
-    """
-    Takes onnx ConstantFill nodes and puts the tensor into graph initializers instead, for simple cases only.
-    """
-
-    def __call__(self, graph):  # type: (Graph) -> Graph
-        output_names = [str(output_[0]) for output_ in graph.outputs]
-        nodes_to_be_removed = []
-        for node in graph.nodes:
-            if (
-                node.op_type == "ConstantFill"
-                and (node.name not in output_names)
-                and node.attrs.get("input_as_shape", 0)
-                and node.inputs[0] in node.input_tensors
-                and node.attrs.get("extra_shape", None) is None
-            ):
-
-                s = node.input_tensors[node.inputs[0]]
-                x = np.ones(tuple(s.astype(int))) * node.attrs.get("value", 0.0)
-                nodes_to_be_removed.append(node)
-                for child in node.children:
-                    child.input_tensors[node.outputs[0]] = x
-                    child.parents.remove(node)
-                graph.shape_dict[node.outputs[0]] = x.shape
-
-        transformed_nodes = []
-        for node in graph.nodes:
-            if node not in nodes_to_be_removed:
-                transformed_nodes.append(node)
-        return graph.create_graph(nodes=transformed_nodes)
-
-
-class ShapeOpRemover(object):
-    """
-    remove shape op, if the input shape is fully known
-    """
-
-    def __call__(self, graph):  # type: (Graph) -> Graph
-        nodes_to_be_removed = []
-        output_names = [str(output_[0]) for output_ in graph.outputs]
-        for node in graph.nodes:
-            if (
-                node.op_type == "Shape"
-                and (node.name not in output_names)
-                and node.inputs[0] in graph.shape_dict
-            ):
-                x_tuple = graph.shape_dict[node.inputs[0]]  # type: Tuple[int, ...]
-                is_well_defined = True
-                for i in x_tuple:
-                    if not (isinstance(i, int) and i > 0):
-                        is_well_defined = False
-                        break
-                if is_well_defined:
-                    x = np.asarray(x_tuple, dtype=np.float32)
-                    nodes_to_be_removed.append(node)
-                    for child in node.children:
-                        child.input_tensors[node.outputs[0]] = x
-                        child.parents.remove(node)
-                    for parent in node.parents:
-                        parent.children.remove(node)
-                    graph.shape_dict[node.outputs[0]] = x.shape
-
-        transformed_nodes = []
-        for node in graph.nodes:
-            if node not in nodes_to_be_removed:
-                transformed_nodes.append(node)
-        return graph.create_graph(nodes=transformed_nodes)
-
-
-class CastOpRemover(object):
-    """
-    Remove Cast Op: onnx-coreml treats all tensor as Float and hence, Cast operator should be removed
-    """
-
-    def __call__(self, graph):  # type: (Graph) -> Graph
-        global cast_i
-        nodes_to_be_removed = []
-        output_names = [str(output_[0]) for output_ in graph.outputs]
-        for node in graph.nodes:
-            if (
-                node.op_type == "Cast"
-                and all(output not in output_names for output in node.outputs)
-                and node.inputs[0] in graph.shape_dict
-            ):
-                nodes_to_be_removed.append(node)
-                _remove_single_input_output_node(node)
-
-        transformed_nodes = []
-        for node in graph.nodes:
-            if node not in nodes_to_be_removed:
-                transformed_nodes.append(node)
-        return graph.create_graph(nodes=transformed_nodes)
-
-
-class PaddingOpRemover(object):
-    """
-    Remove Pad Op if all the pad values are 0
-    """
-
-    def __call__(self, graph):  # type: (Graph) -> Graph
-        global cast_i
-        nodes_to_be_removed = []
-        output_names = [str(output_[0]) for output_ in graph.outputs]
-        for node in graph.nodes:
-            if (
-                node.op_type == "Pad"
-                and (node.name not in output_names)
-                and node.inputs[0] in graph.shape_dict
-            ):
-                pads = node.attrs.get("pads", [])
-                if len(pads) > 0 and sum(pads) == 0:
-                    nodes_to_be_removed.append(node)
-                    _remove_single_input_output_node(node)
-
-        transformed_nodes = []
-        for node in graph.nodes:
-            if node not in nodes_to_be_removed:
-                transformed_nodes.append(node)
-        return graph.create_graph(nodes=transformed_nodes)
-
-
-class ImageScalerRemover(object):
-    """
-    Removes ImageScaler layer if connected to a model input and single parent child nodes
-    """
-
-    def __call__(self, graph):  # type: (Graph) -> Graph
-        input_names = [str(input_[0]) for input_ in graph.inputs]
-        nodes_to_be_removed = []
-        for node in graph.nodes:
-            if (
-                (node.op_type != "ImageScaler")
-                or (len(node.parents) != 0)
-                or (node.inputs[0] not in input_names)
-            ):
-                continue
-            nodes_to_be_removed.append(node.name)
-            for child in node.children:
-                for i, child_input in enumerate(child.inputs):
-                    if child_input == node.outputs[0]:
-                        child.inputs[i] = node.inputs[0]
-                        child.parents.remove(node)
-                        break
-
-        transformed_nodes = []
-        for node in graph.nodes:
-            if node.name not in nodes_to_be_removed:
-                transformed_nodes.append(node)
-        return graph.create_graph(nodes=transformed_nodes)
-
-
-class ConstantRemover(object):
-    """
-    Removes Op if its input is constant
-    Currently, Supports: Gather, Floor, Div, Mul, Slice, Transpose, Concat, Unsqueeze, Squeeze
-    """
-
-    def __call__(self, graph):  # type: (Graph) -> Graph
-        nodes_to_be_removed = []
-        for node in graph.nodes:
-            are_all_inputs_constant = True
-            for input_ in node.inputs:
-                if input_ not in node.input_tensors:
-                    are_all_inputs_constant = False
-                    break
-
-            transformation_performed = False
-            if len(node.parents) != 0 or are_all_inputs_constant == False:
-                continue
-            # TODO: Replace If -> ElIf with more general transformation block
-            if node.op_type == "Gather":
-                data = node.input_tensors[node.inputs[0]]
-                idx = node.input_tensors[node.inputs[1]]
-                axis = node.attrs.get("axis", 0)
-                output = np.take(data, idx, axis=axis)
-                transformation_performed = True
-            elif node.op_type == "Floor":
-                input = node.input_tensors[node.inputs[0]]
-                output = np.floor(input)
-                transformation_performed = True
-            elif node.op_type == "Div" or node.op_type == "Mul":
-                x = node.input_tensors[node.inputs[0]]
-                y = node.input_tensors[node.inputs[1]]
-                for child_node in node.children:
-                    # child_node.parents.remove(node)
-                    if node.op_type == "Div":
-                        output = x / y
-                    else:
-                        output = x * y
-                transformation_performed = True
-            elif node.op_type == "Slice":
-                x = node.input_tensors[node.inputs[0]]
-                ends = node.attrs["ends"]
-                starts = node.attrs["starts"]
-                axes = node.attrs.get("axes", range(len(starts)))
-                output = x
-                for i, a in enumerate(axes):
-                    s = starts[i]
-                    e = ends[i]
-                    n = x.shape[a]
-                    if s < 0:
-                        s += n
-                    if e < 0:
-                        e += n
-                    output = np.take(x, range(s, e), axis=a)  # type: ignore
-                transformation_performed = True
-            elif node.op_type == "Transpose":
-                x = node.input_tensors[node.inputs[0]]
-                perm = node.attrs.get("perm", None)
-                output = np.transpose(x, axes=perm)  # type: ignore
-                transformation_performed = True
-            elif node.op_type == "Concat":
-                x_arr = []
-                for input_ in node.inputs:
-                    x_arr.append(node.input_tensors[input_])
-                axis = node.attrs.get("axis", 0)
-                output = np.concatenate(x_arr, axis=axis)  # type: ignore
-                transformation_performed = True
-            elif node.op_type == "Unsqueeze" or node.op_type == "Squeeze":
-                x = node.input_tensors[node.inputs[0]]
-                if node.op_type == "Unsqueeze":
-                    axes = node.attrs["axes"]
-                    axes.sort()
-                    for axis in axes:
-                        output = np.expand_dims(x, axis=axis)  # type: ignore
-                else:
-                    axes = node.attrs.get("axes", None)
-                    output = np.squeeze(x, axis=tuple(axes))
-                transformation_performed = True
-            elif node.op_type == "Gemm":
-                alpha = node.attrs.get("alpha", 1.0)
-                beta = node.attrs.get("beta", 1.0)
-                transA = node.attrs.get("transA", False)
-                transB = node.attrs.get("transB", False)
-
-                A_tensor = node.input_tensors[node.inputs[0]]
-                B_tensor = node.input_tensors[node.inputs[1]]
-                C_tensor = node.input_tensors[node.inputs[2]]
-
-                A_tensor = np.transpose(A_tensor) if transA else A_tensor
-                B_tensor = np.transpose(B_tensor) if transB else B_tensor
-
-                output = alpha * np.dot(A_tensor, B_tensor) + beta * C_tensor
-                transformation_performed = True
-
-            if transformation_performed:
-                nodes_to_be_removed.append(node)
-                graph.shape_dict[node.outputs[0]] = output.shape
-                for child_node in node.children:
-                    child_node.parents.remove(node)
-                    child_node.input_tensors[node.outputs[0]] = output
-        transformed_nodes = []
-        for node in graph.nodes:
-            if node not in nodes_to_be_removed:
-                transformed_nodes.append(node)
-        return graph.create_graph(nodes=transformed_nodes)
-
-
-class DeadCodeElimination(object):
-    """
-    Removes nodes with unused outputs
-    """
-
-    def __call__(self, graph):  # type: (Graph) -> Graph
-        input_names = [str(input_[0]) for input_ in graph.inputs]
-        output_names = set([str(output_[0]) for output_ in graph.outputs])
-
-        nodes_to_be_removed = []
-        uses = {}
-
-        for _output in output_names:
-            uses[_output] = uses.get(_output, 0) + 1
-
-        for node in graph.nodes:
-            for _input in node.inputs:
-                uses[_input] = uses.get(_input, 0) + 1
-
-        for node in reversed(graph.nodes):
-            output_used = False
-            for _output in node.outputs:
-                if _output in uses:
-                    output_used = True
-                    break
-
-            if not output_used:
-                # Remove current node
-                for _input in node.inputs:
-                    uses[_input] -= 1
-                    if uses[_input] == 0:
-                        del uses[_input]
-                nodes_to_be_removed.append(node.name)
-                for parent in node.parents:
-                    parent.children.remove(node)
-
-        transformed_nodes = []
-        for node in graph.nodes:
-            if node.name not in nodes_to_be_removed:
-                transformed_nodes.append(node)
-
-        for _input in input_names:
-            if _input not in uses:
-                for i in range(len(graph.inputs)):
-                    if graph.inputs[i][0] is _input:
-                        graph.inputs.remove(graph.inputs[i])
-                        break
-
-        return graph.create_graph(nodes=transformed_nodes)
diff --git a/coremltools/models/__init__.py b/coremltools/models/__init__.py
index 63379af43..c0a753bf7 100644
--- a/coremltools/models/__init__.py
+++ b/coremltools/models/__init__.py
@@ -32,3 +32,4 @@
 )
 
 from . import neural_network
+from . import ml_program
diff --git a/coremltools/models/datatypes.py b/coremltools/models/datatypes.py
index 2ebd8dae6..b395b9da9 100644
--- a/coremltools/models/datatypes.py
+++ b/coremltools/models/datatypes.py
@@ -12,7 +12,7 @@
 from ..proto import Model_pb2
 
 
-class _DatatypeBase(object):
+class _DatatypeBase:
     def __init__(self, type_tag, full_tag, num_elements):
         self.type_tag, self.full_tag = type_tag, full_tag
         self.num_elements = num_elements
diff --git a/coremltools/converters/onnx/__init__.py b/coremltools/models/ml_program/__init__.py
similarity index 56%
rename from coremltools/converters/onnx/__init__.py
rename to coremltools/models/ml_program/__init__.py
index 2b5092ed2..9c0d8b44f 100644
--- a/coremltools/converters/onnx/__init__.py
+++ b/coremltools/models/ml_program/__init__.py
@@ -1,7 +1,6 @@
-# Copyright (c) 2018, Apple Inc. All rights reserved.
+# Copyright (c) 2022, Apple Inc. All rights reserved.
 #
 # Use of this source code is governed by a BSD-3-clause license that can be
 # found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
-from coremltools._deps import _HAS_ONNX
-from ._converter import convert
+from . import compression_utils
\ No newline at end of file
diff --git a/coremltools/models/ml_program/compression_utils.py b/coremltools/models/ml_program/compression_utils.py
new file mode 100644
index 000000000..36e8c8e80
--- /dev/null
+++ b/coremltools/models/ml_program/compression_utils.py
@@ -0,0 +1,580 @@
+# Copyright (c) 2022, Apple Inc. All rights reserved.
+#
+# Use of this source code is governed by a BSD-3-clause license that can be
+# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from coremltools.converters.mil.converter import mil_convert as _mil_convert
+from coremltools.converters.mil import Operation as _Operation
+from coremltools.converters.mil.frontend.milproto.load import load as _milproto_to_pymil
+from coremltools.converters.mil.mil.passes.compression_passes import (
+    WeightSparsifier as _WeightSparsifier,
+    WeightPalettizer as _WeightPalettizer,
+    WeightAffineQuantizer as _WeightAffineQuantizer,
+    WeightDecompressor as _WeightDecompressor,
+)
+from coremltools.converters.mil.mil.passes.quantization_passes import AbstractQuantizationPass as _AbstractQuantizationPass
+from coremltools import (
+    _SPECIFICATION_VERSION_IOS_16,
+    ComputeUnit as _ComputeUnit
+)
+
+_DEFAULT_MIN_WEIGHT_SIZE_TO_COMPRESS = 2048
+_DEFAULT_SPECIFICATION_VERSION_FOR_COMPRESSION = _SPECIFICATION_VERSION_IOS_16
+
+
+def _default_op_selector(const_op):
+    if not isinstance(const_op, _Operation) or const_op.op_type != "const":
+        raise ValueError("Input of the op_selector must be type of const Operation, got {}.".format(type(const_op)))
+    return const_op.val.val.size > _DEFAULT_MIN_WEIGHT_SIZE_TO_COMPRESS
+
+def _apply_graph_pass(mlmodel, graph_pass):
+    # Utility function which compresses a coreml model
+    # convert the fully precision mlmodel into pymil program
+    model_spec = mlmodel.get_spec()
+    model_type = model_spec.WhichOneof("Type")
+    if model_type in ("neuralNetwork", "neuralNetworkClassifier", "neuralNetworkRegressor", "pipeline", "PipelineClassifier", "PipelineRegressor"):
+        msg = ("coremltools.compression_utils are meant to be used only with mlprogram typed coreml models. "
+              "This model has type {}. Please use coremltools.models.neural_network.quantization_utils.quantize_weights"
+              "instead to compress the weights of the model.")
+        raise TypeError(msg.format(model_type))
+    elif model_type == "mlProgram":
+        pass
+    else:    
+       raise TypeError("weight compression not applicable for model type {}".format(model_type))
+
+    assert isinstance(graph_pass, _AbstractQuantizationPass), "compression pass must be an AbstractQuantizationPass instance"
+    
+    program_spec = model_spec.mlProgram
+    model_specification_version = model_spec.specificationVersion
+    prog =  _milproto_to_pymil(
+        program_spec=program_spec,
+        specification_version=model_specification_version,
+        file_weights_dir=mlmodel.weights_dir,
+    )
+
+    prog.skip_all_passes = True
+
+    # apply compression graph pass
+    graph_pass.apply(prog)
+
+    # convert the pymil program back to mlmodel
+    compress_model_specification_version = max(model_specification_version, _DEFAULT_SPECIFICATION_VERSION_FOR_COMPRESSION)
+    compressed_mlmodel = _mil_convert(
+        prog,
+        convert_to="mlprogram",
+        convert_from="milinternal",
+        specification_version=compress_model_specification_version,
+        compute_units=mlmodel.compute_unit,
+        model_description=model_spec.description,
+    )
+    return compressed_mlmodel
+
+def affine_quantize_weights(mlmodel, mode="linear_symmetric", op_selector=None):
+    """
+    Utility function to convert a float precision MLModel of type ``mlprogram`` that uses
+    float-precision weights into a compressed MLModel that uses 8-bit weights. This is
+    achieved by converting the float weight values that are stored in the ``const`` op
+    into the ``constexpr_affine_dequantize`` op.
+    
+    This function uses affine quantization on the float weights, providing up to 2x
+    savings in storage compared to float 16, or up to 4x savings compared to float 32.
+    All computation at runtime uses float precision; the precision of the intermediate
+    tensors and the compute precision of the ops are not altered.
+    
+    For each weight, this utility function converts the weight into the uint8 type using
+    either `Linear interpolation` (``"linear"`` mode) or `Linear symmetric
+    interpolation` (``"linear_symmetric"`` mode, the default).
+    
+    **Linear interpolation**
+    
+    Linear interpolation (``"linear"`` mode) maps the min/max of the float
+    range to the range [0, 255] using a zero point (also called quantization bias, or
+    offset) and a scale factor.
+    
+    ``"linear"`` mode uses the quantization formula ``w_r = s * (w_q - z)``, where:
+    
+        * ``w_r`` and  ``s`` are of type float.
+        * ``w_r`` represents the float precision weight.
+        * ``s`` represents the scale.
+        * ``w_q`` and ``z`` are of type uint8.
+        * ``w_q`` represents quantized weight.
+        * ``z`` represents the zero point.
+    
+    Quantized weights are computed as follows:
+    
+        * ``w_q = cast_to_uint8(w_r / s + cast_to_float(z))``
+        * Note: ``cast_to_uint8`` is the process of clipping the input to range [0, 255]
+          followed by rounding and casting to uint8.
+    
+    In ``"linear"`` mode, ``s, z`` are computed by mapping the original float range
+    ``[A, B]`` into the uint8 range [0, 255]. That is, you are solving the following
+    linear equations:
+    
+        * ``B = s * (255 - z)``
+        * ``A = s * (0 - z)``
+    
+    The equations result in the following:
+    
+        * ``s = (B - A) / 255``
+        * ``z = cast_to_uint8(-255 * A / (B - A))``
+    
+    When the rank of weight ``w`` is 1, then ``s`` and ``z`` are both scalars. When the
+    rank of the weight is greater than 1, then ``s`` and ``z`` are both vectors. In that
+    case, scales are computed "per channel", in which "channel" is the output dimension,
+    which corresponds to the first dimension for ops such as ``conv`` and ``linear``, and
+    the second dimension for the ``conv_transpose`` op.
+    
+    For ``"linear"`` mode, ``A = min(w_r), B = max(w_r)``.
+    
+    **Linear symmetric interpolation**
+    
+    With linear symmetric interpolation (``"linear_symmetric"`` mode, the default), rather than
+    mapping the exact min/max of the float range to the quantized range,
+    the function chooses the maximum absolute value between the min/max, which results in
+    a zero point value of 127. The floating-point range is symmetric with respect to zero,
+    and so is the quantized range.
+    
+    For ``"linear_symmetric"`` mode:
+    
+       * ``A = -R`` and ``B = R``, where ``R = max(abs(w_r))``.
+       * This function maps to the range [0, 254].
+       * The result is ``s=(B-A)/254`` --> ``s=2R/254`` --> ``s=R/127``.
+       * Solving for ``z``: ``z = (R/2R) * 254`` --> ``z=127``.
+
+    Parameters
+    ----------
+    mlmodel: MLModel
+        Model to be quantized. This MLModel should be of type ``mlprogram``.
+
+    mode: str
+        Mode for linear quantization:
+        
+        * ``"linear_symmetric"`` (default): Input data are quantized in the range
+          ``[-R, R]``, where ``R = max(abs(w_r))``.
+        * ``"linear"``: Input data are quantized in the range
+          ``[min(w_r), max(w_r)]``.
+
+    op_selector: callable 
+        This function takes a single parameter with type ``coremltools.converters.mil.Const``;
+        that is, a ``const`` operation. It returns a ``bool``: ``True`` to compress ``const_op``,
+        otherwise ``False``. See the following examples:
+        
+        * All constants in the network are compressed:
+          
+          .. sourcecode:: python
+
+              def op_selector(const_op):
+                    return True
+
+        * Only the constant with ``tensor.size > 2048`` is compressed:
+          
+          .. sourcecode:: python
+
+              def op_selector(const_op): 
+                    return const_op.val.val.size > 2048
+
+        * Compress the constant if it is the weight of a convolution layer
+          and ``tensor.size > 2048``:
+          
+          .. sourcecode:: python
+
+              def op_selector(const_op):
+                    return (const_op.val.val.size > 2048 
+                            and const_op.val.child_ops[0].op_type == "conv" 
+                            and const_op.val == const_op.val.child_ops[0].weight
+                            )
+
+        * When creating a custom ``op_selector`` function, the following attributes are helpful:
+        
+             * ``const_op.val.val``: The numpy array holding the value of the const.
+             * ``const_op.val.child_ops``: A list of ops into which this constant is feeding.
+             * ``const_op.val.child_ops[i].op_type``: The string corresponding to the op type
+               of the i-th child op.
+             * ``const_op.val.child_ops[i].name``: The string corresponding to the name the
+               i-th child op.
+
+        * If ``op_selector`` is not provided, it will be set to the behavior in which
+          weights bigger than 2048 elements are compressed:
+          
+          .. sourcecode:: python
+
+              def op_selector(const_op):
+                    returm const_op.val.val.size > 2048:
+
+    Returns
+    -------
+    
+    model: MLModel
+        The quantized MLModel instance.
+
+    Examples
+    --------
+    
+        >>> import coremltools as ct
+        >>> model = ct.models.MLModel('my_model.mlpackage')
+        >>> compressed_model = ct.compression_utils.affine_quantize_weights(model, mode="linear_symmetric")
+
+    """
+    if op_selector is None:
+        op_selector = _default_op_selector
+    affine_weight_quantizer = _WeightAffineQuantizer(fake_compression=False, mode=mode, op_selector=op_selector)
+    return _apply_graph_pass(mlmodel, affine_weight_quantizer)
+
+
+def palettize_weights(mlmodel, nbits=None, mode="kmeans", op_selector=None, lut_function=None):
+    """
+    Utility function to convert a float precision MLModel of type ``mlprogram`` to a
+    compressed MLModel by reducing the overall number of weights using a lookup table
+    (LUT). A LUT contains a list float values. An `nbit` LUT has 2\ :sup:`nbits` entries.
+    
+    For example, a float weight vector such as ``{0.3, 0.3, 0.5, 0.5}`` can be compressed
+    using a 1-bit LUT: ``{0.3, 0.5}``. In this case the float vector can be replaced
+    with a 1-bit vector ``{0, 0, 1, 1}``.
+    
+    This function iterates over all the weights in the ``mlprogram``, discretizes its values,
+    and constructs the LUT according to the algorithm specified in ``mode``. The float
+    values are then converted to the `nbit` values, and the LUT is saved alongside each
+    weight. The ``const`` ops storing weight values are replaced by
+    ``constexpr_lut_to_dense`` ops.
+    
+    At runtime, the LUT and the `nbit` values are used to reconstruct the float weight
+    values, which are then used to perform the float operaton the weight is feeding into. 
+    
+    Consider the following example of ``"uniform"`` mode (a linear histogram):
+    
+        * ``nbits = 4``
+        * ``mode = "uniform"``
+        * ``weight = [0.11, 0.19, 0.3, 0.08, 0.0, 0.02]``
+        
+    The weight can be converted to a palette with indices ``[0, 1, 2, 3]`` (2 bits). The
+    indices are a byte array.
+    
+    The data range ``[0.0, 0.3]`` is divided into 4 partitions linearly, which is
+    ``[0.0, 0.1, 0.2, 0.3]``.
+    
+        * The LUT would be ``[0.0, 0.1, 0.2, 0.3]``.
+     
+        * The weight is rounded to ``[0.1, 0.2, 0.3, 0.1, 0.0, 0.0]``, and represented in
+          the palette as indices ``[01b, 10b, 11b, 01b, 00b, 00b]``.
+
+    Parameters
+    ----------
+    mlmodel: MLModel
+        Model to be converted by a LUT. This MLModel should be of type ``mlprogram``.
+
+    nbits: int
+        Number of bits per weight. Required for ``kmeans`` or ``uniform`` mode, but must
+        not be set for ``unique`` or ``custom`` mode. A LUT would have
+        2\ :sup:`nbits` entries, where `nbits` can be ``{1, 2, 4, 6, 8}``.
+
+    mode: str
+        Determine how the LUT is constructed by specifying one of the following:
+        
+        * ``"kmeans"`` (default): The LUT is generated by `k-means clustering`, a method of vector
+          quantization that groups similar data points together to discover underlying
+          patterns by using a fixed number (`k`) of clusters in a dataset. A cluster
+          refers to a collection of data points aggregated together because of certain
+          similarities. `nbits` is required.
+
+        * ``"uniform"``: The LUT is generated by a linear histogram.
+        
+           - ``[v_min, v_min + scale, v_min + 2 * scale, ..., v_max]``
+           - Where the weight is in the range ``[v_min, v_max]``, and
+             ``scale = (v_max - v_min) / (1 << nbits - 1)``.
+           - ``nbits`` is required.
+           
+           A `histogram` is a representation of the distribution of a continuous variable,
+           in which the entire range of values is divided into a series of intervals (or
+           "bins") and the representation displays how many values fall into each bin.
+           Linear histograms have one bin at even intervals, such as one bin per integer.
+          
+        * ``"unique"``: The LUT is generated by unique values in the weights. The weights
+          are assumed to be on a discrete lattice but stored in a float data type. This
+          parameter identifies the weights and converts them into the palettized representation.
+          
+          Do not provide ``nbits`` for this mode. ``nbits`` is picked up automatically,
+          with the smallest possible value in ``{1, 2, 4, 6, 8}`` such that the
+          number of the unique values is ``<= (1 << nbits)``. If the weight has ``> 256``
+          unique values, the compression is skipped.
+          
+          For example:
+          
+          * If the weights are ``{0.1, 0.2, 0.3, 0.4}`` and ``nbits=2``, the weights are
+            converted to ``{00b, 01b, 10b, 11b}``, and the generated LUT is
+            ``[0.1, 0.2, 0.3, 0.4]``.
+          * If the weights are ``{0.1, 0.2, 0.3, 0.4}`` and ``nbits=1``, nothing happens
+            because the weights are not a 1-bit lattice.
+          * If the weights are ``{0.1, 0.2, 0.3, 0.4, 0.5}`` and ``nbits=2``, nothing
+            happens because the weights are not a 2-bit lattice.
+          
+        * ``"custom"``: The LUT and palettization parameters are calculated using a custom
+          function. If this mode is selected then ``lut_function`` must be provided.
+
+          Do not provide ``nbits`` for this mode. The user should customize ``nbits`` in the 
+          ``lut_function`` implementation.
+
+    op_selector: callable 
+        This function takes a single parameter with type ``coremltools.converters.mil.Operation``.
+        It returns a ``bool``: ``True`` to compress ``const_op``, otherwise ``False``.
+        See the following examples:
+        
+        * All constants in the network are compressed:
+          
+          .. sourcecode:: python
+
+              def op_selector(const_op):
+                    return True
+
+        * Only the constant with ``tensor.size > 2048`` is compressed:
+          
+          .. sourcecode:: python
+
+              def op_selector(const_op): 
+                    return const_op.val.val.size > 2048
+
+        * Compress the constant if it is the weight of a convolution layer
+          and ``tensor.size > 2048``:
+          
+          .. sourcecode:: python
+
+              def op_selector(const_op):
+                    return (const_op.val.val.size > 2048 
+                            and const_op.val.child_ops[0].op_type == "conv" 
+                            and const_op.val == const_op.val.child_ops[0].weight
+                            )
+
+        * When creating a custom ``op_selector`` function, the following attributes are helpful:
+
+             * ``const_op.val.val``: The numpy array holding the value of the const.
+             * ``const_op.val.child_ops``: A list of ops into which this constant is feeding.
+             * ``const_op.val.child_ops[i].op_type``: The string corresponding to the op type
+               of the i-th child op.
+             * ``const_op.val.child_ops[i].name``: The string corresponding to the name the
+               i-th child op.
+
+        * If ``op_selector`` is not provided, it will be set to the behavior in which
+          weights bigger than 2048 elements are compressed:
+          
+          .. sourcecode:: python
+
+              def op_selector(const_op):
+                    returm const_op.val.val.size > 2048:
+  
+    lut_function: callable
+        A callable function which computes the weight palettization parameters. This must
+        be provided if the mode is set to ``"custom"``.
+
+        weight: np.ndarray
+            A float precision numpy array.
+
+        Returns: lut: list[float]
+            The lookup table.
+
+        indices: list[int]
+            A list of indices for each element.
+
+        The following is an example that extract the ``top_k`` elements as the LUT. Given
+        that ``weight = [0.1, 0.5, 0.3, 0.3, 0.5, 0.6, 0.7]``, the ``lut_function``
+        produces ``lut = [0, 0.5, 0.6, 0.7], indices = [0, 1, 0, 0, 2, 3]``.
+          
+        .. sourcecode:: python
+
+           def lut_function(weight):
+                # In this example, we assume elements in the weights >= 0
+                weight = weight.flatten()
+                nbits = 4
+
+                # Get the LUT, from extracting top k maximum unique elements in the weight to be the LUT
+                # Note that k = 1 << nbits - 1, so we have the first element be 0
+                unique_elements = np.unique(weight)
+                k = (1 << nbits) - 1
+                top_k = np.partition(weight, -k)[-k:]
+                np.sort(top_k)
+                lut = [0.] + top_k.tolist()
+
+                # Compute the indices
+                mapping = {v: idx for idx, v in enumerate(lut)}
+                indices = [mapping[v] if v in mapping else 0 for v in weight]
+
+                return lut, indices
+
+    Returns
+    -------
+    model: MLModel
+        The palettized MLModel instance.
+
+    Examples
+    --------
+    
+    .. sourcecode:: python
+
+        >>> import coremltools as ct
+        >>> model = ct.models.MLModel('my_model.mlpackage')
+        >>> compressed_model = ct.compression_utils.palettize_weights(model, mode="kmeans", nbits=4)
+    
+    
+    """
+    if op_selector is None:
+        op_selector = _default_op_selector        
+    weight_palettizer = _WeightPalettizer(nbits=nbits, fake_compression=False, op_selector=op_selector, mode=mode, lut_function=lut_function)
+    return _apply_graph_pass(mlmodel, weight_palettizer)
+    
+
+def sparsify_weights(mlmodel, mode="threshold_based", threshold=1e-3, target_percentile=1.0, op_selector=None):
+    """
+    Utility function to convert a float precision MLModel of type ``mlprogram`` to a
+    compressed MLModel using sparse representation. The ``const`` ops storing weight
+    values are replaced by ``constexpr_sparse_to_dense`` ops.
+    
+    This function is useful if the model is trained with pruning techniques so that
+    a lot of weights have zero values. If a large percentage of weight values are zero,
+    a sparse representation is more efficient than a dense one (the default).
+    
+    The sparsified weights are stored in a bit mask. If the weight values are
+    ``{0, 0, 0, 0, 0, 0, 0, 56.3}``, its sparse representation contains a bit mask with
+    ones on locations where the value is non-zero: ``00000001b``. This is accompanied by
+    non-zero data, which is a size-1 vector of value ``{56.3}``.
+
+    For example, given the following:
+    
+        * ``weight = [0.3, 0, 0, 0.5, 0, 0]``
+        * ``non_zero_data, bit_mask = sparsify(weight)``
+    
+    The indices of the non-zero elements are:
+    
+        * ``non_zero_data = [0.3, 0.5]``
+        * ``bit_mask = "100100"``
+
+    Parameters
+    ----------
+    mlmodel: MLModel
+        Model to be sparsified. This MLModel should be of type ``mlprogram``.
+
+    mode: str
+        Determine the scheme to sparsify the model by specifying one of the following:
+        
+        * ``"threshold_based"`` (default): All the absolute weight values that are smaller
+          than ``threshold`` are changed to 0, and the tensor is stored in a sparse format.
+          For example, given the following:
+
+               * ``weight = [0.3, -0.2, -0.01, 0.05]``
+               * ``threshold = 0.03``
+
+          The sparsified weight would be ``[0.3, -0.2, 0, 0.05]``.
+
+        * ``"percentile_based"``: Sparsify the weight with a constant sparsity percentile,
+          which is ``target_percentile``. Where 
+          ``n = floor(size_of_weight_tensor * target_percentile)``, the ``n`` lowest
+          absolute weight values are changed to 0. For example, given the following:
+
+               * ``weight = [0.3, -0.2, -0.01, 0.05]``
+               * ``target_percentile = 0.75``
+
+          The sparsified weight would be ``[0.3, 0, 0, 0]``.
+
+    threshold: float
+        Required when ``mode = "prune_threshold"``. The absolute threshold to sparsify the weight.
+
+    target_percentile: float
+        Required when ``mode = "percentile_based"``. The percentage of sparsity for
+        compression, which needs to be in the range [0, 1]. When 0, no sparsification
+        occurs. For 1, all weights become 0.
+
+    op_selector: callable 
+        This function takes a single parameter with type ``coremltools.converters.mil.Operation``.
+        It returns a ``bool``: ``True`` to compress ``const_op``, otherwise ``False``.
+        See the following examples:
+        
+        * All constants in the network are compressed:
+          
+          .. sourcecode:: python
+
+              def op_selector(const_op):
+                    return True
+
+        * Only the constant with ``tensor.size > 2048`` is compressed:
+          
+          .. sourcecode:: python
+
+              def op_selector(const_op): 
+                    return const_op.val.val.size > 2048
+
+        * Compress the constant if it is the weight of a convolution layer
+          and ``tensor.size > 2048``:
+          
+          .. sourcecode:: python
+
+              def op_selector(const_op):
+                    return (const_op.val.val.size > 2048 
+                            and const_op.val.child_ops[0].op_type == "conv" 
+                            and const_op.val == const_op.val.child_ops[0].weight
+                            )
+
+        * When creating a custom ``op_selector`` function, the following attributes are helpful:
+        
+             * ``const_op.val.val``: The numpy array holding the value of the const.
+             * ``const_op.val.child_ops``: A list of ops into which this constant is feeding.
+             * ``const_op.val.child_ops[i].op_type``: The string corresponding to the op type
+               of the i-th child op.
+             * ``const_op.val.child_ops[i].name``: The string corresponding to the name the
+               i-th child op.
+
+        * If ``op_selector`` is not provided, it will be set to the behavior in which
+          weights bigger than 2048 elements are compressed:
+          
+          .. sourcecode:: python
+
+              def op_selector(const_op):
+                    returm const_op.val.val.size > 2048:
+  
+    Returns
+    -------
+    model: MLModel
+        The sparse MLModel instance.
+
+    Examples
+    --------
+    .. sourcecode:: python
+
+        >>> import coremltools as ct
+        >>> model = ct.models.MLModel('my_model.mlpackage')
+        >>> compressed_model = ct.compression_utils.sparsify_weights(model, mode="threshold_based", threshold=0.01)
+
+    """
+    if op_selector is None:
+        op_selector = _default_op_selector
+    weight_sparsifier = _WeightSparsifier(mode=mode, threshold=threshold, target_percentile=target_percentile, op_selector=op_selector)
+    return _apply_graph_pass(mlmodel, weight_sparsifier)
+
+def decompress_weights(mlmodel):
+    """
+    Utility function to convert weights that are sparse or palettized or affine quantized, back to the float format.
+    That is, convert any of the follwing three ops:
+    
+    (1) constexpr_affine_dequantize
+    (2) constexpr_lut_to_dense
+    (3) constexpr_sparse_to_dense
+    
+    to mb.const
+
+    Parameters
+    ----------
+    mlmodel: MLModel
+        Model which will be decompressed.
+
+    Returns
+    -------
+    model: MLModel
+        The MLModel with no constexpr ops included.
+
+    Examples
+    --------
+    .. sourcecode:: python
+
+        >>> import coremltools as ct
+        >>> model = ct.models.MLModel('my_compressed_model.mlpackage')
+        >>> decompressed_model = ct.compression_utils.decompress_weights(model)
+
+    """
+    weight_decompressor = _WeightDecompressor(op_selector=lambda op: True)
+    return _apply_graph_pass(mlmodel, weight_decompressor)
diff --git a/coremltools/models/model.py b/coremltools/models/model.py
index d9930db91..b656ed23a 100644
--- a/coremltools/models/model.py
+++ b/coremltools/models/model.py
@@ -4,13 +4,13 @@
 # found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 from copy import deepcopy as _deepcopy
+import numpy as _np
 import os as _os
 import shutil as _shutil
 import tempfile as _tempfile
 import warnings as _warnings
 import numpy as _numpy
 
-
 from ..proto import (
     Model_pb2 as _Model_pb2,
     MIL_pb2 as _MIL_pb2
@@ -91,7 +91,7 @@
 
 
 
-class _FeatureDescription(object):
+class _FeatureDescription:
     def __init__(self, fd_spec):
         self._fd_spec = fd_spec
 
@@ -174,7 +174,7 @@ def _try_get_weights_dir_path(mlpackage_path):
     return weights_dir
 
 
-class MLModel(object):
+class MLModel:
     """
     This class defines the minimal interface to a CoreML object in Python.
 
@@ -233,7 +233,6 @@ class MLModel(object):
     """
 
     def __init__(self, model,
-                 useCPUOnly=False,
                  is_temp_package=False,
                  mil_program=None,
                  skip_model_load=False,
@@ -257,14 +256,6 @@ def __init__(self, model,
             For non mlprogram model types, the model can be a path string (``.mlmodel``) or type ``Model_pb2``,
             i.e. a spec object.
 
-        useCPUOnly: bool
-            This parameter is deprecated and will be removed in 6.0. Use the ``compute_units``
-            parameter instead.
-
-            The ``compute_units`` parameter overrides any usages of this parameter.
-
-            Set to True to restrict loading of the model to only the CPU. Defaults to False.
-
         is_temp_package: bool
             Set to true if the input model package dir is temporary and can be
             deleted upon destruction of this class.
@@ -315,10 +306,6 @@ def __init__(self, model,
         >>> loaded_model = MLModel('my_model.mlmodel')
         >>> loaded_model = MLModel("my_model.mlpackage")
         """
-        if useCPUOnly:
-            _warnings.warn('The "useCPUOnly" parameter is deprecated and will be removed in 6.0. '
-                           'Use the "compute_units" parameter: "compute_units=coremotools.ComputeUnits.CPUOnly".')
-            compute_units = _ComputeUnit.CPU_ONLY
         if not isinstance(compute_units, _ComputeUnit):
             raise TypeError('"compute_units" parameter must be of type: coremltools.ComputeUnit')
         self.compute_unit = compute_units
@@ -332,6 +319,7 @@ def __init__(self, model,
         self._mil_program = mil_program
 
         if isinstance(model, str):
+            model = _os.path.abspath(_os.path.expanduser(_os.path.expandvars(model)))
             if _os.path.isdir(model):
                 self.is_package = True
                 self.package_path = model
@@ -476,7 +464,7 @@ def get_spec(self):
         return _deepcopy(self._spec)
 
 
-    def predict(self, data, useCPUOnly=False):
+    def predict(self, data):
         """
         Return predictions for the model.
 
@@ -487,14 +475,6 @@ def predict(self, data, useCPUOnly=False):
             the names of the input features.
             If value is array type, numpy.ndarray, tensorflow.Tensor and torch.Tensor are acceptable.
 
-        useCPUOnly: bool
-            This parameter is deprecated and will be removed in 6.0. Instead, use the ``compute_units``
-            parameter at load time or conversion time (that is, in
-            `coremltools.models.MLModel() <https://apple.github.io/coremltools/source/coremltools.models.html#module-coremltools.models.model>`_ or
-            `coremltools.convert() <https://apple.github.io/coremltools/source/coremltools.converters.mil.html#module-coremltools.converters._converters_entry>`_).
-
-            Set to True to restrict computation to use only the CPU. Defaults to False.
-
         Returns
         -------
         out: dict[str, value]
@@ -512,11 +492,6 @@ def predict(self, data, useCPUOnly=False):
         >>> data = {'array': tensorflow.Tensor([[1.0, 2.0], [3.0, 4.0]])}
         >>> predictions = model.predict(data)
         """
-        if useCPUOnly:
-            _warnings.warn('The "useCPUOnly" parameter is deprecated and will be removed in 6.0. '
-                           'Please use the "compute_units" parameter at load time or conversion time, '
-                           'i.e. in "coremltools.models.MLModel()" or "coremltools.convert()".')
-
         if self.is_package and _is_macos() and _macos_version() < (12, 0):
             raise Exception(
                 "predict() for .mlpackage is not supported in macOS version older than 12.0."
@@ -529,7 +504,9 @@ def predict(self, data, useCPUOnly=False):
             # return a more verbose error message
             self._verify_input_name_exists(data)
             self._convert_tensor_to_numpy(data)
-            return self.__proxy__.predict(data, useCPUOnly)
+            # TODO: remove the following call when this is fixed: rdar://92239209
+            self._update_float16_multiarray_input_to_float32(data)
+            return self.__proxy__.predict(data)
         else:
             if _macos_version() < (10, 13):
                 raise Exception(
@@ -623,6 +600,10 @@ def _verify_input_name_exists(self, input_dict):
                           "does not match any of the model input name(s), which are: {}"
                 raise KeyError(err_msg.format(given_input, ",".join(model_input_names)))
 
+    def _update_float16_multiarray_input_to_float32(self, input_data):
+        for k, v in input_data.items():
+            if isinstance(v, _np.ndarray) and v.dtype == _np.float16:
+                input_data[k] = v.astype(_np.float32)
 
     def _convert_tensor_to_numpy(self, input_dict):
         def convert(given_input):
@@ -647,3 +628,4 @@ def convert(given_input):
             if not given_input_name in model_input_to_types:
                 continue
             input_dict[given_input_name] = convert(given_input)
+
diff --git a/coremltools/models/nearest_neighbors/builder.py b/coremltools/models/nearest_neighbors/builder.py
index 9db54d9e3..d5ea188e9 100644
--- a/coremltools/models/nearest_neighbors/builder.py
+++ b/coremltools/models/nearest_neighbors/builder.py
@@ -10,7 +10,7 @@
 import coremltools
 
 
-class KNearestNeighborsClassifierBuilder(object):
+class KNearestNeighborsClassifierBuilder:
     """
     Construct a CoreML KNearestNeighborsClassifier specification.
 
@@ -97,8 +97,6 @@ def __init__(
         leaf_size
         	Leaf size for the kd-tree. Ignored if index type is ``'linear'``. Default = 30.
         """
-        super(KNearestNeighborsClassifierBuilder, self).__init__()
-
         self.spec = coremltools.proto.Model_pb2.Model()
         self.spec.specificationVersion = (
             coremltools._MINIMUM_NEAREST_NEIGHBORS_SPEC_VERSION
diff --git a/coremltools/models/neural_network/builder.py b/coremltools/models/neural_network/builder.py
index 63bed6310..a00d2a8b2 100644
--- a/coremltools/models/neural_network/builder.py
+++ b/coremltools/models/neural_network/builder.py
@@ -214,7 +214,7 @@ def _fill_tensor_fields(tensor_field, ranks=None, shapes=None):
                 tensor_field[i].dimValue.append(s)
 
 
-class NeuralNetworkBuilder(object):
+class NeuralNetworkBuilder:
     """
     Neural network builder class to construct Core ML models.
 
diff --git a/coremltools/models/neural_network/flexible_shape_utils.py b/coremltools/models/neural_network/flexible_shape_utils.py
index 20d0fd728..a93680874 100644
--- a/coremltools/models/neural_network/flexible_shape_utils.py
+++ b/coremltools/models/neural_network/flexible_shape_utils.py
@@ -21,7 +21,7 @@
 _CONSTRAINED_KEYS = [_CHANNEL_KEY, _HEIGHT_KEY, _WIDTH_KEY]
 
 
-class Shape(object):
+class Shape:
     def __init__(self, shape_value):
         if shape_value < 1:
             raise Exception("Invalid value. Size/Shape values must be > 0")
@@ -112,7 +112,7 @@ def height(self):
         return self._height.value
 
 
-class ShapeRange(object):
+class ShapeRange:
     def __init__(self, lowerBound, upperBound):
         unBounded = False
 
diff --git a/coremltools/models/neural_network/quantization_utils.py b/coremltools/models/neural_network/quantization_utils.py
index 85e514f8c..36351566d 100644
--- a/coremltools/models/neural_network/quantization_utils.py
+++ b/coremltools/models/neural_network/quantization_utils.py
@@ -7,13 +7,15 @@
 Utilities to compress Neural Network Models.
 Only available in coremltools 2.0b1 and onwards
 """
-
-import numpy as _np
 from sys import stdout as _stdout
 from os import listdir as _listdir
-from .optimization_utils import _optimize_nn
 
+import numpy as _np
+
+from .optimization_utils import _optimize_nn
+from coremltools import ComputeUnit as _ComputeUnit
 from coremltools.models import (
+    _LUT_BASED_QUANTIZATION,
     _SUPPORTED_QUANTIZATION_MODES,
     _QUANTIZATION_MODE_DEQUANTIZE,
     _QUANTIZATION_MODE_LOOKUP_TABLE_LINEAR,
@@ -21,7 +23,7 @@
     _QUANTIZATION_MODE_CUSTOM_LOOKUP_TABLE,
     _QUANTIZATION_MODE_LINEAR_QUANTIZATION,
     _QUANTIZATION_MODE_LINEAR_SYMMETRIC,
-    _LUT_BASED_QUANTIZATION,
+    MLModel as _MLModel,
 )
 
 from ..utils import _get_nn_layers, _wp_to_fp16wp, _get_model, _macos_version
@@ -33,8 +35,9 @@
 )
 
 
-class QuantizedLayerSelector(object):
-    """ This is the base class to implement custom selectors to skip certain
+class QuantizedLayerSelector:
+    """
+    This is the base class to implement custom selectors to skip certain
     layers during quantization. To implement a custom selector, create a class
     that inherits this class and override `do_quantize()` method.
 
@@ -45,10 +48,10 @@ class QuantizedLayerSelector(object):
 
         class MyLayerSelector(QuantizedLayerSelector):
             def __init__(self):
-                super(MyLayerSelector, self).__init__()
+                super().__init__()
 
             def do_quantize(self, layer, **kwargs):
-                ret = super(MyLayerSelector, self).do_quantize(layer)
+                ret = super().do_quantize(layer)
                 if not ret or layer.name == 'dense_2':
                     return False
                 return True
@@ -108,7 +111,7 @@ def __init__(
         minimum_conv_weight_count=4096,
     ):
 
-        super(AdvancedQuantizedLayerSelector, self).__init__()
+        super().__init__()
         self.skip_layer_types = skip_layer_types
 
         # Error checking
@@ -131,7 +134,7 @@ def __init__(
     def do_quantize(self, layer, weight_param=None):
         """ weight_param - should be name of the WeightParam field
         """
-        ret = super(AdvancedQuantizedLayerSelector, self).do_quantize(layer)
+        ret = super().do_quantize(layer)
         if not ret:
             return False
 
@@ -180,11 +183,11 @@ def do_quantize(self, layer, weight_param=None):
 
 class MatrixMultiplyLayerSelector(QuantizedLayerSelector):
     """
-        Layer selector object that allows users to select matrix multiplication layers
-        with one of the matrices being constant, based on some criterions like total
-        numbers of parameters/weights, number of input or output channels and/or layer
-        names. If any of the criterion is not valid, the corresponding layer is not
-        selected.
+    Layer selector object that allows users to select matrix multiplication layers
+    with one of the matrices being constant, based on some criterions like total
+    numbers of parameters/weights, number of input or output channels and/or layer
+    names. If any of the criterion is not valid, the corresponding layer is not
+    selected.
     """
 
     def __init__(
@@ -197,7 +200,7 @@ def __init__(
         include_layers_with_names=None,
     ):
 
-        super(MatrixMultiplyLayerSelector, self).__init__()
+        super().__init__()
 
         # weight count refers to number of parameters/weights and is equal to product of input & output channels
         self.minimum_weight_count = minimum_weight_count
@@ -219,9 +222,10 @@ def __init__(
             )
 
     def do_quantize(self, layer, weight_param=None):
-        """ weight_param - should be name of the WeightParam field
         """
-        ret = super(MatrixMultiplyLayerSelector, self).do_quantize(layer)
+        weight_param - should be name of the WeightParam field
+        """
+        ret = super().do_quantize(layer)
         if not ret:
             return False
 
@@ -690,13 +694,15 @@ def _dequantize_wp(wp, shape, axis=0):
 
 
 def _dequantize_nn_spec(spec):
-    """ Dequantize weights in NeuralNetwork type mlmodel specifications.
+    """
+    Dequantize weights in NeuralNetwork type mlmodel specifications.
     """
     _quantize_nn_spec(spec, None, _QUANTIZATION_MODE_DEQUANTIZE)
 
 
 def _quantize_nn_spec(nn_spec, nbits, qm, **kwargs):
-    """ Quantize weights in NeuralNetwork type mlmodel specifications.
+    """
+    Quantize weights in NeuralNetwork type mlmodel specifications.
     """
     selector = kwargs.get("selector", QuantizedLayerSelector())
 
@@ -1288,7 +1294,7 @@ def _characterize_qmodel_perf_with_data_dir(fpmodel, qspec, data_dir):
             )
         )
 
-    qmodel = _get_model(qspec)
+    qmodel = _get_model(qspec, compute_units=_ComputeUnit.CPU_ONLY)
     model_metrics = ModelMetrics(qspec)
 
     input_name = qspec.description.input[0].name
@@ -1304,11 +1310,13 @@ def _characterize_qmodel_perf_with_data_dir(fpmodel, qspec, data_dir):
 
     analyzed = 0
     tried = 0
+    if fpmodel.compute_unit != _ComputeUnit.CPU_ONLY:
+        fpmodel = _MLModel(fpmodel.get_spec(), compute_units=_ComputeUnit.CPU_ONLY)
     for image in test_image_paths:
         try:
             input = {input_name: _load_and_resize_image(image, input_size)}
-            fp_pred = fpmodel.predict(input, useCPUOnly=True)
-            q_pred = qmodel.predict(input, useCPUOnly=True)
+            fp_pred = fpmodel.predict(input)
+            q_pred = qmodel.predict(input)
             analyzed += 1
             model_metrics.add_metrics(fp_pred, q_pred)
 
@@ -1338,10 +1346,12 @@ def _characterize_quantized_model_perf(fpmodel, qspec, sample_data):
 
     analyzed = 0
     tried = 0
+    fpmodel = _MLModel(fpmodel.get_spec(), compute_units=_ComputeUnit.CPU_ONLY)
+    qmodel =  _MLModel(qmodel.get_spec(), compute_units=_ComputeUnit.CPU_ONLY)
     for data in sample_data:
         try:
-            fp_pred = fpmodel.predict(data, useCPUOnly=True)
-            q_pred = qmodel.predict(data, useCPUOnly=True)
+            fp_pred = fpmodel.predict(data)
+            q_pred = qmodel.predict(data)
             analyzed += 1
             model_metrics.add_metrics(fp_pred, q_pred)
 
diff --git a/coremltools/models/neural_network/update_optimizer_utils.py b/coremltools/models/neural_network/update_optimizer_utils.py
index e93c5ca9a..760946ea6 100644
--- a/coremltools/models/neural_network/update_optimizer_utils.py
+++ b/coremltools/models/neural_network/update_optimizer_utils.py
@@ -8,7 +8,7 @@
 """
 
 
-class AdamParams(object):
+class AdamParams:
     """
     Adam - A Method for Stochastic Optimization.
 
@@ -82,7 +82,7 @@ def eps(self):
         return self._eps
 
 
-class SgdParams(object):
+class SgdParams:
     """
     SGD - Stochastic Gradient Descent optimizer.
 
diff --git a/coremltools/models/pipeline.py b/coremltools/models/pipeline.py
index c8aad77a4..1f9105d7a 100644
--- a/coremltools/models/pipeline.py
+++ b/coremltools/models/pipeline.py
@@ -10,13 +10,15 @@
 from ..proto import Model_pb2 as _Model_pb2
 from . import _feature_management
 from . import model as _model
+from ._interface_management import (
+    set_classifier_interface_params,
+    set_regressor_interface_params,
+    set_training_features,
+    set_transform_interface_params,
+)
 
-from ._interface_management import set_regressor_interface_params
-from ._interface_management import set_classifier_interface_params
-from ._interface_management import set_transform_interface_params, set_training_features
 
-
-class Pipeline(object):
+class Pipeline:
     """
     A pipeline model that exposes a sequence of models as a single model,
     It requires a set of inputs, a sequence of other models and a set of outputs.
@@ -181,9 +183,7 @@ def add_model(self, spec):
             A protobuf spec or MLModel instance containing a model.
         """
 
-        super(PipelineRegressor, self)._validate_updatable_pipeline_on_add_model(
-            self.spec
-        )
+        super()._validate_updatable_pipeline_on_add_model(self.spec)
 
         if isinstance(spec, _model.MLModel):
             spec = spec._spec
@@ -193,7 +193,7 @@ def add_model(self, spec):
         step_spec.CopyFrom(spec)
 
     def make_updatable(self):
-        super(PipelineRegressor, self)._validate_sub_models_and_make_updatable(
+        super()._validate_sub_models_and_make_updatable(
             self.spec.pipelineRegressor.pipeline, self.spec
         )
 
@@ -281,9 +281,7 @@ def add_model(self, spec):
             A protobuf spec or MLModel instance containing a model.
         """
 
-        super(PipelineClassifier, self)._validate_updatable_pipeline_on_add_model(
-            self.spec
-        )
+        super()._validate_updatable_pipeline_on_add_model(self.spec)
 
         if isinstance(spec, _model.MLModel):
             spec = spec._spec
diff --git a/coremltools/models/tree_ensemble.py b/coremltools/models/tree_ensemble.py
index f4751a1d3..0446f0b11 100644
--- a/coremltools/models/tree_ensemble.py
+++ b/coremltools/models/tree_ensemble.py
@@ -6,17 +6,20 @@
 """
 Tree ensemble builder class to construct CoreML models.
 """
+import collections as _collections
+
 from .. import SPECIFICATION_VERSION as _SPECIFICATION_VERSION
 from ..proto import Model_pb2 as _Model_pb2
 from ..proto import TreeEnsemble_pb2 as _TreeEnsemble_pb2
 from ..proto import FeatureTypes_pb2 as _FeatureTypes_pb2
 
-from ._interface_management import set_regressor_interface_params
-from ._interface_management import set_classifier_interface_params
-import collections as _collections
+from ._interface_management import (
+    set_classifier_interface_params,
+    set_regressor_interface_params
+)
 
 
-class TreeEnsembleBase(object):
+class TreeEnsembleBase:
     """
     Base class for the tree ensemble builder class.  This should be instantiated
     either through the :py:class:`TreeEnsembleRegressor` or
@@ -329,7 +332,7 @@ def __init__(self, features, target):
         target:  (default = None)
            Name of the target feature predicted.
         """
-        super(TreeEnsembleRegressor, self).__init__()
+        super().__init__()
         spec = self.spec
         spec = set_regressor_interface_params(spec, features, target)
         self.tree_spec = spec.treeEnsembleRegressor
@@ -418,7 +421,7 @@ def __init__(self, features, class_labels, output_features):
             is a string, it specifies the predicted class label and the class
             scores is set to the default value of ``"classProbability"``.
         """
-        super(TreeEnsembleClassifier, self).__init__()
+        super().__init__()
         spec = self.spec
         spec = set_classifier_interface_params(
             spec, features, class_labels, "treeEnsembleClassifier", output_features
diff --git a/coremltools/proto/FeatureTypes_pb2.py b/coremltools/proto/FeatureTypes_pb2.py
index 2b70cb1a5..ef54f1120 100644
--- a/coremltools/proto/FeatureTypes_pb2.py
+++ b/coremltools/proto/FeatureTypes_pb2.py
@@ -19,7 +19,7 @@
   name='FeatureTypes.proto',
   package='CoreML.Specification',
   syntax='proto3',
-  serialized_pb=_b('\n\x12\x46\x65\x61tureTypes.proto\x12\x14\x43oreML.Specification\"\x12\n\x10Int64FeatureType\"\x13\n\x11\x44oubleFeatureType\"\x13\n\x11StringFeatureType\"3\n\tSizeRange\x12\x12\n\nlowerBound\x18\x01 \x01(\x04\x12\x12\n\nupperBound\x18\x02 \x01(\x03\"\xfe\x04\n\x10ImageFeatureType\x12\r\n\x05width\x18\x01 \x01(\x03\x12\x0e\n\x06height\x18\x02 \x01(\x03\x12V\n\x0f\x65numeratedSizes\x18\x15 \x01(\x0b\x32;.CoreML.Specification.ImageFeatureType.EnumeratedImageSizesH\x00\x12O\n\x0eimageSizeRange\x18\x1f \x01(\x0b\x32\x35.CoreML.Specification.ImageFeatureType.ImageSizeRangeH\x00\x12\x45\n\ncolorSpace\x18\x03 \x01(\x0e\x32\x31.CoreML.Specification.ImageFeatureType.ColorSpace\x1a*\n\tImageSize\x12\r\n\x05width\x18\x01 \x01(\x04\x12\x0e\n\x06height\x18\x02 \x01(\x04\x1aW\n\x14\x45numeratedImageSizes\x12?\n\x05sizes\x18\x01 \x03(\x0b\x32\x30.CoreML.Specification.ImageFeatureType.ImageSize\x1a{\n\x0eImageSizeRange\x12\x33\n\nwidthRange\x18\x01 \x01(\x0b\x32\x1f.CoreML.Specification.SizeRange\x12\x34\n\x0bheightRange\x18\x02 \x01(\x0b\x32\x1f.CoreML.Specification.SizeRange\"F\n\nColorSpace\x12\x17\n\x13INVALID_COLOR_SPACE\x10\x00\x12\r\n\tGRAYSCALE\x10\n\x12\x07\n\x03RGB\x10\x14\x12\x07\n\x03\x42GR\x10\x1e\x42\x11\n\x0fSizeFlexibility\"\x8e\x05\n\x10\x41rrayFeatureType\x12\r\n\x05shape\x18\x01 \x03(\x03\x12\x46\n\x08\x64\x61taType\x18\x02 \x01(\x0e\x32\x34.CoreML.Specification.ArrayFeatureType.ArrayDataType\x12S\n\x10\x65numeratedShapes\x18\x15 \x01(\x0b\x32\x37.CoreML.Specification.ArrayFeatureType.EnumeratedShapesH\x00\x12G\n\nshapeRange\x18\x1f \x01(\x0b\x32\x31.CoreML.Specification.ArrayFeatureType.ShapeRangeH\x00\x12\x19\n\x0fintDefaultValue\x18) \x01(\x05H\x01\x12\x1b\n\x11\x66loatDefaultValue\x18\x33 \x01(\x02H\x01\x12\x1c\n\x12\x64oubleDefaultValue\x18= \x01(\x01H\x01\x1a\x16\n\x05Shape\x12\r\n\x05shape\x18\x01 \x03(\x03\x1aP\n\x10\x45numeratedShapes\x12<\n\x06shapes\x18\x01 \x03(\x0b\x32,.CoreML.Specification.ArrayFeatureType.Shape\x1a\x41\n\nShapeRange\x12\x33\n\nsizeRanges\x18\x01 \x03(\x0b\x32\x1f.CoreML.Specification.SizeRange\"V\n\rArrayDataType\x12\x1b\n\x17INVALID_ARRAY_DATA_TYPE\x10\x00\x12\r\n\x07\x46LOAT32\x10\xa0\x80\x04\x12\x0c\n\x06\x44OUBLE\x10\xc0\x80\x04\x12\x0b\n\x05INT32\x10\xa0\x80\x08\x42\x12\n\x10ShapeFlexibilityB\x16\n\x14\x64\x65\x66\x61ultOptionalValue\"\xa4\x01\n\x15\x44ictionaryFeatureType\x12>\n\x0cint64KeyType\x18\x01 \x01(\x0b\x32&.CoreML.Specification.Int64FeatureTypeH\x00\x12@\n\rstringKeyType\x18\x02 \x01(\x0b\x32\'.CoreML.Specification.StringFeatureTypeH\x00\x42\t\n\x07KeyType\"\xcd\x01\n\x13SequenceFeatureType\x12;\n\tint64Type\x18\x01 \x01(\x0b\x32&.CoreML.Specification.Int64FeatureTypeH\x00\x12=\n\nstringType\x18\x03 \x01(\x0b\x32\'.CoreML.Specification.StringFeatureTypeH\x00\x12\x32\n\tsizeRange\x18\x65 \x01(\x0b\x32\x1f.CoreML.Specification.SizeRangeB\x06\n\x04Type\"\xee\x03\n\x0b\x46\x65\x61tureType\x12;\n\tint64Type\x18\x01 \x01(\x0b\x32&.CoreML.Specification.Int64FeatureTypeH\x00\x12=\n\ndoubleType\x18\x02 \x01(\x0b\x32\'.CoreML.Specification.DoubleFeatureTypeH\x00\x12=\n\nstringType\x18\x03 \x01(\x0b\x32\'.CoreML.Specification.StringFeatureTypeH\x00\x12;\n\timageType\x18\x04 \x01(\x0b\x32&.CoreML.Specification.ImageFeatureTypeH\x00\x12@\n\x0emultiArrayType\x18\x05 \x01(\x0b\x32&.CoreML.Specification.ArrayFeatureTypeH\x00\x12\x45\n\x0e\x64ictionaryType\x18\x06 \x01(\x0b\x32+.CoreML.Specification.DictionaryFeatureTypeH\x00\x12\x41\n\x0csequenceType\x18\x07 \x01(\x0b\x32).CoreML.Specification.SequenceFeatureTypeH\x00\x12\x13\n\nisOptional\x18\xe8\x07 \x01(\x08\x42\x06\n\x04TypeB\x02H\x03\x62\x06proto3')
+  serialized_pb=_b('\n\x12\x46\x65\x61tureTypes.proto\x12\x14\x43oreML.Specification\"\x12\n\x10Int64FeatureType\"\x13\n\x11\x44oubleFeatureType\"\x13\n\x11StringFeatureType\"3\n\tSizeRange\x12\x12\n\nlowerBound\x18\x01 \x01(\x04\x12\x12\n\nupperBound\x18\x02 \x01(\x03\"\x95\x05\n\x10ImageFeatureType\x12\r\n\x05width\x18\x01 \x01(\x03\x12\x0e\n\x06height\x18\x02 \x01(\x03\x12V\n\x0f\x65numeratedSizes\x18\x15 \x01(\x0b\x32;.CoreML.Specification.ImageFeatureType.EnumeratedImageSizesH\x00\x12O\n\x0eimageSizeRange\x18\x1f \x01(\x0b\x32\x35.CoreML.Specification.ImageFeatureType.ImageSizeRangeH\x00\x12\x45\n\ncolorSpace\x18\x03 \x01(\x0e\x32\x31.CoreML.Specification.ImageFeatureType.ColorSpace\x1a*\n\tImageSize\x12\r\n\x05width\x18\x01 \x01(\x04\x12\x0e\n\x06height\x18\x02 \x01(\x04\x1aW\n\x14\x45numeratedImageSizes\x12?\n\x05sizes\x18\x01 \x03(\x0b\x32\x30.CoreML.Specification.ImageFeatureType.ImageSize\x1a{\n\x0eImageSizeRange\x12\x33\n\nwidthRange\x18\x01 \x01(\x0b\x32\x1f.CoreML.Specification.SizeRange\x12\x34\n\x0bheightRange\x18\x02 \x01(\x0b\x32\x1f.CoreML.Specification.SizeRange\"]\n\nColorSpace\x12\x17\n\x13INVALID_COLOR_SPACE\x10\x00\x12\r\n\tGRAYSCALE\x10\n\x12\x07\n\x03RGB\x10\x14\x12\x07\n\x03\x42GR\x10\x1e\x12\x15\n\x11GRAYSCALE_FLOAT16\x10(B\x11\n\x0fSizeFlexibility\"\x9d\x05\n\x10\x41rrayFeatureType\x12\r\n\x05shape\x18\x01 \x03(\x03\x12\x46\n\x08\x64\x61taType\x18\x02 \x01(\x0e\x32\x34.CoreML.Specification.ArrayFeatureType.ArrayDataType\x12S\n\x10\x65numeratedShapes\x18\x15 \x01(\x0b\x32\x37.CoreML.Specification.ArrayFeatureType.EnumeratedShapesH\x00\x12G\n\nshapeRange\x18\x1f \x01(\x0b\x32\x31.CoreML.Specification.ArrayFeatureType.ShapeRangeH\x00\x12\x19\n\x0fintDefaultValue\x18) \x01(\x05H\x01\x12\x1b\n\x11\x66loatDefaultValue\x18\x33 \x01(\x02H\x01\x12\x1c\n\x12\x64oubleDefaultValue\x18= \x01(\x01H\x01\x1a\x16\n\x05Shape\x12\r\n\x05shape\x18\x01 \x03(\x03\x1aP\n\x10\x45numeratedShapes\x12<\n\x06shapes\x18\x01 \x03(\x0b\x32,.CoreML.Specification.ArrayFeatureType.Shape\x1a\x41\n\nShapeRange\x12\x33\n\nsizeRanges\x18\x01 \x03(\x0b\x32\x1f.CoreML.Specification.SizeRange\"e\n\rArrayDataType\x12\x1b\n\x17INVALID_ARRAY_DATA_TYPE\x10\x00\x12\r\n\x07\x46LOAT32\x10\xa0\x80\x04\x12\x0c\n\x06\x44OUBLE\x10\xc0\x80\x04\x12\x0b\n\x05INT32\x10\xa0\x80\x08\x12\r\n\x07\x46LOAT16\x10\x90\x80\x04\x42\x12\n\x10ShapeFlexibilityB\x16\n\x14\x64\x65\x66\x61ultOptionalValue\"\xa4\x01\n\x15\x44ictionaryFeatureType\x12>\n\x0cint64KeyType\x18\x01 \x01(\x0b\x32&.CoreML.Specification.Int64FeatureTypeH\x00\x12@\n\rstringKeyType\x18\x02 \x01(\x0b\x32\'.CoreML.Specification.StringFeatureTypeH\x00\x42\t\n\x07KeyType\"\xcd\x01\n\x13SequenceFeatureType\x12;\n\tint64Type\x18\x01 \x01(\x0b\x32&.CoreML.Specification.Int64FeatureTypeH\x00\x12=\n\nstringType\x18\x03 \x01(\x0b\x32\'.CoreML.Specification.StringFeatureTypeH\x00\x12\x32\n\tsizeRange\x18\x65 \x01(\x0b\x32\x1f.CoreML.Specification.SizeRangeB\x06\n\x04Type\"\xee\x03\n\x0b\x46\x65\x61tureType\x12;\n\tint64Type\x18\x01 \x01(\x0b\x32&.CoreML.Specification.Int64FeatureTypeH\x00\x12=\n\ndoubleType\x18\x02 \x01(\x0b\x32\'.CoreML.Specification.DoubleFeatureTypeH\x00\x12=\n\nstringType\x18\x03 \x01(\x0b\x32\'.CoreML.Specification.StringFeatureTypeH\x00\x12;\n\timageType\x18\x04 \x01(\x0b\x32&.CoreML.Specification.ImageFeatureTypeH\x00\x12@\n\x0emultiArrayType\x18\x05 \x01(\x0b\x32&.CoreML.Specification.ArrayFeatureTypeH\x00\x12\x45\n\x0e\x64ictionaryType\x18\x06 \x01(\x0b\x32+.CoreML.Specification.DictionaryFeatureTypeH\x00\x12\x41\n\x0csequenceType\x18\x07 \x01(\x0b\x32).CoreML.Specification.SequenceFeatureTypeH\x00\x12\x13\n\nisOptional\x18\xe8\x07 \x01(\x08\x42\x06\n\x04TypeB\x02H\x03\x62\x06proto3')
 )
 
 
@@ -46,11 +46,15 @@
       name='BGR', index=3, number=30,
       options=None,
       type=None),
+    _descriptor.EnumValueDescriptor(
+      name='GRAYSCALE_FLOAT16', index=4, number=40,
+      options=None,
+      type=None),
   ],
   containing_type=None,
   options=None,
   serialized_start=709,
-  serialized_end=779,
+  serialized_end=802,
 )
 _sym_db.RegisterEnumDescriptor(_IMAGEFEATURETYPE_COLORSPACE)
 
@@ -76,11 +80,15 @@
       name='INT32', index=3, number=131104,
       options=None,
       type=None),
+    _descriptor.EnumValueDescriptor(
+      name='FLOAT16', index=4, number=65552,
+      options=None,
+      type=None),
   ],
   containing_type=None,
   options=None,
-  serialized_start=1325,
-  serialized_end=1411,
+  serialized_start=1348,
+  serialized_end=1449,
 )
 _sym_db.RegisterEnumDescriptor(_ARRAYFEATURETYPE_ARRAYDATATYPE)
 
@@ -358,7 +366,7 @@
       index=0, containing_type=None, fields=[]),
   ],
   serialized_start=160,
-  serialized_end=798,
+  serialized_end=821,
 )
 
 
@@ -388,8 +396,8 @@
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=1152,
-  serialized_end=1174,
+  serialized_start=1175,
+  serialized_end=1197,
 )
 
 _ARRAYFEATURETYPE_ENUMERATEDSHAPES = _descriptor.Descriptor(
@@ -418,8 +426,8 @@
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=1176,
-  serialized_end=1256,
+  serialized_start=1199,
+  serialized_end=1279,
 )
 
 _ARRAYFEATURETYPE_SHAPERANGE = _descriptor.Descriptor(
@@ -448,8 +456,8 @@
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=1258,
-  serialized_end=1323,
+  serialized_start=1281,
+  serialized_end=1346,
 )
 
 _ARRAYFEATURETYPE = _descriptor.Descriptor(
@@ -527,8 +535,8 @@
       name='defaultOptionalValue', full_name='CoreML.Specification.ArrayFeatureType.defaultOptionalValue',
       index=1, containing_type=None, fields=[]),
   ],
-  serialized_start=801,
-  serialized_end=1455,
+  serialized_start=824,
+  serialized_end=1493,
 )
 
 
@@ -568,8 +576,8 @@
       name='KeyType', full_name='CoreML.Specification.DictionaryFeatureType.KeyType',
       index=0, containing_type=None, fields=[]),
   ],
-  serialized_start=1458,
-  serialized_end=1622,
+  serialized_start=1496,
+  serialized_end=1660,
 )
 
 
@@ -616,8 +624,8 @@
       name='Type', full_name='CoreML.Specification.SequenceFeatureType.Type',
       index=0, containing_type=None, fields=[]),
   ],
-  serialized_start=1625,
-  serialized_end=1830,
+  serialized_start=1663,
+  serialized_end=1868,
 )
 
 
@@ -699,8 +707,8 @@
       name='Type', full_name='CoreML.Specification.FeatureType.Type',
       index=0, containing_type=None, fields=[]),
   ],
-  serialized_start=1833,
-  serialized_end=2327,
+  serialized_start=1871,
+  serialized_end=2365,
 )
 
 _IMAGEFEATURETYPE_IMAGESIZE.containing_type = _IMAGEFEATURETYPE
diff --git a/coremltools/test/api/test_api_examples.py b/coremltools/test/api/test_api_examples.py
index f66ac7bb1..9177ca981 100644
--- a/coremltools/test/api/test_api_examples.py
+++ b/coremltools/test/api/test_api_examples.py
@@ -5,6 +5,7 @@
 
 import copy
 from io import BytesIO
+import itertools
 import numpy as np
 import os
 from os import getcwd, chdir
@@ -20,6 +21,7 @@
 from coremltools.converters.mil.frontend.torch.test.testing_utils import _copy_input_data
 from coremltools.converters.mil.mil import get_new_symbol, Program, Function
 from coremltools.converters.mil.testing_utils import get_op_types_in_program
+from coremltools.proto import FeatureTypes_pb2 as ft
 
 from coremltools._deps import (
     _HAS_TF_1,
@@ -61,12 +63,12 @@ def test_convert_from_frozen_graph(tmpdir):
             x = tf.placeholder(tf.float32, shape=(1, 2, 3), name="input")
             y = tf.nn.relu(x, name="output")
 
-        mlmodel = ct.convert(graph)
+        mlmodel = ct.convert(graph, compute_units=ct.ComputeUnit.CPU_ONLY)
 
         test_input = np.random.rand(1, 2, 3) - 0.5
         with tf.compat.v1.Session(graph=graph) as sess:
             expected_val = sess.run(y, feed_dict={x: test_input})
-        results = mlmodel.predict({"input": test_input}, useCPUOnly=True)
+        results = mlmodel.predict({"input": test_input})
         np.testing.assert_allclose(results["output"], expected_val)
 
     @staticmethod
@@ -114,9 +116,9 @@ def test_convert_from_frozen_graph_file(tmpdir):
         # (3) Not specify inputs at all. `inputs` is optional for TF. When
         # inputs is not specified, convert() infers inputs from Placeholder
         # nodes.
-        mlmodel = ct.convert(pb_path, outputs=["output"])
+        mlmodel = ct.convert(pb_path, outputs=["output"], compute_units=ct.ComputeUnit.CPU_ONLY)
 
-        results = mlmodel.predict({"input": test_input}, useCPUOnly=True)
+        results = mlmodel.predict({"input": test_input})
         np.testing.assert_allclose(results["output"], expected_val)
         mlmodel_path = os.path.join(save_path, "model.mlmodel")
         # Save the converted model
@@ -145,12 +147,12 @@ def test_convert_from_saved_model_dir(tmpdir):
 
         # SavedModel directory generated by TensorFlow 1.x
         # when converting from SavedModel dir, inputs / outputs are optional
-        mlmodel = ct.convert(save_path)
+        mlmodel = ct.convert(save_path, compute_units=ct.ComputeUnit.CPU_ONLY)
 
         # Need input output names to call mlmodel
         # x.name == 'Placeholder:0'. Strip out ':0'
         input_name = x.name.split(":")[0]
-        results = mlmodel.predict({input_name: test_input}, useCPUOnly=True)
+        results = mlmodel.predict({input_name: test_input})
         # y.name == 'Relu:0'. output_name == 'Relu'
         output_name = y.name.split(":")[0]
         np.testing.assert_allclose(results[output_name], expected_val)
@@ -470,11 +472,8 @@ def test_convert_torch_vision_mobilenet_v2(tmpdir):
         """
         if ct.utils._is_macos():
             results = mlmodel.predict({"input": example_input.numpy()})
-            expected = model(example_input)
-            np.testing.assert_allclose(
-                list(results.values())[0], expected.detach().numpy(),
-                atol=1e-8, rtol=1e-2)
-
+            assert isinstance(results, dict)
+            
     @staticmethod
     def test_int64_inputs():
 
@@ -535,8 +534,8 @@ def forward(self, x):
                 ],
             )
 
-        # Outputs must not be specified for PyTorch
-        with pytest.raises(ValueError, match=r"outputs must not be specified"):
+        # Outputs must be of type ct.ImageType or ct.TensorType
+        with pytest.raises(ValueError, match=r"must be a list of type ct.TensorType or ct.ImageType"):
             mlmodel = ct.convert(
                 traced_model,
                 inputs=[
@@ -600,7 +599,7 @@ class TestMILExamples:
     @staticmethod
     def test_tutorial():
         @mb.program(
-            input_specs=[mb.TensorSpec(shape=(1, 100, 100, 3)),]
+            input_specs=[mb.TensorSpec(shape=(1, 100, 100, 3))]
         )
         def prog(x):
             x = mb.relu(x=x, name="relu")
@@ -618,10 +617,11 @@ def prog(x):
         # running predict() is only supported on macOS
         if ct.utils._is_macos():
             prediction = mlmodel.predict(
-                {"x": np.random.rand(1, 100, 100, 3).astype(np.float32),}
+                {"x": np.random.rand(1, 100, 100, 3).astype(np.float32)}
             )
             assert len(prediction) == 1
 
+
 class TestInvalidInput:
     @staticmethod
     def test_rank0_inputs_mil():
@@ -1152,12 +1152,12 @@ def forward(self, x):
         mlmodel = ct.convert(
             traced_model,
             inputs=[tensor_input],
+            compute_units=ct.ComputeUnit.CPU_ONLY
         )
 
         if ct.utils._is_macos():
             result = mlmodel.predict(
                 {"input": example_input.detach().numpy().astype(np.float32)},
-                useCPUOnly=True,
             )
 
             # Verify outputs
@@ -1169,15 +1169,13 @@ def forward(self, x):
             # Test (1, 3, 56, 56) shape (can't verify numerical parity with Torch
             # which doesn't support enumerated shape)
             test_input_x = np.random.rand(*shapes[1]).astype(np.float32)
-            results = mlmodel.predict({
-                "input": test_input_x})
+            mlmodel.predict({"input": test_input_x})
 
             # Test with a wrong shape
             with pytest.raises(RuntimeError,
                     match=r"not compatible with the model\'s feature"):
                 test_input_x = np.random.rand(1, 3, 29, 29).astype(np.float32)
-                results = mlmodel.predict({
-                    "input": test_input_x})
+                mlmodel.predict({"input": test_input_x})
 
     @staticmethod
     @pytest.mark.skipif(not _HAS_TF_2, reason=MSG_TF2_NOT_FOUND)
@@ -1606,23 +1604,47 @@ def test_deepcopy_error_with_symbols_in_prog():
         mlmodel = ct.convert(prog, convert_to="mlprogram", compute_precision=ct.precision.FLOAT32)
         prog2 = mlmodel._get_mil_internal() # this will invoke a deepcopy on the prog
 
+
     @staticmethod
     @pytest.mark.skipif(not _HAS_TORCH or ct.utils._macos_version() < (12, 0),
                         reason=MSG_TORCH_NOT_FOUND)
-    def test_classifier():
+    @pytest.mark.parametrize(
+        "convert_to, provide_prob_output_argument",
+        itertools.product(
+            ["neuralnetwork", "mlprogram"],
+            [False, True],
+        )
+    )
+    def test_classifier(convert_to, provide_prob_output_argument):
         torch_model = torch.nn.ReLU().eval()
         traced_model = torch.jit.trace(torch_model, torch.rand(3,))
+        variable_name = "var_2"
+        class_label_name = "class_label"
+        classifier_config = ct.ClassifierConfig(
+                            class_labels=['a', 'b', 'c'],
+                            predicted_feature_name=class_label_name,
+                            predicted_probabilities_output=variable_name if provide_prob_output_argument else None,
+                            )
+
         model = ct.convert(
             traced_model,
             inputs=[ct.TensorType(shape=(3,))],
-            classifier_config = ct.ClassifierConfig(['a', 'b', 'c']),
-            convert_to='mlprogram'
+            classifier_config = classifier_config,
+            convert_to=convert_to,
         )
         spec = model.get_spec()
         input_name = spec.description.input[0].name
         out_dict = model.predict({input_name : np.array([1.0, 2.0, 3.0])})
-        assert 'classLabel' in out_dict
-        assert out_dict['classLabel'] == 'c'
+
+        assert class_label_name in out_dict
+        assert out_dict[class_label_name] == 'c'
+        if convert_to == "neuralnetwork":
+            assert variable_name in out_dict
+            assert isinstance(out_dict[variable_name], dict)
+        else:
+            output_dict_feature_name = class_label_name + "_probs"
+            assert output_dict_feature_name in out_dict
+            assert isinstance(out_dict[output_dict_feature_name], dict)
 
     @pytest.mark.skipif(not ct.utils._is_macos(), reason="Platform is not Mac OS")
     @pytest.mark.parametrize("skip_model_load", [True, False])
diff --git a/coremltools/test/api/test_api_visibilities.py b/coremltools/test/api/test_api_visibilities.py
index d8376f009..bd614e4fc 100644
--- a/coremltools/test/api/test_api_visibilities.py
+++ b/coremltools/test/api/test_api_visibilities.py
@@ -30,6 +30,8 @@ def test_top_level(self):
             "SPECIFICATION_VERSION",
             "Shape",
             "TensorType",
+            "colorlayout",
+            "compression_utils",
             "convert",
             "converters",
             "libcoremlpython",
@@ -65,13 +67,14 @@ def test_models(self):
         expected = [
             "MLModel",
             "datatypes",
+            "feature_vectorizer",
+            "ml_program",
             "model",
+            "nearest_neighbors",
             "neural_network",
             "pipeline",
             "tree_ensemble",
             "utils",
-            "nearest_neighbors",
-            "feature_vectorizer",
         ]
         _check_visible_modules(_get_visible_items(ct.models), expected)
 
@@ -141,43 +144,23 @@ def test_models_pipeline(self):
     def test_converters(self):
         expected = [
             "ClassifierConfig",
+            "ColorLayout",
             "EnumeratedShapes",
             "ImageType",
             "RangeDim",
             "Shape",
             "TensorType",
             "convert",
-            "keras",
             "libsvm",
             "mil",
-            "onnx",
             "sklearn",
             "xgboost",
         ]
         _check_visible_modules(_get_visible_items(ct.converters), expected)
 
-    @pytest.mark.skipif(
-        ct.utils._python_version() >= (3, 8, 0),
-        reason="Keras isn't compatible with Python 3.8+.",
-    )
-    @pytest.mark.xfail(
-         condition=not ct.utils._is_macos(),
-         reason="rdar://65138103 (Keras converter not exposed on Linux)",
-         run=False,
-     )
-    def test_converters_keras(self):
-        _check_visible_modules(_get_visible_items(ct.converters.keras), ["convert"])
-
     def test_converters_libsvm(self):
         _check_visible_modules(_get_visible_items(ct.converters.libsvm), ["convert"])
 
-    @pytest.mark.skipif(
-        ct.utils._python_version() >= (3, 8, 0),
-        reason="ONNX isn't compatible with Python 3.8+.",
-    )
-    def test_converters_onnx(self):
-        _check_visible_modules(_get_visible_items(ct.converters.onnx), ["convert"])
-
     def test_converters_sklearn(self):
         _check_visible_modules(_get_visible_items(ct.converters.sklearn), ["convert"])
 
@@ -201,6 +184,17 @@ def test_models_neural_network_quantization_utils(self):
             _get_visible_items(ct.models.neural_network.quantization_utils), expected
         )
 
+    def test_compression_utils(self):
+        expected = [
+            "affine_quantize_weights",
+            "palettize_weights",
+            "sparsify_weights",
+            "decompress_weights",
+        ]
+        _check_visible_modules(
+            _get_visible_items(ct.compression_utils), expected
+        )
+
     def test_models_neural_network_flexible_shape_utils(self):
         expected = [
             "NeuralNetworkImageSize",
diff --git a/coremltools/test/blob/test_weights.py b/coremltools/test/blob/test_weights.py
index 96acb0b22..8c554688b 100644
--- a/coremltools/test/blob/test_weights.py
+++ b/coremltools/test/blob/test_weights.py
@@ -24,34 +24,46 @@ def tearDown(self):
         if os.path.exists(self.working_dir):
             shutil.rmtree(self.working_dir)
 
+    def test_weight_blob_int8(self):
+        writer = BlobWriter(self.working_dir + "/net.wt")
+        input_arr = np.array([-5, -2, 0, 2, 5], dtype=np.int8)
+        offset = writer.write_int8_data(input_arr)
+        writer = None
+
+        reader = BlobReader(self.working_dir + "/net.wt")
+        output_arr = np.array(reader.read_int8_data(offset), np.int8)
+        np.testing.assert_equal(input_arr, output_arr)
+
     def test_weight_blob_uint8(self):
         writer = BlobWriter(self.working_dir + "/net.wt")
-        input_arr = np.array([1.0, 2, 3, 4, 5], dtype=np.uint8)
+        input_arr = np.array([1, 2, 3, 4, 5], dtype=np.uint8)
         offset = writer.write_uint8_data(input_arr)
         writer = None
 
         reader = BlobReader(self.working_dir + "/net.wt")
-        output_arr = np.array(reader.read_uint8_data(offset))
+        output_arr = np.array(reader.read_uint8_data(offset), np.uint8)
         np.testing.assert_almost_equal(input_arr, output_arr)
 
     def test_weight_blob_fp16(self):
         writer = BlobWriter(self.working_dir + "/net.wt")
-        input_arr = np.array([1.0, 2, 3, 4, 5], dtype=np.float16)
-        offset = writer.write_fp16_data(input_arr)
+        input_arr = np.array([2.3, 4.6, 7.9], dtype=np.float16)
+        input_arr_to_bytes_uint16 = np.frombuffer(input_arr.tobytes(), np.uint16)
+        offset = writer.write_fp16_data(input_arr_to_bytes_uint16)
         writer = None
 
         reader = BlobReader(self.working_dir + "/net.wt")
-        output_arr = np.array(reader.read_fp16_data(offset))
+        output_arr_uint16 = np.array(reader.read_fp16_data(offset), np.uint16)
+        output_arr = np.frombuffer(output_arr_uint16.tobytes(), np.float16)
         np.testing.assert_almost_equal(input_arr, output_arr)
 
     def test_weight_blob_fp32(self):
         writer = BlobWriter(self.working_dir + "/net.wt")
-        input_arr = np.array([1.0, 2, 3, 4, 5], dtype=np.float32)
+        input_arr = np.array([1.0, 2.4, 3.9, -4.8, 5.2], dtype=np.float32)
         offset = writer.write_float_data(input_arr)
         writer = None
 
         reader = BlobReader(self.working_dir + "/net.wt")
-        output_arr = np.array(reader.read_float_data(offset))
+        output_arr = np.array(reader.read_float_data(offset), np.float32)
         np.testing.assert_almost_equal(input_arr, output_arr)
 
 if __name__ == "__main__":
diff --git a/coremltools/test/ml_program/__init__.py b/coremltools/test/ml_program/__init__.py
new file mode 100644
index 000000000..9fcc9060a
--- /dev/null
+++ b/coremltools/test/ml_program/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) 2022, Apple Inc. All rights reserved.
+#
+# Use of this source code is governed by a BSD-3-clause license that can be
+# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
\ No newline at end of file
diff --git a/coremltools/test/ml_program/test_compression.py b/coremltools/test/ml_program/test_compression.py
new file mode 100644
index 000000000..29bbaa924
--- /dev/null
+++ b/coremltools/test/ml_program/test_compression.py
@@ -0,0 +1,410 @@
+# Copyright (c) 2022, Apple Inc. All rights reserved.
+#
+# Use of this source code is governed by a BSD-3-clause license that can be
+# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import logging
+import pytest
+import numpy as np
+import torch
+
+import coremltools as ct
+
+from coremltools.converters.mil.testing_utils import get_op_types_in_program
+
+def create_unique_weight(weight, nbits):
+    shape = weight.detach().numpy().shape
+    size = weight.detach().numpy().size
+
+    unique_number = 1 << 4
+    weight = []
+    partition_len = size // unique_number + 1
+    for i in range(unique_number):
+        weight += [i] * (partition_len)
+    weight = np.reshape(np.array(weight[:size]).astype(np.float32), shape)
+    return weight
+
+def get_test_model_and_data(multi_layer=False):
+    inputs = [ct.TensorType(name="data", shape=(1, 64, 10, 10))]
+    torch_input_values = [torch.rand(*i.shape.to_list()) for i in inputs]
+    coreml_input_values = {
+        i.name: val.detach().numpy() for i, val in zip(inputs, torch_input_values)
+    }
+    if multi_layer:
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super(Model, self).__init__()
+                self.conv_1 = torch.nn.Conv2d(in_channels=64, out_channels=32, kernel_size=2)
+                self.conv_2 = torch.nn.Conv2d(in_channels=32, out_channels=64, kernel_size=2)
+
+            def forward(self, x):
+                conv_1 = self.conv_1(x)
+                conv_2 = self.conv_2(conv_1)
+                return conv_2
+
+        model = Model().eval()
+    else:
+        model = torch.nn.Conv2d(in_channels=64, out_channels=32, kernel_size=2)
+
+    return model, inputs, torch_input_values, coreml_input_values
+
+def verify_model_outputs(model, compressed_model, input_values):
+    """
+    This utility functions does the following checks:
+
+    (1) Verify the output of the compressed model has the same shape / type of the original model
+    (2) The decompressed and compressed model have the same numerical outputs
+    """
+
+    # Make sure the model can be decompressed
+    decompressed_model = ct.compression_utils.decompress_weights(compressed_model)
+
+    # Validate the output shape / type
+    ref_outputs = model._mil_program.functions["main"].outputs
+    outputs = compressed_model._mil_program.functions["main"].outputs
+
+    assert len(ref_outputs) == len(outputs)
+
+    for a, b in zip(ref_outputs, outputs):
+        assert a.name == b.name
+        assert a.shape == a.shape
+        assert a.dtype == b.dtype
+
+    if ct.utils._macos_version() < (13, 0):
+        return
+
+    # Validate that the compressed model could be decompressed, and produces correct outputs
+    output_dict = compressed_model.predict(input_values)
+    de_output_dict = decompressed_model.predict(input_values)
+    for k, v in de_output_dict.items():
+        assert k in output_dict
+        np.testing.assert_allclose(v, output_dict[k])
+
+class TestCompressionUtils:
+
+    @staticmethod
+    def test_op_selector():
+        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data()
+        torchmodel = torch.jit.trace(model, torch_input_values)
+        mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram")
+        mlmodel_no_quantized = ct.compression_utils.affine_quantize_weights(mlmodel, mode="linear", op_selector=lambda const_op: const_op.val.val.size > 1e7)
+        expected_ops = ['cast', 'conv', 'cast']        
+        assert get_op_types_in_program(mlmodel_no_quantized._mil_program) == expected_ops
+
+    @staticmethod
+    def test_weight_decompression():
+        """
+        This test is doing the following steps
+
+        (1) compress a model with two conv layers into a compressed model with two different constexpr ops
+            
+            [Original model]:
+
+                     weight_1      weight_2
+                       |             |
+                       v             v
+            input -> conv_1 -----> conv_2 ---> output
+
+
+            [Compressed model]:
+
+                   weight_1_lut   weight_2_affine
+                       |               |
+                       v               v
+            input -> conv_1 ------>  conv_2 ---> output
+
+            , where weight_1_lut is a constexpr_lut_to_dense op and weight_2_affine is a constexpr_affine_dequantize op
+        
+        (2) decompress the compressed model
+
+            [Decompressed model]:
+
+                   weight_1_new   weight_2_new
+                       |               |
+                       v               v
+            input -> conv_1 ------>  conv_2 ---> output
+
+            , note that, weight_1_new is equivalent to weight_1_lut, and weight_2_new is equivalent to weight_2_affine
+        """
+        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data(multi_layer=True)
+        torchmodel = torch.jit.trace(model, torch_input_values)
+        mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram")
+        
+        # we first compress the model
+        mlmodel = ct.compression_utils.palettize_weights(mlmodel, mode="kmeans", nbits=4, op_selector=lambda const_op: const_op.name == "conv_1_weight_to_fp16")
+        mlmodel = ct.compression_utils.affine_quantize_weights(mlmodel, mode="linear", op_selector=lambda const_op: const_op.name == "conv_2_weight_to_fp16")
+        expected_ops = ['constexpr_lut_to_dense', 'cast', 'conv', 'constexpr_affine_dequantize', 'conv', 'cast']
+        assert get_op_types_in_program(mlmodel._mil_program) == expected_ops
+
+        # decompress the model
+        decompressed_model = ct.compression_utils.decompress_weights(mlmodel)
+        assert get_op_types_in_program(decompressed_model._mil_program) == ['cast', 'conv', 'conv', 'cast']
+
+        if ct.utils._macos_version() < (13, 0):
+            return
+
+        # compared the numerical outputs
+        output_dict = mlmodel.predict(coreml_input_values)
+        de_output_dict = decompressed_model.predict(coreml_input_values)
+
+        for k, v in output_dict.items():
+            assert k in de_output_dict
+            np.testing.assert_allclose(v, de_output_dict[k])
+
+    @staticmethod
+    def test_compression_utils_error_handling():
+        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data()
+        torchmodel = torch.jit.trace(model, torch_input_values)
+        mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram")
+
+        # Test invalid mode for affine quantization
+        expected_err_str = "supported for weight affine quantization. Got mode invalid_mode."
+        with pytest.raises(ValueError, match=expected_err_str):
+            ct.compression_utils.affine_quantize_weights(mlmodel, mode="invalid_mode")
+
+        # Test invalid mode for weight sparsification
+        expected_err_str = "supported for weight sparsification. Got mode invalid_mode."
+        with pytest.raises(ValueError, match=expected_err_str):
+            ct.compression_utils.sparsify_weights(mlmodel, mode="invalid_mode")
+
+        # Test invalid threshold for weight sparsification
+        expected_err_str = "Invalid value of threshold: \-1. Needs to be in \[0, inf\)"
+        with pytest.raises(ValueError, match=expected_err_str):
+            ct.compression_utils.sparsify_weights(mlmodel, mode="threshold_based", threshold=-1)
+
+        # Test invalid percentile for weight sparsification
+        expected_err_str = "Invalid value of target_percentile: 1.2. Needs to be in \[0, 1\]"
+        with pytest.raises(ValueError, match=expected_err_str):
+           ct.compression_utils.sparsify_weights(mlmodel, mode="percentile_based", target_percentile=1.2)
+
+        # Test invalid mode for weight palettization
+        expected_err_str = "supported for weight palettization. Got mode invalid_mode."
+        with pytest.raises(ValueError, match=expected_err_str):
+            ct.compression_utils.palettize_weights(mlmodel, mode="invalid_mode")
+
+        # Test nbits must be provided for kmeans, uniform mode for weight palettization
+        expected_err_str = "nbits must be provided for mode"
+        with pytest.raises(ValueError, match=expected_err_str):
+            ct.compression_utils.palettize_weights(mlmodel, mode="kmeans")
+
+        with pytest.raises(ValueError, match=expected_err_str):
+            ct.compression_utils.palettize_weights(mlmodel, mode="uniform")
+
+        # Test nbits must not be provided for unique, custom mode for weight palettization
+        expected_err_str = "nbits must NOT be provided for mode"
+        with pytest.raises(ValueError, match=expected_err_str):
+            ct.compression_utils.palettize_weights(mlmodel, mode="unique", nbits=2)
+
+        with pytest.raises(ValueError, match=expected_err_str):
+            ct.compression_utils.palettize_weights(mlmodel, mode="custom", nbits=2)
+
+        # Test lut_function must be provided for custom mode, and must not be provided otherwise
+        expected_err_str = "lut_function must be None if mode is not custom, and that it cannot be None when the mode is custom."
+        with pytest.raises(ValueError, match=expected_err_str):
+            ct.compression_utils.palettize_weights(mlmodel, mode="custom")
+        with pytest.raises(ValueError, match=expected_err_str):
+            ct.compression_utils.palettize_weights(mlmodel, mode="unique", lut_function=lambda op: True)
+
+        # Test lut_function must be a function obejct
+        expected_err_str = "A function object must be provided as lut_function"
+        with pytest.raises(ValueError, match=expected_err_str):
+            ct.compression_utils.palettize_weights(mlmodel, mode="custom", lut_function=1)
+
+
+    @staticmethod
+    @pytest.mark.parametrize(
+        "mode",
+        ("linear", "linear_symmetric"),
+    )
+    def test_linear_quanitzation(mode):
+        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data()
+        torchmodel = torch.jit.trace(model, torch_input_values)
+        mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram")
+
+        mlmodel_quantized = ct.compression_utils.affine_quantize_weights(mlmodel, mode=mode)
+
+        # validate parameters
+        expected_ops = ['constexpr_affine_dequantize', 'cast', 'conv', 'cast']
+        assert get_op_types_in_program(mlmodel_quantized._mil_program) == expected_ops
+
+        quanitze_op = mlmodel_quantized._mil_program.functions["main"].find_ops(op_type="constexpr_affine_dequantize")[0]
+        assert model.weight.detach().numpy().shape == quanitze_op.quantized_data.shape
+
+        verify_model_outputs(mlmodel, mlmodel_quantized, coreml_input_values)
+
+    @staticmethod
+    @pytest.mark.parametrize(
+        "threshold",
+        (0.0, 0.001, 1e2),
+    )
+    def test_weight_sparsify_threshold_based(threshold):
+        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data()
+        with torch.no_grad():
+            model.weight[0][0][0][0] = 101
+        torchmodel = torch.jit.trace(model, torch_input_values)
+        mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram")
+        mlmodel_sparsified = ct.compression_utils.sparsify_weights(mlmodel, mode="threshold_based", threshold=threshold)
+
+        # validate parameters
+        expected_ops = ['constexpr_sparse_to_dense', 'cast', 'conv', 'cast']
+        assert get_op_types_in_program(mlmodel_sparsified._mil_program) == expected_ops
+
+        main_func = mlmodel_sparsified._mil_program.functions["main"]
+        sparse_to_dense_op = main_func.find_ops(op_type="constexpr_sparse_to_dense")[0]
+        non_sparse_data = sparse_to_dense_op.nonzero_data
+
+        if threshold != 1e2:
+            assert np.min(np.absolute(non_sparse_data.val)) >= threshold
+        else:
+            assert non_sparse_data.val.size == 1
+
+        assert sparse_to_dense_op.shape.val.tolist() == list(model.weight.detach().numpy().shape)
+
+        # validate the model
+        verify_model_outputs(mlmodel, mlmodel_sparsified, coreml_input_values)
+
+    @staticmethod
+    @pytest.mark.parametrize(
+        "percentile",
+        (0., 0.5, 1.0),
+    )
+    def test_weight_sparsify_percentile_based(percentile):
+        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data()
+        torchmodel = torch.jit.trace(model, torch_input_values)
+        mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram")
+        mlmodel_sparsified = ct.compression_utils.sparsify_weights(mlmodel, mode="percentile_based", target_percentile=percentile)
+
+        # validate parameters        
+        expected_ops = ['constexpr_sparse_to_dense', 'cast', 'conv', 'cast']
+        assert get_op_types_in_program(mlmodel_sparsified._mil_program) == expected_ops
+
+        main_func = mlmodel_sparsified._mil_program.functions["main"]
+        sparse_to_dense_op = main_func.find_ops(op_type="constexpr_sparse_to_dense")[0]
+        non_sparse_data = sparse_to_dense_op.nonzero_data
+        weight = model.weight.detach().numpy()
+
+        if percentile == 0.:
+            assert non_sparse_data.val.size == weight.size - 1
+        elif percentile == 0.5:
+            assert non_sparse_data.val.size <= 0.51 * (weight.size) and non_sparse_data.val.size >= 0.49 * (weight.size)
+        else:
+            assert non_sparse_data.val.size == 0
+
+        assert sparse_to_dense_op.shape.val.tolist() == list(model.weight.detach().numpy().shape)
+
+        # validate the model
+        verify_model_outputs(mlmodel, mlmodel_sparsified, coreml_input_values)
+
+    @staticmethod
+    @pytest.mark.parametrize(
+        "mode",
+        ("uniform", "kmeans")
+    )
+    def test_weight_palettization(mode):
+        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data()
+        torchmodel = torch.jit.trace(model, torch_input_values)
+        mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram")
+        mlmodel_palettized = ct.compression_utils.palettize_weights(mlmodel, nbits=4, mode=mode)
+
+        # validate parameters
+        expected_ops = ['constexpr_lut_to_dense', 'cast', 'conv', 'cast']
+        assert get_op_types_in_program(mlmodel_palettized._mil_program) == expected_ops
+
+        main_func = mlmodel_palettized._mil_program.functions["main"]
+        lut_to_dense_op = main_func.find_ops(op_type="constexpr_lut_to_dense")[0]
+
+        assert lut_to_dense_op.shape.val.tolist() == list(model.weight.detach().numpy().shape)
+    
+        # validate the model 
+        verify_model_outputs(mlmodel, mlmodel_palettized, coreml_input_values)
+
+    @staticmethod
+    def test_weight_palettization_unique_case_1():
+        # In this model, both conv weights can be palettized
+        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data(multi_layer=True)
+
+        weight_1_unique = create_unique_weight(model.conv_1.weight, nbits=2)
+        weight_2_unique = create_unique_weight(model.conv_2.weight, nbits=6)
+        
+        with torch.no_grad():
+            model.conv_1.weight = torch.nn.Parameter(torch.Tensor(weight_1_unique))
+            model.conv_2.weight = torch.nn.Parameter(torch.Tensor(weight_2_unique))
+
+        torchmodel = torch.jit.trace(model, torch_input_values)
+        mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram")
+
+        # validate parameters
+        mlmodel_palettized = ct.compression_utils.palettize_weights(mlmodel, mode="unique")
+        expected_ops = ['constexpr_lut_to_dense', 'cast', 'conv', 'constexpr_lut_to_dense', 'conv', 'cast']
+        assert get_op_types_in_program(mlmodel_palettized._mil_program) == expected_ops
+
+        main_func = mlmodel_palettized._mil_program.functions["main"]
+        lut_to_dense_op_1 = main_func.find_ops(op_type="constexpr_lut_to_dense")[0]
+        lut_to_dense_op_2 = main_func.find_ops(op_type="constexpr_lut_to_dense")[1]
+
+        assert lut_to_dense_op_1.shape.val.tolist() == list(model.conv_1.weight.detach().numpy().shape)
+        assert lut_to_dense_op_2.shape.val.tolist() == list(model.conv_2.weight.detach().numpy().shape)
+
+        # validate the model 
+        verify_model_outputs(mlmodel, mlmodel_palettized, coreml_input_values)
+
+    @staticmethod
+    def test_weight_palettization_unique_case_2(caplog):
+        # In this model, only one conv weights can be palettized, the converter should warn the users that one weight is skipped
+        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data(multi_layer=True)
+
+        weight_1_unique = create_unique_weight(model.conv_1.weight, nbits=2)
+        
+        with torch.no_grad():
+            model.conv_1.weight = torch.nn.Parameter(torch.Tensor(weight_1_unique))
+
+        torchmodel = torch.jit.trace(model, torch_input_values)
+        mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram")
+
+        # validate parameters
+        # converter should warn the user that one weight is not compressed
+        mlmodel_palettized = ct.compression_utils.palettize_weights(mlmodel, mode="unique")
+        warning_msg = "weight value cannot be represented in an 8 bits palettization. Skipped."
+        assert any([warning_msg in rec.message for rec in caplog.records])
+
+        expected_ops = ['constexpr_lut_to_dense', 'cast', 'conv', 'conv', 'cast']
+        assert get_op_types_in_program(mlmodel_palettized._mil_program) == expected_ops
+
+        main_func = mlmodel_palettized._mil_program.functions["main"]
+        lut_to_dense_op_1 = main_func.find_ops(op_type="constexpr_lut_to_dense")[0]
+        assert lut_to_dense_op_1.shape.val.tolist() == list(model.conv_1.weight.detach().numpy().shape)
+
+        # validate the model 
+        verify_model_outputs(mlmodel, mlmodel_palettized, coreml_input_values)
+
+    @staticmethod
+    def test_weight_palettization_custom():
+        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data()
+        torchmodel = torch.jit.trace(model, torch_input_values)
+        mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram")
+
+        def lut_function(weight):
+            nbits = 4
+            weight = weight.flatten()
+            unique_elements = np.unique(weight)
+            k = (1 << nbits) - 1
+            top_k = np.partition(weight, -k)[-k:]
+            np.sort(top_k)
+            lut = np.array([0.] + top_k.tolist()).astype(weight.dtype)
+            mapping = {v: idx for idx, v in enumerate(lut)}
+            indices = np.array([mapping[v] if v in mapping else 0 for v in weight]).astype(np.uint8)
+            return lut, indices
+
+        mlmodel_palettized = ct.compression_utils.palettize_weights(mlmodel, mode="custom", lut_function=lut_function)
+
+        # validate parameters
+        expected_ops = ['constexpr_lut_to_dense', 'cast', 'conv', 'cast']
+        assert get_op_types_in_program(mlmodel_palettized._mil_program) == expected_ops
+
+        main_func = mlmodel_palettized._mil_program.functions["main"]
+        lut_to_dense_op = main_func.find_ops(op_type="constexpr_lut_to_dense")[0]
+
+        assert lut_to_dense_op.shape.val.tolist() == list(model.weight.detach().numpy().shape)
+    
+        # validate the model 
+        verify_model_outputs(mlmodel, mlmodel_palettized, coreml_input_values)
\ No newline at end of file
diff --git a/coremltools/test/modelpackage/test_mlmodel.py b/coremltools/test/modelpackage/test_mlmodel.py
index 8944e1b25..1cc210999 100644
--- a/coremltools/test/modelpackage/test_mlmodel.py
+++ b/coremltools/test/modelpackage/test_mlmodel.py
@@ -45,11 +45,9 @@ def forward(self, x):
                 dtype=example_input.numpy().dtype,
             )
         ],
-        compute_precision=ct.precision.FLOAT32
+        compute_precision=ct.precision.FLOAT32,
+        compute_units=ct.ComputeUnit.CPU_ONLY
     )
-    # `coremltools_internal.convert` returns
-    # `coremltools_internal.models.MLModel` for `mlprogram` and `neuralnetwork`
-    # backend
     assert isinstance(mlmodel, MLModel)
 
     # mlpackage_path is a model package
@@ -66,7 +64,6 @@ def forward(self, x):
 
     result = mlmodel2.predict(
         {"input": example_input.cpu().detach().numpy().astype(np.float32)},
-        useCPUOnly=True,
     )
 
     # Verify outputs
diff --git a/coremltools/test/modelpackage/test_modelpackage.py b/coremltools/test/modelpackage/test_modelpackage.py
index 83e361cbe..7b905ea66 100644
--- a/coremltools/test/modelpackage/test_modelpackage.py
+++ b/coremltools/test/modelpackage/test_modelpackage.py
@@ -10,7 +10,7 @@
 import tempfile
 
 import coremltools
-from coremltools import utils
+from coremltools import ComputeUnit, utils
 from coremltools.converters.mil import Builder as mb
 from coremltools.libmodelpackage import ModelPackage
 from coremltools.models import MLModel
@@ -254,7 +254,7 @@ def forward(self, x):
             assert os.path.exists(ModelPackage(package_path).getRootModel().path())
 
             # Read back the saved bundle and compile
-            mlmodel2 = MLModel(package_path, useCPUOnly=True)
+            mlmodel2 = MLModel(package_path, compute_units=ComputeUnit.CPU_ONLY)
 
             if utils._macos_version() >= (12, 0):
                 result = mlmodel2.predict(
diff --git a/coremltools/test/neural_network/test_keras.py b/coremltools/test/neural_network/test_keras.py
deleted file mode 100644
index dc986efeb..000000000
--- a/coremltools/test/neural_network/test_keras.py
+++ /dev/null
@@ -1,1137 +0,0 @@
-# Copyright (c) 2017, Apple Inc. All rights reserved.
-#
-# Use of this source code is governed by a BSD-3-clause license that can be
-# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
-
-import unittest
-
-from coremltools._deps import _HAS_KERAS_TF
-from coremltools.proto import FeatureTypes_pb2
-import pytest
-
-if _HAS_KERAS_TF:
-    import tensorflow as tf
-    from keras.models import Sequential, Model
-    from coremltools.converters import keras
-
-
-@unittest.skipIf(not _HAS_KERAS_TF, "Missing keras. Skipping tests.")
-@pytest.mark.keras1
-class KerasSingleLayerTest(unittest.TestCase):
-    """
-    Unit test class for testing scikit-learn converter.
-    """
-
-    @classmethod
-    def setUpClass(self):
-        """
-        Set up the unit test by loading common utilities.
-        """
-
-    def test_dense(self):
-        """
-        Test the conversion of Dense layer.
-        """
-        from keras.layers import Dense
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(Dense(32, input_dim=16))
-
-        input_names = ["input"]
-        output_names = ["output"]
-        spec = keras.convert(model, input_names, output_names).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names))
-        self.assertCountEqual(
-            self, input_names, [x.name for x in spec.description.input]
-        )
-        self.assertEqual(len(spec.description.output), len(output_names))
-        self.assertCountEqual(
-            self, output_names, [x.name for x in spec.description.output]
-        )
-
-        # Test the layer parameters.
-        layers = spec.neuralNetwork.layers
-        layer_0 = layers[0]
-        self.assertIsNotNone(layer_0.innerProduct)
-
-    def test_activations(self):
-        """
-        Test the conversion for a Dense + Activation('something')
-        """
-        from keras.layers import Dense, Activation
-
-        # Create a simple Keras model
-        keras_activation_options = [
-            "tanh",
-            "softplus",
-            "softsign",
-            "relu",
-            "sigmoid",
-            "hard_sigmoid",
-            "linear",
-        ]
-        coreml_activation_options = [
-            "tanh",
-            "softplus",
-            "softsign",
-            "ReLU",
-            "sigmoid",
-            "sigmoidHard",
-            "linear",
-        ]
-
-        for i, k_act in enumerate(keras_activation_options):
-            c_act = coreml_activation_options[i]
-            model = Sequential()
-            model.add(Dense(32, input_dim=16))
-            model.add(Activation(k_act))
-
-            input_names = ["input"]
-            output_names = ["output"]
-            spec = keras.convert(model, input_names, output_names).get_spec()
-            self.assertIsNotNone(spec)
-
-            # Test the model class
-            self.assertIsNotNone(spec.description)
-            self.assertTrue(spec.HasField("neuralNetwork"))
-
-            # Test the inputs and outputs
-            self.assertEqual(len(spec.description.input), len(input_names))
-            self.assertCountEqual(
-                self, input_names, [x.name for x in spec.description.input]
-            )
-            self.assertEqual(len(spec.description.output), len(output_names))
-            self.assertCountEqual(
-                self, output_names, [x.name for x in spec.description.output]
-            )
-
-            # Test the layer parameters.
-            layers = spec.neuralNetwork.layers
-            self.assertIsNotNone(layers[0].innerProduct)
-            self.assertIsNotNone(layers[1].activation)
-            self.assertTrue(layers[1].activation.HasField(c_act))
-
-    def test_activation_softmax(self):
-        """
-        Test the conversion for a Dense + Activation('softmax')
-        """
-        from keras.layers import Dense, Activation
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(Dense(32, input_dim=16))
-        model.add(Activation("softmax"))
-
-        input_names = ["input"]
-        output_names = ["output"]
-        spec = keras.convert(model, input_names, output_names).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names))
-        self.assertCountEqual(
-            self, input_names, [x.name for x in spec.description.input]
-        )
-        self.assertEqual(len(spec.description.output), len(output_names))
-        self.assertCountEqual(
-            self, output_names, [x.name for x in spec.description.output]
-        )
-
-        # Test the layer parameters.
-        layers = spec.neuralNetwork.layers
-        layer_0 = layers[0]
-        self.assertIsNotNone(layer_0.innerProduct)
-        layer_1 = layers[1]
-        self.assertIsNotNone(layer_1.softmax)
-
-    def test_dropout(self):
-        """
-        Test the conversion for a Dense + Dropout
-        """
-        from keras.layers import Dense, Dropout
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(Dense(32, input_dim=16))
-        model.add(Dropout(0.5))
-        model.add(Dense(32, input_dim=16))
-
-        input_names = ["input"]
-        output_names = ["output"]
-        spec = keras.convert(model, input_names, output_names).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names))
-        self.assertCountEqual(
-            self, input_names, [x.name for x in spec.description.input]
-        )
-        self.assertEqual(len(spec.description.output), len(output_names))
-        self.assertCountEqual(
-            self, output_names, [x.name for x in spec.description.output]
-        )
-
-        # Test the layer parameters.
-        layers = spec.neuralNetwork.layers
-        layer_0 = layers[0]
-        self.assertIsNotNone(layer_0.innerProduct)
-        self.assertEqual(len(layers), 2)
-
-    def test_convolution(self):
-        """
-        Test the conversion of 2D convolutional layer.
-        """
-        from keras.layers import Convolution2D
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(
-            Convolution2D(
-                input_shape=(64, 64, 3),
-                nb_filter=32,
-                nb_row=5,
-                nb_col=5,
-                init="glorot_uniform",
-                activation=None,
-                weights=None,
-                border_mode="valid",
-                subsample=(1, 1),
-                bias=True,
-            )
-        )
-        input_names = ["input"]
-        output_names = ["output"]
-        spec = keras.convert(model, input_names, output_names).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names))
-        self.assertCountEqual(
-            self, input_names, [x.name for x in spec.description.input]
-        )
-        self.assertEqual(len(spec.description.output), len(output_names))
-        self.assertCountEqual(
-            self, output_names, [x.name for x in spec.description.output]
-        )
-
-        # Test the layer parameters.
-        layers = spec.neuralNetwork.layers
-        layer_0 = layers[0]
-        self.assertIsNotNone(layer_0.convolution)
-
-    def test_upsample(self):
-        """
-        Test the conversion of 2D convolutional layer + upsample
-        """
-        from keras.layers import Convolution2D, UpSampling2D
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(
-            Convolution2D(input_shape=(64, 64, 3), nb_filter=32, nb_row=5, nb_col=5)
-        )
-        model.add(UpSampling2D(size=(2, 2)))
-        input_names = ["input"]
-        output_names = ["output"]
-        spec = keras.convert(model, input_names, output_names).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names))
-        self.assertCountEqual(
-            self, input_names, [x.name for x in spec.description.input]
-        )
-        self.assertEqual(len(spec.description.output), len(output_names))
-        self.assertCountEqual(
-            self, output_names, [x.name for x in spec.description.output]
-        )
-
-        # Test the layer parameters.
-        layers = spec.neuralNetwork.layers
-        layer_0 = layers[0]
-        self.assertIsNotNone(layer_0.convolution)
-        layer_1 = layers[1]
-        self.assertIsNotNone(layer_1.upsample)
-
-    def test_pooling(self):
-        """
-        Test the conversion of pooling layer.
-        """
-        from keras.layers import Convolution2D, MaxPooling2D
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(
-            Convolution2D(
-                input_shape=(64, 64, 3),
-                nb_filter=32,
-                nb_row=5,
-                nb_col=5,
-                init="glorot_uniform",
-                activation=None,
-                weights=None,
-                border_mode="valid",
-                subsample=(1, 1),
-                bias=True,
-            )
-        )
-        model.add(MaxPooling2D(pool_size=(2, 2)))
-
-        input_names = ["input"]
-        output_names = ["output"]
-        spec = keras.convert(model, input_names, output_names).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names))
-        self.assertCountEqual(
-            self, input_names, [x.name for x in spec.description.input]
-        )
-        self.assertEqual(len(spec.description.output), len(output_names))
-        self.assertCountEqual(
-            self, output_names, [x.name for x in spec.description.output]
-        )
-
-        # Test the layer parameters.
-        layers = spec.neuralNetwork.layers
-        layer_0 = layers[0]
-        self.assertIsNotNone(layer_0.convolution)
-
-    def test_permute(self):
-        """
-        Test the conversion of pooling layer.
-        """
-        from keras.layers.core import Permute
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(Permute((3, 2, 1), input_shape=(10, 64, 3)))
-
-        input_names = ["input"]
-        output_names = ["output"]
-        spec = keras.convert(model, input_names, output_names).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names))
-        self.assertCountEqual(
-            self, input_names, [x.name for x in spec.description.input]
-        )
-        self.assertEqual(len(spec.description.output), len(output_names))
-        self.assertCountEqual(
-            self, output_names, [x.name for x in spec.description.output]
-        )
-
-        # Test the layer parameters.
-        layers = spec.neuralNetwork.layers
-        layer_0 = layers[0]
-        self.assertIsNotNone(layer_0.permute)
-
-    def test_lstm(self):
-        """
-        Test the conversion of an LSTM layer.
-        """
-        from keras.layers import LSTM
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(LSTM(32, input_dim=24, input_length=10))
-
-        input_names = ["input"]
-        output_names = ["output"]
-        spec = keras.convert(model, input_names, output_names).get_spec()
-
-        print(spec)
-
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names) + 2)
-
-        self.assertEqual(32, spec.description.input[1].type.multiArrayType.shape[0])
-        self.assertEqual(32, spec.description.input[2].type.multiArrayType.shape[0])
-
-        self.assertEqual(len(spec.description.output), len(output_names) + 2)
-        self.assertEqual(output_names[0], spec.description.output[0].name)
-        self.assertEqual(32, spec.description.output[0].type.multiArrayType.shape[0])
-
-        self.assertEqual(32, spec.description.output[1].type.multiArrayType.shape[0])
-        self.assertEqual(32, spec.description.output[2].type.multiArrayType.shape[0])
-
-        # Test the layer parameters.
-        layers = spec.neuralNetwork.layers
-        layer_0 = layers[0]
-        self.assertIsNotNone(layer_0.uniDirectionalLSTM)
-        self.assertEqual(len(layer_0.input), 3)
-        self.assertEqual(len(layer_0.output), 3)
-
-    def test_simple_rnn(self):
-        """
-        Test the conversion of a simple RNN layer.
-        """
-        from keras.layers import SimpleRNN
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(SimpleRNN(32, input_dim=32, input_length=10))
-
-        input_names = ["input"]
-        output_names = ["output"]
-        spec = keras.convert(model, input_names, output_names).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names) + 1)
-        self.assertEqual(input_names[0], spec.description.input[0].name)
-
-        self.assertEqual(32, spec.description.input[1].type.multiArrayType.shape[0])
-
-        self.assertEqual(len(spec.description.output), len(output_names) + 1)
-        self.assertEqual(output_names[0], spec.description.output[0].name)
-        self.assertEqual(32, spec.description.output[0].type.multiArrayType.shape[0])
-        self.assertEqual(32, spec.description.output[1].type.multiArrayType.shape[0])
-
-        # Test the layer parameters.
-        layers = spec.neuralNetwork.layers
-        layer_0 = layers[0]
-        self.assertIsNotNone(layer_0.simpleRecurrent)
-        self.assertEqual(len(layer_0.input), 2)
-        self.assertEqual(len(layer_0.output), 2)
-
-    def test_gru(self):
-        """
-        Test the conversion of a GRU layer.
-        """
-        from keras.layers import GRU
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(GRU(32, input_dim=32, input_length=10))
-
-        input_names = ["input"]
-        output_names = ["output"]
-        spec = keras.convert(model, input_names, output_names).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names) + 1)
-        self.assertEqual(input_names[0], spec.description.input[0].name)
-
-        self.assertEqual(32, spec.description.input[1].type.multiArrayType.shape[0])
-
-        self.assertEqual(len(spec.description.output), len(output_names) + 1)
-        self.assertEqual(output_names[0], spec.description.output[0].name)
-        self.assertEqual(32, spec.description.output[0].type.multiArrayType.shape[0])
-        self.assertEqual(32, spec.description.output[1].type.multiArrayType.shape[0])
-
-        # Test the layer parameters.
-        layers = spec.neuralNetwork.layers
-        layer_0 = layers[0]
-        self.assertIsNotNone(layer_0.gru)
-        self.assertEqual(len(layer_0.input), 2)
-        self.assertEqual(len(layer_0.output), 2)
-
-    def test_bidir(self):
-        """
-        Test the conversion of a bidirectional layer
-        """
-        from keras.layers import LSTM
-        from keras.layers.wrappers import Bidirectional
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(
-            Bidirectional(LSTM(32, input_dim=32, input_length=10), input_shape=(10, 32))
-        )
-
-        input_names = ["input"]
-        output_names = ["output"]
-        spec = keras.convert(model, input_names, output_names).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names) + 4)
-        self.assertEqual(input_names[0], spec.description.input[0].name)
-
-        self.assertEqual(32, spec.description.input[1].type.multiArrayType.shape[0])
-        self.assertEqual(32, spec.description.input[2].type.multiArrayType.shape[0])
-        self.assertEqual(32, spec.description.input[3].type.multiArrayType.shape[0])
-        self.assertEqual(32, spec.description.input[4].type.multiArrayType.shape[0])
-
-        self.assertEqual(len(spec.description.output), len(output_names) + 4)
-        self.assertEqual(output_names[0], spec.description.output[0].name)
-        self.assertEqual(64, spec.description.output[0].type.multiArrayType.shape[0])
-
-        self.assertEqual(32, spec.description.output[1].type.multiArrayType.shape[0])
-        self.assertEqual(32, spec.description.output[2].type.multiArrayType.shape[0])
-        self.assertEqual(32, spec.description.output[3].type.multiArrayType.shape[0])
-        self.assertEqual(32, spec.description.output[4].type.multiArrayType.shape[0])
-
-        # Test the layer parameters.
-        layers = spec.neuralNetwork.layers
-        layer_0 = layers[0]
-        self.assertIsNotNone(layer_0.biDirectionalLSTM)
-        self.assertEqual(len(layer_0.input), 5)
-        self.assertEqual(len(layer_0.output), 5)
-
-    def test_embedding(self):
-        from keras.layers import Embedding
-
-        model = Sequential()
-        num_inputs = 10
-        num_outputs = 3
-        model.add(Embedding(num_inputs, num_outputs, input_length=5))
-
-        input_names = ["input"]
-        output_names = ["output"]
-
-        spec = keras.convert(model, input_names, output_names).get_spec()
-
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names))
-
-        # Test the layer parameters.
-        layers = spec.neuralNetwork.layers
-        layer_0 = layers[0]
-        self.assertIsNotNone(layer_0.embedding)
-
-        self.assertEqual(layer_0.embedding.inputDim, num_inputs)
-        self.assertEqual(layer_0.embedding.outputChannels, num_outputs)
-
-        self.assertEqual(
-            len(layer_0.embedding.weights.floatValue), num_inputs * num_outputs
-        )
-
-    @unittest.skip
-    def test_sentiment_analysis(self):
-        """
-        Test the conversion for a Embedding + LSTM + Dense layer
-        """
-        from keras.layers import Dense, Embedding, LSTM
-
-        # Create a simple Keras model
-        max_features = 50
-        embedded_dim = 32
-        sequence_length = 10
-
-        model = Sequential()
-        # Embedding layer example:
-        # Embedding(1000, 64, input_length=10) input_dim=index(0~999), 64-dimensional vector, sequence length = 10
-        # If we have Dense/Flatten layer upstream, input_length, a.k.a sequence_length is required
-
-        model.add(Embedding(max_features, embedded_dim, input_length=sequence_length))
-        # output_dim = 32
-        model.add(LSTM(32))
-        model.add(Dense(1, activation="sigmoid"))
-
-        # Input/output
-        input_names = ["input"]
-        output_names = ["output"]
-        spec = keras.convert(model, input_names, output_names).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names))
-        self.assertCountEqual(
-            self, input_names, [x.name for x in spec.description.input]
-        )
-        self.assertEqual(len(spec.description.output), len(output_names))
-        self.assertCountEqual(
-            self, output_names, [x.name for x in spec.description.output]
-        )
-
-        # Test the layer parameters.
-        layers = spec.neuralNetwork.layers
-        self.assertIsNotNone(layers[0].innerProduct)
-        self.assertIsNotNone(layers[1].recurrent)
-        self.assertIsNotNone(layers[2].innerProduct)
-
-    @unittest.skip
-    def test_conv1d_lstm(self):
-        from keras.layers import Convolution1D, LSTM, Dense
-
-        model = Sequential()
-        # input_shape = (time_step, dimensions)
-        model.add(Convolution1D(32, 3, border_mode="same", input_shape=(10, 8)))
-        # conv1d output shape = (None, 10, 32)
-        model.add(LSTM(24))
-        model.add(Dense(1, activation="sigmoid"))
-        print("model.layers[1].output_shape=", model.layers[1].output_shape)
-
-        input_names = ["input"]
-        output_names = ["output"]
-        spec = keras.convert(model, input_names, output_names).get_spec()
-
-        self.assertIsNotNone(spec)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names))
-        self.assertCountEqual(
-            self, input_names, [x.name for x in spec.description.input]
-        )
-        self.assertEqual(len(spec.description.output), len(output_names))
-        self.assertCountEqual(
-            self, output_names, [x.name for x in spec.description.output]
-        )
-
-        # Test the layer parameters.
-        layers = spec.neuralNetwork.layers
-        self.assertIsNotNone(layers[0].convolution)
-        self.assertIsNotNone(layers[1].simpleRecurrent)
-        self.assertIsNotNone(layers[2].innerProduct)
-
-    def test_batchnorm(self):
-        """
-        Test the conversion for a Convoultion2D + Batchnorm layer
-        """
-        from keras.layers import Convolution2D
-        from keras.layers.normalization import BatchNormalization
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(
-            Convolution2D(
-                input_shape=(64, 64, 3),
-                nb_filter=32,
-                nb_row=5,
-                nb_col=5,
-                init="glorot_uniform",
-                activation=None,
-                weights=None,
-                border_mode="valid",
-                subsample=(1, 1),
-                bias=True,
-            )
-        )
-        # epsilon in CoreML is currently fixed at 1e-5
-        model.add(BatchNormalization(epsilon=1e-5))
-        input_names = ["input"]
-        output_names = ["output"]
-        spec = keras.convert(model, input_names, output_names).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names))
-        self.assertCountEqual(
-            self, input_names, [x.name for x in spec.description.input]
-        )
-        self.assertEqual(len(spec.description.output), len(output_names))
-        self.assertCountEqual(
-            self, output_names, [x.name for x in spec.description.output]
-        )
-
-        # Test the layer parameters.
-        layers = spec.neuralNetwork.layers
-        self.assertIsNotNone(layers[0].convolution)
-        self.assertIsNotNone(layers[1].batchnorm)
-
-    def test_repeat_vector(self):
-        from keras.layers import RepeatVector
-
-        model = Sequential()
-        model.add(RepeatVector(3, input_shape=(5,)))
-
-        input_names = ["input"]
-        output_names = ["output"]
-        spec = keras.convert(model, input_names, output_names).get_spec()
-        self.assertIsNotNone(spec)
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names))
-        self.assertCountEqual(
-            self, input_names, [x.name for x in spec.description.input]
-        )
-        self.assertEqual(len(spec.description.output), len(output_names))
-        self.assertCountEqual(
-            self, output_names, [x.name for x in spec.description.output]
-        )
-        layers = spec.neuralNetwork.layers
-        self.assertIsNotNone(layers[0].sequenceRepeat)
-
-    @pytest.mark.xfail(raises=ValueError)
-    def test_unsupported_variational_deconv(self):
-        from keras.layers import Input, Lambda, Convolution2D, Flatten, Dense
-
-        x = Input(shape=(8, 8, 3))
-        conv_1 = Convolution2D(4, 2, 2, border_mode="same", activation="relu")(x)
-        flat = Flatten()(conv_1)
-        hidden = Dense(10, activation="relu")(flat)
-        z_mean = Dense(10)(hidden)
-        z_log_var = Dense(10)(hidden)
-
-        def sampling(args):
-            z_mean, z_log_var = args
-            return z_mean + z_log_var
-
-        z = Lambda(sampling, output_shape=(10,))([z_mean, z_log_var])
-        model = Model([x], [z])
-        spec = keras.convert(model, ["input"], ["output"]).get_spec()
-
-    def test_image_processing(self):
-        """
-        Test the image-processing parameters.
-        """
-        from keras.layers import Convolution2D
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(
-            Convolution2D(
-                input_shape=(64, 64, 3),
-                nb_filter=32,
-                nb_row=5,
-                nb_col=5,
-                init="glorot_uniform",
-                activation=None,
-                weights=None,
-                border_mode="valid",
-                subsample=(1, 1),
-                bias=True,
-            )
-        )
-        input_names = ["input"]
-        output_names = ["output"]
-        spec = keras.convert(
-            model,
-            input_names,
-            output_names,
-            image_input_names=["input"],
-            red_bias=110.0,
-            blue_bias=117.0,
-            green_bias=120.0,
-            is_bgr=True,
-            image_scale=1.0,
-        ).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-        self.assertEqual(
-            spec.description.input[0].type.WhichOneof("Type"), "imageType"
-        )
-        self.assertEqual(
-            spec.description.input[0].type.imageType.colorSpace,
-            FeatureTypes_pb2.ImageFeatureType.ColorSpace.Value("BGR"),
-        )
-
-        # Test the layer parameters.
-        preprocessing = spec.neuralNetwork.preprocessing[0]
-        self.assertTrue(preprocessing.HasField("scaler"))
-        pr_0 = preprocessing.scaler
-        print("pr_0.channelScale = ", pr_0.channelScale)
-        print("pr_0.redBias = ", pr_0.redBias)
-        print("pr_0.blueBias = ", pr_0.blueBias)
-        print("pr_0.greenBias = ", pr_0.greenBias)
-        self.assertIsNotNone(pr_0.redBias)
-        self.assertIsNotNone(pr_0.greenBias)
-        self.assertIsNotNone(pr_0.blueBias)
-        self.assertIsNotNone(pr_0.channelScale)
-        self.assertEqual(pr_0.channelScale, 1.0)
-        self.assertEqual(pr_0.redBias, 110.0)
-        self.assertEqual(pr_0.blueBias, 117.0)
-        self.assertEqual(pr_0.greenBias, 120.0)
-
-        # Configuration 2: isbgr = False
-        spec = keras.convert(
-            model,
-            input_names,
-            output_names,
-            image_input_names=["input"],
-            red_bias=110.0,
-            blue_bias=117.0,
-            green_bias=120.0,
-            is_bgr=False,
-            image_scale=1.0,
-        ).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-        self.assertEqual(
-            spec.description.input[0].type.WhichOneof("Type"), "imageType"
-        )
-        self.assertEqual(
-            spec.description.input[0].type.imageType.colorSpace,
-            FeatureTypes_pb2.ImageFeatureType.ColorSpace.Value("RGB"),
-        )
-
-        # Test the layer parameters.
-        preprocessing = spec.neuralNetwork.preprocessing[0]
-        self.assertTrue(preprocessing.HasField("scaler"))
-        pr_0 = preprocessing.scaler
-        self.assertIsNotNone(pr_0.redBias)
-        self.assertIsNotNone(pr_0.greenBias)
-        self.assertIsNotNone(pr_0.blueBias)
-        self.assertIsNotNone(pr_0.channelScale)
-        self.assertEqual(pr_0.channelScale, 1.0)
-        self.assertEqual(pr_0.redBias, 110.0)
-        self.assertEqual(pr_0.blueBias, 117.0)
-        self.assertEqual(pr_0.greenBias, 120.0)
-
-        # Configuration 3: Defaults
-        spec = keras.convert(
-            model,
-            input_names,
-            output_names,
-            image_input_names=["input"],
-            is_bgr=False,
-            image_scale=1.0,
-        ).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-        self.assertEqual(
-            spec.description.input[0].type.WhichOneof("Type"), "imageType"
-        )
-        self.assertEqual(
-            spec.description.input[0].type.imageType.colorSpace,
-            FeatureTypes_pb2.ImageFeatureType.ColorSpace.Value("RGB"),
-        )
-
-        # Test the layer parameters.
-        preprocessing = spec.neuralNetwork.preprocessing[0]
-        self.assertTrue(preprocessing.HasField("scaler"))
-        pr_0 = preprocessing.scaler
-        self.assertIsNotNone(pr_0.redBias)
-        self.assertIsNotNone(pr_0.greenBias)
-        self.assertIsNotNone(pr_0.blueBias)
-        self.assertIsNotNone(pr_0.channelScale)
-        self.assertEqual(pr_0.channelScale, 1.0)
-        self.assertEqual(pr_0.redBias, 0.0)
-        self.assertEqual(pr_0.blueBias, 0.0)
-        self.assertEqual(pr_0.greenBias, 0.0)
-
-    def test_classifier_string_classes(self):
-        from keras.layers import Dense
-        from keras.layers import Activation
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(Dense(32, input_dim=16))
-        model.add(Activation("softmax"))
-        classes = ["c%s" % i for i in range(32)]
-
-        input_names = ["input"]
-        output_names = ["prob_output"]
-        expected_output_names = ["prob_output", "classLabel"]
-        spec = keras.convert(
-            model, input_names, output_names, class_labels=classes
-        ).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetworkClassifier"))
-        self.assertFalse(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names))
-        self.assertCountEqual(
-            self, input_names, [x.name for x in spec.description.input]
-        )
-        self.assertEqual(len(spec.description.output), len(expected_output_names))
-        self.assertEqual(
-            expected_output_names, [x.name for x in spec.description.output]
-        )
-
-        # Check the types
-        self.assertEqual(
-            spec.description.output[0].type.WhichOneof("Type"), "dictionaryType"
-        )
-        self.assertEqual(
-            spec.description.output[0].type.dictionaryType.WhichOneof("KeyType"),
-            "stringKeyType",
-        )
-        self.assertEqual(
-            spec.description.output[1].type.WhichOneof("Type"), "stringType"
-        )
-        self.assertTrue(spec.description.predictedFeatureName, "classLabel")
-        self.assertTrue(spec.description.predictedProbabilitiesName, "prob_output")
-
-        # Test the class parameters
-        self.assertEqual(
-            spec.WhichOneof("Type"),
-            "neuralNetworkClassifier",
-            "Expected a NN classifier model",
-        )
-        self.assertEqual(
-            spec.neuralNetworkClassifier.WhichOneof("ClassLabels"), "stringClassLabels"
-        )
-        class_from_proto = list(spec.neuralNetworkClassifier.stringClassLabels.vector)
-        self.assertCountEqual(self, classes, class_from_proto)
-
-    def test_classifier_file(self):
-        from keras.layers import Dense
-        from keras.layers import Activation
-        import os
-        import tempfile
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(Dense(32, input_dim=16))
-        model.add(Activation("softmax"))
-        classes = ["c%s" % i for i in range(32)]
-        classes_file = tempfile.mktemp()
-        with open(classes_file, "w") as f:
-            f.write("\n".join(classes))
-
-        input_names = ["input"]
-        output_names = ["prob_output"]
-        expected_output_names = ["prob_output", "classLabel"]
-        spec = keras.convert(
-            model, input_names, output_names, class_labels=classes
-        ).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetworkClassifier"))
-        self.assertFalse(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names))
-        self.assertCountEqual(
-            self, input_names, [x.name for x in spec.description.input]
-        )
-        self.assertEqual(len(spec.description.output), len(expected_output_names))
-        self.assertEqual(
-            expected_output_names, [x.name for x in spec.description.output]
-        )
-
-        # Check the types
-        self.assertEqual(
-            spec.description.output[0].type.WhichOneof("Type"), "dictionaryType"
-        )
-        self.assertEqual(
-            spec.description.output[0].type.dictionaryType.WhichOneof("KeyType"),
-            "stringKeyType",
-        )
-        self.assertEqual(
-            spec.description.output[1].type.WhichOneof("Type"), "stringType"
-        )
-        self.assertTrue(spec.description.predictedFeatureName, "classLabel")
-        self.assertTrue(spec.description.predictedProbabilitiesName, "prob_output")
-
-        # cleanup
-        os.remove(classes_file)
-
-    def test_classifier_integer_classes(self):
-        from keras.layers import Dense
-        from keras.layers import Activation
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(Dense(32, input_dim=16))
-        model.add(Activation("softmax"))
-        classes = list(range(32))
-
-        input_names = ["input"]
-        output_names = ["prob_output"]
-        expected_output_names = ["prob_output", "classLabel"]
-        spec = keras.convert(
-            model, input_names, output_names, class_labels=classes
-        ).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetworkClassifier"))
-        self.assertFalse(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names))
-        self.assertCountEqual(
-            self, input_names, [x.name for x in spec.description.input]
-        )
-        self.assertEqual(len(spec.description.output), len(expected_output_names))
-        self.assertEqual(
-            expected_output_names, [x.name for x in spec.description.output]
-        )
-
-        # Check the types
-        self.assertEqual(
-            spec.description.output[0].type.WhichOneof("Type"), "dictionaryType"
-        )
-        self.assertEqual(
-            spec.description.output[0].type.dictionaryType.WhichOneof("KeyType"),
-            "int64KeyType",
-        )
-        self.assertEqual(
-            spec.description.output[1].type.WhichOneof("Type"), "int64Type"
-        )
-        self.assertTrue(spec.description.predictedFeatureName, "classLabel")
-        self.assertTrue(spec.description.predictedProbabilitiesName, "prob_output")
-
-        # Test the class parameters
-        self.assertEqual(
-            spec.WhichOneof("Type"),
-            "neuralNetworkClassifier",
-            "Expected a NN classifier model",
-        )
-        self.assertEqual(
-            spec.neuralNetworkClassifier.WhichOneof("ClassLabels"), "int64ClassLabels"
-        )
-        class_from_proto = list(spec.neuralNetworkClassifier.int64ClassLabels.vector)
-        self.assertCountEqual(self, classes, class_from_proto)
-
-    def test_classifier_custom_class_name(self):
-        from keras.layers import Dense
-        from keras.layers import Activation
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(Dense(32, input_dim=16))
-        model.add(Activation("softmax"))
-        classes = ["c%s" % i for i in range(32)]
-
-        input_names = ["input"]
-        output_names = ["prob_output"]
-        expected_output_names = ["prob_output", "my_foo_bar_class_output"]
-        spec = keras.convert(
-            model,
-            input_names,
-            output_names,
-            class_labels=classes,
-            predicted_feature_name="my_foo_bar_class_output",
-        ).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetworkClassifier"))
-        self.assertFalse(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names))
-        self.assertCountEqual(
-            self, input_names, [x.name for x in spec.description.input]
-        )
-        self.assertEqual(len(spec.description.output), len(expected_output_names))
-        self.assertEqual(
-            expected_output_names, [x.name for x in spec.description.output]
-        )
-
-        # Check the types
-        self.assertEqual(
-            spec.description.output[0].type.WhichOneof("Type"), "dictionaryType"
-        )
-        self.assertEqual(
-            spec.description.output[0].type.dictionaryType.WhichOneof("KeyType"),
-            "stringKeyType",
-        )
-        self.assertEqual(
-            spec.description.output[1].type.WhichOneof("Type"), "stringType"
-        )
-        self.assertTrue(
-            spec.description.predictedFeatureName, "my_foo_bar_class_output"
-        )
-        self.assertTrue(spec.description.predictedProbabilitiesName, "prob_output")
-
-        # Test the class parameters
-        self.assertEqual(
-            spec.WhichOneof("Type"),
-            "neuralNetworkClassifier",
-            "Expected a NN classifier model",
-        )
-        self.assertEqual(
-            spec.neuralNetworkClassifier.WhichOneof("ClassLabels"), "stringClassLabels"
-        )
-        class_from_proto = list(spec.neuralNetworkClassifier.stringClassLabels.vector)
-        self.assertCountEqual(self, classes, class_from_proto)
-
-    def test_default_interface_names(self):
-        from keras.layers import Dense
-        from keras.layers import Activation
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(Dense(32, input_dim=16))
-        model.add(Activation("softmax"))
-
-        expected_input_names = ["input1"]
-        expected_output_names = ["output1"]
-        spec = keras.convert(model).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(expected_input_names))
-        self.assertCountEqual(
-            self, expected_input_names, [x.name for x in spec.description.input]
-        )
-        self.assertEqual(len(spec.description.output), len(expected_output_names))
-        self.assertEqual(
-            expected_output_names, [x.name for x in spec.description.output]
-        )
diff --git a/coremltools/test/neural_network/test_keras2.py b/coremltools/test/neural_network/test_keras2.py
deleted file mode 100644
index bbc99fcbe..000000000
--- a/coremltools/test/neural_network/test_keras2.py
+++ /dev/null
@@ -1,1594 +0,0 @@
-# Copyright (c) 2021, Apple Inc. All rights reserved.
-#
-# Use of this source code is governed by a BSD-3-clause license that can be
-# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
-
-import unittest
-
-import pytest
-
-from coremltools._deps import _HAS_KERAS2_TF
-from coremltools.proto import (
-    FeatureTypes_pb2,
-    Model_pb2,
-    NeuralNetwork_pb2
-)
-
-if _HAS_KERAS2_TF:
-    import tensorflow as tf
-    from keras.models import Sequential, Model
-    from coremltools.converters import keras
-
-
-@unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras. Skipping tests.")
-@pytest.mark.keras2
-class KerasSingleLayerTest(unittest.TestCase):
-    """
-    Unit test class for testing scikit-learn converter.
-    """
-
-    @classmethod
-    def setUpClass(self):
-        """
-        Set up the unit test by loading common utilities.
-        """
-
-    def test_dense(self):
-        """
-        Test the conversion of Dense layer.
-        """
-        from keras.layers import Dense
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(Dense(32, input_dim=16))
-
-        input_names = ["input"]
-        output_names = ["output"]
-        spec = keras.convert(model, input_names, output_names).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names))
-        self.assertEqual(
-            sorted(input_names), sorted(map(lambda x: x.name, spec.description.input))
-        )
-        self.assertEqual(len(spec.description.output), len(output_names))
-        self.assertEqual(
-            sorted(output_names), sorted(map(lambda x: x.name, spec.description.output))
-        )
-
-        # Test the layer parameters.
-        layers = spec.neuralNetwork.layers
-        layer_0 = layers[0]
-        self.assertIsNotNone(layer_0.innerProduct)
-
-    def test_activations(self):
-        """
-        Test the conversion for a Dense + Activation('something')
-        """
-        from keras.layers import Dense, Activation
-
-        # Create a simple Keras model
-        keras_activation_options = [
-            "elu",
-            "tanh",
-            "softplus",
-            "softsign",
-            "relu",
-            "sigmoid",
-            "hard_sigmoid",
-            "linear",
-        ]
-        coreml_activation_options = [
-            "ELU",
-            "tanh",
-            "softplus",
-            "softsign",
-            "ReLU",
-            "sigmoid",
-            "sigmoidHard",
-            "linear",
-        ]
-
-        for i, k_act in enumerate(keras_activation_options):
-            c_act = coreml_activation_options[i]
-            model = Sequential()
-            model.add(Dense(32, input_dim=16))
-            model.add(Activation(k_act))
-
-            input_names = ["input"]
-            output_names = ["output"]
-            spec = keras.convert(model, input_names, output_names).get_spec()
-            self.assertIsNotNone(spec)
-
-            # Test the model class
-            self.assertIsNotNone(spec.description)
-            self.assertTrue(spec.HasField("neuralNetwork"))
-
-            # Test the inputs and outputs
-            self.assertEqual(len(spec.description.input), len(input_names))
-            self.assertEqual(
-                sorted(input_names),
-                sorted(map(lambda x: x.name, spec.description.input)),
-            )
-            self.assertEqual(len(spec.description.output), len(output_names))
-            self.assertEqual(
-                sorted(output_names),
-                sorted(map(lambda x: x.name, spec.description.output)),
-            )
-
-            # Test the layer parameters.
-            layers = spec.neuralNetwork.layers
-            self.assertIsNotNone(layers[0].innerProduct)
-            self.assertIsNotNone(layers[1].activation)
-            self.assertTrue(layers[1].activation.HasField(c_act))
-
-    def test_activation_softmax(self):
-        """
-        Test the conversion for a Dense + Activation('softmax')
-        """
-        from keras.layers import Dense, Activation
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(Dense(32, input_dim=16))
-        model.add(Activation("softmax"))
-
-        input_names = ["input"]
-        output_names = ["output"]
-        spec = keras.convert(model, input_names, output_names).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names))
-        self.assertEqual(
-            sorted(input_names), sorted(map(lambda x: x.name, spec.description.input))
-        )
-        self.assertEqual(len(spec.description.output), len(output_names))
-        self.assertEqual(
-            sorted(output_names), sorted(map(lambda x: x.name, spec.description.output))
-        )
-
-        # Test the layer parameters.
-        layers = spec.neuralNetwork.layers
-        layer_0 = layers[0]
-        self.assertIsNotNone(layer_0.innerProduct)
-        layer_1 = layers[1]
-        self.assertIsNotNone(layer_1.softmax)
-
-    def test_dropout(self):
-        """
-        Test the conversion for a Dense + Dropout
-        """
-        from keras.layers import Dense, Dropout
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(Dense(32, input_shape=(16,)))
-        model.add(Dropout(0.5))
-        model.add(Dense(32, input_shape=(16,)))
-
-        input_names = ["input"]
-        output_names = ["output"]
-        spec = keras.convert(model, input_names, output_names).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names))
-        self.assertEqual(
-            sorted(input_names), sorted(map(lambda x: x.name, spec.description.input))
-        )
-        self.assertEqual(len(spec.description.output), len(output_names))
-        self.assertEqual(
-            sorted(output_names), sorted(map(lambda x: x.name, spec.description.output))
-        )
-
-        # Test the layer parameters.
-        layers = spec.neuralNetwork.layers
-        layer_0 = layers[0]
-        self.assertIsNotNone(layer_0.innerProduct)
-        self.assertEqual(len(layers), 2)
-
-    def test_convolution(self, with_dilations=False):
-        """
-        Test the conversion of 2D convolutional layer.
-        """
-        from keras.layers import Conv2D
-
-        dilation_rate = [1, 1]
-        if with_dilations:
-            dilation_rate = [2, 2]
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(
-            Conv2D(
-                input_shape=(64, 64, 3),
-                filters=32,
-                kernel_size=(5, 5),
-                activation=None,
-                padding="valid",
-                strides=(1, 1),
-                use_bias=True,
-                dilation_rate=dilation_rate,
-            )
-        )
-
-        input_names = ["input"]
-        output_names = ["output"]
-        spec = keras.convert(model, input_names, output_names).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names))
-        self.assertEqual(
-            sorted(input_names), sorted(map(lambda x: x.name, spec.description.input))
-        )
-        self.assertEqual(len(spec.description.output), len(output_names))
-        self.assertEqual(
-            sorted(output_names), sorted(map(lambda x: x.name, spec.description.output))
-        )
-
-        # Test the layer parameters.
-        layers = spec.neuralNetwork.layers
-        layer_0 = layers[0]
-        self.assertIsNotNone(layer_0.convolution)
-        self.assertEqual(layer_0.convolution.dilationFactor, dilation_rate)
-
-    def test_convolution_dilated(self):
-        """
-        Test the conversion of 2D convolutional layer with dilated kernels
-        """
-        self.test_convolution(with_dilations=True)
-
-    def test_separable_convolution(self, with_dilations=False, activation=None):
-        """
-        Test the conversion of 2D depthwise separable convolutional layer.
-        """
-        from keras.layers import SeparableConv2D
-
-        dilation_rate = [1, 1]
-        if with_dilations:
-            dilation_rate = [2, 2]
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(
-            SeparableConv2D(
-                input_shape=(64, 64, 3),
-                filters=32,
-                kernel_size=(5, 5),
-                activation=activation,
-                padding="valid",
-                strides=(1, 1),
-                use_bias=True,
-                dilation_rate=dilation_rate,
-            )
-        )
-
-        input_names = ["input"]
-        output_names = ["output"]
-        spec = keras.convert(model, input_names, output_names).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names))
-        self.assertEqual(
-            sorted(input_names), sorted(map(lambda x: x.name, spec.description.input))
-        )
-        self.assertEqual(len(spec.description.output), len(output_names))
-        self.assertEqual(
-            sorted(output_names), sorted(map(lambda x: x.name, spec.description.output))
-        )
-
-        # Test the layer parameters.
-        layers = spec.neuralNetwork.layers
-        layer_depthwise, layer_pointwise = layers[0], layers[1]
-
-        self.assertIsNotNone(layer_depthwise.convolution)
-        self.assertIsNotNone(layer_pointwise.convolution)
-        self.assertEqual(layer_depthwise.convolution.dilationFactor, dilation_rate)
-        if activation is not None:
-            self.assertIsNotNone(layers[2].activation)
-            self.assertTrue(layers[2].activation.HasField("ELU"))
-
-    def test_separable_convolution_dilated(self):
-        """
-        Test the conversion of 2D depthwise separable convolutional layer with dilated kernels.
-        """
-        self.test_separable_convolution(with_dilations=True)
-
-    def test_separable_convolution_with_nonlinearity(self):
-        """
-        Test the conversion of 2D depthwise separable convolutional layer with nonlinearity.
-        """
-        self.test_separable_convolution(activation="elu")
-
-    def test_upsample(self):
-        """
-        Test the conversion of 2D convolutional layer + upsample
-        """
-        from keras.layers import Conv2D, UpSampling2D
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(Conv2D(input_shape=(64, 64, 3), filters=32, kernel_size=(5, 5)))
-        model.add(UpSampling2D(size=(2, 2)))
-        input_names = ["input"]
-        output_names = ["output"]
-        spec = keras.convert(model, input_names, output_names).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names))
-        self.assertEqual(
-            sorted(input_names), sorted(map(lambda x: x.name, spec.description.input))
-        )
-        self.assertEqual(len(spec.description.output), len(output_names))
-        self.assertEqual(
-            sorted(output_names), sorted(map(lambda x: x.name, spec.description.output))
-        )
-
-        # Test the layer parameters.
-        layers = spec.neuralNetwork.layers
-        layer_0 = layers[0]
-        self.assertIsNotNone(layer_0.convolution)
-        layer_1 = layers[1]
-        self.assertIsNotNone(layer_1.upsample)
-        self.assertEqual(
-            layer_1.upsample.mode,
-            NeuralNetwork_pb2.UpsampleLayerParams.InterpolationMode.Value("NN"),
-        )
-
-        # Test if BILINEAR mode works as well
-        model = Sequential()
-        model.add(Conv2D(input_shape=(64, 64, 3), filters=32, kernel_size=(5, 5)))
-        try:
-            model.add(UpSampling2D(size=(2, 2), interpolation="bilinear"))
-        except TypeError:  # Early version of Keras, no support for 'interpolation'
-            return
-
-        spec = keras.convert(model, input_names, output_names).get_spec()
-        self.assertIsNotNone(spec)
-        layers = spec.neuralNetwork.layers
-        layer_1 = layers[1]
-        self.assertIsNotNone(layer_1.upsample)
-        self.assertEqual(
-            layer_1.upsample.mode,
-            NeuralNetwork_pb2.UpsampleLayerParams.InterpolationMode.Value("BILINEAR"),
-        )
-
-    def test_pooling(self):
-        """
-        Test the conversion of pooling layer.
-        """
-        from keras.layers import Conv2D, MaxPooling2D
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(
-            Conv2D(
-                input_shape=(64, 64, 3),
-                filters=32,
-                kernel_size=(5, 5),
-                strides=(1, 1),
-                activation=None,
-                padding="valid",
-                use_bias=True,
-            )
-        )
-        model.add(MaxPooling2D(pool_size=(2, 2)))
-
-        input_names = ["input"]
-        output_names = ["output"]
-        spec = keras.convert(model, input_names, output_names).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names))
-        self.assertEqual(
-            sorted(input_names), sorted(map(lambda x: x.name, spec.description.input))
-        )
-        self.assertEqual(len(spec.description.output), len(output_names))
-        self.assertEqual(
-            sorted(output_names), sorted(map(lambda x: x.name, spec.description.output))
-        )
-
-        # Test the layer parameters.
-        layers = spec.neuralNetwork.layers
-        self.assertIsNotNone(layers[1].pooling)
-
-    def test_permute(self):
-        """
-        Test the conversion of pooling layer.
-        """
-        from keras.layers.core import Permute
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(Permute((3, 2, 1), input_shape=(10, 64, 3)))
-
-        input_names = ["input"]
-        output_names = ["output"]
-        spec = keras.convert(model, input_names, output_names).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names))
-        self.assertEqual(
-            sorted(input_names), sorted(map(lambda x: x.name, spec.description.input))
-        )
-        self.assertEqual(len(spec.description.output), len(output_names))
-        self.assertEqual(
-            sorted(output_names), sorted(map(lambda x: x.name, spec.description.output))
-        )
-
-        # Test the layer parameters.
-        layers = spec.neuralNetwork.layers
-        layer_0 = layers[0]
-        self.assertIsNotNone(layer_0.permute)
-
-    def test_lstm(self):
-        """
-        Test the conversion of an LSTM layer.
-        """
-        from keras.layers import LSTM
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(LSTM(32, input_shape=(10, 24)))
-
-        input_names = ["input"]
-        output_names = ["output"]
-        spec = keras.convert(model, input_names, output_names).get_spec()
-
-        print(spec)
-
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names) + 2)
-
-        self.assertEqual(32, spec.description.input[1].type.multiArrayType.shape[0])
-        self.assertEqual(32, spec.description.input[2].type.multiArrayType.shape[0])
-
-        self.assertEqual(len(spec.description.output), len(output_names) + 2)
-        self.assertEqual(output_names[0], spec.description.output[0].name)
-        self.assertEqual(32, spec.description.output[0].type.multiArrayType.shape[0])
-
-        self.assertEqual(32, spec.description.output[1].type.multiArrayType.shape[0])
-        self.assertEqual(32, spec.description.output[2].type.multiArrayType.shape[0])
-
-        # Test the layer parameters.
-        layers = spec.neuralNetwork.layers
-        layer_0 = layers[0]
-        self.assertIsNotNone(layer_0.uniDirectionalLSTM)
-        self.assertEqual(len(layer_0.input), 3)
-        self.assertEqual(len(layer_0.output), 3)
-
-    def test_simple_rnn(self):
-        """
-        Test the conversion of a simple RNN layer.
-        """
-        from keras.layers import SimpleRNN
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(SimpleRNN(32, input_shape=(10, 32)))
-
-        input_names = ["input"]
-        output_names = ["output"]
-        spec = keras.convert(model, input_names, output_names).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names) + 1)
-        self.assertEqual(input_names[0], spec.description.input[0].name)
-
-        self.assertEqual(32, spec.description.input[1].type.multiArrayType.shape[0])
-
-        self.assertEqual(len(spec.description.output), len(output_names) + 1)
-        self.assertEqual(output_names[0], spec.description.output[0].name)
-        self.assertEqual(32, spec.description.output[0].type.multiArrayType.shape[0])
-        self.assertEqual(32, spec.description.output[1].type.multiArrayType.shape[0])
-
-        # Test the layer parameters.
-        layers = spec.neuralNetwork.layers
-        layer_0 = layers[0]
-        self.assertIsNotNone(layer_0.simpleRecurrent)
-        self.assertEqual(len(layer_0.input), 2)
-        self.assertEqual(len(layer_0.output), 2)
-
-    def test_gru(self):
-        """
-        Test the conversion of a GRU layer.
-        """
-        from keras.layers import GRU
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(GRU(32, input_shape=(32, 10)))
-
-        input_names = ["input"]
-        output_names = ["output"]
-        spec = keras.convert(model, input_names, output_names).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names) + 1)
-        self.assertEqual(input_names[0], spec.description.input[0].name)
-
-        self.assertEqual(32, spec.description.input[1].type.multiArrayType.shape[0])
-
-        self.assertEqual(len(spec.description.output), len(output_names) + 1)
-        self.assertEqual(output_names[0], spec.description.output[0].name)
-        self.assertEqual(32, spec.description.output[0].type.multiArrayType.shape[0])
-        self.assertEqual(32, spec.description.output[1].type.multiArrayType.shape[0])
-
-        # Test the layer parameters.
-        layers = spec.neuralNetwork.layers
-        layer_0 = layers[0]
-        self.assertIsNotNone(layer_0.gru)
-        self.assertEqual(len(layer_0.input), 2)
-        self.assertEqual(len(layer_0.output), 2)
-
-    def test_bidir(self):
-        """
-        Test the conversion of a bidirectional layer
-        """
-        from keras.layers import LSTM
-        from keras.layers.wrappers import Bidirectional
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(Bidirectional(LSTM(32, input_shape=(10, 32)), input_shape=(10, 32)))
-
-        input_names = ["input"]
-        output_names = ["output"]
-        spec = keras.convert(model, input_names, output_names).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names) + 4)
-        self.assertEqual(input_names[0], spec.description.input[0].name)
-
-        self.assertEqual(32, spec.description.input[1].type.multiArrayType.shape[0])
-        self.assertEqual(32, spec.description.input[2].type.multiArrayType.shape[0])
-        self.assertEqual(32, spec.description.input[3].type.multiArrayType.shape[0])
-        self.assertEqual(32, spec.description.input[4].type.multiArrayType.shape[0])
-
-        self.assertEqual(len(spec.description.output), len(output_names) + 4)
-        self.assertEqual(output_names[0], spec.description.output[0].name)
-        self.assertEqual(64, spec.description.output[0].type.multiArrayType.shape[0])
-
-        self.assertEqual(32, spec.description.output[1].type.multiArrayType.shape[0])
-        self.assertEqual(32, spec.description.output[2].type.multiArrayType.shape[0])
-        self.assertEqual(32, spec.description.output[3].type.multiArrayType.shape[0])
-        self.assertEqual(32, spec.description.output[4].type.multiArrayType.shape[0])
-
-        # Test the layer parameters.
-        layers = spec.neuralNetwork.layers
-        layer_0 = layers[0]
-        self.assertIsNotNone(layer_0.biDirectionalLSTM)
-        self.assertEqual(len(layer_0.input), 5)
-        self.assertEqual(len(layer_0.output), 5)
-
-    def test_embedding(self):
-        from keras.layers import Embedding
-
-        model = Sequential()
-        num_inputs = 10
-        num_outputs = 3
-        model.add(Embedding(num_inputs, num_outputs, input_length=5))
-
-        input_names = ["input"]
-        output_names = ["output"]
-
-        spec = keras.convert(model, input_names, output_names).get_spec()
-
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names))
-
-        # Test the layer parameters.
-        layers = spec.neuralNetwork.layers
-        layer_0 = layers[0]
-        self.assertIsNotNone(layer_0.embedding)
-
-        self.assertEqual(layer_0.embedding.inputDim, num_inputs)
-        self.assertEqual(layer_0.embedding.outputChannels, num_outputs)
-
-        self.assertEqual(
-            len(layer_0.embedding.weights.floatValue), num_inputs * num_outputs
-        )
-
-    def test_sentiment_analysis(self):
-        """
-        Test the conversion for a Embedding + LSTM + Dense layer
-        """
-        from keras.layers import Dense, Embedding, LSTM
-
-        # Create a simple Keras model
-        max_features = 50
-        embedded_dim = 32
-        sequence_length = 10
-
-        model = Sequential()
-        # Embedding layer example:
-        # Embedding(1000, 64, input_length=10) input_dim=index(0~999), 64-dimensional vector, sequence length = 10
-        # If we have Dense/Flatten layer upstream, input_length, a.k.a sequence_length is required
-
-        model.add(Embedding(max_features, embedded_dim, input_length=sequence_length))
-        # output_dim = 32
-        model.add(LSTM(32))
-        model.add(Dense(1, activation="sigmoid"))
-
-        # Input/output
-        input_names = ["input"]
-        output_names = ["output"]
-        spec = keras.convert(model, input_names, output_names).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        # We're giving state input and output so expect description to differ.
-        self.assertEqual(len(spec.description.input), len(input_names) + 2)
-        self.assertEqual(len(spec.description.output), len(output_names) + 2)
-
-        # Test the layer parameters.
-        layers = spec.neuralNetwork.layers
-        self.assertIsNotNone(layers[0].embedding)
-        self.assertIsNotNone(layers[1].uniDirectionalLSTM)
-        self.assertIsNotNone(layers[2].innerProduct)
-
-    def test_conv1d_lstm(self):
-        from keras.layers import Conv1D, LSTM, Dense
-
-        model = Sequential()
-        # input_shape = (time_step, dimensions)
-        model.add(Conv1D(32, 3, padding="same", input_shape=(10, 8)))
-        # conv1d output shape = (None, 10, 32)
-        model.add(LSTM(24))
-        model.add(Dense(1, activation="sigmoid"))
-
-        input_names = ["input"]
-        output_names = ["output"]
-        spec = keras.convert(model, input_names, output_names).get_spec()
-
-        self.assertIsNotNone(spec)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names) + 2)
-        self.assertEqual(len(spec.description.output), len(output_names) + 2)
-
-        # Test the layer parameters.
-        layers = spec.neuralNetwork.layers
-        self.assertIsNotNone(layers[0].convolution)
-        self.assertIsNotNone(layers[1].simpleRecurrent)
-        self.assertIsNotNone(layers[2].innerProduct)
-
-    def test_batchnorm(self):
-        """
-        Test the conversion for a Convoultion2D + Batchnorm layer
-        """
-        from keras.layers import Conv2D
-        from keras.layers.normalization import BatchNormalization
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(
-            Conv2D(
-                input_shape=(64, 64, 3),
-                filters=32,
-                kernel_size=(5, 5),
-                strides=(1, 1),
-                activation=None,
-                padding="valid",
-                use_bias=True,
-            )
-        )
-        # epsilon in CoreML is currently fixed at 1e-5
-        model.add(BatchNormalization(epsilon=1e-5))
-        input_names = ["input"]
-        output_names = ["output"]
-        spec = keras.convert(model, input_names, output_names).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names))
-        self.assertEqual(
-            sorted(input_names), sorted(map(lambda x: x.name, spec.description.input))
-        )
-        self.assertEqual(len(spec.description.output), len(output_names))
-        self.assertEqual(
-            sorted(output_names), sorted(map(lambda x: x.name, spec.description.output))
-        )
-
-        # Test the layer parameters.
-        layers = spec.neuralNetwork.layers
-        self.assertIsNotNone(layers[0].convolution)
-        self.assertIsNotNone(layers[1].batchnorm)
-
-    def test_repeat_vector(self):
-        from keras.layers import RepeatVector
-
-        model = Sequential()
-        model.add(RepeatVector(3, input_shape=(5,)))
-
-        input_names = ["input"]
-        output_names = ["output"]
-        spec = keras.convert(model, input_names, output_names).get_spec()
-        self.assertIsNotNone(spec)
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names))
-        self.assertEqual(
-            sorted(input_names), sorted(map(lambda x: x.name, spec.description.input))
-        )
-        self.assertEqual(len(spec.description.output), len(output_names))
-        self.assertEqual(
-            sorted(output_names), sorted(map(lambda x: x.name, spec.description.output))
-        )
-        layers = spec.neuralNetwork.layers
-        self.assertIsNotNone(layers[0].sequenceRepeat)
-
-    @pytest.mark.xfail(raises=ValueError)
-    def test_unsupported_variational_deconv(self):
-        from keras.layers import Input, Lambda, Conv2D, Flatten, Dense
-
-        x = Input(shape=(8, 8, 3))
-        conv_1 = Conv2D(4, (2, 2), padding="same", activation="relu")(x)
-        flat = Flatten()(conv_1)
-        hidden = Dense(10, activation="relu")(flat)
-        z_mean = Dense(10)(hidden)
-        z_log_var = Dense(10)(hidden)
-
-        def sampling(args):
-            z_mean, z_log_var = args
-            return z_mean + z_log_var
-
-        z = Lambda(sampling, output_shape=(10,))([z_mean, z_log_var])
-        model = Model([x], [z])
-        spec = keras.convert(model, ["input"], ["output"]).get_spec()
-
-    def test_image_processing(self):
-        """
-        Test the image-processing parameters.
-        """
-        from keras.layers import Conv2D
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(
-            Conv2D(
-                input_shape=(64, 64, 3),
-                filters=32,
-                kernel_size=(5, 5),
-                activation=None,
-                padding="valid",
-                strides=(1, 1),
-                use_bias=True,
-            )
-        )
-        input_names = ["input"]
-        output_names = ["output"]
-        spec = keras.convert(
-            model,
-            input_names,
-            output_names,
-            image_input_names=["input"],
-            red_bias=110.0,
-            blue_bias=117.0,
-            green_bias=120.0,
-            is_bgr=True,
-            image_scale=1.0,
-        ).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-        self.assertEqual(
-            spec.description.input[0].type.WhichOneof("Type"), "imageType"
-        )
-        self.assertEqual(
-            spec.description.input[0].type.imageType.colorSpace,
-            FeatureTypes_pb2.ImageFeatureType.ColorSpace.Value("BGR"),
-        )
-
-        # Test the layer parameters.
-        preprocessing = spec.neuralNetwork.preprocessing[0]
-        self.assertTrue(preprocessing.HasField("scaler"))
-        pr_0 = preprocessing.scaler
-        print("pr_0.channelScale = ", pr_0.channelScale)
-        print("pr_0.redBias = ", pr_0.redBias)
-        print("pr_0.blueBias = ", pr_0.blueBias)
-        print("pr_0.greenBias = ", pr_0.greenBias)
-        self.assertIsNotNone(pr_0.redBias)
-        self.assertIsNotNone(pr_0.greenBias)
-        self.assertIsNotNone(pr_0.blueBias)
-        self.assertIsNotNone(pr_0.channelScale)
-        self.assertEqual(pr_0.channelScale, 1.0)
-        self.assertEqual(pr_0.redBias, 110.0)
-        self.assertEqual(pr_0.blueBias, 117.0)
-        self.assertEqual(pr_0.greenBias, 120.0)
-
-        # Configuration 2: isbgr = False
-        spec = keras.convert(
-            model,
-            input_names,
-            output_names,
-            image_input_names=["input"],
-            red_bias=110.0,
-            blue_bias=117.0,
-            green_bias=120.0,
-            is_bgr=False,
-            image_scale=1.0,
-        ).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-        self.assertEqual(
-            spec.description.input[0].type.WhichOneof("Type"), "imageType"
-        )
-        self.assertEqual(
-            spec.description.input[0].type.imageType.colorSpace,
-            FeatureTypes_pb2.ImageFeatureType.ColorSpace.Value("RGB"),
-        )
-
-        # Test the layer parameters.
-        preprocessing = spec.neuralNetwork.preprocessing[0]
-        self.assertTrue(preprocessing.HasField("scaler"))
-        pr_0 = preprocessing.scaler
-        self.assertIsNotNone(pr_0.redBias)
-        self.assertIsNotNone(pr_0.greenBias)
-        self.assertIsNotNone(pr_0.blueBias)
-        self.assertIsNotNone(pr_0.channelScale)
-        self.assertEqual(pr_0.channelScale, 1.0)
-        self.assertEqual(pr_0.redBias, 110.0)
-        self.assertEqual(pr_0.blueBias, 117.0)
-        self.assertEqual(pr_0.greenBias, 120.0)
-
-        # Configuration 3: Defaults
-        spec = keras.convert(
-            model,
-            input_names,
-            output_names,
-            image_input_names=["input"],
-            is_bgr=False,
-            image_scale=1.0,
-        ).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-        self.assertEqual(
-            spec.description.input[0].type.WhichOneof("Type"), "imageType"
-        )
-        self.assertEqual(
-            spec.description.input[0].type.imageType.colorSpace,
-            FeatureTypes_pb2.ImageFeatureType.ColorSpace.Value("RGB"),
-        )
-
-        # Test the layer parameters.
-        preprocessing = spec.neuralNetwork.preprocessing[0]
-        self.assertTrue(preprocessing.HasField("scaler"))
-        pr_0 = preprocessing.scaler
-        self.assertIsNotNone(pr_0.redBias)
-        self.assertIsNotNone(pr_0.greenBias)
-        self.assertIsNotNone(pr_0.blueBias)
-        self.assertIsNotNone(pr_0.channelScale)
-        self.assertEqual(pr_0.channelScale, 1.0)
-        self.assertEqual(pr_0.redBias, 0.0)
-        self.assertEqual(pr_0.blueBias, 0.0)
-        self.assertEqual(pr_0.greenBias, 0.0)
-
-    def test_classifier_string_classes(self):
-        from keras.layers import Dense
-        from keras.layers import Activation
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(Dense(32, input_shape=(16,)))
-        model.add(Activation("softmax"))
-        classes = ["c%s" % i for i in range(32)]
-
-        input_names = ["input"]
-        output_names = ["prob_output"]
-        expected_output_names = ["prob_output", "classLabel"]
-        spec = keras.convert(
-            model, input_names, output_names, class_labels=classes
-        ).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetworkClassifier"))
-        self.assertFalse(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names))
-        self.assertEqual(
-            sorted(input_names), sorted(map(lambda x: x.name, spec.description.input))
-        )
-        self.assertEqual(len(spec.description.output), len(expected_output_names))
-        self.assertEqual(
-            expected_output_names, list(map(lambda x: x.name, spec.description.output))
-        )
-
-        # Check the types
-        self.assertEqual(
-            spec.description.output[0].type.WhichOneof("Type"), "dictionaryType"
-        )
-        self.assertEqual(
-            spec.description.output[0].type.dictionaryType.WhichOneof("KeyType"),
-            "stringKeyType",
-        )
-        self.assertEqual(
-            spec.description.output[1].type.WhichOneof("Type"), "stringType"
-        )
-        self.assertTrue(spec.description.predictedFeatureName, "classLabel")
-        self.assertTrue(spec.description.predictedProbabilitiesName, "prob_output")
-
-        # Test the class parameters
-        self.assertEqual(
-            spec.WhichOneof("Type"),
-            "neuralNetworkClassifier",
-            "Expected a NN classifier model",
-        )
-        self.assertEqual(
-            spec.neuralNetworkClassifier.WhichOneof("ClassLabels"), "stringClassLabels"
-        )
-        class_from_proto = list(spec.neuralNetworkClassifier.stringClassLabels.vector)
-        self.assertEqual(sorted(classes), sorted(class_from_proto))
-
-    def test_classifier_file(self):
-        from keras.layers import Dense
-        from keras.layers import Activation
-        import os
-        import tempfile
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(Dense(32, input_shape=(16,)))
-        model.add(Activation("softmax"))
-        classes = ["c%s" % i for i in range(32)]
-        classes_file = tempfile.mktemp()
-        with open(classes_file, "w") as f:
-            f.write("\n".join(classes))
-
-        input_names = ["input"]
-        output_names = ["prob_output"]
-        expected_output_names = ["prob_output", "classLabel"]
-        spec = keras.convert(
-            model, input_names, output_names, class_labels=classes
-        ).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetworkClassifier"))
-        self.assertFalse(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names))
-        self.assertEqual(
-            sorted(input_names), sorted(map(lambda x: x.name, spec.description.input))
-        )
-        self.assertEqual(len(spec.description.output), len(expected_output_names))
-        self.assertEqual(
-            expected_output_names, list(map(lambda x: x.name, spec.description.output))
-        )
-
-        # Check the types
-        self.assertEqual(
-            spec.description.output[0].type.WhichOneof("Type"), "dictionaryType"
-        )
-        self.assertEqual(
-            spec.description.output[0].type.dictionaryType.WhichOneof("KeyType"),
-            "stringKeyType",
-        )
-        self.assertEqual(
-            spec.description.output[1].type.WhichOneof("Type"), "stringType"
-        )
-        self.assertTrue(spec.description.predictedFeatureName, "classLabel")
-        self.assertTrue(spec.description.predictedProbabilitiesName, "prob_output")
-
-        # cleanup
-        os.remove(classes_file)
-
-    def test_classifier_integer_classes(self):
-        from keras.layers import Dense
-        from keras.layers import Activation
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(Dense(32, input_shape=(16,)))
-        model.add(Activation("softmax"))
-        classes = list(range(32))
-
-        input_names = ["input"]
-        output_names = ["prob_output"]
-        expected_output_names = ["prob_output", "classLabel"]
-        spec = keras.convert(
-            model, input_names, output_names, class_labels=classes
-        ).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetworkClassifier"))
-        self.assertFalse(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names))
-        self.assertEqual(
-            sorted(input_names), sorted(map(lambda x: x.name, spec.description.input))
-        )
-        self.assertEqual(len(spec.description.output), len(expected_output_names))
-        self.assertEqual(
-            expected_output_names, list(map(lambda x: x.name, spec.description.output))
-        )
-
-        # Check the types
-        self.assertEqual(
-            spec.description.output[0].type.WhichOneof("Type"), "dictionaryType"
-        )
-        self.assertEqual(
-            spec.description.output[0].type.dictionaryType.WhichOneof("KeyType"),
-            "int64KeyType",
-        )
-        self.assertEqual(
-            spec.description.output[1].type.WhichOneof("Type"), "int64Type"
-        )
-        self.assertTrue(spec.description.predictedFeatureName, "classLabel")
-        self.assertTrue(spec.description.predictedProbabilitiesName, "prob_output")
-
-        # Test the class parameters
-        self.assertEqual(
-            spec.WhichOneof("Type"),
-            "neuralNetworkClassifier",
-            "Expected a NN classifier model",
-        )
-        self.assertEqual(
-            spec.neuralNetworkClassifier.WhichOneof("ClassLabels"), "int64ClassLabels"
-        )
-        class_from_proto = list(spec.neuralNetworkClassifier.int64ClassLabels.vector)
-        self.assertEqual(sorted(classes), sorted(class_from_proto))
-
-    def test_classifier_custom_class_name(self):
-        from keras.layers import Dense
-        from keras.layers import Activation
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(Dense(32, input_shape=(16,)))
-        model.add(Activation("softmax"))
-        classes = ["c%s" % i for i in range(32)]
-
-        input_names = ["input"]
-        output_names = ["prob_output"]
-        expected_output_names = ["prob_output", "my_foo_bar_class_output"]
-        spec = keras.convert(
-            model,
-            input_names,
-            output_names,
-            class_labels=classes,
-            predicted_feature_name="my_foo_bar_class_output",
-        ).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetworkClassifier"))
-        self.assertFalse(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names))
-        self.assertEqual(
-            sorted(input_names), sorted(map(lambda x: x.name, spec.description.input))
-        )
-        self.assertEqual(len(spec.description.output), len(expected_output_names))
-        self.assertEqual(
-            expected_output_names, list(map(lambda x: x.name, spec.description.output))
-        )
-
-        # Check the types
-        self.assertEqual(
-            spec.description.output[0].type.WhichOneof("Type"), "dictionaryType"
-        )
-        self.assertEqual(
-            spec.description.output[0].type.dictionaryType.WhichOneof("KeyType"),
-            "stringKeyType",
-        )
-        self.assertEqual(
-            spec.description.output[1].type.WhichOneof("Type"), "stringType"
-        )
-        self.assertTrue(
-            spec.description.predictedFeatureName, "my_foo_bar_class_output"
-        )
-        self.assertTrue(spec.description.predictedProbabilitiesName, "prob_output")
-
-        # Test the class parameters
-        self.assertEqual(
-            spec.WhichOneof("Type"),
-            "neuralNetworkClassifier",
-            "Expected a NN classifier model",
-        )
-        self.assertEqual(
-            spec.neuralNetworkClassifier.WhichOneof("ClassLabels"), "stringClassLabels"
-        )
-        class_from_proto = list(spec.neuralNetworkClassifier.stringClassLabels.vector)
-        self.assertEqual(sorted(classes), sorted(class_from_proto))
-
-    def test_default_interface_names(self):
-        from keras.layers import Dense
-        from keras.layers import Activation
-
-        # Create a simple Keras model
-        model = Sequential()
-        model.add(Dense(32, input_shape=(16,)))
-        model.add(Activation("softmax"))
-
-        expected_input_names = ["input1"]
-        expected_output_names = ["output1"]
-        spec = keras.convert(model).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(expected_input_names))
-        self.assertEqual(
-            sorted(expected_input_names),
-            sorted(map(lambda x: x.name, spec.description.input)),
-        )
-        self.assertEqual(len(spec.description.output), len(expected_output_names))
-        self.assertEqual(
-            sorted(expected_output_names),
-            sorted(map(lambda x: x.name, spec.description.output)),
-        )
-
-    def test_updatable_model_flag_off(self):
-        """
-        Test to ensure that when respect_trainable is off, then we will ignore
-        any 'trainable' layers of the original network.
-        """
-        import coremltools
-        from keras.layers import Dense
-        from keras.losses import categorical_crossentropy
-        from keras.optimizers import SGD
-
-        input = ["data"]
-        output = ["output"]
-        # First, set respect_trainable to False and then check to make sure the
-        # converted model is NOT updatable.
-        not_updatable = Sequential()
-        not_updatable.add(Dense(128, input_shape=(16,)))
-        # layer is updatable, but the flag during convert is false, so that bit
-        # must get dropped on the floor.
-        not_updatable.add(Dense(10, name="foo", activation="softmax", trainable=True))
-        not_updatable.compile(
-            loss=categorical_crossentropy, optimizer=SGD(lr=0.01), metrics=["accuracy"]
-        )
-        cml = coremltools.converters.keras.convert(
-            not_updatable, input, output, respect_trainable=False
-        )
-        spec = cml.get_spec()
-        self.assertFalse(spec.isUpdatable)
-        layers = spec.neuralNetwork.layers
-        self.assertIsNotNone(layers[1].innerProduct)
-        self.assertTrue(layers[1].innerProduct)
-        self.assertFalse(layers[1].isUpdatable)
-
-    def test_updatable_model_flag_cce_sgd(self):
-        """
-        Test to ensure that respect_trainable is honored during convert of a
-        model with categorical cross entropy loss and SGD optimizer.
-        """
-        import coremltools
-        from keras.layers import Dense
-        from keras.losses import categorical_crossentropy
-        from keras.optimizers import SGD
-
-        input = ["data"]
-        output = ["output"]
-
-        # This should result in an updatable model.
-        updatable = Sequential()
-        updatable.add(Dense(128, input_shape=(16,)))
-        updatable.add(Dense(10, name="foo", activation="softmax", trainable=True))
-        updatable.compile(
-            loss=categorical_crossentropy, optimizer=SGD(lr=1.0), metrics=["accuracy"]
-        )
-        cml = coremltools.converters.keras.convert(
-            updatable, input, output, respect_trainable=True
-        )
-        spec = cml.get_spec()
-        self.assertTrue(spec.isUpdatable)
-        layers = spec.neuralNetwork.layers
-        self.assertIsNotNone(layers[1].innerProduct)
-        self.assertTrue(layers[1].innerProduct)
-        self.assertTrue(layers[1].isUpdatable)
-        self.assertEqual(len(spec.neuralNetwork.updateParams.lossLayers), 1)
-        sgdopt = spec.neuralNetwork.updateParams.optimizer.sgdOptimizer
-        self.assertEqual(sgdopt.learningRate.defaultValue, 1.0)
-        self.assertEqual(sgdopt.miniBatchSize.defaultValue, 16)
-        self.assertEqual(sgdopt.momentum.defaultValue, 0.0)
-
-    def test_updatable_model_flag_functional(self):
-        """
-        Test to ensure that respect_trainable is honored during convert of a
-        Keras model defined via the Keras functional API.
-        """
-        import coremltools
-        from keras.layers import Dense, Input
-        from keras.losses import categorical_crossentropy
-        from keras.optimizers import SGD
-
-        input = ["data"]
-        output = ["output"]
-
-        # This should result in an updatable model.
-        inputs = Input(shape=(16,))
-        d1 = Dense(128)(inputs)
-        d2 = Dense(10, name="foo", activation="softmax", trainable=True)(d1)
-        kmodel = Model(inputs=inputs, outputs=d2)
-        kmodel.compile(
-            loss=categorical_crossentropy, optimizer=SGD(lr=1.0), metrics=["accuracy"]
-        )
-        cml = coremltools.converters.keras.convert(
-            kmodel, input, output, respect_trainable=True
-        )
-        spec = cml.get_spec()
-        self.assertTrue(spec.isUpdatable)
-        layers = spec.neuralNetwork.layers
-        self.assertIsNotNone(layers[1].innerProduct)
-        self.assertTrue(layers[1].innerProduct)
-        self.assertTrue(layers[1].isUpdatable)
-        self.assertEqual(len(spec.neuralNetwork.updateParams.lossLayers), 1)
-        sgdopt = spec.neuralNetwork.updateParams.optimizer.sgdOptimizer
-        self.assertEqual(sgdopt.learningRate.defaultValue, 1.0)
-        self.assertEqual(sgdopt.miniBatchSize.defaultValue, 16)
-        self.assertEqual(sgdopt.momentum.defaultValue, 0.0)
-
-    def test_updatable_model_flag_mse_adam(self):
-        """
-        Test to ensure that respect_trainable is honored during convert of a
-        model with mean squared error loss and the Adam optimizer.
-        """
-        import coremltools
-        from keras.layers import Dense
-        from keras.losses import mean_squared_error
-        from keras.optimizers import Adam
-
-        input = ["data"]
-        output = ["output"]
-
-        # Again, this should give an updatable model.
-        updatable = Sequential()
-        updatable.add(Dense(128, input_shape=(16,)))
-        updatable.add(Dense(10, name="foo", activation="softmax", trainable=True))
-        updatable.compile(
-            loss=mean_squared_error,
-            optimizer=Adam(lr=1.0, beta_1=0.5, beta_2=0.75, epsilon=0.25),
-            metrics=["accuracy"],
-        )
-        cml = coremltools.converters.keras.convert(
-            updatable, input, output, respect_trainable=True
-        )
-        spec = cml.get_spec()
-        self.assertTrue(spec.isUpdatable)
-        layers = spec.neuralNetwork.layers
-        self.assertIsNotNone(layers[1].innerProduct)
-        self.assertTrue(layers[1].innerProduct)
-        self.assertTrue(layers[1].isUpdatable)
-        self.assertEqual(len(spec.neuralNetwork.updateParams.lossLayers), 1)
-        adopt = spec.neuralNetwork.updateParams.optimizer.adamOptimizer
-        self.assertEqual(adopt.learningRate.defaultValue, 1.0)
-        self.assertEqual(adopt.beta1.defaultValue, 0.5)
-        self.assertEqual(adopt.beta2.defaultValue, 0.75)
-        self.assertEqual(adopt.eps.defaultValue, 0.25)
-
-    def test_updatable_model_flag_no_loss_optimizer(self):
-        """
-        Tests the 'respect_trainable' flag on models that have not been
-        compiled, and thus do not have a loss function or optimizer.
-        """
-        import coremltools
-        from keras.layers import Dense
-
-        updatable = Sequential()
-        updatable.add(Dense(128, input_shape=(16,)))
-        updatable.add(Dense(10, name="foo", activation="softmax", trainable=True))
-        input = ["data"]
-        output = ["output"]
-        cml = coremltools.converters.keras.convert(
-            updatable, input, output, respect_trainable=True
-        )
-        spec = cml.get_spec()
-        self.assertTrue(spec.isUpdatable)
-        layers = spec.neuralNetwork.layers
-        self.assertIsNotNone(layers[1].innerProduct)
-        self.assertTrue(layers[1].innerProduct)
-        self.assertTrue(layers[1].isUpdatable)
-
-    def test_updatable_model_flag_mse_string_adam(self):
-        """
-        Tests the 'respect_trainable' flag when used along with string
-        for the loss(here mse), conversion is successful
-        """
-        import coremltools
-        from keras.layers import Dense
-        from keras.optimizers import Adam
-
-        updatable = Sequential()
-        updatable.add(Dense(128, input_shape=(16,)))
-        updatable.add(Dense(10, name="foo", activation="relu", trainable=True))
-        updatable.compile(
-            loss="mean_squared_error",
-            optimizer=Adam(lr=1.0, beta_1=0.5, beta_2=0.75, epsilon=0.25),
-            metrics=["accuracy"],
-        )
-        input = ["data"]
-        output = ["output"]
-        cml = coremltools.converters.keras.convert(
-            updatable, input, output, respect_trainable=True
-        )
-        spec = cml.get_spec()
-        self.assertTrue(spec.isUpdatable)
-        layers = spec.neuralNetwork.layers
-        self.assertIsNotNone(layers[1].innerProduct)
-        self.assertTrue(layers[1].innerProduct)
-        self.assertTrue(layers[1].isUpdatable)
-
-        self.assertEqual(len(spec.neuralNetwork.updateParams.lossLayers), 1)
-        # check that mean squared error input name and output name is set
-        # check length is non-zero for mse
-        self.assertTrue(
-            len(
-                spec.neuralNetwork.updateParams.lossLayers[
-                    0
-                ].meanSquaredErrorLossLayer.input
-            )
-        )
-        self.assertTrue(
-            len(
-                spec.neuralNetwork.updateParams.lossLayers[
-                    0
-                ].meanSquaredErrorLossLayer.target
-            )
-        )
-        # check length is 0 for cce
-        self.assertFalse(
-            len(
-                spec.neuralNetwork.updateParams.lossLayers[
-                    0
-                ].categoricalCrossEntropyLossLayer.input
-            )
-        )
-        self.assertFalse(
-            len(
-                spec.neuralNetwork.updateParams.lossLayers[
-                    0
-                ].categoricalCrossEntropyLossLayer.target
-            )
-        )
-
-        adopt = spec.neuralNetwork.updateParams.optimizer.adamOptimizer
-        # verify default values
-        self.assertEqual(adopt.learningRate.defaultValue, 1.0)
-        self.assertEqual(adopt.beta1.defaultValue, 0.5)
-        self.assertEqual(adopt.beta2.defaultValue, 0.75)
-        self.assertEqual(adopt.eps.defaultValue, 0.25)
-
-    def test_updatable_model_flag_cce_string_sgd(self):
-        """
-        Tests the 'respect_trainable' flag when used along with string
-        for the loss(here cce), conversion is successful
-        """
-        import coremltools
-        from keras.layers import Dense
-        from keras.optimizers import SGD
-
-        updatable = Sequential()
-        updatable.add(Dense(128, input_shape=(16,)))
-        updatable.add(Dense(10, name="foo", activation="softmax", trainable=True))
-        updatable.compile(
-            loss="categorical_crossentropy", optimizer=SGD(lr=1.0), metrics=["accuracy"]
-        )
-        input = ["data"]
-        output = ["output"]
-        cml = coremltools.converters.keras.convert(
-            updatable, input, output, respect_trainable=True
-        )
-        spec = cml.get_spec()
-        self.assertTrue(spec.isUpdatable)
-        layers = spec.neuralNetwork.layers
-        self.assertIsNotNone(layers[1].innerProduct)
-        self.assertTrue(layers[1].innerProduct)
-        self.assertTrue(layers[1].isUpdatable)
-        self.assertEqual(len(spec.neuralNetwork.updateParams.lossLayers), 1)
-
-        # check that cce input name and output name is set
-        # check length is non-zero for cce
-        self.assertTrue(
-            len(
-                spec.neuralNetwork.updateParams.lossLayers[
-                    0
-                ].categoricalCrossEntropyLossLayer.input
-            )
-        )
-        self.assertTrue(
-            len(
-                spec.neuralNetwork.updateParams.lossLayers[
-                    0
-                ].categoricalCrossEntropyLossLayer.target
-            )
-        )
-        # check length is 0 for mse
-        self.assertFalse(
-            len(
-                spec.neuralNetwork.updateParams.lossLayers[
-                    0
-                ].meanSquaredErrorLossLayer.input
-            )
-        )
-        self.assertFalse(
-            len(
-                spec.neuralNetwork.updateParams.lossLayers[
-                    0
-                ].meanSquaredErrorLossLayer.target
-            )
-        )
-
-        sgdopt = spec.neuralNetwork.updateParams.optimizer.sgdOptimizer
-        self.assertEqual(sgdopt.learningRate.defaultValue, 1.0)
-        self.assertEqual(sgdopt.miniBatchSize.defaultValue, 16)
-        self.assertEqual(sgdopt.momentum.defaultValue, 0.0)
-
-    def test_updatable_model_flag_cce_sgd_string(self):
-        """
-        Tests the 'respect_trainable' flag when used along with string
-        for the optimizer(keras internally creates an instance, here sgd),
-        conversion is successful
-        """
-        import coremltools
-        from keras.layers import Dense, Input
-        from keras.losses import categorical_crossentropy
-
-        input = ["data"]
-        output = ["output"]
-
-        # This should result in an updatable model.
-        inputs = Input(shape=(16,))
-        d1 = Dense(128)(inputs)
-        d2 = Dense(10, name="foo", activation="softmax", trainable=True)(d1)
-        kmodel = Model(inputs=inputs, outputs=d2)
-        kmodel.compile(
-            loss=categorical_crossentropy, optimizer="sgd", metrics=["accuracy"]
-        )
-        cml = coremltools.converters.keras.convert(
-            kmodel, input, output, respect_trainable=True
-        )
-        spec = cml.get_spec()
-        self.assertTrue(spec.isUpdatable)
-        layers = spec.neuralNetwork.layers
-        self.assertIsNotNone(layers[1].innerProduct)
-        self.assertTrue(layers[1].innerProduct)
-        self.assertTrue(layers[1].isUpdatable)
-        self.assertEqual(len(spec.neuralNetwork.updateParams.lossLayers), 1)
-        sgdopt = spec.neuralNetwork.updateParams.optimizer.sgdOptimizer
-        # use almost equal for default verification with at least 5 decimal
-        # places of closeness
-        self.assertAlmostEqual(sgdopt.learningRate.defaultValue, 0.01, places=5)
-        self.assertEqual(sgdopt.miniBatchSize.defaultValue, 16)
-        self.assertEqual(sgdopt.momentum.defaultValue, 0.0)
-
-    def test_updatable_model_flag_cce_adam_string(self):
-        """
-        Tests the 'respect_trainable' flag when used along with string
-        for the optimizer(keras internally creates an instance, here adam),
-        conversion is successful
-        """
-        import coremltools
-        from keras.layers import Dense, Input
-        from keras.losses import categorical_crossentropy
-
-        input = ["data"]
-        output = ["output"]
-
-        # This should result in an updatable model.
-        inputs = Input(shape=(16,))
-        d1 = Dense(128)(inputs)
-        d2 = Dense(10, name="foo", activation="softmax", trainable=True)(d1)
-        kmodel = Model(inputs=inputs, outputs=d2)
-        kmodel.compile(
-            loss=categorical_crossentropy, optimizer="adam", metrics=["accuracy"]
-        )
-        cml = coremltools.converters.keras.convert(
-            kmodel, input, output, respect_trainable=True
-        )
-        spec = cml.get_spec()
-        self.assertTrue(spec.isUpdatable)
-        layers = spec.neuralNetwork.layers
-        self.assertIsNotNone(layers[1].innerProduct)
-        self.assertTrue(layers[1].innerProduct)
-        self.assertTrue(layers[1].isUpdatable)
-        self.assertEqual(len(spec.neuralNetwork.updateParams.lossLayers), 1)
-        adopt = spec.neuralNetwork.updateParams.optimizer.adamOptimizer
-        # use almost equal for default verification with at least 5 decimal
-        # places of closeness
-        self.assertAlmostEqual(adopt.learningRate.defaultValue, 0.001, places=5)
-        self.assertAlmostEqual(adopt.miniBatchSize.defaultValue, 16)
-        self.assertAlmostEqual(adopt.beta1.defaultValue, 0.90, places=5)
-        self.assertAlmostEqual(adopt.beta2.defaultValue, 0.999, places=5)
diff --git a/coremltools/test/neural_network/test_keras2_numeric.py b/coremltools/test/neural_network/test_keras2_numeric.py
deleted file mode 100644
index b5c4f9610..000000000
--- a/coremltools/test/neural_network/test_keras2_numeric.py
+++ /dev/null
@@ -1,3458 +0,0 @@
-# Copyright (c) 2021, Apple Inc. All rights reserved.
-#
-# Use of this source code is governed by a BSD-3-clause license that can be
-# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
-
-import itertools
-import os
-import shutil
-import tempfile
-import unittest
-
-import numpy as np
-import pytest
-
-from coremltools._deps import _HAS_KERAS2_TF
-from coremltools.models import _MLMODEL_FULL_PRECISION, _MLMODEL_HALF_PRECISION
-from coremltools.models.utils import _macos_version, _is_macos
-
-if _HAS_KERAS2_TF:
-    import keras.backend
-    from keras.models import Sequential, Model
-    from keras.layers import (
-        Dense,
-        Activation,
-        Conv2D,
-        Conv1D,
-        Flatten,
-        BatchNormalization,
-        Conv2DTranspose,
-        SeparableConv2D,
-    )
-    from keras.layers import (
-        MaxPooling2D,
-        AveragePooling2D,
-        GlobalAveragePooling2D,
-        GlobalMaxPooling2D,
-    )
-    from keras.layers import (
-        MaxPooling1D,
-        AveragePooling1D,
-        GlobalAveragePooling1D,
-        GlobalMaxPooling1D,
-    )
-    from keras.layers import Embedding, Input, Permute, Reshape, RepeatVector, Dropout
-    from keras.layers import Add, Concatenate
-    from keras.layers import add, multiply, concatenate, dot, maximum, average
-    from keras.layers import ZeroPadding2D, UpSampling2D, Cropping2D
-    from keras.layers import ZeroPadding1D, UpSampling1D, Cropping1D
-    from keras.layers import SimpleRNN, LSTM, GRU
-    from keras.layers.core import SpatialDropout2D
-    from keras.layers.wrappers import Bidirectional, TimeDistributed
-    from distutils.version import StrictVersion as _StrictVersion
-
-    if keras.__version__ >= _StrictVersion("2.2.1"):
-        from keras.layers import DepthwiseConv2D, ReLU
-    elif keras.__version__ >= _StrictVersion("2.2.0"):
-        from keras.layers import DepthwiseConv2D
-        from keras_applications.mobilenet import relu6
-    else:
-        from keras.applications.mobilenet import DepthwiseConv2D, relu6
-
-
-def _keras_transpose(x, is_sequence=False):
-    if len(x.shape) == 5:
-        # Keras input shape = [Batch, Seq, Height, Width, Channels]
-        x = np.transpose(x, [1, 0, 4, 2, 3])
-    if len(x.shape) == 4:
-        # Keras input shape = [Batch, Height, Width, Channels]
-        x = np.transpose(x, [0, 3, 1, 2])
-        return np.expand_dims(x, axis=0)
-    elif len(x.shape) == 3:
-        # Keras input shape = [Batch, (Sequence) Length, Channels]
-        return np.transpose(x, [1, 0, 2])
-    elif len(x.shape) == 2:
-        if is_sequence:  # (N,S) --> (S,N,1,)
-            return x.reshape(x.shape[::-1] + (1,))
-        else:  # (N,C) --> (N,C,1,1)
-            return x.reshape((1,) + x.shape)  # Dense
-    elif len(x.shape) == 1:
-        if is_sequence:  # (S) --> (S,N,1,1,1)
-            return x.reshape((x.shape[0], 1, 1))
-        else:
-            return x
-    else:
-        return x
-
-
-def _get_coreml_model(
-    model,
-    input_names=["data"],
-    output_names=["output"],
-    input_name_shape_dict={},
-    model_precision=_MLMODEL_FULL_PRECISION,
-    use_float_arraytype=False,
-):
-    """
-    Get the coreml model from the Keras model.
-    """
-    # Convert the model
-    from coremltools.converters import keras as keras_converter
-
-    model = keras_converter.convert(
-        model,
-        input_names,
-        output_names,
-        input_name_shape_dict=input_name_shape_dict,
-        model_precision=model_precision,
-        use_float_arraytype=use_float_arraytype,
-    )
-    return model
-
-
-def _generate_data(input_shape, mode="random"):
-    """
-    Generate some random data according to a shape.
-    """
-    if mode == "zeros":
-        X = np.zeros(input_shape)
-    elif mode == "ones":
-        X = np.ones(input_shape)
-    elif mode == "linear":
-        X = np.array(range(np.product(input_shape))).reshape(input_shape)
-    elif mode == "random":
-        X = np.random.rand(*input_shape)
-    elif mode == "random_zero_mean":
-        X = np.random.rand(*input_shape) - 0.5
-    return X
-
-
-@unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras. Skipping tests.")
-@pytest.mark.keras2
-class KerasNumericCorrectnessTest(unittest.TestCase):
-    """
-    Unit test class for testing the Keras converter.
-    """
-
-    def runTest(self):
-        pass
-
-    def _get_coreml_model_params_and_test_input(
-        self, model, mode, one_dim_seq_flags, input_name_shape_dict={}
-    ):
-        # Generate data
-        nb_inputs = len(model.inputs)
-        if nb_inputs > 1:
-            input_names = []
-            input_data = []
-            coreml_input = {}
-            for i in range(nb_inputs):
-                feature_name = "data_%s" % i
-                input_names.append(feature_name)
-                if feature_name in input_name_shape_dict:
-                    input_shape = [
-                        1 if a is None else a
-                        for a in input_name_shape_dict[feature_name]
-                    ]
-                else:
-                    input_shape = [1 if a is None else a for a in model.input_shape[i]]
-                X = _generate_data(input_shape, mode)
-                input_data.append(X)
-                if one_dim_seq_flags is None:
-                    coreml_input[feature_name] = _keras_transpose(X).astype("f").copy()
-                else:
-                    coreml_input[feature_name] = (
-                        _keras_transpose(X, one_dim_seq_flags[i]).astype("f").copy()
-                    )
-        else:
-            input_names = ["data"]
-            if "data" in input_name_shape_dict:
-                input_shape = [
-                    1 if a is None else a for a in input_name_shape_dict["data"]
-                ]
-            else:
-                input_shape = [1 if a is None else a for a in model.input_shape]
-
-            input_data = _generate_data(input_shape, mode)
-            if one_dim_seq_flags is None:
-                coreml_input = {"data": _keras_transpose(input_data).astype("f").copy()}
-            else:
-                coreml_input = {
-                    "data": _keras_transpose(input_data, one_dim_seq_flags[0])
-                    .astype("f")
-                    .copy()
-                }
-
-        output_names = ["output" + str(i) for i in range(len(model.outputs))]
-        return input_names, output_names, input_data, coreml_input
-
-    def _test_model(
-        self,
-        model,
-        input_name_shape_dict={},
-        num_samples=1,
-        mode="random",
-        delta=1e-2,
-        model_dir=None,
-        transpose_keras_result=True,
-        one_dim_seq_flags=None,
-        model_precision=_MLMODEL_FULL_PRECISION,
-    ):
-
-        # transpose_keras_result: if true, compare the transposed Keras result
-        # one_dim_seq_flags: a list of same length as the number of inputs in
-        # the model; if None, treat all 1D input (if any) as non-sequence
-        # if one_dim_seq_flags[i] is True, it means the ith input, with shape
-        # (X,) is in fact a sequence of length X.
-
-        # Get the CoreML model
-        use_tmp_folder = False
-        if model_dir is None:
-            use_tmp_folder = True
-            model_dir = tempfile.mkdtemp()
-
-        (
-            input_names,
-            output_names,
-            input_data,
-            coreml_input,
-        ) = self._get_coreml_model_params_and_test_input(
-            model, mode, one_dim_seq_flags, input_name_shape_dict
-        )
-
-        coreml_model = _get_coreml_model(
-            model,
-            input_names,
-            output_names,
-            input_name_shape_dict,
-            model_precision=model_precision,
-        )
-        try:
-            if not (_is_macos() and _macos_version() >= (10, 13)):
-                return
-
-            # Assuming coreml model output names are in the same order as
-            # Keras output list, put predictions into a list, sorted by output
-            # name
-            coreml_preds = coreml_model.predict(coreml_input)
-            c_preds = [coreml_preds[name] for name in output_names]
-
-            # Get Keras predictions
-            keras_preds = model.predict(input_data)
-            k_preds = keras_preds if type(keras_preds) is list else [keras_preds]
-
-            # Compare each output blob
-            for idx, k_pred in enumerate(k_preds):
-                if transpose_keras_result:
-                    kp = _keras_transpose(k_pred).flatten()
-                else:
-                    kp = k_pred.flatten()
-                cp = c_preds[idx].flatten()
-                # Compare predictions
-                self.assertEqual(len(kp), len(cp))
-                for i in range(len(kp)):
-                    max_den = max(1.0, kp[i], cp[i])
-                    self.assertAlmostEqual(
-                        kp[i] / max_den, cp[i] / max_den, delta=delta
-                    )
-        finally:
-            # Cleanup files - models on disk no longer useful
-            if use_tmp_folder and os.path.exists(model_dir):
-                shutil.rmtree(model_dir)
-
-
-@unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras. Skipping tests.")
-@pytest.mark.keras2
-class KerasBasicNumericCorrectnessTest(KerasNumericCorrectnessTest):
-    def test_tiny_inner_product(self, model_precision=_MLMODEL_FULL_PRECISION):
-        np.random.seed(1988)
-
-        # Define a model
-        model = Sequential()
-        model.add(Dense(2, input_shape=(2,)))
-
-        # Test all zeros
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-        self._test_model(model, mode="zeros", model_precision=model_precision)
-
-        # Test all ones
-        model.set_weights([np.ones(w.shape) for w in model.get_weights()])
-        self._test_model(model, mode="ones", model_precision=model_precision)
-
-        # Test random
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-        self._test_model(model, model_precision=model_precision)
-
-    def test_tiny_inner_product_half_precision(self):
-        self.test_tiny_inner_product(model_precision=_MLMODEL_HALF_PRECISION)
-
-    def test_inner_product_random(self, model_precision=_MLMODEL_FULL_PRECISION):
-        np.random.seed(1988)
-
-        # Define a model
-        model = Sequential()
-        model.add(Dense(1000, input_shape=(100,)))
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(model, model_precision=model_precision)
-
-    def test_inner_product_half_precision_random(self):
-        self.test_inner_product_random(model_precision=_MLMODEL_HALF_PRECISION)
-
-    def test_dense_softmax(self):
-        np.random.seed(1988)
-
-        # Define a model
-        model = Sequential()
-        model.add(Dense(32, input_shape=(32,), activation="softmax"))
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(model)
-
-    def test_dense_elu(self):
-        np.random.seed(1988)
-
-        # Define a model
-        model = Sequential()
-        model.add(Dense(32, input_shape=(32,), activation="elu"))
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(model)
-
-    def test_dense_selu(self):
-        np.random.seed(1988)
-
-        # Define a model
-        model = Sequential()
-        model.add(Dense(32, input_shape=(32,), activation="selu"))
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(model)
-
-    def test_housenet_random(self):
-        np.random.seed(1988)
-        num_hidden = 2
-        num_features = 3
-
-        # Define a model
-        model = Sequential()
-        model.add(Dense(num_hidden, input_dim=num_features))
-        model.add(Activation("relu"))
-        model.add(Dense(1, input_dim=num_features))
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(model)
-
-    def test_tiny_conv_ones(self, model_precision=_MLMODEL_FULL_PRECISION):
-        np.random.seed(1988)
-        input_dim = 10
-        input_shape = (input_dim, input_dim, 1)
-        num_kernels, kernel_height, kernel_width = 3, 5, 5
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Conv2D(
-                input_shape=input_shape,
-                filters=num_kernels,
-                kernel_size=(kernel_height, kernel_width),
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.ones(w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(model, model_precision=model_precision)
-
-    def test_tiny_conv_ones_half_precision(self):
-        self.test_tiny_conv_ones(model_precision=_MLMODEL_HALF_PRECISION)
-
-    def test_tiny_conv_random(self, model_precision=_MLMODEL_FULL_PRECISION):
-        np.random.seed(1988)
-        input_dim = 10
-        input_shape = (input_dim, input_dim, 1)
-        num_kernels, kernel_height, kernel_width = 3, 5, 5
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Conv2D(
-                input_shape=input_shape,
-                filters=num_kernels,
-                kernel_size=(kernel_height, kernel_width),
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(model, model_precision=model_precision)
-
-    @unittest.skipUnless(
-        _is_macos() and _macos_version() >= (10, 14), "Only supported on MacOS 10.14+"
-    )
-    def test_tiny_conv_random_input_shape_dict(
-        self, model_precision=_MLMODEL_FULL_PRECISION
-    ):
-        np.random.seed(1988)
-        H, W, C = 10, 20, 5
-        input_shape = (None, H, W, C)
-        num_kernels, kernel_height, kernel_width = 3, 5, 5
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Conv2D(
-                input_shape=(None, None, C),
-                filters=num_kernels,
-                kernel_size=(kernel_height, kernel_width),
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(
-            model,
-            input_name_shape_dict={"data": input_shape},
-            model_precision=model_precision,
-        )
-
-    def test_tiny_conv_random_half_precision(self):
-        self.test_tiny_conv_random(model_precision=_MLMODEL_HALF_PRECISION)
-
-    def test_tiny_conv_dilated(self, model_precision=_MLMODEL_FULL_PRECISION):
-        np.random.seed(1988)
-        input_dim = 10
-        input_shape = (input_dim, input_dim, 1)
-        num_kernels, kernel_height, kernel_width = 3, 5, 5
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Conv2D(
-                input_shape=input_shape,
-                dilation_rate=(2, 2),
-                filters=num_kernels,
-                kernel_size=(kernel_height, kernel_width),
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(model, model_precision=model_precision)
-
-    def test_tiny_conv_dilated_half_precision(self):
-        return self.test_tiny_conv_dilated(model_precision=_MLMODEL_HALF_PRECISION)
-
-    def test_tiny_conv_dilated_rect_random(
-        self, model_precision=_MLMODEL_FULL_PRECISION
-    ):
-        np.random.seed(1988)
-        input_shape = (32, 20, 3)
-        num_kernels = 2
-        kernel_height = 3
-        kernel_width = 3
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Conv2D(
-                input_shape=input_shape,
-                dilation_rate=(2, 2),
-                filters=num_kernels,
-                kernel_size=(kernel_height, kernel_width),
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(model, model_precision=model_precision)
-
-    def test_tiny_conv_dilated_rect_random_half_precision(self):
-        return self.test_tiny_conv_dilated_rect_random(
-            model_precision=_MLMODEL_HALF_PRECISION
-        )
-
-    def test_tiny_conv_pseudo_1d_x(self, model_precision=_MLMODEL_FULL_PRECISION):
-        np.random.seed(1988)
-        input_dim = 2
-        input_length = 5
-        filter_length = 1  # 3
-        nb_filters = 1
-        # Define a model
-        model = Sequential()
-        model.add(
-            Conv2D(
-                nb_filters,
-                kernel_size=(1, filter_length),
-                input_shape=(1, input_length, input_dim),
-                padding="valid",
-            )
-        )
-        # Set some random weights
-        model.set_weights([np.ones(w.shape) for w in model.get_weights()])
-        self._test_model(model, mode="linear", model_precision=model_precision)
-
-    def test_tiny_conv_pseudo_1d_x_half_precision(self):
-        return self.test_tiny_conv_pseudo_1d_x(model_precision=_MLMODEL_HALF_PRECISION)
-
-    def test_tiny_conv1d_same_random(self):
-        np.random.seed(1988)
-        input_dim = 2
-        input_length = 10
-        filter_length = 3
-        nb_filters = 4
-        model = Sequential()
-        model.add(
-            Conv1D(
-                nb_filters,
-                kernel_size=filter_length,
-                padding="same",
-                input_shape=(input_length, input_dim),
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(model)
-
-    def test_tiny_conv1d_same_random_input_shape_dict(self):
-        np.random.seed(1988)
-        input_dim = 2
-        input_length = 10
-        filter_length = 3
-        nb_filters = 4
-        model = Sequential()
-        model.add(
-            Conv1D(
-                nb_filters,
-                kernel_size=filter_length,
-                padding="same",
-                input_shape=(None, input_dim),
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(
-            model, input_name_shape_dict={"data": (None, input_length, input_dim)}
-        )
-
-    def test_large_input_length_conv1d_same_random(
-        self, model_precision=_MLMODEL_FULL_PRECISION
-    ):
-        np.random.seed(1988)
-        input_dim = 2
-        input_length = 80
-        filter_length = 3
-        nb_filters = 4
-        model = Sequential()
-        model.add(
-            Conv1D(
-                nb_filters,
-                kernel_size=filter_length,
-                padding="same",
-                input_shape=(input_length, input_dim),
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(model, model_precision=model_precision)
-
-    def test_large_input_length_conv1d_same_random_half_precision(self):
-        return self.test_large_input_length_conv1d_same_random(
-            model_precision=_MLMODEL_HALF_PRECISION
-        )
-
-    def test_tiny_conv1d_valid_random(self):
-        np.random.seed(1988)
-        input_dim = 2
-        input_length = 10
-        filter_length = 3
-        nb_filters = 4
-        model = Sequential()
-        model.add(
-            Conv1D(
-                nb_filters,
-                kernel_size=filter_length,
-                padding="valid",
-                input_shape=(input_length, input_dim),
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(model)
-
-    def test_tiny_conv1d_dilated_random(self):
-        np.random.seed(1988)
-        input_shape = (20, 1)
-        num_kernels = 2
-        filter_length = 3
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Conv1D(
-                num_kernels,
-                kernel_size=filter_length,
-                padding="valid",
-                input_shape=input_shape,
-                dilation_rate=3,
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(model)
-
-    def test_tiny_conv_rect_kernel_x(self):
-        np.random.seed(1988)
-        input_dim = 10
-        input_shape = (input_dim, input_dim, 1)
-        num_kernels = 3
-        kernel_height = 1
-        kernel_width = 5
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Conv2D(
-                input_shape=input_shape,
-                filters=num_kernels,
-                kernel_size=(kernel_height, kernel_width),
-                padding="same",
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(model)
-
-    def test_tiny_conv_rect_kernel_y(self):
-        np.random.seed(1988)
-        input_dim = 10
-        input_shape = (input_dim, input_dim, 1)
-        num_kernels = 3
-        kernel_height = 5
-        kernel_width = 1
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Conv2D(
-                input_shape=input_shape,
-                filters=num_kernels,
-                kernel_size=(kernel_height, kernel_width),
-                padding="valid",
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(model)
-
-    def test_tiny_conv_rect_kernel_xy(self, model_precision=_MLMODEL_FULL_PRECISION):
-        np.random.seed(1988)
-        input_dim = 10
-        input_shape = (input_dim, input_dim, 1)
-        num_kernels = 3
-        kernel_height = 5
-        kernel_width = 3
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Conv2D(
-                input_shape=input_shape,
-                filters=num_kernels,
-                kernel_size=(kernel_height, kernel_width),
-                padding="valid",
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(model, model_precision=model_precision)
-
-    def test_tiny_conv_rect_kernel_xy_half_precision(self):
-        self.test_tiny_conv_rect_kernel_xy(model_precision=_MLMODEL_HALF_PRECISION)
-
-    def test_flatten(self):
-        model = Sequential()
-        model.add(Flatten(input_shape=(2, 2, 2)))
-        self._test_model(model, mode="linear")
-
-    def test_conv_dense(self, model_precision=_MLMODEL_FULL_PRECISION):
-        input_shape = (48, 48, 3)
-        model = Sequential()
-        model.add(Conv2D(32, (3, 3), activation="relu", input_shape=input_shape))
-        model.add(Flatten())
-        model.add(Dense(10, activation="softmax"))
-
-        # Get the coreml model
-        self._test_model(model, model_precision=model_precision)
-
-    def test_conv_dense_half_precision(self):
-        return self.test_conv_dense(model_precision=_MLMODEL_HALF_PRECISION)
-
-    def test_conv_batchnorm_random(self, model_precision=_MLMODEL_FULL_PRECISION):
-        np.random.seed(1988)
-        input_dim = 10
-        input_shape = (input_dim, input_dim, 3)
-        num_kernels = 3
-        kernel_height = 5
-        kernel_width = 5
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Conv2D(
-                input_shape=input_shape,
-                filters=num_kernels,
-                kernel_size=(kernel_height, kernel_width),
-            )
-        )
-        model.add(BatchNormalization(epsilon=1e-5))
-
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_model(model, model_precision=model_precision)
-
-    def test_conv_batchnorm_random_half_precision(self):
-        return self.test_conv_batchnorm_random(model_precision=_MLMODEL_HALF_PRECISION)
-
-    def test_conv_batchnorm_no_gamma_no_beta(
-        self, model_precision=_MLMODEL_FULL_PRECISION
-    ):
-        np.random.seed(1988)
-        input_dim = 10
-        input_shape = (input_dim, input_dim, 3)
-        num_kernels = 3
-        kernel_height = 5
-        kernel_width = 5
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Conv2D(
-                input_shape=input_shape,
-                filters=num_kernels,
-                kernel_size=(kernel_height, kernel_width),
-            )
-        )
-        model.add(BatchNormalization(center=False, scale=False, epsilon=1e-5))
-
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_model(model, model_precision=model_precision)
-
-    def test_conv_batchnorm_no_gamma_no_beta_half_precision(self):
-        return self.test_conv_batchnorm_no_gamma_no_beta(
-            model_precision=_MLMODEL_HALF_PRECISION
-        )
-
-    def test_tiny_deconv_random(self):
-        # In Keras 2, deconvolution auto computes the output shape.
-        np.random.seed(1988)
-        input_dim = 13
-        input_shape = (input_dim, input_dim, 5)
-        num_kernels = 16
-        kernel_height = 3
-        kernel_width = 3
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Conv2DTranspose(
-                filters=num_kernels,
-                kernel_size=(kernel_height, kernel_width),
-                input_shape=input_shape,
-                padding="valid",
-                use_bias=False,
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(model)
-
-    def test_tiny_deconv_random_same_padding(self):
-        np.random.seed(1988)
-        input_dim = 14
-        input_shape = (input_dim, input_dim, 3)
-        num_kernels = 16
-        kernel_height = 3
-        kernel_width = 3
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Conv2DTranspose(
-                filters=num_kernels,
-                kernel_size=(kernel_height, kernel_width),
-                input_shape=input_shape,
-                padding="same",
-                strides=(2, 2),
-                use_bias=True,
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(model)
-
-    def test_tiny_depthwise_conv_same_pad(self):
-        np.random.seed(1988)
-        input_dim = 16
-        input_shape = (input_dim, input_dim, 3)
-        depth_multiplier = 1
-        kernel_height = 3
-        kernel_width = 3
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            DepthwiseConv2D(
-                depth_multiplier=depth_multiplier,
-                kernel_size=(kernel_height, kernel_width),
-                input_shape=input_shape,
-                padding="same",
-                strides=(1, 1),
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(model)
-
-    def test_tiny_depthwise_conv_valid_pad(self):
-        np.random.seed(1988)
-        input_dim = 16
-        input_shape = (input_dim, input_dim, 3)
-        depth_multiplier = 1
-        kernel_height = 3
-        kernel_width = 3
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            DepthwiseConv2D(
-                depth_multiplier=depth_multiplier,
-                kernel_size=(kernel_height, kernel_width),
-                input_shape=input_shape,
-                padding="valid",
-                strides=(1, 1),
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(model)
-
-    def test_tiny_depthwise_conv_same_pad_depth_multiplier(self):
-        np.random.seed(1988)
-        input_dim = 16
-        input_shape = (input_dim, input_dim, 3)
-        depth_multiplier = 4
-        kernel_height = 3
-        kernel_width = 3
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            DepthwiseConv2D(
-                depth_multiplier=depth_multiplier,
-                kernel_size=(kernel_height, kernel_width),
-                input_shape=input_shape,
-                padding="same",
-                strides=(1, 1),
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(model)
-
-    def test_tiny_depthwise_conv_valid_pad_depth_multiplier(self):
-        np.random.seed(1988)
-        input_dim = 16
-        input_shape = (input_dim, input_dim, 3)
-        depth_multiplier = 2
-        kernel_height = 3
-        kernel_width = 3
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            DepthwiseConv2D(
-                depth_multiplier=depth_multiplier,
-                kernel_size=(kernel_height, kernel_width),
-                input_shape=input_shape,
-                padding="valid",
-                strides=(1, 1),
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(model)
-
-    def test_tiny_separable_conv_valid(self):
-        np.random.seed(1988)
-        input_dim = 16
-        input_shape = (input_dim, input_dim, 3)
-        depth_multiplier = 1
-        kernel_height = 3
-        kernel_width = 3
-        num_kernels = 4
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            SeparableConv2D(
-                filters=num_kernels,
-                kernel_size=(kernel_height, kernel_width),
-                padding="valid",
-                strides=(1, 1),
-                depth_multiplier=depth_multiplier,
-                input_shape=input_shape,
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(model)
-
-    def test_tiny_separable_conv_same_fancy(self):
-        np.random.seed(1988)
-        input_dim = 16
-        input_shape = (input_dim, input_dim, 3)
-        depth_multiplier = 1
-        kernel_height = 3
-        kernel_width = 3
-        num_kernels = 4
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            SeparableConv2D(
-                filters=num_kernels,
-                kernel_size=(kernel_height, kernel_width),
-                padding="same",
-                strides=(2, 2),
-                activation="relu",
-                depth_multiplier=depth_multiplier,
-                input_shape=input_shape,
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(model)
-
-    def test_tiny_separable_conv_valid_depth_multiplier(self):
-        np.random.seed(1988)
-        input_dim = 16
-        input_shape = (input_dim, input_dim, 3)
-        depth_multiplier = 5
-        kernel_height = 3
-        kernel_width = 3
-        num_kernels = 40
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            SeparableConv2D(
-                filters=num_kernels,
-                kernel_size=(kernel_height, kernel_width),
-                padding="valid",
-                strides=(1, 1),
-                depth_multiplier=depth_multiplier,
-                input_shape=input_shape,
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(model)
-
-    def test_tiny_separable_conv_same_fancy_depth_multiplier(
-        self, model_precision=_MLMODEL_FULL_PRECISION
-    ):
-
-        np.random.seed(1988)
-        input_dim = 16
-        input_shape = (input_dim, input_dim, 3)
-        depth_multiplier = 2
-        kernel_height = 3
-        kernel_width = 3
-        num_kernels = 40
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            SeparableConv2D(
-                filters=num_kernels,
-                kernel_size=(kernel_height, kernel_width),
-                padding="same",
-                strides=(2, 2),
-                activation="relu",
-                depth_multiplier=depth_multiplier,
-                input_shape=input_shape,
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(model, model_precision=model_precision)
-
-    def test_tiny_separable_conv_same_fancy_depth_multiplier_half_precision(self):
-        return self.test_tiny_separable_conv_same_fancy_depth_multiplier(
-            model_precision=_MLMODEL_HALF_PRECISION
-        )
-
-    def test_tiny_separable_conv_dilated(self, model_precision=_MLMODEL_FULL_PRECISION):
-        np.random.seed(1988)
-        input_dim = 10
-        input_shape = (input_dim, input_dim, 1)
-        num_kernels, kernel_height, kernel_width = 3, 5, 5
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            SeparableConv2D(
-                input_shape=input_shape,
-                dilation_rate=(2, 2),
-                filters=num_kernels,
-                kernel_size=(kernel_height, kernel_width),
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(model, model_precision=model_precision)
-
-    def test_tiny_separable_conv_dilated_half_precision(self):
-        return self.test_tiny_separable_conv_dilated(
-            model_precision=_MLMODEL_HALF_PRECISION
-        )
-
-    def test_tiny_separable_conv_dilated_rect_random(
-        self, model_precision=_MLMODEL_FULL_PRECISION
-    ):
-        np.random.seed(1988)
-        input_shape = (32, 20, 3)
-        num_kernels = 2
-        kernel_height = 3
-        kernel_width = 3
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            SeparableConv2D(
-                input_shape=input_shape,
-                dilation_rate=(2, 2),
-                filters=num_kernels,
-                kernel_size=(kernel_height, kernel_width),
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(model, model_precision=model_precision)
-
-    def test_tiny_separable_conv_dilated_rect_random_half_precision(self):
-        return self.test_tiny_separable_conv_dilated_rect_random(
-            model_precision=_MLMODEL_HALF_PRECISION
-        )
-
-    def test_max_pooling_no_overlap(self):
-        # no_overlap: pool_size = strides
-        model = Sequential()
-        model.add(
-            MaxPooling2D(
-                input_shape=(16, 16, 3), pool_size=(2, 2), strides=None, padding="valid"
-            )
-        )
-        self._test_model(model)
-
-    def test_max_pooling_overlap_multiple(self):
-        # input shape is multiple of pool_size, strides != pool_size
-        model = Sequential()
-        model.add(
-            MaxPooling2D(
-                input_shape=(18, 18, 3),
-                pool_size=(3, 3),
-                strides=(2, 2),
-                padding="valid",
-            )
-        )
-        self._test_model(model)
-
-    def test_max_pooling_overlap_odd(self):
-        model = Sequential()
-        model.add(
-            MaxPooling2D(
-                input_shape=(16, 16, 3),
-                pool_size=(3, 3),
-                strides=(2, 2),
-                padding="valid",
-            )
-        )
-        self._test_model(model)
-
-    def test_max_pooling_overlap_same(self):
-        model = Sequential()
-        model.add(
-            MaxPooling2D(
-                input_shape=(16, 16, 3),
-                pool_size=(3, 3),
-                strides=(2, 2),
-                padding="same",
-            )
-        )
-        self._test_model(model)
-
-    def test_global_max_pooling(self):
-        model = Sequential()
-        model.add(GlobalMaxPooling2D(input_shape=(16, 16, 3)))
-        self._test_model(model)
-
-    def test_average_pooling_no_overlap(self):
-        # no_overlap: pool_size = strides
-        model = Sequential()
-        model.add(
-            AveragePooling2D(
-                input_shape=(16, 16, 3), pool_size=(2, 2), strides=None, padding="valid"
-            )
-        )
-        self._test_model(model, delta=1e-2)
-
-    def test_average_pooling_inception_config_1(self):
-        # no_overlap: pool_size = strides
-        model = Sequential()
-        model.add(
-            AveragePooling2D(
-                input_shape=(16, 16, 3),
-                pool_size=(3, 3),
-                strides=(1, 1),
-                padding="same",
-            )
-        )
-        self._test_model(model, delta=1e-2)
-
-    def test_global_average_pooling(self):
-        model = Sequential()
-        model.add(GlobalAveragePooling2D(input_shape=(16, 16, 3)))
-        self._test_model(model)
-
-    def test_max_pooling_1d(self):
-        model = Sequential()
-        model.add(MaxPooling1D(input_shape=(16, 3), pool_size=4))
-        self._test_model(model)
-
-    def test_global_max_pooling_1d(self):
-        np.random.seed(1988)
-        input_dim = 2
-        input_length = 10
-        filter_length = 3
-        nb_filters = 4
-        model = Sequential()
-        model.add(
-            Conv1D(
-                nb_filters,
-                kernel_size=filter_length,
-                padding="same",
-                input_shape=(input_length, input_dim),
-            )
-        )
-        model.add(GlobalMaxPooling1D())
-        self._test_model(model)
-
-    def test_average_pooling_1d(self):
-        np.random.seed(1988)
-        input_dim = 2
-        input_length = 10
-        filter_length = 3
-        nb_filters = 4
-        model = Sequential()
-        model.add(
-            Conv1D(
-                nb_filters,
-                kernel_size=filter_length,
-                padding="same",
-                input_shape=(input_length, input_dim),
-            )
-        )
-        model.add(AveragePooling1D(pool_size=2))
-        self._test_model(model)
-
-    def test_global_average_pooling_1d(self):
-        np.random.seed(1988)
-        input_dim = 2
-        input_length = 10
-        filter_length = 3
-        nb_filters = 4
-        model = Sequential()
-        model.add(
-            Conv1D(
-                nb_filters,
-                kernel_size=filter_length,
-                padding="same",
-                input_shape=(input_length, input_dim),
-            )
-        )
-        model.add(GlobalAveragePooling1D())
-        self._test_model(model)
-
-    def test_tiny_conv_upsample_random(self):
-        np.random.seed(1988)
-        input_dim = 10
-        input_shape = (input_dim, input_dim, 1)
-        num_kernels = 3
-        kernel_height = 5
-        kernel_width = 5
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Conv2D(
-                input_shape=input_shape,
-                filters=num_kernels,
-                kernel_size=(kernel_height, kernel_width),
-            )
-        )
-        model.add(UpSampling2D(size=2))
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(model)
-
-    def test_tiny_conv_upsample_1d_random(self):
-        np.random.seed(1988)
-        input_dim = 2
-        input_length = 10
-        filter_length = 3
-        nb_filters = 4
-        model = Sequential()
-        model.add(
-            Conv1D(
-                nb_filters,
-                kernel_size=filter_length,
-                padding="same",
-                input_shape=(input_length, input_dim),
-            )
-        )
-        model.add(UpSampling1D(size=2))
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(model)
-
-    def test_tiny_conv_crop_1d_random(self, model_precision=_MLMODEL_FULL_PRECISION):
-        np.random.seed(1988)
-        input_dim = 2
-        input_length = 10
-        filter_length = 3
-        nb_filters = 4
-        model = Sequential()
-        model.add(
-            Conv1D(
-                nb_filters,
-                kernel_size=filter_length,
-                padding="same",
-                input_shape=(input_length, input_dim),
-            )
-        )
-        model.add(Cropping1D(cropping=2))
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(model, model_precision=model_precision)
-
-    def test_tiny_conv_crop_1d_random_half_precision(self):
-        return self.test_tiny_conv_crop_1d_random(
-            model_precision=_MLMODEL_HALF_PRECISION
-        )
-
-    def test_tiny_conv_pad_1d_random(self, model_precision=_MLMODEL_FULL_PRECISION):
-        np.random.seed(1988)
-        input_dim = 2
-        input_length = 10
-        filter_length = 3
-        nb_filters = 4
-        model = Sequential()
-        model.add(
-            Conv1D(
-                nb_filters,
-                kernel_size=filter_length,
-                padding="same",
-                input_shape=(input_length, input_dim),
-            )
-        )
-        model.add(ZeroPadding1D(padding=2))
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(model, model_precision=model_precision)
-
-    def test_tiny_conv_pad_1d_random_half_precision(self):
-        return self.test_tiny_conv_pad_1d_random(
-            model_precision=_MLMODEL_HALF_PRECISION
-        )
-
-    def test_tiny_conv_causal_1d(self):
-        np.random.seed(1988)
-        model = Sequential()
-        model.add(Conv1D(1, 3, input_shape=(10, 1), use_bias=False, padding="causal"))
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-        self._test_model(model)
-
-    def test_embedding(self, model_precision=_MLMODEL_FULL_PRECISION):
-        model = Sequential()
-        num_inputs = 10
-        num_outputs = 3
-        model.add(Embedding(num_inputs, num_outputs))
-
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-        self._test_model(model, model_precision=model_precision)
-
-    def test_embedding_half_precision(self):
-        return self.test_embedding(model_precision=_MLMODEL_HALF_PRECISION)
-
-    def test_embedding_seq(self, model_precision=_MLMODEL_FULL_PRECISION):
-        model = Sequential()
-        num_inputs = 10
-        num_outputs = 3
-        model.add(Embedding(num_inputs, num_outputs, input_length=7))
-
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-        self._test_model(
-            model, one_dim_seq_flags=[True], model_precision=model_precision
-        )
-
-    def test_embedding_seq_half_precision(self):
-        return self.test_embedding_seq(model_precision=_MLMODEL_HALF_PRECISION)
-
-    def test_tiny_no_sequence_simple_rnn_random(self):
-        np.random.seed(1988)
-        input_dim = 10
-        input_length = 1
-        num_channels = 1
-
-        # Define a model
-        model = Sequential()
-        model.add(SimpleRNN(num_channels, input_shape=(input_length, input_dim)))
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(model)
-
-    def test_tiny_sequence_simple_rnn_random(self):
-        np.random.seed(1988)
-        input_dim = 2
-        input_length = 4
-        num_channels = 3
-
-        # Define a model
-        model = Sequential()
-        model.add(SimpleRNN(num_channels, input_shape=(input_length, input_dim)))
-
-        # Set some random weights
-        model.set_weights(
-            [np.random.rand(*w.shape) * 0.2 - 0.1 for w in model.get_weights()]
-        )
-
-        # Test the keras model
-        self._test_model(model)
-
-    def test_tiny_seq2seq_rnn_random(self):
-        np.random.seed(1988)
-        input_dim = 2
-        input_length = 4
-        num_channels = 3
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            SimpleRNN(
-                num_channels,
-                input_shape=(input_length, input_dim),
-                return_sequences=True,
-            )
-        )
-
-        # Set some random weights
-        model.set_weights(
-            [np.random.rand(*w.shape) * 0.2 - 0.1 for w in model.get_weights()]
-        )
-
-        # Test the keras model
-        self._test_model(model)
-
-    def test_rnn_seq(self):
-        np.random.seed(1988)
-        input_dim = 11
-        input_length = 5
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            SimpleRNN(20, input_shape=(input_length, input_dim), return_sequences=False)
-        )
-
-        # Set some random weights
-        model.set_weights(
-            [np.random.rand(*w.shape) * 0.2 - 0.1 for w in model.get_weights()]
-        )
-
-        # Test the keras model
-        self._test_model(model)
-
-    def test_rnn_seq_backwards(self):
-        np.random.seed(1988)
-        input_dim = 11
-        input_length = 5
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            SimpleRNN(
-                20,
-                input_shape=(input_length, input_dim),
-                return_sequences=False,
-                go_backwards=True,
-            )
-        )
-
-        # Set some random weights
-        model.set_weights(
-            [np.random.rand(*w.shape) * 0.2 - 0.1 for w in model.get_weights()]
-        )
-
-        # Test the keras model
-        self._test_model(model)
-
-    def test_medium_no_sequence_simple_rnn_random(self):
-        np.random.seed(1988)
-        input_dim = 10
-        input_length = 1
-        num_channels = 10
-
-        # Define a model
-        model = Sequential()
-        model.add(SimpleRNN(num_channels, input_shape=(input_length, input_dim)))
-
-        # Set some random weights
-        model.set_weights(
-            [np.random.rand(*w.shape) * 0.2 - 0.1 for w in model.get_weights()]
-        )
-
-        # Test the keras model
-        self._test_model(model)
-
-    def test_tiny_no_sequence_lstm_zeros(self):
-        np.random.seed(1988)
-        input_dim = 1
-        input_length = 1
-        num_channels = 1
-
-        model = Sequential()
-        model.add(
-            LSTM(
-                num_channels,
-                input_shape=(input_length, input_dim),
-                implementation=1,
-                recurrent_activation="sigmoid",
-            )
-        )
-
-        model.set_weights(
-            [np.random.rand(*w.shape) * 0.2 - 0.1 for w in model.get_weights()]
-        )
-        self._test_model(model, mode="zeros")
-
-    def test_tiny_no_sequence_lstm_ones(self):
-        np.random.seed(1988)
-        input_dim = 1
-        input_length = 1
-        num_channels = 1
-
-        model = Sequential()
-        model.add(
-            LSTM(
-                num_channels,
-                input_shape=(input_length, input_dim),
-                implementation=1,
-                recurrent_activation="sigmoid",
-            )
-        )
-
-        model.set_weights(
-            [np.random.rand(*w.shape) * 0.2 - 0.1 for w in model.get_weights()]
-        )
-        self._test_model(model, mode="ones")
-
-    def test_small_no_sequence_lstm_zeros(self):
-        np.random.seed(1988)
-        input_dim = 10
-        input_length = 1
-        num_channels = 1
-
-        model = Sequential()
-        model.add(
-            LSTM(
-                num_channels,
-                input_shape=(input_length, input_dim),
-                implementation=2,
-                recurrent_activation="sigmoid",
-            )
-        )
-
-        model.set_weights(
-            [np.random.rand(*w.shape) * 0.2 - 0.1 for w in model.get_weights()]
-        )
-        self._test_model(model, mode="zeros")
-
-    def test_small_no_sequence_lstm_ones(self):
-        np.random.seed(1988)
-        input_dim = 10
-        input_length = 1
-        num_channels = 1
-
-        model = Sequential()
-        model.add(
-            LSTM(
-                num_channels,
-                input_shape=(input_length, input_dim),
-                implementation=2,
-                recurrent_activation="sigmoid",
-            )
-        )
-
-        model.set_weights(
-            [np.random.rand(*w.shape) * 0.2 - 0.1 for w in model.get_weights()]
-        )
-        self._test_model(model, mode="ones")
-
-    def test_lstm_seq(self):
-        np.random.seed(1988)
-        input_dim = 11
-        input_length = 5
-
-        model = Sequential()
-        model.add(
-            LSTM(20, input_shape=(input_length, input_dim), return_sequences=False)
-        )
-
-        model.set_weights(
-            [np.random.rand(*w.shape) * 0.2 - 0.1 for w in model.get_weights()]
-        )
-        self._test_model(model)
-
-    def test_lstm_seq_backwards(self):
-        np.random.seed(1988)
-        input_dim = 11
-        input_length = 5
-
-        model = Sequential()
-        model.add(
-            LSTM(
-                20,
-                input_shape=(input_length, input_dim),
-                return_sequences=False,
-                go_backwards=True,
-            )
-        )
-
-        model.set_weights(
-            [np.random.rand(*w.shape) * 0.2 - 0.1 for w in model.get_weights()]
-        )
-        self._test_model(model)
-
-    def test_medium_no_sequence_lstm_random(self):
-        np.random.seed(1988)
-        input_dim = 10
-        input_length = 1
-        num_channels = 10
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            LSTM(
-                num_channels,
-                input_shape=(input_length, input_dim),
-                recurrent_activation="sigmoid",
-            )
-        )
-
-        # Set some random weights
-        model.set_weights(
-            [np.random.rand(*w.shape) * 0.2 - 0.1 for w in model.get_weights()]
-        )
-
-        # Test the keras model
-        self._test_model(model)
-
-    def test_tiny_no_sequence_lstm_zeros_gpu(self):
-        np.random.seed(1988)
-        input_dim = 1
-        input_length = 1
-        num_channels = 1
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            LSTM(
-                num_channels,
-                input_shape=(input_length, input_dim),
-                implementation=2,
-                recurrent_activation="sigmoid",
-            )
-        )
-
-        # Set some random weights
-        model.set_weights(
-            [np.random.rand(*w.shape) * 0.2 - 0.1 for w in model.get_weights()]
-        )
-
-        # Test the keras model
-        self._test_model(model, mode="zeros")
-
-    def test_small_no_sequence_lstm_random(self):
-        np.random.seed(1988)
-        input_dim = 10
-        input_length = 1
-        num_channels = 1
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            LSTM(
-                num_channels,
-                input_shape=(input_length, input_dim),
-                implementation=2,
-                recurrent_activation="sigmoid",
-            )
-        )
-
-        # Set some random weights
-        model.set_weights(
-            [np.random.rand(*w.shape) * 0.2 - 0.1 for w in model.get_weights()]
-        )
-
-        # Test the keras model
-        self._test_model(model)
-
-    def test_tiny_no_sequence_gru_random(self, model_precision=_MLMODEL_FULL_PRECISION):
-        np.random.seed(1988)
-        input_dim = 1
-        input_length = 1
-        num_channels = 1
-        num_samples = 1
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            GRU(
-                num_channels,
-                input_shape=(input_length, input_dim),
-                recurrent_activation="sigmoid",
-            )
-        )
-
-        # Set some random weights
-        model.set_weights(
-            [np.random.rand(*w.shape) * 0.2 - 0.1 for w in model.get_weights()]
-        )
-
-        # Test the keras model
-        self._test_model(model, model_precision=model_precision)
-
-    def test_tiny_no_sequence_gru_random_half_precision(self):
-        return self.test_tiny_no_sequence_gru_random(
-            model_precision=_MLMODEL_HALF_PRECISION
-        )
-
-    def test_small_no_sequence_gru_random(self):
-        np.random.seed(1988)
-        input_dim = 10
-        input_length = 1
-        num_channels = 1
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            GRU(
-                num_channels,
-                input_shape=(input_length, input_dim),
-                recurrent_activation="sigmoid",
-            )
-        )
-
-        # Set some random weights
-        model.set_weights(
-            [np.random.rand(*w.shape) * 0.2 - 0.1 for w in model.get_weights()]
-        )
-
-        # Test the keras model
-        self._test_model(model)
-
-    def test_medium_no_sequence_gru_random(
-        self, model_precision=_MLMODEL_FULL_PRECISION
-    ):
-        np.random.seed(1988)
-        input_dim = 10
-        input_length = 1
-        num_channels = 10
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            GRU(
-                num_channels,
-                input_shape=(input_length, input_dim),
-                recurrent_activation="sigmoid",
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_model(model, model_precision=model_precision)
-
-    def test_medium_no_sequence_gru_random_half_precision(self):
-        return self.test_medium_no_sequence_gru_random(
-            model_precision=_MLMODEL_HALF_PRECISION
-        )
-
-    def test_gru_seq(self):
-        np.random.seed(1988)
-        input_dim = 11
-        input_length = 5
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            GRU(20, input_shape=(input_length, input_dim), return_sequences=False)
-        )
-
-        # Set some random weights
-        model.set_weights(
-            [np.random.rand(*w.shape) * 0.2 - 0.1 for w in model.get_weights()]
-        )
-
-        # Test the keras model
-        self._test_model(model)
-
-    def test_gru_seq_backwards(self, model_precision=_MLMODEL_FULL_PRECISION):
-        np.random.seed(1988)
-        input_dim = 11
-        input_length = 5
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            GRU(
-                20,
-                input_shape=(input_length, input_dim),
-                return_sequences=False,
-                go_backwards=True,
-            )
-        )
-
-        # Set some random weights
-        model.set_weights(
-            [np.random.rand(*w.shape) * 0.2 - 0.1 for w in model.get_weights()]
-        )
-
-        # Test the keras model
-        self._test_model(model, model_precision=model_precision)
-
-    def test_gru_seq_backwards_half_precision(self):
-        return self.test_gru_seq_backwards(model_precision=_MLMODEL_HALF_PRECISION)
-
-    def test_tiny_no_sequence_bidir_random(
-        self, model_precision=_MLMODEL_FULL_PRECISION
-    ):
-        np.random.seed(1988)
-        input_dim = 1
-        input_length = 1
-        num_channels = 1
-        num_samples = 1
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Bidirectional(
-                LSTM(num_channels, implementation=1, recurrent_activation="sigmoid"),
-                input_shape=(input_length, input_dim),
-            )
-        )
-
-        # Set some random weights
-        model.set_weights(
-            [np.random.rand(*w.shape) * 0.2 - 0.1 for w in model.get_weights()]
-        )
-
-        # Test the keras model
-        self._test_model(model, model_precision=model_precision)
-
-    def test_tiny_no_sequence_bidir_random_half_precision(self):
-        return self.test_tiny_no_sequence_bidir_random(
-            model_precision=_MLMODEL_HALF_PRECISION
-        )
-
-    def test_tiny_no_sequence_bidir_random_gpu(
-        self, model_precision=_MLMODEL_FULL_PRECISION
-    ):
-        np.random.seed(1988)
-        input_dim = 1
-        input_length = 1
-        num_channels = 1
-        num_samples = 1
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Bidirectional(
-                LSTM(num_channels, implementation=2, recurrent_activation="sigmoid"),
-                input_shape=(input_length, input_dim),
-            )
-        )
-
-        # Set some random weights
-        model.set_weights(
-            [np.random.rand(*w.shape) * 0.2 - 0.1 for w in model.get_weights()]
-        )
-
-        # Test the keras model
-        self._test_model(model, model_precision=model_precision)
-
-    def test_tiny_no_sequence_bidir_random_gpu_half_precision(self):
-        return self.test_tiny_no_sequence_bidir_random_gpu(
-            model_precision=_MLMODEL_HALF_PRECISION
-        )
-
-    def test_small_no_sequence_bidir_random(self):
-        np.random.seed(1988)
-        input_dim = 10
-        input_length = 1
-        num_channels = 1
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Bidirectional(
-                LSTM(num_channels, implementation=2, recurrent_activation="sigmoid"),
-                input_shape=(input_length, input_dim),
-            )
-        )
-
-        # Set some random weights
-        model.set_weights(
-            [np.random.rand(*w.shape) * 0.2 - 0.1 for w in model.get_weights()]
-        )
-
-        # Test the keras model
-        self._test_model(model)
-
-    def test_medium_no_sequence_bidir_random(self):
-        np.random.seed(1988)
-        input_dim = 10
-        input_length = 1
-        num_channels = 10
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Bidirectional(
-                LSTM(num_channels, implementation=2, recurrent_activation="sigmoid"),
-                input_shape=(input_length, input_dim),
-            )
-        )
-
-        # Set some random weights
-        model.set_weights(
-            [np.random.rand(*w.shape) * 0.2 - 0.1 for w in model.get_weights()]
-        )
-
-        # Test the keras model
-        self._test_model(model)
-
-    def test_medium_bidir_random_return_seq_false(self):
-        np.random.seed(1988)
-        input_dim = 7
-        input_length = 5
-        num_channels = 10
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Bidirectional(
-                LSTM(
-                    num_channels,
-                    return_sequences=False,
-                    implementation=2,
-                    recurrent_activation="sigmoid",
-                ),
-                input_shape=(input_length, input_dim),
-            )
-        )
-
-        # Set some random weights
-        model.set_weights(
-            [np.random.rand(*w.shape) * 0.2 - 0.1 for w in model.get_weights()]
-        )
-
-        # Test the keras model
-        self._test_model(model)
-
-    def test_medium_bidir_random_return_seq_true(self):
-        np.random.seed(1988)
-        input_dim = 7
-        input_length = 5
-        num_channels = 10
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Bidirectional(
-                LSTM(
-                    num_channels,
-                    return_sequences=True,
-                    implementation=2,
-                    recurrent_activation="sigmoid",
-                ),
-                input_shape=(input_length, input_dim),
-            )
-        )
-
-        # Set some random weights
-        model.set_weights(
-            [np.random.rand(*w.shape) * 0.2 - 0.1 for w in model.get_weights()]
-        )
-
-        # Test the keras model
-        self._test_model(model)
-
-    def test_bilstm_merge_modes(self):
-        # issue 157
-
-        def get_model(input_dim, fc_size, rnn_size, output_dim, merge_mode):
-            input_data = Input(name="the_input", shape=(None, input_dim))
-            x = TimeDistributed(Dense(fc_size, name="fc1", activation="relu",))(
-                input_data
-            )
-            x = Bidirectional(
-                LSTM(
-                    rnn_size,
-                    return_sequences=True,
-                    activation="relu",
-                    kernel_initializer="he_normal",
-                ),
-                merge_mode=merge_mode,
-            )(x)
-            y_pred = TimeDistributed(
-                Dense(output_dim, name="y_pred", activation="softmax")
-            )(x)
-            model = Model([input_data], [y_pred])
-            model.set_weights(
-                [np.random.rand(*w.shape) * 0.2 - 0.1 for w in model.get_weights()]
-            )
-            return model
-
-        input_dim = 26
-        fc_size = 512
-        rnn_size = 512
-        output_dim = 29
-        for merge_mode in ["concat", "sum", "mul", "ave"]:
-            model = get_model(input_dim, fc_size, rnn_size, output_dim, merge_mode)
-            self._test_model(model)
-
-    def test_tiny_conv_elu_random(self):
-        np.random.seed(1988)
-
-        # Define a model
-        from keras.layers.advanced_activations import ELU
-
-        model = Sequential()
-        model.add(Conv2D(input_shape=(10, 10, 3), filters=3, kernel_size=(5, 5)))
-        model.add(ELU(alpha=0.8))
-
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_model(model)
-
-    def test_tiny_conv_prelu_random(self, model_precision=_MLMODEL_FULL_PRECISION):
-        np.random.seed(1988)
-
-        # Define a model
-        from keras.layers.advanced_activations import PReLU
-
-        model = Sequential()
-        model.add(
-            Conv2D(
-                input_shape=(10, 10, 3), filters=3, kernel_size=(5, 5), padding="same"
-            )
-        )
-        model.add(PReLU(shared_axes=[1, 2]))
-
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_model(model, model_precision=model_precision)
-
-    def test_tiny_conv_prelu_random_half_precision(self):
-        return self.test_tiny_conv_prelu_random(model_precision=_MLMODEL_HALF_PRECISION)
-
-    def test_tiny_conv_leaky_relu_random(self):
-        np.random.seed(1988)
-
-        # Define a model
-        from keras.layers.advanced_activations import LeakyReLU
-
-        model = Sequential()
-        model.add(
-            Conv2D(
-                input_shape=(10, 10, 3), filters=3, kernel_size=(5, 5), padding="same"
-            )
-        )
-        model.add(LeakyReLU(alpha=0.3))
-
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_model(model)
-
-    def test_tiny_conv_thresholded_relu_random(self):
-        np.random.seed(1988)
-
-        # Define a model
-        from keras.layers.advanced_activations import ThresholdedReLU
-
-        model = Sequential()
-        model.add(
-            Conv2D(
-                input_shape=(10, 10, 3), filters=3, kernel_size=(5, 5), padding="same"
-            )
-        )
-        model.add(ThresholdedReLU(theta=0.8))
-
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_model(model)
-
-    def test_tiny_concat_random(self):
-        np.random.seed(1988)
-        input_dim = 10
-        num_channels = 6
-
-        # Define a model
-        input_tensor = Input(shape=(input_dim,))
-        x1 = Dense(num_channels)(input_tensor)
-        x2 = Dense(num_channels)(x1)
-        x3 = Dense(num_channels)(x1)
-        x4 = concatenate([x2, x3])
-        x5 = Dense(num_channels)(x4)
-
-        model = Model(inputs=[input_tensor], outputs=[x5])
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_model(model)
-
-    def test_tiny_concat_seq_random(self):
-        np.random.seed(1988)
-        max_features = 10
-        embedding_dims = 4
-        seq_len = 5
-        num_channels = 6
-
-        # Define a model
-        input_tensor = Input(shape=(seq_len,))
-        x1 = Embedding(max_features, embedding_dims)(input_tensor)
-        x2 = Embedding(max_features, embedding_dims)(input_tensor)
-        x3 = concatenate([x1, x2], axis=1)
-
-        model = Model(inputs=[input_tensor], outputs=[x3])
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_model(model, one_dim_seq_flags=[True])
-
-    def test_lstm_concat_dense_random(self):
-        np.random.seed(1988)
-        vocab_size = 1250
-        seq_length = 5
-        units = 32
-
-        # Define a model
-        input = Input(shape=(seq_length,))
-        pos = Input(shape=(seq_length, 1))
-        embedding = Embedding(vocab_size, 50, input_length=seq_length)(input)
-        concat = Concatenate(axis=2)([embedding, pos])
-        model = LSTM(units, return_sequences=True, stateful=False)(concat)
-        model = LSTM(units, return_sequences=False)(model)
-        model = Dense(100, activation="relu")(model)
-        model = Dense(vocab_size, activation="softmax")(model)
-
-        model = Model(inputs=[input, pos], outputs=model)
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_model(model, one_dim_seq_flags=[True, True])
-
-    def test_tiny_add_random(self):
-        np.random.seed(1988)
-        input_dim = 10
-        num_channels = 6
-
-        # Define a model
-        input_tensor = Input(shape=(input_dim,))
-        x1 = Dense(num_channels)(input_tensor)
-        x2 = Dense(num_channels)(x1)
-        x3 = Dense(num_channels)(x1)
-        x4 = add([x2, x3])
-        x5 = Dense(num_channels)(x4)
-
-        model = Model(inputs=[input_tensor], outputs=[x5])
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_model(model)
-
-    def test_tiny_mul_random(self):
-        np.random.seed(1988)
-        input_dim = 10
-        num_channels = 6
-
-        # Define a model
-        input_tensor = Input(shape=(input_dim,))
-        x1 = Dense(num_channels)(input_tensor)
-        x2 = Dense(num_channels)(x1)
-        x3 = Dense(num_channels)(x1)
-        x4 = multiply([x2, x3])
-        x5 = Dense(num_channels)(x4)
-
-        model = Model(inputs=[input_tensor], outputs=[x5])
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_model(model)
-
-    def test_tiny_cos_random(self):
-        np.random.seed(1988)
-        input_dim = 10
-        num_channels = 6
-
-        # Define a model
-        input_tensor = Input(shape=(input_dim,))
-        x1 = Dense(num_channels)(input_tensor)
-        x2 = Dense(num_channels)(x1)
-        x3 = Dense(num_channels)(x1)
-        x4 = dot([x2, x3], axes=-1, normalize=True)
-        x5 = Dense(num_channels)(x4)
-
-        model = Model(inputs=[input_tensor], outputs=[x5])
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_model(model)
-
-    def test_zeropad_simple(self):
-
-        input_shape = (48, 48, 3)
-        model = Sequential()
-        model.add(ZeroPadding2D((1, 1), input_shape=input_shape))
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_model(model)
-
-    def test_zeropad_fancy(self):
-
-        input_shape = (48, 48, 3)
-        model = Sequential()
-        model.add(ZeroPadding2D(((2, 5), (3, 4)), input_shape=input_shape))
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_model(model)
-
-    def test_crop_simple(self):
-
-        input_shape = (48, 48, 3)
-        model = Sequential()
-        model.add(Cropping2D(cropping=((2, 5), (2, 5)), input_shape=input_shape))
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_model(model)
-
-    def test_tiny_permute(self):
-        # When input blob is 3D array (D1, D2, D3), Keras assumes the axes' meaning is
-        # (D1=H,D2=W,D3=C), while CoreML assumes (D1=C,D2=H,D3=W)
-        import itertools
-
-        for permute_order in list(itertools.permutations([1, 2, 3])):
-            model = Sequential()
-            model.add(Permute(permute_order, input_shape=(4, 3, 2)))
-            self._test_model(model, transpose_keras_result=True)
-
-    def test_reshape_3d(self):
-        model = Sequential()
-        model.add(Reshape((10, 1, 6), input_shape=(5, 4, 3)))
-        self._test_model(model, mode="linear")
-
-    def test_tiny_conv_dense_random(self):
-        np.random.seed(1988)
-        num_samples = 1
-        input_dim = 8
-        input_shape = (input_dim, input_dim, 3)
-        num_kernels = 2
-        kernel_height = 5
-        kernel_width = 5
-        hidden_dim = 4
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Conv2D(
-                input_shape=input_shape,
-                filters=num_kernels,
-                kernel_size=(kernel_height, kernel_width),
-            )
-        )
-        model.add(Dropout(0.5))
-        model.add(Flatten())
-        model.add(Dense(hidden_dim))
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_model(model)
-
-    def test_tiny_conv_dropout_random(self):
-        np.random.seed(1988)
-        num_samples = 1
-        input_dim = 8
-        input_shape = (input_dim, input_dim, 3)
-        num_kernels = 2
-        kernel_height = 5
-        kernel_width = 5
-        hidden_dim = 4
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Conv2D(
-                input_shape=input_shape,
-                filters=num_kernels,
-                kernel_size=(kernel_height, kernel_width),
-            )
-        )
-        model.add(SpatialDropout2D(0.5))
-        model.add(Flatten())
-        model.add(Dense(hidden_dim))
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_model(model)
-
-    def test_tiny_dense_tanh_fused_random(self):
-        np.random.seed(1988)
-        num_samples = 1
-        input_dim = 3
-        hidden_dim = 4
-
-        # Define a model
-        model = Sequential()
-        model.add(Dense(hidden_dim, input_shape=(input_dim,), activation="tanh"))
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_model(model)
-
-    def test_tiny_conv_relu_fused_random(self):
-        np.random.seed(1988)
-        num_samples = 1
-        input_dim = 8
-        input_shape = (input_dim, input_dim, 3)
-        num_kernels = 2
-        kernel_height = 5
-        kernel_width = 5
-        hidden_dim = 4
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Conv2D(
-                input_shape=input_shape,
-                activation="relu",
-                filters=num_kernels,
-                kernel_size=(kernel_height, kernel_width),
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_model(model)
-
-    def test_tiny_time_distrbuted(self):
-
-        # as the first layer in a model
-        model = Sequential()
-        model.add(TimeDistributed(Dense(8), input_shape=(10, 16)))
-
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        self._test_model(model)
-
-    def test_tiny_sequence_lstm(self, model_precision=_MLMODEL_FULL_PRECISION):
-
-        np.random.seed(1988)
-        input_dim = 1
-        input_length = 2
-        num_channels = 1
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            LSTM(
-                num_channels,
-                input_shape=(input_length, input_dim),
-                implementation=1,
-                recurrent_activation="sigmoid",
-            )
-        )
-
-        # Set some random weights
-        model.set_weights(
-            [(np.random.rand(*w.shape) - 0.5) * 0.2 for w in model.get_weights()]
-        )
-
-        # Test the keras model
-        self._test_model(model, delta=1e-4, model_precision=model_precision)
-
-    def test_tiny_sequence_lstm_half_precision(self):
-        return self.test_tiny_sequence_lstm(model_precision=_MLMODEL_HALF_PRECISION)
-
-    def test_tiny_spatial_bn(self):
-        np.random.seed(1988)
-        x_in = Input(shape=(7, 7, 2))
-        x = ZeroPadding2D(padding=(1, 1))(x_in)
-        x = BatchNormalization(axis=2)(x)
-        model = Model(x_in, x)
-
-        self._test_model(model, delta=1e-2)
-
-    def test_embedding_fixed_length(self):
-        sequence_length = 5
-        vocab_size = 10
-        embed_channels = 4
-
-        dense_units = sequence_length * embed_channels
-        model = Sequential()
-        model.add(Embedding(vocab_size, embed_channels, input_length=sequence_length))
-        model.add(Flatten())
-        model.add(Dense(dense_units))
-        model.add(Dense(20))
-
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-        self._test_model(model, one_dim_seq_flags=[True])
-
-    def test_conv1d_flatten(self, delta=1e-2):
-        model = Sequential()
-        model.add(AveragePooling1D(2, input_shape=(64, 9)))
-        model.add(Conv1D(16, 1, padding="same", activation="relu", use_bias=False))
-        model.add(MaxPooling1D(2))
-        model.add(Flatten())
-        model.add(Dense(units=7, activation="softmax", use_bias=False))
-
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-        self._test_model(model, delta=delta)
-
-    def test_dense_fused_act_in_td(self):
-        np.random.seed(1988)
-        x_in = Input(shape=(10, 2))
-        x = TimeDistributed(Dense(6, activation="softmax"))(x_in)
-        model = Model(inputs=[x_in], outputs=[x])
-
-        self._test_model(model, delta=1e-4)
-
-    def test_conv_batch_1d(self):
-        np.random.seed(1988)
-        vocabulary_size = 4
-        embedding_dimension = 6
-        input_length = 10
-
-        model = Sequential()
-        model.add(
-            Embedding(
-                vocabulary_size,
-                embedding_dimension,
-                input_length=input_length,
-                trainable=True,
-            )
-        )
-
-        model.add(Conv1D(5, 2))
-        model.add(BatchNormalization())
-        model.add(Activation("relu"))
-
-        model.add(MaxPooling1D(2))
-
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-        self._test_model(model, one_dim_seq_flags=[True])
-
-    def test_lstm_td(self):
-        np.random.seed(1988)
-        input_dim = 2
-        input_length = 4
-        num_channels = 3
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            SimpleRNN(
-                num_channels,
-                return_sequences=True,
-                input_shape=(input_length, input_dim),
-            )
-        )
-        model.add(TimeDistributed(Dense(5)))
-
-        # Set some random weights
-        model.set_weights(
-            [np.random.rand(*w.shape) * 0.2 - 0.1 for w in model.get_weights()]
-        )
-
-        # Test the keras model
-        self._test_model(model)
-
-    # Making sure that giant channel sizes get handled correctly
-    def test_large_channel_gpu(self):
-
-        input_shape = (20, 20, 3)
-        num_channels = 2049
-        kernel_size = 3
-
-        model = Sequential()
-        model.add(
-            Conv2D(
-                input_shape=input_shape,
-                filters=num_channels,
-                kernel_size=(kernel_size, kernel_size),
-            )
-        )
-
-        model.set_weights(
-            [(np.random.rand(*w.shape) - 0.5) * 0.2 for w in model.get_weights()]
-        )
-
-        self._test_model(model, delta=1e-2)
-
-    @pytest.mark.xfail(raises=Exception)
-    def test_large_batch_gpu(self):
-
-        batch_size = 2049
-        num_channels = 4
-        kernel_size = 3
-
-        model = Sequential()
-        model.add(
-            TimeDistributed(Dense(num_channels), input_shape=(batch_size, kernel_size))
-        )
-
-        model.set_weights(
-            [(np.random.rand(*w.shape) - 0.5) * 0.2 for w in model.get_weights()]
-        )
-
-        self._test_model(model, delta=1e-2)
-
-
-@unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras. Skipping tests.")
-@pytest.mark.keras2
-class KerasTopologyCorrectnessTest(KerasNumericCorrectnessTest):
-    def test_dangling_merge_left(self):
-
-        x1 = Input(shape=(4,), name="input1")
-        x2 = Input(shape=(5,), name="input2")
-        y1 = Dense(6, name="dense")(x2)
-        z = concatenate([x1, y1])
-        model = Model(inputs=[x1, x2], outputs=[z])
-
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        self._test_model(model)
-
-    def test_dangling_merge_right(self):
-
-        x1 = Input(shape=(4,), name="input1")
-        x2 = Input(shape=(5,), name="input2")
-        y1 = Dense(6, name="dense")(x2)
-        z = concatenate([y1, x1])
-        model = Model(inputs=[x1, x2], outputs=[z])
-
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        self._test_model(model)
-
-    def test_shared_vision(self):
-        digit_input = Input(shape=(27, 27, 1))
-        x = Conv2D(64, (3, 3))(digit_input)
-        x = Conv2D(64, (3, 3))(x)
-        out = Flatten()(x)
-
-        vision_model = Model(inputs=[digit_input], outputs=[out])
-
-        # then define the tell-digits-apart model
-        digit_a = Input(shape=(27, 27, 1))
-        digit_b = Input(shape=(27, 27, 1))
-
-        # the vision model will be shared, weights and all
-        out_a = vision_model(digit_a)
-        out_b = vision_model(digit_b)
-
-        concatenated = concatenate([out_a, out_b])
-        out = Dense(1, activation="sigmoid")(concatenated)
-        model = Model(inputs=[digit_a, digit_b], outputs=out)
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-        self._test_model(model)
-
-    def test_tiny_weight_sharing(self):
-        #     - Dense1 -----------
-        # x - |                   |- Merge
-        #     - Dense1 - Dense2 --
-
-        x = Input(shape=(3,))
-        dense = Dense(4)
-        y1 = dense(x)
-        y2 = dense(x)
-        y3 = Dense(4)(y2)
-        z = concatenate([y1, y3])
-        model = Model(inputs=[x], outputs=[z])
-
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-        self._test_model(model, mode="random", delta=1e-2)
-
-    def test_tiny_multiple_outputs(self):
-        x = Input(shape=(3,))
-        y1 = Dense(4)(x)
-        y2 = Dense(5)(x)
-        model = Model([x], [y1, y2])
-
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-        self._test_model(model, mode="random", delta=1e-2)
-
-    def test_intermediate_outputs_dense(self):
-        x = Input(shape=(3,))
-        y = Dense(4, name="intermediate_dense_y")(x)
-        z = Dense(5, name="intermediate_dense_z")(y)
-        model = Model([x], [y, z])
-
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-        self._test_model(model, mode="random", delta=1e-2)
-
-    def test_intermediate_outputs_conv2d(self):
-        x = Input(shape=(8, 8, 3))
-        y = Conv2D(4, (3, 3), name="intermdiate_conv2d_1")(x)
-        z = Conv2D(5, (3, 3), name="intermdiate_conv2d_2")(y)
-        model = Model([x], [y, z])
-
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-        self._test_model(model, mode="random", delta=1e-2)
-
-    def test_intermediate_outputs_conv2d_fused_act(self):
-        x = Input(shape=(8, 8, 3))
-        y = Conv2D(4, (3, 3), name="intermdiate_conv2d_1_fused", activation="relu")(x)
-        z = Conv2D(5, (3, 3), name="intermdiate_conv2d_2_fused", activation="relu")(y)
-        model = Model([x], [y, z])
-
-        model.set_weights([np.random.rand(*w.shape) - 0.5 for w in model.get_weights()])
-        self._test_model(model, mode="random", delta=1e-2)
-
-    def test_intermediate_outputs_conv1d(self):
-        x = Input(shape=(10, 3))
-        y = Conv1D(4, 3, name="intermdiate_conv1d_1")(x)
-        z = Conv1D(5, 3, name="intermdiate_conv1d_2")(y)
-        model = Model([x], [y, z])
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-        self._test_model(model, mode="random", delta=1e-2)
-
-    def test_intermediate_outputs_conv1d_fused_act(self):
-        x = Input(shape=(10, 3))
-        y = Conv1D(4, 3, name="intermdiate_conv1d_1_fused", activation="relu")(x)
-        z = Conv1D(5, 3, name="intermdiate_conv1d_2_fused", activation="relu")(y)
-        model = Model([x], [y, z])
-        model.set_weights([np.random.rand(*w.shape) - 0.5 for w in model.get_weights()])
-        self._test_model(model, mode="random", delta=1e-2)
-
-    def test_intermediate_rcnn_1d(self):
-
-        x_in = Input(shape=(10, 2))
-        # Conv block 1
-        x = Conv1D(3, 3, padding="same", name="interm_rcnn_conv1")(x_in)
-        x = BatchNormalization(axis=-1, name="interm_rcnn_bn1")(x)
-        x = Activation("elu")(x)
-        x = MaxPooling1D(pool_size=2, name="interm_rcnn_pool1")(x)
-
-        out1 = x  # out1.shape = (5,3)
-        x = GRU(6, name="gru1")(x)
-        out2 = x
-        model = Model(x_in, [out1, out2])
-        # model = Model(x_in, [out2])
-        self._test_model(model, mode="random_zero_mean", delta=1e-2)
-
-    def test_tiny_mobilenet_arch(self, model_precision=_MLMODEL_FULL_PRECISION):
-        def ReLU6(x, name):
-            if keras.__version__ >= _StrictVersion("2.2.1"):
-                return ReLU(6.0, name=name)(x)
-            else:
-                return Activation(relu6, name=name)(x)
-
-        img_input = Input(shape=(32, 32, 3))
-        x = Conv2D(
-            4, (3, 3), padding="same", use_bias=False, strides=(2, 2), name="conv1"
-        )(img_input)
-        x = BatchNormalization(axis=-1, name="conv1_bn")(x)
-        x = ReLU6(x, name="conv1_relu")
-
-        x = DepthwiseConv2D(
-            (3, 3),
-            padding="same",
-            depth_multiplier=1,
-            strides=(1, 1),
-            use_bias=False,
-            name="conv_dw_1",
-        )(x)
-        x = BatchNormalization(axis=-1, name="conv_dw_1_bn")(x)
-        x = ReLU6(x, name="conv_dw_1_relu")
-
-        x = Conv2D(
-            8, (1, 1), padding="same", use_bias=False, strides=(1, 1), name="conv_pw_1"
-        )(x)
-        x = BatchNormalization(axis=-1, name="conv_pw_1_bn")(x)
-        x = ReLU6(x, name="conv_pw_1_relu")
-
-        x = DepthwiseConv2D(
-            (3, 3),
-            padding="same",
-            depth_multiplier=1,
-            strides=(2, 2),
-            use_bias=False,
-            name="conv_dw_2",
-        )(x)
-        x = BatchNormalization(axis=-1, name="conv_dw_2_bn")(x)
-        x = ReLU6(x, name="conv_dw_2_relu")
-
-        x = Conv2D(
-            8, (1, 1), padding="same", use_bias=False, strides=(2, 2), name="conv_pw_2"
-        )(x)
-        x = BatchNormalization(axis=-1, name="conv_pw_2_bn")(x)
-        x = ReLU6(x, name="conv_pw_2_relu")
-
-        model = Model(inputs=[img_input], outputs=[x])
-
-        self._test_model(model, delta=1e-2, model_precision=model_precision)
-
-    def test_tiny_mobilenet_arch_half_precision(self):
-        self.test_tiny_mobilenet_arch(model_precision=_MLMODEL_HALF_PRECISION)
-
-    def test_tiny_xception(self, model_precision=_MLMODEL_FULL_PRECISION):
-        img_input = Input(shape=(32, 32, 3))
-        x = Conv2D(2, (3, 3), strides=(2, 2), use_bias=False, name="block1_conv1")(
-            img_input
-        )
-        x = BatchNormalization(name="block1_conv1_bn")(x)
-        x = Activation("relu", name="block1_conv1_act")(x)
-        x = Conv2D(4, (3, 3), use_bias=False, name="block1_conv2")(x)
-        x = BatchNormalization(name="block1_conv2_bn")(x)
-        x = Activation("relu", name="block1_conv2_act")(x)
-
-        residual = Conv2D(8, (1, 1), strides=(2, 2), padding="same", use_bias=False)(x)
-        residual = BatchNormalization()(residual)
-
-        x = SeparableConv2D(
-            8, (3, 3), padding="same", use_bias=False, name="block2_sepconv1"
-        )(x)
-        x = BatchNormalization(name="block2_sepconv1_bn")(x)
-        x = Activation("relu", name="block2_sepconv2_act")(x)
-        x = SeparableConv2D(
-            8, (3, 3), padding="same", use_bias=False, name="block2_sepconv2"
-        )(x)
-        x = BatchNormalization(name="block2_sepconv2_bn")(x)
-
-        x = MaxPooling2D((3, 3), strides=(2, 2), padding="same", name="block2_pool")(x)
-        x = add([x, residual])
-
-        residual = Conv2D(16, (1, 1), strides=(2, 2), padding="same", use_bias=False)(x)
-        residual = BatchNormalization()(residual)
-
-        model = Model(inputs=[img_input], outputs=[residual])
-
-        self._test_model(model, delta=1e-2, model_precision=model_precision)
-
-    def test_tiny_xception_half_precision(self):
-        return self.test_tiny_xception(model_precision=_MLMODEL_HALF_PRECISION)
-
-    def test_nested_model_giving_output(self):
-        base_model = Sequential()
-        base_model.add(Conv2D(32, (1, 1), input_shape=(4, 4, 3)))
-
-        top_model = Sequential()
-        top_model.add(Flatten(input_shape=base_model.output_shape[1:]))
-        top_model.add(Dense(16, activation="relu"))
-        top_model.add(Dense(1, activation="sigmoid"))
-
-        model = Model(inputs=base_model.input, outputs=top_model(base_model.output))
-        self._test_model(model)
-
-    # similar to issue 269
-    def test_time_distributed_conv(self):
-        model = Sequential()
-        model.add(
-            TimeDistributed(
-                Conv2D(64, (3, 3), activation="relu"), input_shape=(1, 30, 30, 3)
-            )
-        )
-        model.add(TimeDistributed(MaxPooling2D((2, 2), strides=(1, 1))))
-        model.add(TimeDistributed(Conv2D(32, (4, 4), activation="relu")))
-        model.add(TimeDistributed(MaxPooling2D((2, 2), strides=(2, 2))))
-        model.add(TimeDistributed(Conv2D(32, (4, 4), activation="relu")))
-        model.add(TimeDistributed(MaxPooling2D((2, 2), strides=(2, 2))))
-        model.add(TimeDistributed(Flatten()))
-        model.add(Dropout(0.5))
-        model.add(LSTM(32, return_sequences=False, dropout=0.5))
-        model.add(Dense(10, activation="sigmoid"))
-        self._test_model(model)
-
-
-@pytest.mark.slow
-@pytest.mark.keras2
-@unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras. Skipping tests.")
-class KerasNumericCorrectnessStressTest(KerasNumericCorrectnessTest):
-    """
-    Unit test class for testing all combinations of a particular
-    layer.
-    """
-
-    def _run_test(
-        self,
-        model,
-        param,
-        model_dir=None,
-        delta=1e-2,
-        transpose_keras_result=True,
-        one_dim_seq_flags=None,
-        model_precision=_MLMODEL_FULL_PRECISION,
-    ):
-        """ Run a test on a particular model
-        """
-        use_tmp_folder = False
-        if model_dir is None:
-            use_tmp_folder = True
-            model_dir = tempfile.mkdtemp()
-        model_path = os.path.join(model_dir, "keras.mlmodel")
-
-        # Generate some random data
-        nb_inputs = len(model.inputs)
-        if nb_inputs > 1:
-            input_names = []
-            input_data = []
-            coreml_input = {}
-            for i in range(nb_inputs):
-                input_shape = [1 if a is None else a for a in model.input_shape[i]]
-                X = _generate_data(input_shape)
-                feature_name = "data_%s" % i
-                input_names.append(feature_name)
-                input_data.append(X)
-                if one_dim_seq_flags is None:
-                    coreml_input[feature_name] = _keras_transpose(X).astype("f")
-                else:
-                    coreml_input[feature_name] = _keras_transpose(
-                        X, one_dim_seq_flags[i]
-                    ).astype("f")
-        else:
-            input_shape = [1 if a is None else a for a in model.input_shape]
-            input_names = ["data"]
-            input_data = _generate_data(input_shape)
-            if one_dim_seq_flags is None:
-                coreml_input = {"data": _keras_transpose(input_data).astype("f")}
-            else:
-                coreml_input = {
-                    "data": _keras_transpose(input_data, one_dim_seq_flags[0]).astype(
-                        "f"
-                    )
-                }
-
-        # Make predictions
-        if transpose_keras_result:
-            keras_preds = _keras_transpose(model.predict(input_data)).flatten()
-        else:
-            keras_preds = model.predict(input_data).flatten()
-
-        # Get the model
-        coreml_model = _get_coreml_model(
-            model, input_names, ["output"], model_precision=model_precision
-        )
-        if _is_macos() and _macos_version() >= (10, 13):
-            # get prediction
-            coreml_preds = coreml_model.predict(coreml_input)["output"].flatten()
-
-            if use_tmp_folder:
-                shutil.rmtree(model_dir)
-            self.assertEqual(
-                len(coreml_preds),
-                len(keras_preds),
-                msg="Failed test case %s. Lengths wrong (%s vs %s)"
-                % (param, len(coreml_preds), len(keras_preds)),
-            )
-            for i in range(len(keras_preds)):
-                max_den = max(1.0, keras_preds[i], coreml_preds[i])
-                self.assertAlmostEqual(
-                    keras_preds[i] / max_den,
-                    coreml_preds[i] / max_den,
-                    delta=delta,
-                    msg="Failed test case %s. Predictions wrong (%s vs %s)"
-                    % (param, coreml_preds[i], keras_preds[i]),
-                )
-
-    @pytest.mark.slow
-    def test_activation_layer_params(self):
-        options = dict(
-            activation=[
-                "tanh",
-                "relu",
-                "sigmoid",
-                "softmax",
-                "softplus",
-                "softsign",
-                "hard_sigmoid",
-                "elu",
-            ]
-        )
-
-        # Define a function that tests a model
-        num_channels = 10
-        input_dim = 10
-
-        def build_model(x):
-            model = Sequential()
-            model.add(Dense(num_channels, input_dim=input_dim))
-            model.add(Activation(**dict(zip(options.keys(), x))))
-            return x, model
-
-        # Iterate through all combinations
-        product = itertools.product(*options.values())
-        args = [build_model(p) for p in product]
-
-        # Test the cases
-        print("Testing a total of %s cases. This could take a while" % len(args))
-        for param, model in args:
-            model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-            self._run_test(model, param)
-
-    @pytest.mark.slow
-    def test_dense_layer_params(self):
-        options = dict(
-            activation=[
-                "relu",
-                "softmax",
-                "tanh",
-                "sigmoid",
-                "softplus",
-                "softsign",
-                "elu",
-                "hard_sigmoid",
-            ],
-            use_bias=[True, False],
-        )
-        # Define a function that tests a model
-        input_shape = (10,)
-        num_channels = 10
-
-        def build_model(x):
-            kwargs = dict(zip(options.keys(), x))
-            model = Sequential()
-            model.add(Dense(num_channels, input_shape=input_shape, **kwargs))
-            return x, model
-
-        # Iterate through all combinations
-        product = itertools.product(*options.values())
-        args = [build_model(p) for p in product]
-
-        # Test the cases
-        print("Testing a total of %s cases. This could take a while" % len(args))
-        for param, model in args:
-            self._run_test(model, param)
-
-    @pytest.mark.slow
-    def test_upsample_layer_params(self):
-        options = dict(size=[(2, 2), (3, 3), (4, 4), (5, 5)])
-
-        np.random.seed(1988)
-        input_dim = 10
-        input_shape = (input_dim, input_dim, 1)
-        X = np.random.rand(1, *input_shape)
-
-        # Define a function that tests a model
-        def build_model(x):
-            kwargs = dict(zip(options.keys(), x))
-            model = Sequential()
-            model.add(Conv2D(filters=5, kernel_size=(7, 7), input_shape=input_shape))
-            model.add(UpSampling2D(**kwargs))
-            return x, model
-
-        # Iterate through all combinations
-        product = itertools.product(*options.values())
-        args = [build_model(p) for p in product]
-
-        # Test the cases
-        print("Testing a total of %s cases. This could take a while" % len(args))
-        for param, model in args:
-            self._run_test(model, param)
-
-    @pytest.mark.slow
-    def test_conv_layer_params(self, model_precision=_MLMODEL_FULL_PRECISION):
-        options = dict(
-            activation=[
-                "relu",
-                "tanh",
-                "sigmoid",
-            ],  # keras does not support softmax on 4-D
-            use_bias=[True, False],
-            padding=["same", "valid"],
-            filters=[1, 3, 5],
-            kernel_size=[[5, 5]],  # fails when sizes are different
-        )
-
-        # Define a function that tests a model
-        input_shape = (10, 10, 1)
-
-        def build_model(x):
-            kwargs = dict(zip(options.keys(), x))
-            model = Sequential()
-            model.add(Conv2D(input_shape=input_shape, **kwargs))
-            return x, model
-
-        # Iterate through all combinations
-        product = itertools.product(*options.values())
-        args = [build_model(p) for p in product]
-
-        # Test the cases
-        print("Testing a total of %s cases. This could take a while" % len(args))
-        for param, model in args:
-            self._run_test(model, param, model_precision=model_precision)
-
-    @pytest.mark.keras2
-    def test_conv_layer_params_half_precision(self):
-        return self.test_conv_layer_params(model_precision=_MLMODEL_HALF_PRECISION)
-
-    @pytest.mark.slow
-    def test_dense_elementwise_params(self):
-        options = dict(modes=[add, multiply, concatenate, average, maximum])
-
-        def build_model(mode):
-            x1 = Input(shape=(3,))
-            x2 = Input(shape=(3,))
-            y1 = Dense(4)(x1)
-            y2 = Dense(4)(x2)
-            z = mode([y1, y2])
-            model = Model([x1, x2], z)
-            return mode, model
-
-        product = itertools.product(*options.values())
-        args = [build_model(p[0]) for p in product]
-        print("Testing a total of %s cases. This could take a while" % len(args))
-        for param, model in args:
-            self._run_test(model, param)
-
-    def test_vgg_16_tiny(self):
-
-        input_shape = (48, 48, 3)
-        model = Sequential()
-        model.add(ZeroPadding2D((1, 1), input_shape=input_shape))
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-        model.add(MaxPooling2D((2, 2), strides=(2, 2)))
-
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-        model.add(MaxPooling2D((2, 2), strides=(2, 2)))
-
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-        model.add(MaxPooling2D((2, 2), strides=(2, 2)))
-
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-        model.add(MaxPooling2D((2, 2), strides=(2, 2)))
-
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-        model.add(MaxPooling2D((2, 2), strides=(2, 2)))
-
-        model.add(Flatten())
-        model.add(Dense(32, activation="relu"))
-        model.add(Dropout(0.5))
-        model.add(Dense(32, activation="relu"))
-        model.add(Dropout(0.5))
-        model.add(Dense(1000))  # activation='softmax'))
-
-        # Set some random weights
-        model.set_weights(
-            [(np.random.rand(*w.shape) - 0.5) * 0.2 for w in model.get_weights()]
-        )
-
-        # Get the coreml model
-        self._test_model(model)
-
-    def test_vgg_16_tiny_no_pooling(self):
-
-        input_shape = (48, 48, 3)
-        model = Sequential()
-        model.add(ZeroPadding2D((1, 1), input_shape=input_shape))
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-        model.add(MaxPooling2D((2, 2), strides=(2, 2)))
-
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-        model.add(MaxPooling2D((2, 2), strides=(2, 2)))
-
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-        model.add(MaxPooling2D((2, 2), strides=(2, 2)))
-
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-        model.add(MaxPooling2D((2, 2), strides=(2, 2)))
-
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-        model.add(MaxPooling2D((2, 2), strides=(2, 2)))
-
-        model.add(Flatten())
-        model.add(Dense(32, activation="relu"))
-        # model.add(Dropout(0.5))
-        model.add(Dense(32, activation="relu"))
-        # model.add(Dropout(0.5))
-        model.add(Dense(1000))  # activation='softmax'))
-
-        # Set some random weights
-        model.set_weights(
-            [(np.random.rand(*w.shape) - 0.5) * 0.2 for w in model.get_weights()]
-        )
-
-        # Get the coreml model
-        self._test_model(model)
-
-    def test_vgg_16_tiny_no_pooling_no_padding(
-        self, model_precision=_MLMODEL_FULL_PRECISION
-    ):
-
-        input_shape = (48, 48, 3)
-        model = Sequential()
-        model.add(Conv2D(32, (3, 3), activation="relu", input_shape=input_shape))
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-        model.add(Conv2D(32, (3, 3), activation="relu"))
-
-        model.add(Flatten())
-        model.add(Dense(32, activation="relu"))
-        model.add(Dropout(0.5))
-        model.add(Dense(32, activation="relu"))
-        model.add(Dropout(0.5))
-        model.add(Dense(1000, activation="softmax"))
-
-        # Get the coreml model
-        self._test_model(model, model_precision=model_precision)
-
-    def test_vgg_16_tiny_no_pooling_no_padding_half_precision(self):
-        return self.test_vgg_16_tiny_no_pooling_no_padding(
-            model_precision=_MLMODEL_HALF_PRECISION
-        )
-
-    def test_imdb_fasttext_first_2(self):
-
-        max_features = 10
-        max_len = 6
-        embedding_dims = 4
-        pool_length = 2
-
-        model = Sequential()
-        model.add(Embedding(max_features, embedding_dims, input_length=max_len))
-        # we add a AveragePooling1D, which will average the embeddings
-        # of all words in the document
-        model.add(AveragePooling1D(pool_size=pool_length))
-
-        self._test_model(model, one_dim_seq_flags=[True])
-
-    def test_tiny_mcrnn_td(self):
-
-        model = Sequential()
-        model.add(Conv2D(3, (1, 1), input_shape=(2, 4, 4), padding="same"))
-        model.add(AveragePooling2D(pool_size=(2, 2)))
-        model.add(Reshape((2, 3)))
-        model.add(TimeDistributed(Dense(5)))
-
-        self._test_model(model)
-
-    def test_tiny_mcrnn_recurrent(self):
-
-        model = Sequential()
-        model.add(Conv2D(3, (1, 1), input_shape=(2, 4, 4), padding="same"))
-        model.add(AveragePooling2D(pool_size=(2, 2)))
-        model.add(Reshape((2, 3)))
-        model.add(LSTM(5, recurrent_activation="sigmoid"))
-
-        self._test_model(model)
-
-    def test_tiny_mcrnn_music_tagger(self):
-
-        x_in = Input(shape=(4, 6, 1))
-        x = ZeroPadding2D(padding=(0, 1))(x_in)
-        x = BatchNormalization(axis=2, name="bn_0_freq")(x)
-        # Conv block 1
-        x = Conv2D(2, (3, 3), padding="same", name="conv1")(x)
-        x = BatchNormalization(axis=3, name="bn1")(x)
-        x = Activation("elu")(x)
-        x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), name="pool1")(x)
-        # Conv block 2
-        x = Conv2D(4, (3, 3), padding="same", name="conv2")(x)
-        x = BatchNormalization(axis=3, name="bn2")(x)
-        x = Activation("elu")(x)
-        x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), name="pool2")(x)
-
-        # Should get you (1,1,2,4)
-        x = Reshape((2, 4))(x)
-        x = GRU(32, return_sequences=True, name="gru1")(x)
-        x = GRU(32, return_sequences=False, name="gru2")(x)
-
-        # Create model.
-        model = Model(x_in, x)
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-        self._test_model(model, mode="random_zero_mean", delta=1e-2)
-
-    def test_tiny_apple_manual(self):
-        model = Sequential()
-        model.add(LSTM(3, input_shape=(4, 5), recurrent_activation="sigmoid"))
-        model.add(Dense(5))
-        model.add(Activation("softmax"))
-
-        self._test_model(model)
-
-    def test_tiny_image_captioning_image_branch(self):
-        img_input_1 = Input(shape=(16, 16, 3))
-        x = Conv2D(2, (3, 3))(img_input_1)
-        x = Flatten()(x)
-        img_model = Model(inputs=[img_input_1], outputs=[x])
-
-        img_input = Input(shape=(16, 16, 3))
-        x = img_model(img_input)
-        x = Dense(8, name="cap_dense")(x)
-        x = Reshape((1, 8), name="cap_reshape")(x)
-        image_branch = Model(inputs=[img_input], outputs=[x])
-        self._test_model(image_branch)
-
-    def test_tiny_image_captioning_feature_merge(self):
-
-        img_input_1 = Input(shape=(16, 16, 3))
-        x = Conv2D(2, (3, 3))(img_input_1)
-        x = Flatten()(x)
-        img_model = Model([img_input_1], [x])
-
-        img_input = Input(shape=(16, 16, 3))
-        x = img_model(img_input)
-        x = Dense(8, name="cap_dense")(x)
-        x = Reshape((1, 8), name="cap_reshape")(x)
-
-        sentence_input = Input(shape=(5,))  # max_length = 5
-        y = Embedding(8, 8, name="cap_embedding")(sentence_input)
-        z = concatenate([x, y], axis=1, name="cap_merge")
-
-        combined_model = Model(inputs=[img_input, sentence_input], outputs=[z])
-        self._test_model(combined_model, one_dim_seq_flags=[False, True])
-
-    def test_tiny_image_captioning(self):
-        # use a conv layer as a image feature branch
-        img_input_1 = Input(shape=(16, 16, 3))
-        x = Conv2D(2, (3, 3))(img_input_1)
-        x = Flatten()(x)
-        img_model = Model(inputs=[img_input_1], outputs=[x])
-
-        img_input = Input(shape=(16, 16, 3))
-        x = img_model(img_input)
-        x = Dense(8, name="cap_dense")(x)
-        x = Reshape((1, 8), name="cap_reshape")(x)
-
-        sentence_input = Input(shape=(5,))  # max_length = 5
-        y = Embedding(8, 8, name="cap_embedding")(sentence_input)
-        z = concatenate([x, y], axis=1, name="cap_merge")
-        z = LSTM(4, return_sequences=True, name="cap_lstm")(z)
-        z = TimeDistributed(Dense(8), name="cap_timedistributed")(z)
-
-        combined_model = Model(inputs=[img_input, sentence_input], outputs=[z])
-        self._test_model(combined_model, one_dim_seq_flags=[False, True])
-
-    def test_tiny_babi_rnn(self):
-        vocab_size = 10
-        embed_hidden_size = 8
-        story_maxlen = 5
-        query_maxlen = 5
-
-        input_tensor_1 = Input(shape=(story_maxlen,))
-        x1 = Embedding(vocab_size, embed_hidden_size)(input_tensor_1)
-        x1 = Dropout(0.3)(x1)
-
-        input_tensor_2 = Input(shape=(query_maxlen,))
-        x2 = Embedding(vocab_size, embed_hidden_size)(input_tensor_2)
-        x2 = Dropout(0.3)(x2)
-        x2 = LSTM(embed_hidden_size, return_sequences=False)(x2)
-        x2 = RepeatVector(story_maxlen)(x2)
-
-        x3 = add([x1, x2])
-        x3 = LSTM(embed_hidden_size, return_sequences=False)(x3)
-        x3 = Dropout(0.3)(x3)
-        x3 = Dense(vocab_size, activation="softmax")(x3)
-
-        model = Model(inputs=[input_tensor_1, input_tensor_2], outputs=[x3])
-
-        self._test_model(model, one_dim_seq_flags=[True, True])
-
-    def test_clickbait_cnn(self, model_precision=_MLMODEL_FULL_PRECISION):
-        # from: https://github.com/saurabhmathur96/clickbait-detector
-        vocabulary_size = 500
-        embedding_dimension = 30
-        input_length = 20
-
-        model = Sequential()
-        model.add(
-            Embedding(
-                vocabulary_size,
-                embedding_dimension,
-                input_length=input_length,
-                trainable=True,
-            )
-        )
-
-        model.add(Conv1D(32, 2))
-        model.add(BatchNormalization())
-        model.add(Activation("relu"))
-
-        model.add(Conv1D(32, 2))
-        model.add(BatchNormalization())
-        model.add(Activation("relu"))
-
-        model.add(Conv1D(32, 2))
-        model.add(BatchNormalization())
-        model.add(Activation("relu"))
-
-        model.add(MaxPooling1D(17))
-        model.add(Flatten())
-
-        model.add(Dense(1, use_bias=True))
-        model.add(BatchNormalization())
-        model.add(Activation("sigmoid"))
-
-        self._test_model(
-            model, one_dim_seq_flags=[True], model_precision=model_precision
-        )
-
-    def test_clickbait_cnn_half_precision(self):
-        return self.test_clickbait_cnn(model_precision=_MLMODEL_HALF_PRECISION)
-
-    def test_model_with_duplicated_edges(self):
-        # Create a simple model
-        inputs = Input(shape=(20, 20))
-        activation = Activation("relu")(inputs)
-        cropping = Cropping1D(cropping=(1, 1))(activation)
-        conv1d = Conv1D(20, 3, padding="valid")(activation)
-        ouputs = Add()([conv1d, cropping])
-
-        model = Model(inputs, ouputs)
-        self._test_model(model)
-
-
-@unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras. Skipping tests.")
-@pytest.mark.keras2
-class KerasBasicConversionTest(KerasNumericCorrectnessTest):
-    def test_float_arraytype_flag(self):
-        np.random.seed(1988)
-        # Define a model
-        model = Sequential()
-        model.add(Dense(1000, input_shape=(100,)))
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-        # Convert model
-        from coremltools.converters import keras as keras_converter
-
-        coreml_model = keras_converter.convert(model, use_float_arraytype=True)
-        spec = coreml_model.get_spec()
-        from coremltools.proto import Model_pb2 as _Model_pb2
-
-        self.assertEqual(
-            spec.description.input[0].type.multiArrayType.dataType,
-            _Model_pb2.ArrayFeatureType.FLOAT32,
-        )
-        self.assertEqual(
-            spec.description.output[0].type.multiArrayType.dataType,
-            _Model_pb2.ArrayFeatureType.FLOAT32,
-        )
-
-
-if __name__ == "__main__":
-    unittest.main()
-    # suite = unittest.TestSuite()
-    # suite.addTest(KerasBasicNumericCorrectnessTest("test_lstm_concat_dense_random"))
-    # unittest.TextTestRunner().run(suite)
diff --git a/coremltools/test/neural_network/test_keras_nonseq.py b/coremltools/test/neural_network/test_keras_nonseq.py
deleted file mode 100644
index 73ad048e1..000000000
--- a/coremltools/test/neural_network/test_keras_nonseq.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Copyright (c) 2017, Apple Inc. All rights reserved.
-#
-# Use of this source code is governed by a BSD-3-clause license that can be
-# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
-
-import unittest
-import pytest
-from coremltools._deps import _HAS_KERAS_TF
-
-if _HAS_KERAS_TF:
-    from keras.models import Model
-    from keras.layers import Dense, Input, merge
-    from coremltools.converters import keras
-
-
-@unittest.skipIf(not _HAS_KERAS_TF, "Missing keras. Skipping tests.")
-@pytest.mark.keras1
-class KerasNonSequentialModelTest(unittest.TestCase):
-    """
-    Unit test class for testing non-sequential Keras models.
-    """
-
-    @classmethod
-    def setUpClass(self):
-        """
-        Set up the unit test by loading common utilities.
-        """
-
-    def test_simple_merge(self):
-        """
-        Test the following Keras model
-               |- dense-|
-        dense -|        |- merge - dense
-               |- dense-|
-        """
-        input_tensor = Input(shape=(3,))
-        x1 = Dense(4)(input_tensor)
-        x2 = Dense(5)(x1)
-        x3 = Dense(6)(x1)
-        x4 = merge([x2, x3], mode="concat")
-        x5 = Dense(7)(x4)
-
-        model = Model(input=[input_tensor], output=[x5])
-        input_names = ["data"]
-        output_names = ["output"]
-
-        spec = keras.convert(model, input_names, output_names).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names))
-        self.assertEqual(
-            sorted(input_names), sorted(map(lambda x: x.name, spec.description.input))
-        )
-        self.assertEqual(len(spec.description.output), len(output_names))
-        self.assertEqual(
-            sorted(output_names), sorted(map(lambda x: x.name, spec.description.output))
-        )
-
-    def test_merge_add(self):
-        """
-        Test the following Keras model
-               |- dense-|
-        dense -|        |- merge - dense
-               |- dense-|
-        """
-        input_tensor = Input(shape=(3,))
-        x1 = Dense(4)(input_tensor)
-        x2 = Dense(5)(x1)
-        x3 = Dense(5)(x1)
-        x4 = merge([x2, x3], mode="sum")
-        x5 = Dense(7)(x4)
-
-        model = Model(input=[input_tensor], output=[x5])
-        input_names = ["data"]
-        output_names = ["output"]
-
-        spec = keras.convert(model, input_names, output_names).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names))
-        self.assertEqual(
-            sorted(input_names), sorted(map(lambda x: x.name, spec.description.input))
-        )
-        self.assertEqual(len(spec.description.output), len(output_names))
-        self.assertEqual(
-            sorted(output_names), sorted(map(lambda x: x.name, spec.description.output))
-        )
-
-    def test_merge_multiply(self):
-        """
-        Test the following Keras model
-               |- dense-|
-        dense -|        |- merge - dense
-               |- dense-|
-        """
-        input_tensor = Input(shape=(3,))
-        x1 = Dense(4)(input_tensor)
-        x2 = Dense(5)(x1)
-        x3 = Dense(5)(x1)
-        x4 = merge([x2, x3], mode="mul")
-        x5 = Dense(7)(x4)
-
-        model = Model(input=[input_tensor], output=[x5])
-        input_names = ["data"]
-        output_names = ["output"]
-
-        spec = keras.convert(model, input_names, output_names).get_spec()
-        self.assertIsNotNone(spec)
-
-        # Test the model class
-        self.assertIsNotNone(spec.description)
-        self.assertTrue(spec.HasField("neuralNetwork"))
-
-        # Test the inputs and outputs
-        self.assertEqual(len(spec.description.input), len(input_names))
-        self.assertEqual(
-            sorted(input_names), sorted(map(lambda x: x.name, spec.description.input))
-        )
-        self.assertEqual(len(spec.description.output), len(output_names))
-        self.assertEqual(
-            sorted(output_names), sorted(map(lambda x: x.name, spec.description.output))
-        )
diff --git a/coremltools/test/neural_network/test_keras_numeric.py b/coremltools/test/neural_network/test_keras_numeric.py
deleted file mode 100644
index 51c5d2d9f..000000000
--- a/coremltools/test/neural_network/test_keras_numeric.py
+++ /dev/null
@@ -1,3137 +0,0 @@
-# Copyright (c) 2017, Apple Inc. All rights reserved.
-#
-# Use of this source code is governed by a BSD-3-clause license that can be
-# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
-
-import itertools
-import os
-import shutil
-import tempfile
-import unittest
-
-import numpy as np
-import pytest
-
-from coremltools._deps import _HAS_KERAS_TF
-from coremltools.models.utils import _macos_version, _is_macos
-
-if _HAS_KERAS_TF:
-    from keras.models import Sequential, Model
-    from keras.layers import (
-        Dense,
-        Activation,
-        Convolution2D,
-        AtrousConvolution2D,
-        LSTM,
-        ZeroPadding2D,
-        Deconvolution2D,
-        Permute,
-        Convolution1D,
-        AtrousConvolution1D,
-        MaxPooling2D,
-        AveragePooling2D,
-        Flatten,
-        Dropout,
-        UpSampling2D,
-        merge,
-        Merge,
-        Input,
-        GRU,
-        GlobalMaxPooling2D,
-        GlobalMaxPooling1D,
-        GlobalAveragePooling2D,
-        GlobalAveragePooling1D,
-        Cropping1D,
-        Cropping2D,
-        Reshape,
-        AveragePooling1D,
-        MaxPooling1D,
-        RepeatVector,
-        ELU,
-        SimpleRNN,
-        BatchNormalization,
-        Embedding,
-        ZeroPadding1D,
-        UpSampling1D,
-    )
-    from keras.layers.wrappers import Bidirectional, TimeDistributed
-
-
-def _keras_transpose(x, is_sequence=False):
-    if len(x.shape) == 4:
-        # Keras input shape = [Batch, Height, Width, Channels]
-        x = np.transpose(x, [0, 3, 1, 2])
-        return np.expand_dims(x, axis=0)
-    elif len(x.shape) == 3:
-        # Keras input shape = [Batch, (Sequence) Length, Channels]
-        return np.transpose(x, [1, 0, 2])
-    elif len(x.shape) == 2:
-        if is_sequence:  # (N,S) --> (S,N,1,)
-            return x.reshape(x.shape[::-1] + (1,))
-        else:  # (N,C) --> (N,C,1,1)
-            return x.reshape((1,) + x.shape)  # Dense
-    elif len(x.shape) == 1:
-        if is_sequence:  # (S) --> (S,N,1,1,1)
-            return x.reshape((x.shape[0], 1, 1))
-        else:
-            return x
-    else:
-        return x
-
-
-def _get_coreml_model(model, model_path, input_names, output_names):
-    """
-    Get the coreml model from the Keras model.
-    """
-    # Convert the model
-    from coremltools.converters import keras as keras_converter
-
-    model = keras_converter.convert(model, input_names, output_names)
-    return model
-
-
-def _generate_data(input_shape, mode="random"):
-    """
-    Generate some random data according to a shape.
-    """
-    if mode == "zeros":
-        X = np.zeros(input_shape)
-    elif mode == "ones":
-        X = np.ones(input_shape)
-    elif mode == "linear":
-        X = np.array(range(np.product(input_shape))).reshape(input_shape)
-    elif mode == "random":
-        X = np.random.rand(*input_shape)
-    elif mode == "random_zero_mean":
-        X = np.random.rand(*input_shape) - 0.5
-    return X
-
-
-def conv2d_bn(
-    x, nb_filter, nb_row, nb_col, border_mode="same", subsample=(1, 1), name=None
-):
-    """
-    Utility function to apply conv + BN.
-    """
-    if name is not None:
-        bn_name = name + "_bn"
-        conv_name = name + "_conv"
-    else:
-        bn_name = None
-        conv_name = None
-    bn_axis = 3
-    x = Convolution2D(
-        nb_filter,
-        nb_row,
-        nb_col,
-        subsample=subsample,
-        activation="relu",
-        border_mode=border_mode,
-        name=conv_name,
-    )(x)
-    x = BatchNormalization(axis=bn_axis, name=bn_name)(x)
-    return x
-
-
-@unittest.skipIf(not _HAS_KERAS_TF, "Missing keras. Skipping tests.")
-@pytest.mark.keras1
-class KerasNumericCorrectnessTest(unittest.TestCase):
-    """
-    Unit test class for testing the Keras converter.
-    """
-
-    def _test_keras_model(
-        self,
-        model,
-        num_samples=1,
-        mode="random",
-        input_blob="data",
-        output_blob="output",
-        delta=1e-2,
-        model_dir=None,
-        transpose_keras_result=True,
-        one_dim_seq_flags=None,
-    ):
-
-        # transpose_keras_result: if true, compare the transposed Keras result
-        # one_dim_seq_flags: a list of same length as the number of inputs in
-        # the model; if None, treat all 1D input (if any) as non-sequence
-        # if one_dim_seq_flags[i] is True, it means the ith input, with shape
-        # (X,) is in fact a sequence of length X.
-
-        # Get the CoreML model
-        use_tmp_folder = False
-        if model_dir is None:
-            use_tmp_folder = True
-            model_dir = tempfile.mkdtemp()
-        model_path = os.path.join(model_dir, "keras.mlmodel")
-
-        # Generate data
-        nb_inputs = len(model.inputs)
-        if nb_inputs > 1:
-            input_names = []
-            input_data = []
-            coreml_input = {}
-            for i in range(nb_inputs):
-                input_shape = [1 if a is None else a for a in model.input_shape[i]]
-                X = _generate_data(input_shape, mode)
-                feature_name = "data_%s" % i
-                input_names.append(feature_name)
-                input_data.append(X)
-                if one_dim_seq_flags is None:
-                    coreml_input[feature_name] = _keras_transpose(X).astype("f").copy()
-                else:
-                    coreml_input[feature_name] = (
-                        _keras_transpose(X, one_dim_seq_flags[i]).astype("f").copy()
-                    )
-        else:
-            input_shape = [1 if a is None else a for a in model.input_shape]
-            input_names = ["data"]
-            input_data = _generate_data(input_shape, mode)
-            if one_dim_seq_flags is None:
-                coreml_input = {"data": _keras_transpose(input_data).astype("f").copy()}
-            else:
-                coreml_input = {
-                    "data": _keras_transpose(input_data, one_dim_seq_flags[0])
-                    .astype("f")
-                    .copy()
-                }
-
-        # Compile the model
-        output_names = ["output" + str(i) for i in range(len(model.outputs))]
-        coreml_model = _get_coreml_model(model, model_path, input_names, output_names)
-
-        if _is_macos() and _macos_version() >= (10, 13):
-            # Assuming coreml model output names are in the same order as Keras
-            # Output list, put predictions into a list, sorted by output name
-            coreml_preds = coreml_model.predict(coreml_input)
-            c_preds = [coreml_preds[name] for name in output_names]
-
-            # Run Keras predictions
-            keras_preds = model.predict(input_data)
-            k_preds = keras_preds if type(keras_preds) is list else [keras_preds]
-
-            # Compare each output blob
-            for idx, k_pred in enumerate(k_preds):
-                if transpose_keras_result:
-                    kp = _keras_transpose(k_pred).flatten()
-                else:
-                    kp = k_pred.flatten()
-                cp = c_preds[idx].flatten()
-                # Compare predictions
-                self.assertEqual(len(kp), len(cp))
-                for i in range(len(kp)):
-                    max_den = max(1.0, kp[i], cp[i])
-                    self.assertAlmostEqual(
-                        kp[i] / max_den, cp[i] / max_den, delta=delta
-                    )
-
-        # Cleanup files - models on disk no longer useful
-        if use_tmp_folder and os.path.exists(model_dir):
-            shutil.rmtree(model_dir)
-
-
-@unittest.skipIf(not _HAS_KERAS_TF, "Missing keras. Skipping tests.")
-@pytest.mark.keras1
-class KerasBasicNumericCorrectnessTest(KerasNumericCorrectnessTest):
-    def test_tiny_inner_product_zero_input(self):
-        np.random.seed(1988)
-        input_dim = 2
-        num_channels = 2
-
-        # Define a model
-        model = Sequential()
-        model.add(Dense(num_channels, input_dim=input_dim))
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model, mode="zeros")
-
-    def test_tiny_inner_product_ones(self):
-        np.random.seed(1988)
-        input_dim = 2
-        num_channels = 2
-
-        # Define a model
-        model = Sequential()
-        model.add(Dense(num_channels, input_dim=input_dim))
-
-        # Set some random weights
-        model.set_weights([np.ones(w.shape) for w in model.get_weights()])
-
-        # test the keras model
-        self._test_keras_model(model, mode="ones")
-
-    def test_tiny_inner_product_random(self):
-        np.random.seed(1988)
-        input_dim = 2
-        num_channels = 2
-
-        # Define a model
-        model = Sequential()
-        model.add(Dense(num_channels, input_dim=input_dim))
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model)
-
-    def test_inner_product_random(self):
-        np.random.seed(1988)
-        input_dim = 100
-        input_shape = (input_dim,)
-        num_channels = 100
-
-        # Define a model
-        model = Sequential()
-        model.add(Dense(num_channels, input_dim=input_dim))
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model)
-
-    def test_tiny_conv_ones(self):
-        np.random.seed(1988)
-        input_dim = 10
-        input_shape = (input_dim, input_dim, 1)
-        num_kernels = 3
-        kernel_height = 5
-        kernel_width = 5
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Convolution2D(
-                input_shape=input_shape,
-                nb_filter=num_kernels,
-                nb_row=kernel_height,
-                nb_col=kernel_width,
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.ones(w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model)
-
-    def test_tiny_conv_random(self):
-        np.random.seed(1988)
-        input_dim = 10
-        input_shape = (input_dim, input_dim, 1)
-        num_kernels = 3
-        kernel_height = 5
-        kernel_width = 5
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            AtrousConvolution2D(
-                num_kernels, kernel_height, kernel_width, input_shape=input_shape
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model)
-
-    def test_tiny_atrous_conv_random(self):
-        np.random.seed(1988)
-        input_dim = 8
-        input_shape = (input_dim, input_dim, 1)
-        num_kernels = 2
-        kernel_height = 3
-        kernel_width = 3
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            AtrousConvolution2D(
-                nb_filter=num_kernels,
-                nb_row=kernel_height,
-                nb_col=kernel_width,
-                input_shape=input_shape,
-                atrous_rate=(2, 2),
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model)
-
-    def test_tiny_atrous_conv_rect_random(self):
-        np.random.seed(1988)
-        input_shape = (32, 20, 1)
-        num_kernels = 2
-        kernel_height = 3
-        kernel_width = 3
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            AtrousConvolution2D(
-                nb_filter=num_kernels,
-                nb_row=kernel_height,
-                nb_col=kernel_width,
-                input_shape=input_shape,
-                atrous_rate=(3, 3),
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model)
-
-    def test_tiny_conv_rect_kernel_x(self):
-        np.random.seed(1988)
-        input_dim = 10
-        input_shape = (input_dim, input_dim, 1)
-        num_kernels = 3
-        kernel_height = 1
-        kernel_width = 5
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Convolution2D(
-                input_shape=input_shape,
-                nb_filter=num_kernels,
-                nb_row=kernel_height,
-                nb_col=kernel_width,
-                border_mode="same",
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model)
-
-    def test_tiny_conv_rect_kernel_y(self):
-        np.random.seed(1988)
-        input_dim = 10
-        input_shape = (input_dim, input_dim, 1)
-        num_kernels = 3
-        kernel_height = 5
-        kernel_width = 1
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Convolution2D(
-                input_shape=input_shape,
-                nb_filter=num_kernels,
-                nb_row=kernel_height,
-                nb_col=kernel_width,
-                border_mode="valid",
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model)
-
-    def test_tiny_conv_rect_kernel_xy(self):
-        np.random.seed(1988)
-        input_dim = 10
-        input_shape = (input_dim, input_dim, 1)
-        num_kernels = 3
-        kernel_height = 5
-        kernel_width = 3
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Convolution2D(
-                input_shape=input_shape,
-                nb_filter=num_kernels,
-                nb_row=kernel_height,
-                nb_col=kernel_width,
-                border_mode="valid",
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model)
-
-    def test_tiny_conv_pseudo_1d_x(self):
-        np.random.seed(1988)
-        input_dim = 2
-        input_length = 5
-        filter_length = 1  # 3
-        nb_filters = 1
-        # Define a model
-        model = Sequential()
-        model.add(
-            Convolution2D(
-                nb_filters,
-                1,
-                filter_length,
-                input_shape=(1, input_length, input_dim),
-                border_mode="valid",
-            )
-        )
-        # Set some random weights
-        model.set_weights([np.ones(w.shape) for w in model.get_weights()])
-        self._test_keras_model(model, mode="linear")
-
-    def test_tiny_conv1d_same_random(self):
-        np.random.seed(1988)
-        input_dim = 2
-        input_length = 10
-        filter_length = 3
-        nb_filters = 4
-        model = Sequential()
-        model.add(
-            Convolution1D(
-                nb_filters,
-                filter_length,
-                border_mode="same",
-                input_shape=(input_length, input_dim),
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model)
-
-    def test_tiny_conv1d_valid_random(self):
-        np.random.seed(1988)
-        input_dim = 2
-        input_length = 10
-        filter_length = 3
-        nb_filters = 4
-        model = Sequential()
-        model.add(
-            Convolution1D(
-                nb_filters,
-                filter_length,
-                border_mode="valid",
-                input_shape=(input_length, input_dim),
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model)
-
-    def test_tiny_atrous_conv1d_random(self):
-        np.random.seed(1988)
-        input_dim = 8
-        input_shape = (input_dim, 1)
-        num_kernels = 2
-        kernel_length = 3
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            AtrousConvolution1D(
-                nb_filter=num_kernels,
-                filter_length=kernel_length,
-                input_shape=input_shape,
-                atrous_rate=2,
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model)
-
-    def test_tiny_deconv_random(self):
-        np.random.seed(1988)
-        input_dim = 13
-        output_dim = 28
-        input_shape = (input_dim, input_dim, 3)
-        num_kernels = 16
-        kernel_height = 3
-        kernel_width = 3
-        output_shape = (None, output_dim, output_dim, num_kernels)
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Deconvolution2D(
-                num_kernels,
-                kernel_width,
-                kernel_height,
-                input_shape=input_shape,
-                output_shape=output_shape,
-                border_mode="valid",
-                subsample=(2, 2),
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model)
-
-    def test_tiny_deconv_random_same_padding(self):
-        np.random.seed(1988)
-        input_dim = 14
-        output_dim = 28
-        input_shape = (input_dim, input_dim, 3)
-        num_kernels = 16
-        kernel_height = 3
-        kernel_width = 3
-        output_shape = (None, output_dim, output_dim, num_kernels)
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Deconvolution2D(
-                num_kernels,
-                kernel_width,
-                kernel_height,
-                input_shape=input_shape,
-                output_shape=output_shape,
-                border_mode="same",
-                subsample=(2, 2),
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model)
-
-    def test_tiny_conv_upsample_random(self):
-        np.random.seed(1988)
-        input_dim = 10
-        input_shape = (input_dim, input_dim, 1)
-        num_kernels = 3
-        kernel_height = 5
-        kernel_width = 5
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Convolution2D(
-                input_shape=input_shape,
-                nb_filter=num_kernels,
-                nb_row=kernel_height,
-                nb_col=kernel_width,
-            )
-        )
-        model.add(UpSampling2D(size=(2, 2)))
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model)
-
-    def test_housenet_random(self):
-        np.random.seed(1988)
-        num_hidden = 2
-        num_features = 3
-
-        # Define a model
-        model = Sequential()
-        model.add(Dense(num_hidden, input_dim=num_features))
-        model.add(Activation("relu"))
-        model.add(Dense(1, input_dim=num_features))
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model)
-
-    def test_tiny_no_sequence_lstm_zeros(self):
-        np.random.seed(1988)
-        input_dim = 1
-        input_length = 1
-        num_channels = 1
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            LSTM(
-                num_channels,
-                input_dim=input_dim,
-                input_length=input_length,
-                consume_less="cpu",
-                inner_activation="sigmoid",
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(
-            model, mode="zeros", input_blob="data", output_blob="output"
-        )
-
-    def test_tiny_no_sequence_lstm_zeros_gpu(self):
-        np.random.seed(1988)
-        input_dim = 1
-        input_length = 1
-        num_channels = 1
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            LSTM(
-                num_channels,
-                input_dim=input_dim,
-                input_length=input_length,
-                consume_less="gpu",
-                inner_activation="sigmoid",
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(
-            model, mode="zeros", input_blob="data", output_blob="output"
-        )
-
-    def test_tiny_no_sequence_lstm_ones(self):
-        np.random.seed(1988)
-        input_dim = 1
-        input_length = 1
-        num_channels = 1
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            LSTM(
-                num_channels,
-                input_dim=input_dim,
-                input_length=input_length,
-                consume_less="cpu",
-                inner_activation="sigmoid",
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(
-            model, mode="ones", input_blob="data", output_blob="output"
-        )
-
-    def test_small_no_sequence_lstm_zeros(self):
-        np.random.seed(1988)
-        input_dim = 10
-        input_length = 1
-        num_channels = 1
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            LSTM(
-                num_channels,
-                input_dim=input_dim,
-                input_length=input_length,
-                consume_less="gpu",
-                inner_activation="sigmoid",
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(
-            model, mode="zeros", input_blob="data", output_blob="output"
-        )
-
-    def test_small_no_sequence_lstm_ones(self):
-        np.random.seed(1988)
-        input_dim = 10
-        input_length = 1
-        num_channels = 1
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            LSTM(
-                num_channels,
-                input_dim=input_dim,
-                input_length=input_length,
-                consume_less="gpu",
-                inner_activation="sigmoid",
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(
-            model, mode="ones", input_blob="data", output_blob="output"
-        )
-
-    def test_tiny_no_sequence_simple_rnn_random(self):
-        np.random.seed(1988)
-        input_dim = 1
-        input_length = 1
-        num_channels = 1
-        num_samples = 1
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            SimpleRNN(num_channels, input_dim=input_dim, input_length=input_length)
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model, input_blob="data", output_blob="output")
-
-    def test_tiny_no_sequence_gru_random(self):
-        np.random.seed(1988)
-        input_dim = 1
-        input_length = 1
-        num_channels = 1
-        num_samples = 1
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            GRU(
-                num_channels,
-                input_dim=input_dim,
-                input_length=input_length,
-                inner_activation="sigmoid",
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model, input_blob="data", output_blob="output")
-
-    def test_tiny_no_sequence_bidir_random(self):
-        np.random.seed(1988)
-        input_dim = 1
-        input_length = 1
-        num_channels = 1
-        num_samples = 1
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Bidirectional(
-                LSTM(
-                    num_channels,
-                    input_dim=input_dim,
-                    input_length=input_length,
-                    consume_less="cpu",
-                    inner_activation="sigmoid",
-                ),
-                input_shape=(input_length, input_dim),
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model, input_blob="data", output_blob="output")
-
-    def test_tiny_no_sequence_bidir_random_gpu(self):
-        np.random.seed(1988)
-        input_dim = 1
-        input_length = 1
-        num_channels = 1
-        num_samples = 1
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Bidirectional(
-                LSTM(
-                    num_channels,
-                    input_dim=input_dim,
-                    input_length=input_length,
-                    consume_less="gpu",
-                    inner_activation="sigmoid",
-                ),
-                input_shape=(input_length, input_dim),
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model, input_blob="data", output_blob="output")
-
-    def test_small_no_sequence_lstm_random(self):
-        np.random.seed(1988)
-        input_dim = 10
-        input_length = 1
-        num_channels = 1
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            LSTM(
-                num_channels,
-                input_dim=input_dim,
-                input_length=input_length,
-                consume_less="gpu",
-                inner_activation="sigmoid",
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model, input_blob="data", output_blob="output")
-
-    def test_small_no_sequence_gru_random(self):
-        np.random.seed(1988)
-        input_dim = 10
-        input_length = 1
-        num_channels = 1
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            GRU(
-                num_channels,
-                input_dim=input_dim,
-                input_length=input_length,
-                inner_activation="sigmoid",
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model, input_blob="data", output_blob="output")
-
-    def test_small_no_sequence_bidir_random(self):
-        np.random.seed(1988)
-        input_dim = 10
-        input_length = 1
-        num_channels = 1
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Bidirectional(
-                LSTM(
-                    num_channels,
-                    input_dim=input_dim,
-                    input_length=input_length,
-                    consume_less="gpu",
-                    inner_activation="sigmoid",
-                ),
-                input_shape=(input_length, input_dim),
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model, input_blob="data", output_blob="output")
-
-    def test_small_no_sequence_simple_rnn_random(self):
-        np.random.seed(1988)
-        input_dim = 10
-        input_length = 1
-        num_channels = 1
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            SimpleRNN(
-                num_channels,
-                input_dim=input_dim,
-                input_length=input_length,
-                consume_less="gpu",
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model, input_blob="data", output_blob="output")
-
-    def test_medium_no_sequence_lstm_random(self):
-        np.random.seed(1988)
-        input_dim = 10
-        input_length = 1
-        num_channels = 10
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            LSTM(
-                num_channels,
-                input_dim=input_dim,
-                input_length=input_length,
-                inner_activation="sigmoid",
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model, input_blob="data", output_blob="output")
-
-    def test_medium_no_sequence_bidir_random(self):
-        np.random.seed(1988)
-        input_dim = 10
-        input_length = 1
-        num_channels = 10
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Bidirectional(
-                LSTM(
-                    num_channels,
-                    input_dim=input_dim,
-                    input_length=input_length,
-                    consume_less="gpu",
-                    inner_activation="sigmoid",
-                ),
-                input_shape=(input_length, input_dim),
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model, input_blob="data", output_blob="output")
-
-    def test_medium_bidir_random_return_seq_false(self):
-        np.random.seed(1988)
-        input_dim = 7
-        input_length = 5
-        num_channels = 10
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Bidirectional(
-                LSTM(
-                    num_channels,
-                    input_dim=input_dim,
-                    input_length=input_length,
-                    return_sequences=False,
-                    consume_less="gpu",
-                    inner_activation="sigmoid",
-                ),
-                input_shape=(input_length, input_dim),
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model, input_blob="data", output_blob="output")
-
-    def test_medium_bidir_random_return_seq_true(self):
-        np.random.seed(1988)
-        input_dim = 7
-        input_length = 5
-        num_channels = 10
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Bidirectional(
-                LSTM(
-                    num_channels,
-                    input_dim=input_dim,
-                    input_length=input_length,
-                    return_sequences=True,
-                    consume_less="gpu",
-                    inner_activation="sigmoid",
-                ),
-                input_shape=(input_length, input_dim),
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model, input_blob="data", output_blob="output")
-
-    def test_lstm_seq(self):
-        np.random.seed(1988)
-
-        input_dim = 11
-        input_length = 5
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            LSTM(
-                20,
-                input_dim=input_dim,
-                input_length=input_length,
-                return_sequences=False,
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model, input_blob="data", output_blob="output")
-
-    def test_lstm_seq_dense(self):
-        np.random.seed(1988)
-
-        input_dim = 5
-        num_hidden = 12
-        num_classes = 6
-        input_length = 3
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            LSTM(
-                num_hidden,
-                input_dim=input_dim,
-                input_length=input_length,
-                return_sequences=False,
-            )
-        )
-        model.add(Dense(num_classes, activation="softmax"))
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model, input_blob="data", output_blob="output")
-
-    def test_lstm_seq_backwards(self):
-        np.random.seed(1988)
-
-        input_dim = 11
-        input_length = 5
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            LSTM(
-                20,
-                input_dim=input_dim,
-                input_length=input_length,
-                return_sequences=False,
-                go_backwards=True,
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model, input_blob="data", output_blob="output")
-
-    def test_rnn_seq(self):
-        np.random.seed(1988)
-
-        input_dim = 11
-        input_length = 5
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            SimpleRNN(
-                20,
-                input_dim=input_dim,
-                input_length=input_length,
-                return_sequences=False,
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model, input_blob="data", output_blob="output")
-
-    def test_rnn_seq_backwards(self):
-        np.random.seed(1988)
-
-        input_dim = 11
-        input_length = 5
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            SimpleRNN(
-                20,
-                input_dim=input_dim,
-                input_length=input_length,
-                return_sequences=False,
-                go_backwards=True,
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model, input_blob="data", output_blob="output")
-
-    def test_gru_seq(self):
-        np.random.seed(1988)
-
-        input_dim = 11
-        input_length = 5
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            GRU(
-                20,
-                input_dim=input_dim,
-                input_length=input_length,
-                return_sequences=False,
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model, input_blob="data", output_blob="output")
-
-    def test_gru_seq_backwards(self):
-        np.random.seed(1988)
-
-        input_dim = 11
-        input_length = 5
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            GRU(
-                20,
-                input_dim=input_dim,
-                input_length=input_length,
-                return_sequences=False,
-                go_backwards=True,
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model, input_blob="data", output_blob="output")
-
-    def test_medium_no_sequence_simple_rnn_random(self):
-        np.random.seed(1988)
-        input_dim = 10
-        input_length = 1
-        num_channels = 10
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            SimpleRNN(num_channels, input_dim=input_dim, input_length=input_length)
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model, input_blob="data", output_blob="output")
-
-    def test_medium_no_sequence_gru_random(self):
-        np.random.seed(1988)
-        input_dim = 10
-        input_length = 1
-        num_channels = 10
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            GRU(
-                num_channels,
-                input_dim=input_dim,
-                input_length=input_length,
-                inner_activation="sigmoid",
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-
-    def test_medium_conv_batchnorm_random(self):
-        np.random.seed(1988)
-        input_dim = 10
-        input_shape = (input_dim, input_dim, 3)
-        num_kernels = 3
-        kernel_height = 5
-        kernel_width = 5
-        data_mean = 2
-        data_var = 1
-
-        # Define a model
-        from keras.layers.normalization import BatchNormalization
-
-        model = Sequential()
-        model.add(
-            Convolution2D(
-                input_shape=input_shape,
-                nb_filter=num_kernels,
-                nb_row=kernel_height,
-                nb_col=kernel_width,
-            )
-        )
-        model.add(BatchNormalization(epsilon=1e-5))
-
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_keras_model(model)
-
-    def test_tiny_conv_elu_random(self):
-        np.random.seed(1988)
-
-        # Define a model
-        from keras.layers.advanced_activations import ELU
-
-        model = Sequential()
-        model.add(
-            Convolution2D(input_shape=(10, 10, 3), nb_filter=3, nb_row=5, nb_col=5)
-        )
-        model.add(ELU(alpha=0.8))
-
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_keras_model(model)
-
-    def test_tiny_conv_prelu_random(self):
-        np.random.seed(1988)
-
-        # Define a model
-        from keras.layers.advanced_activations import PReLU
-
-        model = Sequential()
-        model.add(
-            Convolution2D(
-                input_shape=(10, 10, 3),
-                nb_filter=3,
-                nb_row=5,
-                nb_col=5,
-                border_mode="same",
-            )
-        )
-        model.add(PReLU(shared_axes=[1, 2]))
-
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_keras_model(model)
-
-    def test_tiny_conv_leaky_relu_random(self):
-        np.random.seed(1988)
-
-        # Define a model
-        from keras.layers.advanced_activations import LeakyReLU
-
-        model = Sequential()
-        model.add(
-            Convolution2D(
-                input_shape=(10, 10, 3),
-                nb_filter=3,
-                nb_row=5,
-                nb_col=5,
-                border_mode="same",
-            )
-        )
-        model.add(LeakyReLU(alpha=0.3))
-
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_keras_model(model)
-
-    def test_tiny_parametric_softplus_random(self):
-        input_shape = (8, 8, 3)  # (10,10,3)
-        # Define a model
-        from keras.layers.advanced_activations import ParametricSoftplus
-
-        model = Sequential()
-        model.add(ParametricSoftplus(input_shape=input_shape))
-
-        alpha_per_channel = np.random.rand(3)
-        beta_per_channel = np.random.rand(3)
-        alphas = (
-            np.repeat(alpha_per_channel, input_shape[0] * input_shape[1])
-            .reshape(input_shape[::-1])
-            .transpose((2, 1, 0))
-        )
-        betas = (
-            np.repeat(beta_per_channel, input_shape[0] * input_shape[1])
-            .reshape(input_shape[::-1])
-            .transpose((2, 1, 0))
-        )
-
-        model.layers[0].set_weights([alphas, betas])
-
-        # Get the coreml model
-        self._test_keras_model(model)
-
-    def test_tiny_conv_parametric_softplus_random(self):
-        np.random.seed(1988)
-        input_shape = (8, 8, 3)  # (10,10,3)
-        nb_filters = 2
-        output_shape = (8, 8, 2)
-        # Define a model
-        from keras.layers.advanced_activations import ParametricSoftplus
-
-        model = Sequential()
-        model.add(
-            Convolution2D(
-                input_shape=(8, 8, 3),
-                nb_filter=nb_filters,
-                nb_row=3,
-                nb_col=3,
-                border_mode="same",
-            )
-        )
-        model.add(ParametricSoftplus())
-
-        # CoreML only takes 1-param per channel, so weights are set differently
-        # model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-        alpha_per_channel = np.random.rand(nb_filters)
-        beta_per_channel = np.random.rand(nb_filters)
-        alphas = (
-            np.repeat(alpha_per_channel, output_shape[0] * output_shape[1])
-            .reshape(output_shape[::-1])
-            .transpose((2, 1, 0))
-        )
-        betas = (
-            np.repeat(beta_per_channel, output_shape[0] * output_shape[1])
-            .reshape(output_shape[::-1])
-            .transpose((2, 1, 0))
-        )
-        model.layers[1].set_weights([alphas, betas])
-
-        # Get the coreml model
-        self._test_keras_model(model)
-
-    def test_tiny_dense_parametric_softplus_random(self):
-        np.random.seed(1988)
-
-        # Define a model
-        from keras.layers.advanced_activations import ParametricSoftplus
-
-        model = Sequential()
-        model.add(Dense(10, input_shape=(4,)))
-        model.add(ParametricSoftplus())
-
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_keras_model(model)
-
-    def test_tiny_conv_thresholded_relu_random(self):
-        np.random.seed(1988)
-
-        # Define a model
-        from keras.layers.advanced_activations import ThresholdedReLU
-
-        model = Sequential()
-        model.add(
-            Convolution2D(
-                input_shape=(10, 10, 3),
-                nb_filter=3,
-                nb_row=5,
-                nb_col=5,
-                border_mode="same",
-            )
-        )
-        model.add(ThresholdedReLU(theta=0.8))
-
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_keras_model(model)
-
-    def test_tiny_concat_random(self):
-        np.random.seed(1988)
-        input_dim = 10
-        num_channels = 6
-
-        # Define a model
-        input_tensor = Input(shape=(input_dim,))
-        x1 = Dense(num_channels)(input_tensor)
-        x2 = Dense(num_channels)(x1)
-        x3 = Dense(num_channels)(x1)
-        x4 = merge([x2, x3], mode="concat")
-        x5 = Dense(num_channels)(x4)
-
-        model = Model(input=[input_tensor], output=[x5])
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_keras_model(model)
-
-    def test_tiny_concat_seq_random(self):
-        np.random.seed(1988)
-        max_features = 10
-        embedding_dims = 4
-        seq_len = 5
-        num_channels = 6
-
-        # Define a model
-        input_tensor = Input(shape=(seq_len,))
-        x1 = Embedding(max_features, embedding_dims)(input_tensor)
-        x2 = Embedding(max_features, embedding_dims)(input_tensor)
-        x3 = merge([x1, x2], mode="concat", concat_axis=1)
-
-        model = Model(input=[input_tensor], output=[x3])
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_keras_model(model, one_dim_seq_flags=[True])
-
-    def test_tiny_add_random(self):
-        np.random.seed(1988)
-        input_dim = 10
-        num_channels = 6
-
-        # Define a model
-        input_tensor = Input(shape=(input_dim,))
-        x1 = Dense(num_channels)(input_tensor)
-        x2 = Dense(num_channels)(x1)
-        x3 = Dense(num_channels)(x1)
-        x4 = merge([x2, x3], mode="sum")
-        x5 = Dense(num_channels)(x4)
-
-        model = Model(input=[input_tensor], output=[x5])
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_keras_model(model)
-
-    def test_tiny_mul_random(self):
-        np.random.seed(1988)
-        input_dim = 10
-        num_channels = 6
-
-        # Define a model
-        input_tensor = Input(shape=(input_dim,))
-        x1 = Dense(num_channels)(input_tensor)
-        x2 = Dense(num_channels)(x1)
-        x3 = Dense(num_channels)(x1)
-        x4 = merge([x2, x3], mode="mul")
-        x5 = Dense(num_channels)(x4)
-
-        model = Model(input=[input_tensor], output=[x5])
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_keras_model(model)
-
-    def test_tiny_cos_random(self):
-        np.random.seed(1988)
-        input_dim = 10
-        num_channels = 6
-
-        # Define a model
-        input_tensor = Input(shape=(input_dim,))
-        x1 = Dense(num_channels)(input_tensor)
-        x2 = Dense(num_channels)(x1)
-        x3 = Dense(num_channels)(x1)
-        x4 = merge([x2, x3], mode="cos")
-        x5 = Dense(num_channels)(x4)
-
-        model = Model(input=[input_tensor], output=[x5])
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_keras_model(model)
-
-    def test_zeropad_simple(self):
-        input_shape = (48, 48, 3)
-        model = Sequential()
-        model.add(ZeroPadding2D((1, 1), input_shape=input_shape))
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_keras_model(model)
-
-    def test_zeropad_fancy(self):
-        input_shape = (48, 48, 3)
-        model = Sequential()
-        model.add(ZeroPadding2D((2, 5), input_shape=input_shape))
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_keras_model(model)
-
-    def test_crop_simple(self):
-        input_shape = (48, 48, 3)
-        model = Sequential()
-        model.add(Cropping2D(cropping=((2, 5), (2, 5)), input_shape=input_shape))
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_keras_model(model)
-
-    def test_tiny_permute(self):
-        model = Sequential()
-        model.add(Permute((3, 2, 1), input_shape=(4, 3, 2)))
-
-        # When input blob is 3D array (D1, D2, D3), Keras assumes the axes' meaning is
-        # (D1=H,D2=W,D3=C), while CoreML assumes (D1=C,D2=H,D3=W). However,
-        # it's unclear after permutation, what the axes' meaning is for the output blob.
-        # Since permutation done on (H,W,C) blobs usually is usually followed by
-        # recurrent layers / Dense, we choose that the ouput axis order of CoreML is
-        # the same as Keras after permutation.
-        self._test_keras_model(model, transpose_keras_result=False)
-
-    def test_max_pooling_no_overlap(self):
-        # no_overlap: pool_size = strides
-        model = Sequential()
-        model.add(
-            MaxPooling2D(
-                input_shape=(16, 16, 3),
-                pool_size=(2, 2),
-                strides=None,
-                border_mode="valid",
-            )
-        )
-        self._test_keras_model(model)
-
-    def test_max_pooling_overlap_multiple(self):
-        # input shape is multiple of pool_size, strides != pool_size
-        model = Sequential()
-        model.add(
-            MaxPooling2D(
-                input_shape=(18, 18, 3),
-                pool_size=(3, 3),
-                strides=(2, 2),
-                border_mode="valid",
-            )
-        )
-        self._test_keras_model(model)
-
-    def test_max_pooling_overlap_odd(self):
-        model = Sequential()
-        model.add(
-            MaxPooling2D(
-                input_shape=(16, 16, 3),
-                pool_size=(3, 3),
-                strides=(2, 2),
-                border_mode="valid",
-            )
-        )
-        self._test_keras_model(model)
-
-    def test_max_pooling_overlap_same(self):
-        model = Sequential()
-        model.add(
-            MaxPooling2D(
-                input_shape=(16, 16, 3),
-                pool_size=(3, 3),
-                strides=(2, 2),
-                border_mode="same",
-            )
-        )
-        self._test_keras_model(model)
-
-    def test_global_max_pooling(self):
-        model = Sequential()
-        model.add(GlobalMaxPooling2D(input_shape=(16, 16, 3)))
-        self._test_keras_model(model)
-
-    def test_max_pooling_1d(self):
-        model = Sequential()
-        model.add(MaxPooling1D(input_shape=(16, 3), pool_length=4))
-        self._test_keras_model(model)
-
-    def test_global_max_pooling_1d(self):
-        np.random.seed(1988)
-        input_dim = 2
-        input_length = 10
-        filter_length = 3
-        nb_filters = 4
-        model = Sequential()
-        model.add(
-            Convolution1D(
-                nb_filters,
-                filter_length,
-                border_mode="same",
-                input_shape=(input_length, input_dim),
-            )
-        )
-        model.add(GlobalMaxPooling1D())
-        self._test_keras_model(model)
-
-    def test_average_pooling_no_overlap(self):
-        # no_overlap: pool_size = strides
-        model = Sequential()
-        model.add(
-            AveragePooling2D(
-                input_shape=(16, 16, 3),
-                pool_size=(2, 2),
-                strides=None,
-                border_mode="valid",
-            )
-        )
-        self._test_keras_model(model, delta=1e-2)
-
-    def test_average_pooling_inception_config_1(self):
-        # no_overlap: pool_size = strides
-        model = Sequential()
-        model.add(
-            AveragePooling2D(
-                input_shape=(16, 16, 3),
-                pool_size=(3, 3),
-                strides=(1, 1),
-                border_mode="same",
-            )
-        )
-        self._test_keras_model(model, delta=1e-2)
-
-    def test_global_average_pooling(self):
-        model = Sequential()
-        model.add(GlobalAveragePooling2D(input_shape=(16, 16, 3)))
-        self._test_keras_model(model)
-
-    def test_average_pooling_1d(self):
-        np.random.seed(1988)
-        input_dim = 2
-        input_length = 10
-        filter_length = 3
-        nb_filters = 4
-        model = Sequential()
-        model.add(
-            Convolution1D(
-                nb_filters,
-                filter_length,
-                border_mode="same",
-                input_shape=(input_length, input_dim),
-            )
-        )
-        model.add(AveragePooling1D(pool_length=2))
-        self._test_keras_model(model)
-
-    def test_global_average_pooling_1d(self):
-        np.random.seed(1988)
-        input_dim = 2
-        input_length = 10
-        filter_length = 3
-        nb_filters = 4
-        model = Sequential()
-        model.add(
-            Convolution1D(
-                nb_filters,
-                filter_length,
-                border_mode="same",
-                input_shape=(input_length, input_dim),
-            )
-        )
-        model.add(GlobalAveragePooling1D())
-        self._test_keras_model(model)
-
-    def test_tiny_conv_dense_random(self):
-        np.random.seed(1988)
-        num_samples = 1
-        input_dim = 8
-        input_shape = (input_dim, input_dim, 3)
-        num_kernels = 2
-        kernel_height = 5
-        kernel_width = 5
-        hidden_dim = 4
-
-        # Define a model
-        from keras.layers import Flatten
-
-        model = Sequential()
-        model.add(
-            Convolution2D(
-                input_shape=input_shape,
-                nb_filter=num_kernels,
-                nb_row=kernel_height,
-                nb_col=kernel_width,
-            )
-        )
-        model.add(Flatten())
-        model.add(Dense(hidden_dim))
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_keras_model(model)
-
-    def test_tiny_dense_tanh_fused_random(self):
-        np.random.seed(1988)
-        num_samples = 1
-        input_dim = 3
-        hidden_dim = 4
-
-        # Define a model
-        model = Sequential()
-        model.add(Dense(hidden_dim, input_shape=(input_dim,), activation="tanh"))
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_keras_model(model)
-
-    def test_tiny_conv_relu_fused_random(self):
-        np.random.seed(1988)
-        num_samples = 1
-        input_dim = 8
-        input_shape = (input_dim, input_dim, 3)
-        num_kernels = 2
-        kernel_height = 5
-        kernel_width = 5
-        hidden_dim = 4
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Convolution2D(
-                input_shape=input_shape,
-                nb_filter=num_kernels,
-                nb_row=kernel_height,
-                nb_col=kernel_width,
-                activation="relu",
-            )
-        )
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Get the coreml model
-        self._test_keras_model(model)
-
-    def test_flatten(self):
-        model = Sequential()
-        model.add(Flatten(input_shape=(2, 2, 2)))
-        self._test_keras_model(model, mode="linear")
-
-    def test_reshape_3d(self):
-        model = Sequential()
-        model.add(Reshape((10, 1, 6), input_shape=(5, 4, 3)))
-        self._test_keras_model(model, mode="linear")
-
-    def test_embedding(self):
-        model = Sequential()
-        num_inputs = 10
-        num_outputs = 3
-        model.add(Embedding(num_inputs, num_outputs))
-
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        self._test_keras_model(model)
-
-    def test_embedding_seq(self):
-        model = Sequential()
-        num_inputs = 10
-        num_outputs = 3
-        model.add(Embedding(num_inputs, num_outputs, input_length=7))
-
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        self._test_keras_model(model, one_dim_seq_flags=[True])
-
-    def test_tiny_time_distrbuted(self):
-        # as the first layer in a model
-        model = Sequential()
-        model.add(TimeDistributed(Dense(8), input_shape=(10, 16)))
-
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        self._test_keras_model(model)
-
-    def test_tiny_sequence_lstm(self):
-        np.random.seed(1988)
-        input_dim = 1
-        input_length = 2
-        num_channels = 1
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            LSTM(
-                num_channels,
-                input_dim=input_dim,
-                input_length=input_length,
-                consume_less="cpu",
-                inner_activation="sigmoid",
-            )
-        )
-
-        # Set some random weights
-        model.set_weights(
-            [(np.random.rand(*w.shape) - 0.5) / 5.0 for w in model.get_weights()]
-        )
-
-        # Test the keras model
-        self._test_keras_model(
-            model, input_blob="data", output_blob="output", delta=1e-4
-        )
-
-    def test_tiny_spatial_bn(self):
-        np.random.seed(1988)
-        x_in = Input(shape=(7, 7, 2))
-        x = ZeroPadding2D(padding=(1, 1))(x_in)
-        x = BatchNormalization(axis=2)(x)
-        model = Model(x_in, x)
-
-        self._test_keras_model(
-            model, input_blob="data", output_blob="output", delta=1e-2
-        )
-
-    def test_dense_fused_act_in_td(self):
-        np.random.seed(1988)
-        x_in = Input(shape=(10, 2))
-        x = TimeDistributed(Dense(6, activation="softmax"))(x_in)
-        model = Model(x_in, x)
-
-        self._test_keras_model(
-            model, input_blob="data", output_blob="output", delta=1e-2
-        )
-
-    def test_tiny_conv_upsample_1d_random(self):
-        np.random.seed(1988)
-        input_dim = 2
-        input_length = 10
-        filter_length = 3
-        nb_filters = 4
-        model = Sequential()
-        model.add(
-            Convolution1D(
-                nb_filters,
-                filter_length=filter_length,
-                border_mode="same",
-                input_shape=(input_length, input_dim),
-            )
-        )
-        model.add(UpSampling1D(length=2))
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model)
-
-    def test_tiny_conv_crop_1d_random(self):
-        np.random.seed(1988)
-        input_dim = 2
-        input_length = 10
-        filter_length = 3
-        nb_filters = 4
-        model = Sequential()
-        model.add(
-            Convolution1D(
-                nb_filters,
-                filter_length=filter_length,
-                border_mode="same",
-                input_shape=(input_length, input_dim),
-            )
-        )
-        model.add(Cropping1D(cropping=(2, 2)))
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model)
-
-    def test_tiny_conv_pad_1d_random(self):
-        np.random.seed(1988)
-        input_dim = 2
-        input_length = 10
-        filter_length = 3
-        nb_filters = 4
-        model = Sequential()
-        model.add(
-            Convolution1D(
-                nb_filters,
-                filter_length=filter_length,
-                border_mode="same",
-                input_shape=(input_length, input_dim),
-            )
-        )
-        model.add(ZeroPadding1D(padding=(2, 2)))
-
-        # Set some random weights
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model)
-
-    def test_conv_batch_1d(self):
-        vocabulary_size = 4
-        embedding_dimension = 6
-        input_length = 10
-
-        model = Sequential()
-        model.add(
-            Embedding(
-                vocabulary_size,
-                embedding_dimension,
-                input_length=input_length,
-                trainable=True,
-            )
-        )
-
-        model.add(Convolution1D(5, 2))
-        model.add(BatchNormalization())
-        model.add(Activation("relu"))
-
-        model.add(MaxPooling1D(2))
-
-        self._test_keras_model(model, one_dim_seq_flags=[True])
-
-    # Making sure that giant channel sizes get handled correctly
-    def test_large_channel_gpu(self):
-        input_shape = (20, 20, 3)
-        num_channels = 2049
-        kernel_size = 3
-
-        model = Sequential()
-        model.add(
-            Convolution2D(
-                input_shape=input_shape,
-                nb_filter=num_channels,
-                nb_row=kernel_size,
-                nb_col=kernel_size,
-            )
-        )
-
-        model.set_weights(
-            [(np.random.rand(*w.shape) - 0.5) / 5.0 for w in model.get_weights()]
-        )
-
-        self._test_keras_model(
-            model, input_blob="data", output_blob="output", delta=1e-2
-        )
-
-    @pytest.mark.xfail(raises=Exception)
-    def test_large_batch_gpu(self):
-        batch_size = 2049
-        num_channels = 4
-        kernel_size = 3
-
-        model = Sequential()
-        model.add(
-            TimeDistributed(Dense(num_channels), input_shape=(batch_size, kernel_size))
-        )
-
-        model.set_weights(
-            [(np.random.rand(*w.shape) - 0.5) / 5.0 for w in model.get_weights()]
-        )
-
-        self._test_keras_model(
-            model, input_blob="data", output_blob="output", delta=1e-2
-        )
-
-
-@unittest.skipIf(not _HAS_KERAS_TF, "Missing keras. Skipping tests.")
-@pytest.mark.keras1
-class KerasTopologyCorrectnessTest(KerasNumericCorrectnessTest):
-    def test_tiny_sequential_merge(self):
-        np.random.seed(1988)
-
-        # Define a model
-        model1 = Sequential()
-        model1.add(Dense(4, input_dim=3))
-        model1.add(Dense(4))
-        model2 = Sequential()
-        model2.add(Dense(4, input_dim=3))
-        model2.add(Dense(4))
-        model3 = Sequential()
-        model3.add(Merge([model1, model2], mode="concat"))
-
-        # Set some random weights
-        model3.set_weights([np.random.rand(*w.shape) for w in model3.get_weights()])
-
-        # Test the keras model
-        self._test_keras_model(model3)
-
-    def test_dangling_merge_left(self):
-        x1 = Input(shape=(4,), name="input1")
-        x2 = Input(shape=(5,), name="input2")
-        y1 = Dense(6, name="dense")(x2)
-        z = merge([x1, y1], mode="concat")
-        model = Model([x1, x2], [z])
-
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        self._test_keras_model(model)
-
-    def test_dangling_merge_right(self):
-        x1 = Input(shape=(4,), name="input1")
-        x2 = Input(shape=(5,), name="input2")
-        y1 = Dense(6, name="dense")(x2)
-        z = merge([y1, x1], mode="concat")
-        model = Model([x1, x2], [z])
-
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        self._test_keras_model(model)
-
-    def test_shared_vision(self):
-        digit_input = Input(shape=(27, 27, 1))
-        x = Convolution2D(64, 3, 3)(digit_input)
-        x = Convolution2D(64, 3, 3)(x)
-        out = Flatten()(x)
-
-        vision_model = Model(digit_input, out)
-
-        # then define the tell-digits-apart model
-        digit_a = Input(shape=(27, 27, 1))
-        digit_b = Input(shape=(27, 27, 1))
-
-        # the vision model will be shared, weights and all
-        out_a = vision_model(digit_a)
-        out_b = vision_model(digit_b)
-
-        concatenated = merge([out_a, out_b], mode="concat")
-        out = Dense(1, activation="sigmoid")(concatenated)
-        model = Model([digit_a, digit_b], out)
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-        self._test_keras_model(model)
-
-    def test_tiny_weight_sharing(self):
-        #     - Dense1 -----------
-        # x - |                   |- Merge
-        #     - Dense1 - Dense2 --
-
-        x = Input(shape=(3,))
-        dense = Dense(4)
-        y1 = dense(x)
-        y2 = dense(x)
-        y3 = Dense(4)(y2)
-        z = merge([y1, y3], mode="concat")
-        model = Model(x, z)
-
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-        self._test_keras_model(model, mode="random", delta=1e-2)
-
-
-@unittest.skipIf(not _HAS_KERAS_TF, "Missing keras. Skipping tests.")
-@pytest.mark.keras1
-class KerasInceptionCorrectnessTest(KerasNumericCorrectnessTest):
-    def test_inception_conv_stage(self):
-
-        input_shape = (299, 299, 3)
-        img_input = Input(shape=input_shape)
-        channel_axis = 3
-        inputs = img_input
-
-        x = conv2d_bn(img_input, 32, 3, 3, subsample=(2, 2), border_mode="valid")
-        x = conv2d_bn(x, 32, 3, 3, border_mode="valid")
-        x = conv2d_bn(x, 64, 3, 3)
-        x = MaxPooling2D((3, 3), strides=(2, 2))(x)
-
-        x = conv2d_bn(x, 80, 1, 1, border_mode="valid")
-        x = conv2d_bn(x, 192, 3, 3, border_mode="valid")
-        x = MaxPooling2D((3, 3), strides=(2, 2))(x)
-
-        model = Model(inputs, x, name="inception_v3")
-
-        # Set some random weights
-        # use small weights for numerical correctness
-        model.set_weights(
-            [np.random.rand(*w.shape) * 1e-3 for w in model.get_weights()]
-        )
-
-        # Get the coreml model
-        self._test_keras_model(model)
-
-    def test_inception_first_branch(self):
-
-        input_shape = (299, 299, 3)
-        img_input = Input(shape=input_shape)
-
-        channel_axis = 3
-        inputs = img_input
-
-        x = conv2d_bn(img_input, 32, 3, 3, subsample=(2, 2), border_mode="valid")
-        x = conv2d_bn(x, 32, 3, 3, border_mode="valid")
-        x = conv2d_bn(x, 64, 3, 3)
-        x = MaxPooling2D((3, 3), strides=(2, 2))(x)
-
-        x = conv2d_bn(x, 80, 1, 1, border_mode="valid")
-        x = conv2d_bn(x, 192, 3, 3, border_mode="valid")
-        x = MaxPooling2D((3, 3), strides=(2, 2))(x)
-
-        # mixed 0, 1, 2: 35 x 35 x 256
-        for i in range(3):
-            branch1x1 = conv2d_bn(x, 64, 1, 1)
-
-            branch5x5 = conv2d_bn(x, 48, 1, 1)
-            branch5x5 = conv2d_bn(branch5x5, 64, 5, 5)
-
-            branch3x3dbl = conv2d_bn(x, 64, 1, 1)
-            branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
-            branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
-
-            branch_pool = AveragePooling2D((3, 3), strides=(1, 1), border_mode="same")(
-                x
-            )
-            branch_pool = conv2d_bn(branch_pool, 32, 1, 1)
-            x = merge(
-                [branch1x1, branch5x5, branch3x3dbl, branch_pool],
-                mode="concat",
-                concat_axis=channel_axis,
-                name="mixed" + str(i),
-            )
-
-        model = Model(inputs, x, name="inception_v3")
-
-        # Set some random weights
-        # use small weights for numerical correctness
-        model.set_weights(
-            [np.random.rand(*w.shape) * 1e-3 for w in model.get_weights()]
-        )
-
-        # Get the coreml model
-        self._test_keras_model(model)
-
-    def test_inception_second_branch(self):
-
-        input_shape = (299, 299, 3)
-        img_input = Input(shape=input_shape)
-
-        channel_axis = 3
-        inputs = img_input
-
-        x = conv2d_bn(img_input, 32, 3, 3, subsample=(2, 2), border_mode="valid")
-        x = conv2d_bn(x, 32, 3, 3, border_mode="valid")
-        x = conv2d_bn(x, 64, 3, 3)
-        x = MaxPooling2D((3, 3), strides=(2, 2))(x)
-
-        x = conv2d_bn(x, 80, 1, 1, border_mode="valid")
-        x = conv2d_bn(x, 192, 3, 3, border_mode="valid")
-        x = MaxPooling2D((3, 3), strides=(2, 2))(x)
-
-        # mixed 0, 1, 2: 35 x 35 x 256
-        for i in range(3):
-            branch1x1 = conv2d_bn(x, 64, 1, 1)
-
-            branch5x5 = conv2d_bn(x, 48, 1, 1)
-            branch5x5 = conv2d_bn(branch5x5, 64, 5, 5)
-
-            branch3x3dbl = conv2d_bn(x, 64, 1, 1)
-            branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
-            branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
-
-            branch_pool = AveragePooling2D((3, 3), strides=(1, 1), border_mode="same")(
-                x
-            )
-            branch_pool = conv2d_bn(branch_pool, 32, 1, 1)
-            x = merge(
-                [branch1x1, branch5x5, branch3x3dbl, branch_pool],
-                mode="concat",
-                concat_axis=channel_axis,
-                name="mixed" + str(i),
-            )
-
-        # mixed 3: 17 x 17 x 768
-        branch3x3 = conv2d_bn(x, 384, 3, 3, subsample=(2, 2), border_mode="valid")
-
-        branch3x3dbl = conv2d_bn(x, 64, 1, 1)
-        branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
-        branch3x3dbl = conv2d_bn(
-            branch3x3dbl, 96, 3, 3, subsample=(2, 2), border_mode="valid"
-        )
-
-        branch_pool = MaxPooling2D((3, 3), strides=(2, 2))(x)
-        x = merge(
-            [branch3x3, branch3x3dbl, branch_pool],
-            mode="concat",
-            concat_axis=channel_axis,
-            name="mixed3",
-        )
-
-        # mixed 4: 17 x 17 x 768
-        branch1x1 = conv2d_bn(x, 192, 1, 1)
-
-        branch7x7 = conv2d_bn(x, 128, 1, 1)
-        branch7x7 = conv2d_bn(branch7x7, 128, 1, 7)
-        branch7x7 = conv2d_bn(branch7x7, 192, 7, 1)
-
-        branch7x7dbl = conv2d_bn(x, 128, 1, 1)
-        branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 7, 1)
-        branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 1, 7)
-        branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 7, 1)
-        branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
-
-        branch_pool = AveragePooling2D((3, 3), strides=(1, 1), border_mode="same")(x)
-        branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
-        x = merge(
-            [branch1x1, branch7x7, branch7x7dbl, branch_pool],
-            mode="concat",
-            concat_axis=channel_axis,
-            name="mixed4",
-        )
-
-        # mixed 5, 6: 17 x 17 x 768
-        for i in range(2):
-            branch1x1 = conv2d_bn(x, 192, 1, 1)
-
-            branch7x7 = conv2d_bn(x, 160, 1, 1)
-            branch7x7 = conv2d_bn(branch7x7, 160, 1, 7)
-            branch7x7 = conv2d_bn(branch7x7, 192, 7, 1)
-
-            branch7x7dbl = conv2d_bn(x, 160, 1, 1)
-            branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 7, 1)
-            branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 1, 7)
-            branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 7, 1)
-            branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
-
-            branch_pool = AveragePooling2D((3, 3), strides=(1, 1), border_mode="same")(
-                x
-            )
-            branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
-            x = merge(
-                [branch1x1, branch7x7, branch7x7dbl, branch_pool],
-                mode="concat",
-                concat_axis=channel_axis,
-                name="mixed" + str(5 + i),
-            )
-
-        # mixed 7: 17 x 17 x 768
-        branch1x1 = conv2d_bn(x, 192, 1, 1)
-
-        branch7x7 = conv2d_bn(x, 192, 1, 1)
-        branch7x7 = conv2d_bn(branch7x7, 192, 1, 7)
-        branch7x7 = conv2d_bn(branch7x7, 192, 7, 1)
-
-        branch7x7dbl = conv2d_bn(x, 160, 1, 1)
-        branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 7, 1)
-        branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
-        branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 7, 1)
-        branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
-
-        branch_pool = AveragePooling2D((3, 3), strides=(1, 1), border_mode="same")(x)
-        branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
-        x = merge(
-            [branch1x1, branch7x7, branch7x7dbl, branch_pool],
-            mode="concat",
-            concat_axis=channel_axis,
-            name="mixed7",
-        )
-
-        model = Model(inputs, x, name="inception_v3")
-
-        # Set some random weights
-        # use small weights for numerical correctness
-        model.set_weights(
-            [np.random.rand(*w.shape) * 1e-3 for w in model.get_weights()]
-        )
-
-        # Get the coreml model
-        self._test_keras_model(model)
-
-    def test_inception_no_top(self):
-
-        input_shape = (299, 299, 3)
-        img_input = Input(shape=input_shape)
-        channel_axis = 3
-        inputs = img_input
-
-        x = conv2d_bn(img_input, 32, 3, 3, subsample=(2, 2), border_mode="valid")
-        x = conv2d_bn(x, 32, 3, 3, border_mode="valid")
-        x = conv2d_bn(x, 64, 3, 3)
-        x = MaxPooling2D((3, 3), strides=(2, 2))(x)
-
-        x = conv2d_bn(x, 80, 1, 1, border_mode="valid")
-        x = conv2d_bn(x, 192, 3, 3, border_mode="valid")
-        x = MaxPooling2D((3, 3), strides=(2, 2))(x)
-
-        # mixed 0, 1, 2: 35 x 35 x 256
-        for i in range(3):
-            branch1x1 = conv2d_bn(x, 64, 1, 1)
-
-            branch5x5 = conv2d_bn(x, 48, 1, 1)
-            branch5x5 = conv2d_bn(branch5x5, 64, 5, 5)
-
-            branch3x3dbl = conv2d_bn(x, 64, 1, 1)
-            branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
-            branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
-
-            branch_pool = AveragePooling2D((3, 3), strides=(1, 1), border_mode="same")(
-                x
-            )
-            branch_pool = conv2d_bn(branch_pool, 32, 1, 1)
-            x = merge(
-                [branch1x1, branch5x5, branch3x3dbl, branch_pool],
-                mode="concat",
-                concat_axis=channel_axis,
-                name="mixed" + str(i),
-            )
-
-        # mixed 3: 17 x 17 x 768
-        branch3x3 = conv2d_bn(x, 384, 3, 3, subsample=(2, 2), border_mode="valid")
-
-        branch3x3dbl = conv2d_bn(x, 64, 1, 1)
-        branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
-        branch3x3dbl = conv2d_bn(
-            branch3x3dbl, 96, 3, 3, subsample=(2, 2), border_mode="valid"
-        )
-
-        branch_pool = MaxPooling2D((3, 3), strides=(2, 2))(x)
-        x = merge(
-            [branch3x3, branch3x3dbl, branch_pool],
-            mode="concat",
-            concat_axis=channel_axis,
-            name="mixed3",
-        )
-
-        # mixed 4: 17 x 17 x 768
-        branch1x1 = conv2d_bn(x, 192, 1, 1)
-
-        branch7x7 = conv2d_bn(x, 128, 1, 1)
-        branch7x7 = conv2d_bn(branch7x7, 128, 1, 7)
-        branch7x7 = conv2d_bn(branch7x7, 192, 7, 1)
-
-        branch7x7dbl = conv2d_bn(x, 128, 1, 1)
-        branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 7, 1)
-        branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 1, 7)
-        branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 7, 1)
-        branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
-
-        branch_pool = AveragePooling2D((3, 3), strides=(1, 1), border_mode="same")(x)
-        branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
-        x = merge(
-            [branch1x1, branch7x7, branch7x7dbl, branch_pool],
-            mode="concat",
-            concat_axis=channel_axis,
-            name="mixed4",
-        )
-
-        # mixed 5, 6: 17 x 17 x 768
-        for i in range(2):
-            branch1x1 = conv2d_bn(x, 192, 1, 1)
-
-            branch7x7 = conv2d_bn(x, 160, 1, 1)
-            branch7x7 = conv2d_bn(branch7x7, 160, 1, 7)
-            branch7x7 = conv2d_bn(branch7x7, 192, 7, 1)
-
-            branch7x7dbl = conv2d_bn(x, 160, 1, 1)
-            branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 7, 1)
-            branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 1, 7)
-            branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 7, 1)
-            branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
-
-            branch_pool = AveragePooling2D((3, 3), strides=(1, 1), border_mode="same")(
-                x
-            )
-            branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
-            x = merge(
-                [branch1x1, branch7x7, branch7x7dbl, branch_pool],
-                mode="concat",
-                concat_axis=channel_axis,
-                name="mixed" + str(5 + i),
-            )
-
-        # mixed 7: 17 x 17 x 768
-        branch1x1 = conv2d_bn(x, 192, 1, 1)
-
-        branch7x7 = conv2d_bn(x, 192, 1, 1)
-        branch7x7 = conv2d_bn(branch7x7, 192, 1, 7)
-        branch7x7 = conv2d_bn(branch7x7, 192, 7, 1)
-
-        branch7x7dbl = conv2d_bn(x, 160, 1, 1)
-        branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 7, 1)
-        branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
-        branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 7, 1)
-        branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
-
-        branch_pool = AveragePooling2D((3, 3), strides=(1, 1), border_mode="same")(x)
-        branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
-        x = merge(
-            [branch1x1, branch7x7, branch7x7dbl, branch_pool],
-            mode="concat",
-            concat_axis=channel_axis,
-            name="mixed7",
-        )
-
-        # mixed 8: 8 x 8 x 1280
-        branch3x3 = conv2d_bn(x, 192, 1, 1)
-        branch3x3 = conv2d_bn(
-            branch3x3, 320, 3, 3, subsample=(2, 2), border_mode="valid"
-        )
-
-        branch7x7x3 = conv2d_bn(x, 192, 1, 1)
-        branch7x7x3 = conv2d_bn(branch7x7x3, 192, 1, 7)
-        branch7x7x3 = conv2d_bn(branch7x7x3, 192, 7, 1)
-        branch7x7x3 = conv2d_bn(
-            branch7x7x3, 192, 3, 3, subsample=(2, 2), border_mode="valid"
-        )
-
-        branch_pool = AveragePooling2D((3, 3), strides=(2, 2))(x)
-        x = merge(
-            [branch3x3, branch7x7x3, branch_pool],
-            mode="concat",
-            concat_axis=channel_axis,
-            name="mixed8",
-        )
-
-        # mixed 9: 8 x 8 x 2048
-        for i in range(2):
-            branch1x1 = conv2d_bn(x, 320, 1, 1)
-
-            branch3x3 = conv2d_bn(x, 384, 1, 1)
-            branch3x3_1 = conv2d_bn(branch3x3, 384, 1, 3)
-            branch3x3_2 = conv2d_bn(branch3x3, 384, 3, 1)
-            branch3x3 = merge(
-                [branch3x3_1, branch3x3_2],
-                mode="concat",
-                concat_axis=channel_axis,
-                name="mixed9_" + str(i),
-            )
-
-            branch3x3dbl = conv2d_bn(x, 448, 1, 1)
-            branch3x3dbl = conv2d_bn(branch3x3dbl, 384, 3, 3)
-            branch3x3dbl_1 = conv2d_bn(branch3x3dbl, 384, 1, 3)
-            branch3x3dbl_2 = conv2d_bn(branch3x3dbl, 384, 3, 1)
-            branch3x3dbl = merge(
-                [branch3x3dbl_1, branch3x3dbl_2],
-                mode="concat",
-                concat_axis=channel_axis,
-            )
-
-            branch_pool = AveragePooling2D((3, 3), strides=(1, 1), border_mode="same")(
-                x
-            )
-            branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
-            x = merge(
-                [branch1x1, branch3x3, branch3x3dbl, branch_pool],
-                mode="concat",
-                concat_axis=channel_axis,
-                name="mixed" + str(9 + i),
-            )
-
-        model = Model(inputs, x, name="inception_v3")
-
-        # Set some random weights
-        # use small weights for numerical correctness
-        model.set_weights(
-            [np.random.rand(*w.shape) * 1e-3 for w in model.get_weights()]
-        )
-
-        # Get the coreml model
-        self._test_keras_model(model)
-
-
-@unittest.skipIf(not _HAS_KERAS_TF, "Missing keras. Skipping tests.")
-@pytest.mark.keras1
-@pytest.mark.slow
-class KerasNumericCorrectnessStressTest(KerasNumericCorrectnessTest):
-    """
-    Unit test class for testing all combinations of a particular
-    layer.
-    """
-
-    def _run_test(
-        self,
-        model,
-        param,
-        model_dir=None,
-        delta=1e-2,
-        transpose_keras_result=True,
-        one_dim_seq_flags=None,
-    ):
-        """ Run a test on a particular model
-        """
-        use_tmp_folder = False
-        if model_dir is None:
-            use_tmp_folder = True
-            model_dir = tempfile.mkdtemp()
-        model_path = os.path.join(model_dir, "keras.mlmodel")
-
-        # Generate some random data
-        nb_inputs = len(model.inputs)
-        if nb_inputs > 1:
-            input_names = []
-            input_data = []
-            coreml_input = {}
-            for i in range(nb_inputs):
-                input_shape = [1 if a is None else a for a in model.input_shape[i]]
-                X = _generate_data(input_shape)
-                feature_name = "data_%s" % i
-                input_names.append(feature_name)
-                input_data.append(X)
-                if one_dim_seq_flags is None:
-                    coreml_input[feature_name] = _keras_transpose(X).astype("f")
-                else:
-                    coreml_input[feature_name] = _keras_transpose(
-                        X, one_dim_seq_flags[i]
-                    ).astype("f")
-        else:
-            input_shape = [1 if a is None else a for a in model.input_shape]
-            input_names = ["data"]
-            input_data = _generate_data(input_shape)
-            if one_dim_seq_flags is None:
-                coreml_input = {"data": _keras_transpose(input_data).astype("f")}
-            else:
-                coreml_input = {
-                    "data": _keras_transpose(input_data, one_dim_seq_flags[0]).astype(
-                        "f"
-                    )
-                }
-
-        # Make predictions
-        if transpose_keras_result:
-            keras_preds = _keras_transpose(model.predict(input_data)).flatten()
-        else:
-            keras_preds = model.predict(input_data).flatten()
-
-        # Get the model
-        coreml_model = _get_coreml_model(model, model_path, input_names, ["output"])
-        if _is_macos() and _macos_version() >= (10, 13):
-            # get prediction
-            coreml_preds = coreml_model.predict(coreml_input)["output"].flatten()
-
-            if use_tmp_folder:
-                shutil.rmtree(model_dir)
-            self.assertEqual(
-                len(coreml_preds),
-                len(keras_preds),
-                msg="Failed test case %s. Lengths wrong (%s vs %s)"
-                % (param, len(coreml_preds), len(keras_preds)),
-            )
-            for i in range(len(keras_preds)):
-                max_den = max(1.0, keras_preds[i], coreml_preds[i])
-                self.assertAlmostEqual(
-                    keras_preds[i] / max_den,
-                    coreml_preds[i] / max_den,
-                    delta=delta,
-                    msg="Failed test case %s. Predictions wrong (%s vs %s)"
-                    % (param, coreml_preds[i], keras_preds[i]),
-                )
-
-    @pytest.mark.slow
-    def test_activation_layer_params(self):
-        options = dict(
-            activation=["tanh", "relu", "sigmoid", "softmax", "softplus", "softsign"]
-        )
-
-        # Define a function that tests a model
-        num_channels = 10
-        input_dim = 10
-
-        def build_model(x):
-            model = Sequential()
-            model.add(Dense(num_channels, input_dim=input_dim))
-            model.add(Activation(**dict(zip(options.keys(), x))))
-            return x, model
-
-        # Iterate through all combinations
-        product = itertools.product(*options.values())
-        args = [build_model(p) for p in product]
-
-        # Test the cases
-        print("Testing a total of %s cases. This could take a while" % len(args))
-        for param, model in args:
-            model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-            self._run_test(model, param)
-
-    @pytest.mark.slow
-    def test_dense_layer_params(self):
-        options = dict(
-            activation=["relu", "softmax", "tanh", "sigmoid"], bias=[True, False],
-        )
-
-        # Define a function that tests a model
-        input_dim = 10
-        num_channels = 10
-
-        def build_model(x):
-            kwargs = dict(zip(options.keys(), x))
-            model = Sequential()
-            model.add(Dense(num_channels, input_dim=input_dim, **kwargs))
-            return x, model
-
-        # Iterate through all combinations
-        product = itertools.product(*options.values())
-        args = [build_model(p) for p in product]
-
-        # Test the cases
-        print("Testing a total of %s cases. This could take a while" % len(args))
-        for param, model in args:
-            self._run_test(model, param)
-
-    @pytest.mark.slow
-    def test_upsample_layer_params(self):
-        options = dict(size=[(2, 2), (3, 3), (4, 4), (5, 5)])
-
-        np.random.seed(1988)
-        input_dim = 10
-        input_shape = (input_dim, input_dim, 1)
-        X = np.random.rand(1, *input_shape)
-
-        # Define a function that tests a model
-        def build_model(x):
-            kwargs = dict(zip(options.keys(), x))
-            model = Sequential()
-            model.add(
-                Convolution2D(input_shape=input_shape, nb_row=7, nb_col=7, nb_filter=5)
-            )
-            model.add(UpSampling2D(**kwargs))
-            return x, model
-
-        # Iterate through all combinations
-        product = itertools.product(*options.values())
-        args = [build_model(p) for p in product]
-
-        # Test the cases
-        print("Testing a total of %s cases. This could take a while" % len(args))
-        for param, model in args:
-            self._run_test(model, param)
-
-    @pytest.mark.slow
-    def test_conv_layer_params(self):
-        options = dict(
-            activation=[
-                "relu",
-                "tanh",
-                "sigmoid",
-            ],  # keas does not support softmax on 4-D
-            bias=[True, False],
-            border_mode=["same", "valid"],
-            nb_filter=[1, 3, 5],
-            nb_row=[5],  # fails when sizes are different
-            nb_col=[5],
-        )
-
-        # Define a function that tests a model
-        input_shape = (10, 10, 1)
-
-        def build_model(x):
-            kwargs = dict(zip(options.keys(), x))
-            model = Sequential()
-            model.add(Convolution2D(input_shape=input_shape, **kwargs))
-            return x, model
-
-        # Iterate through all combinations
-        product = itertools.product(*options.values())
-        args = [build_model(p) for p in product]
-
-        # Test the cases
-        print("Testing a total of %s cases. This could take a while" % len(args))
-        for param, model in args:
-            self._run_test(model, param)
-
-    @pytest.mark.slow
-    def test_dense_elementwise_params(self):
-        options = dict(modes=["sum", "mul", "concat", "ave", "cos", "dot", "max"])
-
-        def build_model(mode):
-            x1 = Input(shape=(3,))
-            x2 = Input(shape=(3,))
-            y1 = Dense(4)(x1)
-            y2 = Dense(4)(x2)
-            z = merge([y1, y2], mode=mode)
-            model = Model([x1, x2], z)
-            return mode, model
-
-        product = itertools.product(*options.values())
-        args = [build_model(p[0]) for p in product]
-        print("Testing a total of %s cases. This could take a while" % len(args))
-        for param, model in args:
-            self._run_test(model, param)
-
-    def test_vgg_16_tiny(self):
-
-        input_shape = (48, 48, 3)
-        model = Sequential()
-        model.add(ZeroPadding2D((1, 1), input_shape=input_shape))
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-        model.add(MaxPooling2D((2, 2), strides=(2, 2)))
-
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-        model.add(MaxPooling2D((2, 2), strides=(2, 2)))
-
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-        model.add(MaxPooling2D((2, 2), strides=(2, 2)))
-
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-        model.add(MaxPooling2D((2, 2), strides=(2, 2)))
-
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-        model.add(MaxPooling2D((2, 2), strides=(2, 2)))
-
-        model.add(Flatten())
-        model.add(Dense(32, activation="relu"))
-        # model.add(Dropout(0.5))
-        model.add(Dense(32, activation="relu"))
-        # model.add(Dropout(0.5))
-        model.add(Dense(1000))  # activation='softmax'))
-
-        # Set some random weights
-        model.set_weights(
-            [(np.random.rand(*w.shape) - 0.5) / 5.0 for w in model.get_weights()]
-        )
-
-        # Get the coreml model
-        self._test_keras_model(model)
-
-    def test_vgg_16_tiny_no_pooling(self):
-
-        input_shape = (48, 48, 3)
-        model = Sequential()
-        model.add(ZeroPadding2D((1, 1), input_shape=input_shape))
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-        model.add(MaxPooling2D((2, 2), strides=(2, 2)))
-
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-        model.add(MaxPooling2D((2, 2), strides=(2, 2)))
-
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-        model.add(MaxPooling2D((2, 2), strides=(2, 2)))
-
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-        model.add(MaxPooling2D((2, 2), strides=(2, 2)))
-
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-        model.add(ZeroPadding2D((1, 1)))
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-        model.add(MaxPooling2D((2, 2), strides=(2, 2)))
-
-        model.add(Flatten())
-        model.add(Dense(32, activation="relu"))
-        # model.add(Dropout(0.5))
-        model.add(Dense(32, activation="relu"))
-        # model.add(Dropout(0.5))
-        model.add(Dense(1000))  # activation='softmax'))
-
-        # Set some random weights
-        model.set_weights(
-            [(np.random.rand(*w.shape) - 0.5) / 5.0 for w in model.get_weights()]
-        )
-
-        # Get the coreml model
-        self._test_keras_model(model)
-
-    def test_vgg_16_tiny_no_pooling_no_padding(self):
-
-        input_shape = (48, 48, 3)
-        model = Sequential()
-        model.add(Convolution2D(32, 3, 3, activation="relu", input_shape=input_shape))
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-        model.add(Convolution2D(32, 3, 3, activation="relu"))
-
-        model.add(Flatten())
-        model.add(Dense(32, activation="relu"))
-        model.add(Dropout(0.5))
-        model.add(Dense(32, activation="relu"))
-        model.add(Dropout(0.5))
-        model.add(Dense(1000, activation="softmax"))
-
-        # Get the coreml model
-        self._test_keras_model(model)
-
-    def test_vgg_16_tiny_only_conv_dense(self):
-
-        input_shape = (48, 48, 3)
-        model = Sequential()
-        model.add(Convolution2D(32, 3, 3, activation="relu", input_shape=input_shape))
-        model.add(Flatten())
-        model.add(Dense(10, activation="softmax"))
-
-        # Get the coreml model
-        self._test_keras_model(model)
-
-    def test_imdb_fasttext_first_2(self):
-
-        max_features = 10
-        max_len = 6
-        embedding_dims = 4
-        pool_length = 2
-
-        model = Sequential()
-        model.add(Embedding(max_features, embedding_dims, input_length=max_len))
-        # we add a AveragePooling1D, which will average the embeddings
-        # of all words in the document
-        model.add(AveragePooling1D(pool_length=pool_length))
-
-        self._test_keras_model(model, one_dim_seq_flags=[True])
-
-    def test_tiny_mcrnn_td(self):
-
-        model = Sequential()
-        model.add(Convolution2D(3, 1, 1, input_shape=(2, 4, 4), border_mode="same"))
-        model.add(AveragePooling2D(pool_size=(2, 2)))
-        model.add(Reshape((2, 3)))
-        model.add(TimeDistributed(Dense(5)))
-
-        self._test_keras_model(model)
-
-    def test_tiny_mcrnn_recurrent(self):
-
-        model = Sequential()
-        model.add(Convolution2D(3, 1, 1, input_shape=(2, 4, 4), border_mode="same"))
-        model.add(AveragePooling2D(pool_size=(2, 2)))
-        model.add(Reshape((2, 3)))
-        model.add(LSTM(5, inner_activation="sigmoid"))
-
-        self._test_keras_model(model)
-
-    def test_tiny_mcrnn_music_tagger(self):
-
-        x_in = Input(shape=(4, 6, 1))
-        x = ZeroPadding2D(padding=(0, 1))(x_in)
-        x = BatchNormalization(axis=2, name="bn_0_freq")(x)
-        # Conv block 1
-        x = Convolution2D(2, 3, 3, border_mode="same", name="conv1")(x)
-        x = BatchNormalization(axis=3, mode=0, name="bn1")(x)
-        x = ELU()(x)
-        x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), name="pool1")(x)
-        # Conv block 2
-        x = Convolution2D(4, 3, 3, border_mode="same", name="conv2")(x)
-        x = BatchNormalization(axis=3, mode=0, name="bn2")(x)
-        x = ELU()(x)
-        x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), name="pool2")(x)
-
-        # Should get you (1,1,2,4)
-        x = Reshape((2, 4))(x)
-        x = GRU(32, return_sequences=True, name="gru1")(x)
-        x = GRU(32, return_sequences=False, name="gru2")(x)
-
-        # Create model.
-        model = Model(x_in, x)
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-        self._test_keras_model(model, mode="random_zero_mean", delta=1e-2)
-
-    def test_tiny_apple_manual(self):
-        model = Sequential()
-        model.add(LSTM(3, input_shape=(4, 5), inner_activation="sigmoid"))
-        model.add(Dense(5))
-        model.add(Activation("softmax"))
-
-        self._test_keras_model(model)
-
-    def test_tiny_image_captioning_image_branch(self):
-        img_input_1 = Input(shape=(16, 16, 3))
-        x = Convolution2D(2, 3, 3)(img_input_1)
-        x = Flatten()(x)
-        img_model = Model([img_input_1], [x])
-
-        img_input = Input(shape=(16, 16, 3))
-        x = img_model(img_input)
-        x = Dense(8, name="cap_dense")(x)
-        x = Reshape((1, 8), name="cap_reshape")(x)
-        image_branch = Model([img_input], [x])
-        self._test_keras_model(image_branch)
-
-    def test_tiny_image_captioning_feature_merge(self):
-
-        img_input_1 = Input(shape=(16, 16, 3))
-        x = Convolution2D(2, 3, 3)(img_input_1)
-        x = Flatten()(x)
-        img_model = Model([img_input_1], [x])
-
-        img_input = Input(shape=(16, 16, 3))
-        x = img_model(img_input)
-        x = Dense(8, name="cap_dense")(x)
-        x = Reshape((1, 8), name="cap_reshape")(x)
-
-        sentence_input = Input(shape=(5,))  # max_length = 5
-        y = Embedding(8, 8, name="cap_embedding")(sentence_input)
-        z = merge([x, y], mode="concat", concat_axis=1, name="cap_merge")
-
-        combined_model = Model([img_input, sentence_input], [z])
-        self._test_keras_model(combined_model, one_dim_seq_flags=[False, True])
-
-    def test_tiny_image_captioning(self):
-        # use a conv layer as a image feature branch
-        img_input_1 = Input(shape=(16, 16, 3))
-        x = Convolution2D(2, 3, 3)(img_input_1)
-        x = Flatten()(x)
-        img_model = Model([img_input_1], [x])
-
-        img_input = Input(shape=(16, 16, 3))
-        x = img_model(img_input)
-        x = Dense(8, name="cap_dense")(x)
-        x = Reshape((1, 8), name="cap_reshape")(x)
-
-        sentence_input = Input(shape=(5,))  # max_length = 5
-        y = Embedding(8, 8, name="cap_embedding")(sentence_input)
-        z = merge([x, y], mode="concat", concat_axis=1, name="cap_merge")
-        z = LSTM(4, return_sequences=True, name="cap_lstm")(z)
-        z = TimeDistributed(Dense(8), name="cap_timedistributed")(z)
-
-        combined_model = Model([img_input, sentence_input], [z])
-        self._test_keras_model(combined_model, one_dim_seq_flags=[False, True])
-
-    def test_tiny_babi_rnn(self):
-        vocab_size = 10
-        embed_hidden_size = 8
-        story_maxlen = 5
-        query_maxlen = 5
-
-        sentrnn = Sequential()
-        sentrnn.add(Embedding(vocab_size, embed_hidden_size, input_length=story_maxlen))
-        sentrnn.add(Dropout(0.3))
-
-        qrnn = Sequential()
-        qrnn.add(Embedding(vocab_size, embed_hidden_size, input_length=query_maxlen))
-        qrnn.add(Dropout(0.3))
-        qrnn.add(LSTM(embed_hidden_size, return_sequences=False))
-        qrnn.add(RepeatVector(story_maxlen))
-
-        model = Sequential()
-        model.add(Merge([sentrnn, qrnn], mode="sum"))
-        model.add(LSTM(embed_hidden_size, return_sequences=False))
-        model.add(Dropout(0.3))
-        model.add(Dense(vocab_size, activation="softmax"))
-
-        self._test_keras_model(model, one_dim_seq_flags=[True, True])
-
-    def test_clickbait_cnn(self):
-        # from: https://github.com/saurabhmathur96/clickbait-detector
-        vocabulary_size = 500
-        embedding_dimension = 30
-        input_length = 20
-
-        model = Sequential()
-        model.add(
-            Embedding(
-                vocabulary_size,
-                embedding_dimension,
-                input_length=input_length,
-                trainable=True,
-            )
-        )
-
-        model.add(Convolution1D(32, 2))
-        model.add(BatchNormalization())
-        model.add(Activation("relu"))
-
-        model.add(Convolution1D(32, 2))
-        model.add(BatchNormalization())
-        model.add(Activation("relu"))
-
-        model.add(Convolution1D(32, 2))
-        model.add(BatchNormalization())
-        model.add(Activation("relu"))
-
-        model.add(MaxPooling1D(17))
-        model.add(Flatten())
-
-        model.add(Dense(1, bias=True))
-        model.add(BatchNormalization())
-        model.add(Activation("sigmoid"))
-
-        self._test_keras_model(model, one_dim_seq_flags=[True])
diff --git a/coremltools/test/neural_network/test_model.py b/coremltools/test/neural_network/test_model.py
index c6ff6b821..dd8bab8ca 100644
--- a/coremltools/test/neural_network/test_model.py
+++ b/coremltools/test/neural_network/test_model.py
@@ -10,6 +10,7 @@
 import tempfile
 import unittest
 
+from coremltools import ComputeUnit
 from coremltools._deps import _HAS_TORCH
 from coremltools.converters.mil import Builder as mb
 from coremltools.models.utils import (
@@ -330,7 +331,7 @@ def test_multiarray_to_image_input_util(self):
         )
         builder.add_activation("linear", "LINEAR", "data", "out")
         spec = builder.spec
-        mlmodel = MLModel(spec)
+        mlmodel = MLModel(spec, compute_units=ComputeUnit.CPU_ONLY)
         mlmodel = make_image_input(
             mlmodel,
             "data",
@@ -343,7 +344,7 @@ def test_multiarray_to_image_input_util(self):
         x = np.array([4, 2, 5], dtype=np.uint8)
         x = np.reshape(x, (H, W, C))
         pil_img = PIL.Image.fromarray(x)
-        y = mlmodel.predict({"data": pil_img}, useCPUOnly=True)["out"]
+        y = mlmodel.predict({"data": pil_img})["out"]
         self.assertEqual(y.shape, (C, H, W))
         np.testing.assert_almost_equal(y.flatten(), [35.0, 14.0, 47.5])
 
@@ -360,7 +361,7 @@ def test_multiarray_to_image_input_util_transpose_elimination(self):
         builder.add_transpose("transpose", [2, 0, 1], "data", "transpose")
         builder.add_activation("linear", "LINEAR", "transpose", "out")
         spec = builder.spec
-        mlmodel = MLModel(spec)
+        mlmodel = MLModel(spec, compute_units=ComputeUnit.CPU_ONLY)
         mlmodel = make_image_input(
             mlmodel,
             "data",
@@ -373,7 +374,7 @@ def test_multiarray_to_image_input_util_transpose_elimination(self):
         x = np.array([4, 2, 5], dtype=np.uint8)
         x = np.reshape(x, (H, W, C))
         pil_img = PIL.Image.fromarray(x)
-        y = mlmodel.predict({"data": pil_img}, useCPUOnly=True)["out"]
+        y = mlmodel.predict({"data": pil_img})["out"]
         self.assertEqual(y.shape, (H, W, C))
         np.testing.assert_almost_equal(y.flatten(), [35.0, 14.0, 47.5])
 
@@ -389,7 +390,7 @@ def test_multiarray_to_image_input_util_HWC_format(self):
         )
         builder.add_activation("linear", "LINEAR", "data", "out")
         spec = builder.spec
-        mlmodel = MLModel(spec)
+        mlmodel = MLModel(spec, compute_units=ComputeUnit.CPU_ONLY)
         mlmodel = make_image_input(
             mlmodel,
             "data",
@@ -402,7 +403,7 @@ def test_multiarray_to_image_input_util_HWC_format(self):
         x = np.array([4, 2, 5], dtype=np.uint8)
         x = np.reshape(x, (H, W, C))
         pil_img = PIL.Image.fromarray(x)
-        y = mlmodel.predict({"data": pil_img}, useCPUOnly=True)["out"]
+        y = mlmodel.predict({"data": pil_img})["out"]
         self.assertEqual(y.shape, (H, W, C))
         np.testing.assert_almost_equal(y.flatten(), [35.0, 14.0, 47.5])
 
@@ -417,14 +418,14 @@ def test_nn_classifier_util(self):
         )
         builder.add_activation("linear", "LINEAR", "data", "out")
         spec = builder.spec
-        mlmodel = MLModel(spec)
+        mlmodel = MLModel(spec, compute_units=ComputeUnit.CPU_ONLY)
         mlmodel = make_nn_classifier(
             mlmodel,
             class_labels=["a", "b", "c"],
             predicted_feature_name="out_confidence",
             predicted_probabilities_output="out",
         )
-        out_dict = mlmodel.predict({"data": np.array([4.0, 5.5, 6.0])}, useCPUOnly=True)
+        out_dict = mlmodel.predict({"data": np.array([4.0, 5.5, 6.0])})
         self.assertEqual(out_dict["out_confidence"], "c")
         self.assertEqual(
             mlmodel.get_spec().WhichOneof("Type"), "neuralNetworkClassifier"
@@ -441,7 +442,7 @@ def test_nn_classifier_util_file(self):
         )
         builder.add_activation("linear", "LINEAR", "data", "out")
         spec = builder.spec
-        mlmodel = MLModel(spec)
+        mlmodel = MLModel(spec, compute_units=ComputeUnit.CPU_ONLY)
 
         class_labels = ["a", "b", "c"]
         with tempfile.NamedTemporaryFile(mode="w", suffix=".txt") as f:
@@ -453,7 +454,7 @@ def test_nn_classifier_util_file(self):
                 predicted_feature_name="out_confidence",
                 predicted_probabilities_output="out",
             )
-        out_dict = mlmodel.predict({"data": np.array([4.0, 5.5, 6.0])}, useCPUOnly=True)
+        out_dict = mlmodel.predict({"data": np.array([4.0, 5.5, 6.0])})
         self.assertEqual(out_dict["out_confidence"], "c")
         self.assertEqual(
             mlmodel.get_spec().WhichOneof("Type"), "neuralNetworkClassifier"
@@ -478,9 +479,9 @@ def test_rename_output_nn_classifier(self):
         # rename output
         spec = mlmodel.get_spec()
         rename_feature(spec, "out", "new_out_name")
-        mlmodel = MLModel(spec)
+        mlmodel = MLModel(spec, compute_units=ComputeUnit.CPU_ONLY)
 
-        out_dict = mlmodel.predict({"data": np.array([4.0, 5.5, 6.0])}, useCPUOnly=True)
+        out_dict = mlmodel.predict({"data": np.array([4.0, 5.5, 6.0])})
         self.assertEqual(out_dict["classLabel"], "c")
         self.assertTrue("new_out_name" in out_dict)
         self.assertTrue(isinstance(out_dict["new_out_name"], dict))
@@ -501,11 +502,12 @@ def test_rename_image_input(self):
         # rename the input
         spec = mlmodel.get_spec()
         rename_feature(spec, "data", "new_input_name")
-        mlmodel = MLModel(spec)
+        mlmodel = MLModel(spec, compute_units=ComputeUnit.CPU_ONLY)
+
         # test
         x = np.array([4, 5, 6], dtype=np.uint8).reshape(1, 1, 3)
         pil_img = PIL.Image.fromarray(x)
-        out = mlmodel.predict({"new_input_name": pil_img}, useCPUOnly=True)['out']
+        out = mlmodel.predict({"new_input_name": pil_img})['out']
         np.testing.assert_equal(out, np.array([8.0, 10.0, 12.0]).reshape(3, 1, 1))
 
     @unittest.skipUnless(
diff --git a/coremltools/test/neural_network/test_multiple_images_preprocessing.py b/coremltools/test/neural_network/test_multiple_images_preprocessing.py
deleted file mode 100644
index 5c9879500..000000000
--- a/coremltools/test/neural_network/test_multiple_images_preprocessing.py
+++ /dev/null
@@ -1,191 +0,0 @@
-# Copyright (c) 2021, Apple Inc. All rights reserved.
-#
-# Use of this source code is governed by a BSD-3-clause license that can be
-# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
-import json
-import os
-import shutil
-import subprocess
-import tarfile
-import tempfile
-import unittest
-from subprocess import Popen
-
-import PIL.Image
-import numpy as np
-import pytest
-
-import coremltools
-from coremltools._deps import _HAS_KERAS2_TF
-from coremltools.models.utils import _macos_version, _is_macos
-
-if _HAS_KERAS2_TF:
-    import keras
-    from keras.models import Sequential, Model
-    from keras.layers import Activation, GlobalMaxPooling2D, Input
-
-FOLDER_NAME = "multiple_images_preprocessing"
-
-
-def extract_tarfile(input_filename, dest_dir):
-    with tarfile.open(input_filename, "r:gz") as tar:
-        tar.extractall(dest_dir)
-
-
-def load_mlmodel(model_path):
-    load_args = [" /usr/local/bin/coremltest", "load", "-modelPath", model_path]
-    print("Loading {}".format(model_path))
-    process = Popen(
-        (" ").join(load_args),
-        stdin=subprocess.PIPE,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-        shell=True,
-    )
-    stdout, err = process.communicate()
-
-    if not err:
-        return True
-    else:
-        print(" The error is {}".format(err.decode()))
-        return False
-
-
-def compare_models(keras_preds, coreml_preds):
-    max_relative_error = 0
-    for i in range(len(coreml_preds)):
-        max_den = max(1.0, np.abs(keras_preds[i]), np.abs(coreml_preds[i]))
-        relative_error = np.abs(keras_preds[i] / max_den - coreml_preds[i] / max_den)
-        if relative_error > max_relative_error:
-            max_relative_error = relative_error
-
-    print("maximum relative error: ", max_relative_error)
-    return max_relative_error
-
-
-@unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras. Skipping tests.")
-@pytest.mark.keras2
-class ManyImagesKeras(unittest.TestCase):
-    def test_keras_1_image_bias(self):
-        # define Keras model and get prediction
-        input_shape = (100, 50, 3)
-        model = Sequential()
-        model.add(Activation("linear", input_shape=input_shape))
-
-        data = np.ones(input_shape)
-        keras_input = np.ones(input_shape)
-        data[:, :, 0] = 128.0
-        data[:, :, 1] = 27.0
-        data[:, :, 2] = 200.0
-        red_bias = -12.0
-        green_bias = -20
-        blue_bias = -4
-        keras_input[:, :, 0] = data[:, :, 0] + red_bias
-        keras_input[:, :, 1] = data[:, :, 1] + green_bias
-        keras_input[:, :, 2] = data[:, :, 2] + blue_bias
-
-        keras_preds = model.predict(np.expand_dims(keras_input, axis=0))
-        keras_preds = np.transpose(keras_preds, [0, 3, 1, 2]).flatten()
-
-        # convert to coreml and get predictions
-        model_dir = tempfile.mkdtemp()
-        model_path = os.path.join(model_dir, "keras.mlmodel")
-        from coremltools.converters import keras as keras_converter
-
-        coreml_model = keras_converter.convert(
-            model,
-            input_names=["data"],
-            output_names=["output"],
-            image_input_names=["data"],
-            red_bias=red_bias,
-            green_bias=green_bias,
-            blue_bias=blue_bias,
-        )
-
-        if _is_macos() and _macos_version() >= (10, 13):
-            coreml_input_dict = dict()
-            coreml_input_dict["data"] = PIL.Image.fromarray(data.astype(np.uint8))
-            coreml_preds = coreml_model.predict(coreml_input_dict)["output"].flatten()
-
-            self.assertEqual(len(keras_preds), len(coreml_preds))
-            max_relative_error = compare_models(keras_preds, coreml_preds)
-            self.assertAlmostEqual(max(max_relative_error, 0.001), 0.001, delta=1e-6)
-
-        if os.path.exists(model_dir):
-            shutil.rmtree(model_dir)
-
-    def test_keras_2_image_bias(self):
-        # define Keras model and get prediction
-        input_shape1 = (100, 60, 3)
-        input_shape2 = (23, 45, 3)
-
-        data1 = Input(shape=input_shape1)
-        data2 = Input(shape=input_shape2)
-        a_pool = GlobalMaxPooling2D()(data1)
-        b_pool = GlobalMaxPooling2D()(data2)
-        output = keras.layers.add([a_pool, b_pool])
-        model = Model(inputs=[data1, data2], outputs=output)
-
-        data1 = np.ones(input_shape1)
-        data2 = np.ones(input_shape2)
-        keras_input1 = np.ones(input_shape1)
-        keras_input2 = np.ones(input_shape2)
-
-        data1[:, :, 0] = 100.0
-        data1[:, :, 1] = 79.0
-        data1[:, :, 2] = 194.0
-
-        data2[:, :, 0] = 130.0
-        data2[:, :, 1] = 91.0
-        data2[:, :, 2] = 11.0
-
-        red_bias1 = -88.0
-        green_bias1 = -2
-        blue_bias1 = -40
-
-        red_bias2 = -100.0
-        green_bias2 = -29
-        blue_bias2 = -15
-
-        keras_input1[:, :, 0] = data1[:, :, 2] + blue_bias1
-        keras_input1[:, :, 1] = data1[:, :, 1] + green_bias1
-        keras_input1[:, :, 2] = data1[:, :, 0] + red_bias1
-
-        keras_input2[:, :, 0] = data2[:, :, 0] + red_bias2
-        keras_input2[:, :, 1] = data2[:, :, 1] + green_bias2
-        keras_input2[:, :, 2] = data2[:, :, 2] + blue_bias2
-
-        keras_preds = model.predict(
-            [np.expand_dims(keras_input1, axis=0), np.expand_dims(keras_input2, axis=0)]
-        )
-        keras_preds = keras_preds.flatten()
-
-        # convert to coreml and get predictions
-        model_dir = tempfile.mkdtemp()
-        model_path = os.path.join(model_dir, "keras.mlmodel")
-        from coremltools.converters import keras as keras_converter
-
-        coreml_model = keras_converter.convert(
-            model,
-            input_names=["data1", "data2"],
-            output_names=["output"],
-            image_input_names=["data1", "data2"],
-            red_bias={"data1": red_bias1, "data2": red_bias2},
-            green_bias={"data1": green_bias1, "data2": green_bias2},
-            blue_bias={"data1": blue_bias1, "data2": blue_bias2},
-            is_bgr={"data1": True, "data2": False},
-        )
-
-        if _is_macos() and _macos_version() >= (10, 13):
-            coreml_input_dict = dict()
-            coreml_input_dict["data1"] = PIL.Image.fromarray(data1.astype(np.uint8))
-            coreml_input_dict["data2"] = PIL.Image.fromarray(data2.astype(np.uint8))
-            coreml_preds = coreml_model.predict(coreml_input_dict)["output"].flatten()
-
-            # compare
-            self.assertEqual(len(keras_preds), len(coreml_preds))
-            max_relative_error = compare_models(keras_preds, coreml_preds)
-            self.assertAlmostEqual(max(max_relative_error, 0.001), 0.001, delta=1e-6)
-
-        if os.path.exists(model_dir):
-            shutil.rmtree(model_dir)
diff --git a/coremltools/test/neural_network/test_neural_networks.py b/coremltools/test/neural_network/test_neural_networks.py
index c65d25ca4..518080902 100644
--- a/coremltools/test/neural_network/test_neural_networks.py
+++ b/coremltools/test/neural_network/test_neural_networks.py
@@ -12,7 +12,6 @@
 import pytest
 
 import coremltools
-from coremltools._deps import _HAS_KERAS_TF, MSG_KERAS1_NOT_FOUND
 from coremltools._deps import _HAS_TF, MSG_TF1_NOT_FOUND
 from coremltools.models.utils import (
     _get_custom_layer_names,
@@ -22,11 +21,6 @@
 )
 from coremltools.proto import Model_pb2
 
-if _HAS_KERAS_TF:
-    from keras.models import Sequential
-    from keras.layers import Dense, LSTM
-    from coremltools.converters import keras as keras_converter
-
 if _HAS_TF:
     import tensorflow as tf
     from tensorflow.python.platform import gfile
@@ -34,155 +28,6 @@
 
     tf.compat.v1.disable_eager_execution()
 
-
-@unittest.skipIf(not _HAS_KERAS_TF, MSG_KERAS1_NOT_FOUND)
-@pytest.mark.keras1
-class KerasBasicNumericCorrectnessTest(unittest.TestCase):
-    def test_classifier(self):
-        np.random.seed(1988)
-
-        print("running test classifier")
-
-        input_dim = 5
-        num_hidden = 12
-        num_classes = 6
-        input_length = 3
-
-        model = Sequential()
-        model.add(
-            LSTM(
-                num_hidden,
-                input_dim=input_dim,
-                input_length=input_length,
-                return_sequences=False,
-            )
-        )
-        model.add(Dense(num_classes, activation="softmax"))
-
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        input_names = ["input"]
-        output_names = ["zzzz"]
-        class_labels = ["a", "b", "c", "d", "e", "f"]
-        predicted_feature_name = "pf"
-        coremlmodel = keras_converter.convert(
-            model,
-            input_names,
-            output_names,
-            class_labels=class_labels,
-            predicted_feature_name=predicted_feature_name,
-            predicted_probabilities_output=output_names[0],
-        )
-
-        if _is_macos() and _macos_version() >= (10, 13):
-            inputs = np.random.rand(input_dim)
-            outputs = coremlmodel.predict({"input": inputs})
-            # this checks that the dictionary got the right name and type
-            self.assertEqual(type(outputs[output_names[0]]), type({"a": 0.5}))
-
-    def test_classifier_no_name(self):
-        np.random.seed(1988)
-
-        input_dim = 5
-        num_hidden = 12
-        num_classes = 6
-        input_length = 3
-
-        model = Sequential()
-        model.add(
-            LSTM(
-                num_hidden,
-                input_dim=input_dim,
-                input_length=input_length,
-                return_sequences=False,
-            )
-        )
-        model.add(Dense(num_classes, activation="softmax"))
-
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        input_names = ["input"]
-        output_names = ["zzzz"]
-        class_labels = ["a", "b", "c", "d", "e", "f"]
-        predicted_feature_name = "pf"
-        coremlmodel = keras_converter.convert(
-            model,
-            input_names,
-            output_names,
-            class_labels=class_labels,
-            predicted_feature_name=predicted_feature_name,
-        )
-
-        if _is_macos() and _macos_version() >= (10, 13):
-            inputs = np.random.rand(input_dim)
-            outputs = coremlmodel.predict({"input": inputs})
-            # this checks that the dictionary got the right name and type
-            self.assertEqual(type(outputs[output_names[0]]), type({"a": 0.5}))
-
-    def test_internal_layer(self):
-
-        np.random.seed(1988)
-
-        input_dim = 5
-        num_channels1 = 10
-        num_channels2 = 7
-        num_channels3 = 5
-
-        w1 = (np.random.rand(input_dim, num_channels1) - 0.5) / 5.0
-        w2 = (np.random.rand(num_channels1, num_channels2) - 0.5) / 5.0
-        w3 = (np.random.rand(num_channels2, num_channels3) - 0.5) / 5.0
-
-        b1 = (np.random.rand(num_channels1,) - 0.5) / 5.0
-        b2 = (np.random.rand(num_channels2,) - 0.5) / 5.0
-        b3 = (np.random.rand(num_channels3,) - 0.5) / 5.0
-
-        model = Sequential()
-        model.add(Dense(num_channels1, input_dim=input_dim))
-        model.add(Dense(num_channels2, name="middle_layer"))
-        model.add(Dense(num_channels3))
-
-        model.set_weights([w1, b1, w2, b2, w3, b3])
-
-        input_names = ["input"]
-        output_names = ["output"]
-        coreml1 = keras_converter.convert(model, input_names, output_names)
-
-        # adjust the output parameters of coreml1 to include the intermediate layer
-        spec = coreml1.get_spec()
-        coremlNewOutputs = spec.description.output.add()
-        coremlNewOutputs.name = "middle_layer_output"
-        coremlNewParams = coremlNewOutputs.type.multiArrayType
-        coremlNewParams.dataType = coremltools.proto.FeatureTypes_pb2.ArrayFeatureType.ArrayDataType.Value(
-            "DOUBLE"
-        )
-        coremlNewParams.shape.extend([num_channels2])
-
-        coremlfinal = coremltools.models.MLModel(spec)
-
-        # generate a second model which
-        model2 = Sequential()
-        model2.add(Dense(num_channels1, input_dim=input_dim))
-        model2.add(Dense(num_channels2))
-        model2.set_weights([w1, b1, w2, b2])
-
-        coreml2 = keras_converter.convert(model2, input_names, ["output2"])
-
-        if _is_macos() and _macos_version() >= (10, 13):
-            # generate input data
-            inputs = np.random.rand(input_dim)
-
-            fullOutputs = coremlfinal.predict({"input": inputs})
-
-            partialOutput = coreml2.predict({"input": inputs})
-
-            for i in range(0, num_channels2):
-                self.assertAlmostEqual(
-                    fullOutputs["middle_layer_output"][i],
-                    partialOutput["output2"][i],
-                    2,
-                )
-
-
 class CustomLayerUtilsTest(unittest.TestCase):
     @classmethod
     def setUpClass(self):
diff --git a/coremltools/test/neural_network/test_nn_builder.py b/coremltools/test/neural_network/test_nn_builder.py
index f53db22e8..a7bd6c482 100644
--- a/coremltools/test/neural_network/test_nn_builder.py
+++ b/coremltools/test/neural_network/test_nn_builder.py
@@ -9,6 +9,8 @@
 import pytest
 
 import coremltools
+from coremltools import ComputeUnit
+from coremltools.converters.mil.mil.types.type_mapping import np_val_to_py_type
 from coremltools.models import datatypes, MLModel
 from coremltools.models.neural_network import NeuralNetworkBuilder
 from coremltools.models.neural_network.quantization_utils import (
@@ -16,7 +18,7 @@
     quantize_weights,
 )
 from coremltools.models.utils import _macos_version, _is_macos
-from coremltools.converters.mil.backend.nn.op_mapping import to_py_type
+
 
 MIN_MACOS_VERSION_REQUIRED = (10, 13)
 LAYERS_10_14_MACOS_VERSION = (10, 14)
@@ -169,7 +171,7 @@ def build_quant_conv_layer(
             quant_bias=quant_bias,
             quant_lut=quant_lut,
         )
-        return MLModel(builder.spec)
+        return MLModel(builder.spec, compute_units=ComputeUnit.CPU_ONLY)
 
     def test_linear_quant_convolution_8bit(self):
         W = np.ones((2, 2, 1, 2), dtype=np.uint8)
@@ -183,7 +185,7 @@ def test_linear_quant_convolution_8bit(self):
         )
         data = np.ones((1, 2, 2))
         data_dict = {"data": data}
-        out = mlmodel.predict(data_dict, useCPUOnly=True)["out"]
+        out = mlmodel.predict(data_dict)["out"]
         expected_out = np.reshape(np.array([8, 24]), (2, 1, 1))
         self.assertTrue(np.allclose(out, expected_out))
 
@@ -199,7 +201,7 @@ def test_linear_quant_convolution_8bit_vector_scalebias(self):
         )
         data = np.ones((1, 2, 2))
         data_dict = {"data": data}
-        out = mlmodel.predict(data_dict, useCPUOnly=True)["out"]
+        out = mlmodel.predict(data_dict)["out"]
         expected_out = np.reshape(np.array([8, 44]), (2, 1, 1))
         self.assertTrue(np.allclose(out, expected_out))
 
@@ -215,7 +217,7 @@ def test_linear_quant_convolution_8bit_float_scale_and_bias(self):
         )
         data = np.ones((1, 2, 2))
         data_dict = {"data": data}
-        out = mlmodel.predict(data_dict, useCPUOnly=True)["out"]
+        out = mlmodel.predict(data_dict)["out"]
         # Output should be equal to: (scale*(1+248+248+248)+(4*bias))
         expected_out = np.reshape(np.array([-4477]), (1, 1, 1, 1, 1))
         self.assertTrue(np.allclose(out, expected_out))
@@ -230,7 +232,7 @@ def test_lut_quant_convolution_2bit(self):
         )
         data = np.ones((1, 2, 2))
         data_dict = {"data": data}
-        out = mlmodel.predict(data_dict, useCPUOnly=True)["out"]
+        out = mlmodel.predict(data_dict)["out"]
         expected_out = np.reshape(np.array([40, -12]), (2, 1, 1))
         self.assertTrue(np.allclose(out, expected_out))
 
@@ -323,12 +325,12 @@ def test_linear_quant_batchedmatmul_5bit(self):
             quant_scale=quant_scale.flatten(),
             quant_bias=quant_bias.flatten(),
         )
-        mlmodel = MLModel(builder.spec)
+        mlmodel = MLModel(builder.spec, compute_units=ComputeUnit.CPU_ONLY)
         data = np.zeros((2, 2), dtype=np.float32)
         data[0, :] = [5, 6]
         data[1, :] = [10, 12]
         data_dict = {"data": data}
-        out = mlmodel.predict(data_dict, useCPUOnly=True)["out"]
+        out = mlmodel.predict(data_dict)["out"]
         expected_out = np.matmul(data, W_unquantized) + bias
         self.assertTrue(out.shape == expected_out.shape)
         self.assertTrue(np.allclose(out.flatten(), expected_out.flatten()))
@@ -352,7 +354,7 @@ def test_linear_quant_batchedmatmul_8bit(self):
             W=W,
             bias=bias,
         )
-        mlmodel = MLModel(builder.spec)
+        mlmodel = MLModel(builder.spec, compute_units=ComputeUnit.CPU_ONLY)
         q_mlmodel = quantize_weights(mlmodel, 8)
         q_spec = q_mlmodel.get_spec()
         q_layer = q_spec.neuralNetwork.layers[0].batchedMatmul
@@ -362,7 +364,7 @@ def test_linear_quant_batchedmatmul_8bit(self):
 
         data = np.random.rand(2, 32)
         data_dict = {"data": data}
-        out = q_mlmodel.predict(data_dict, useCPUOnly=True)["out"]
+        out = q_mlmodel.predict(data_dict)["out"]
         expected_out = np.matmul(data, W) + bias
         self.assertTrue(out.shape == expected_out.shape)
         self.assertTrue(np.allclose(out.flatten(), expected_out.flatten(), atol=0.1))
@@ -396,10 +398,10 @@ def test_lut_quant_embedding_nd_2bit(self):
             quant_lut=quant_lut,
         )
 
-        mlmodel = MLModel(builder.spec)
+        mlmodel = MLModel(builder.spec, compute_units=ComputeUnit.CPU_ONLY)
         data = np.reshape(np.array([2.0, 2.0, 1.0, 0.0]), (4, 1))
         data_dict = {"data": data}
-        out = mlmodel.predict(data_dict, useCPUOnly=True)["out"]
+        out = mlmodel.predict(data_dict)["out"]
         expected_out = np.zeros((4, embed_size), dtype=np.float32)
         expected_out[0, :] = [quant_lut[W[0, 2]], quant_lut[W[1, 2]]] + bias
         expected_out[1, :] = [quant_lut[W[0, 2]], quant_lut[W[1, 2]]] + bias
@@ -441,14 +443,14 @@ def test_linear_quant_embedding_7bit(self):
             is_quantized_weight=True,
             quantization_type="linear",
             nbits=7,
-            quant_scale=to_py_type(quant_scale),
-            quant_bias=to_py_type(quant_bias),
+            quant_scale=np_val_to_py_type(quant_scale),
+            quant_bias=np_val_to_py_type(quant_bias),
         )
 
-        mlmodel = MLModel(builder.spec)
+        mlmodel = MLModel(builder.spec, compute_units=ComputeUnit.CPU_ONLY)
         data = np.reshape(np.array([2.0, 2.0, 1.0, 0.0]), (4, 1, 1, 1))
         data_dict = {"data": data}
-        out = mlmodel.predict(data_dict, useCPUOnly=True)["out"]
+        out = mlmodel.predict(data_dict)["out"]
         self.assertTrue(out.shape == (4, embed_size, 1, 1))
         expected_out = np.zeros((4, embed_size), dtype=np.float32)
         expected_out[0, :] = W_unquantized[:, 2].flatten()
diff --git a/coremltools/test/neural_network/test_numpy_nn_layers.py b/coremltools/test/neural_network/test_numpy_nn_layers.py
index e84db9939..37428ef3c 100644
--- a/coremltools/test/neural_network/test_numpy_nn_layers.py
+++ b/coremltools/test/neural_network/test_numpy_nn_layers.py
@@ -11,24 +11,24 @@
 import tempfile
 import unittest
 import uuid
-import pytest
-from packaging import version
 
 import numpy as np
-from coremltools._deps import _HAS_TF, MSG_TF1_NOT_FOUND
+import pytest
 
+from coremltools._deps import _HAS_TF, MSG_TF1_NOT_FOUND
 if _HAS_TF:
     import tensorflow as tf
 import torch
 
 import coremltools
+from coremltools import ComputeUnit
 import coremltools.models.datatypes as datatypes
 from coremltools.converters.mil.mil.ops.defs._utils import aggregated_pad
-from coremltools.models import _MLMODEL_FULL_PRECISION, _MLMODEL_HALF_PRECISION
-from coremltools.models import neural_network as neural_network
+from coremltools.models import _MLMODEL_FULL_PRECISION, _MLMODEL_HALF_PRECISION, neural_network
 from coremltools.models.neural_network import flexible_shape_utils
 from coremltools.models.utils import _macos_version, _is_macos
 
+
 np.random.seed(10)
 
 MIN_MACOS_VERSION_REQUIRED = (10, 13)
@@ -125,8 +125,13 @@ def get_moment(data, k):
         if isinstance(model, str):
             model = coremltools.models.MLModel(model)
 
-        model = coremltools.models.MLModel(model, useCPUOnly=use_cpu_only)
-        prediction = model.predict(inputs, useCPUOnly=use_cpu_only)
+        if use_cpu_only:
+            compute_unit=ComputeUnit.CPU_ONLY
+        else:
+            compute_unit=ComputeUnit.ALL
+
+        model = coremltools.models.MLModel(model, compute_units=compute_unit)
+        prediction = model.predict(inputs)
 
         for output_name in expected:
             np_preds = expected[output_name]
@@ -157,11 +162,15 @@ def _test_model(
         SNR=30,
     ):
 
+        if useCPUOnly:
+            compute_unit=ComputeUnit.CPU_ONLY
+        else:
+            compute_unit=ComputeUnit.ALL
+
         model_dir = None
         # if we're given a path to a model
         if isinstance(model, str):
-            model = coremltools.models.MLModel(model)
-
+            model = coremltools.models.MLModel(model, compute_units=compute_unit)
         # If we're passed in a specification, save out the model
         # and then load it back up
         elif isinstance(model, coremltools.proto.Model_pb2.Model):
@@ -169,14 +178,14 @@ def _test_model(
             model_name = str(uuid.uuid4()) + ".mlmodel"
             model_path = os.path.join(model_dir, model_name)
             coremltools.utils.save_spec(model, model_path)
-            model = coremltools.models.MLModel(model, useCPUOnly=useCPUOnly)
+            model = coremltools.models.MLModel(model, compute_units=compute_unit)
 
         # If we want to test the half precision case
         if model_precision == _MLMODEL_HALF_PRECISION:
             model = coremltools.utils._convert_neural_network_weights_to_fp16(model)
 
         try:
-            prediction = model.predict(input, useCPUOnly=useCPUOnly)
+            prediction = model.predict(input)
             for output_name in expected:
                 if self.__class__.__name__ == "SimpleTest":
                     self._test_shape_equality(
@@ -1745,6 +1754,7 @@ def test_elementwise_binary_cpu(self, cpu_only=True):
                 expected = {"output": func(a, b, dtype=np.float32)}
                 self._test_model(builder.spec, input, expected, useCPUOnly=cpu_only)
 
+    @pytest.mark.xfail(reason="rdar://93912621")
     def test_elementwise_binary_gpu(self):
         self.test_elementwise_binary_cpu(cpu_only=False)
 
@@ -3075,6 +3085,7 @@ def _nms_TF(
                                         iou_threshold = (
                                             np.percentile(iou_matrix, iou_thresh) + 0.01
                                         )
+                                    iou_threshold = np.maximum(iou_threshold, 1e-8)
 
                                     number_of_test += 1
 
diff --git a/coremltools/test/neural_network/test_quantization.py b/coremltools/test/neural_network/test_quantization.py
index 7e5fcb02a..c76c15dd3 100644
--- a/coremltools/test/neural_network/test_quantization.py
+++ b/coremltools/test/neural_network/test_quantization.py
@@ -5,510 +5,22 @@
 """Module containing unit tests for verifying various quantization."""
 
 import numpy as np
-import os
 import pytest
-import shutil
-import tempfile
 import unittest
 
 import coremltools
-from coremltools import ComputeUnit as _ComputeUnit
+from coremltools import ComputeUnit
 from coremltools.models import (
     neural_network,
-    _MLMODEL_FULL_PRECISION,
     _QUANTIZATION_MODE_LINEAR_QUANTIZATION,
-    _QUANTIZATION_MODE_LOOKUP_TABLE_KMEANS,
-    _QUANTIZATION_MODE_CUSTOM_LOOKUP_TABLE,
 )
 import coremltools.models.datatypes as datatypes
-import coremltools.models.neural_network.quantization_utils as quantization_utils
+from coremltools.models.neural_network import quantization_utils
 from coremltools.models.neural_network.quantization_utils import (
     activate_int8_int8_matrix_multiplications,
     MatrixMultiplyLayerSelector,
     _quantize_spec_weights,
 )
-from coremltools._deps import _HAS_KERAS2_TF
-
-@unittest.skipIf(
-    not coremltools.utils._is_macos() or coremltools.utils._macos_version() < (10, 14),
-    "Missing macOS 10.14+. Skipping tests.",
-)
-@unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras. Skipping tests.")
-@pytest.mark.keras2
-class QuantizationNumericalCorrectnessTests(unittest.TestCase):
-    def runTest(self):
-        pass
-
-    def setUp(self):
-        self.qbits = 8  # n-bit quantization for tests
-        self.qmode = _QUANTIZATION_MODE_LINEAR_QUANTIZATION
-        self.custom_lut = None
-        from .test_keras2_numeric import KerasBasicNumericCorrectnessTest
-
-        self.keras_tester = KerasBasicNumericCorrectnessTest()
-        self.keras_tester._test_model = self._test_model
-
-    def _run_quantized_test(self, input_, full_precision_model, quantized_model, delta):
-        # Output from both models should be the same
-        full_output = full_precision_model.predict(input_)
-        quantized_output = quantized_model.predict(input_)
-        self.assertEqual(full_output.keys(), quantized_output.keys())
-
-        for key in full_output.keys():
-            full_output_flatten = full_output[key].flatten()
-            quantized_output_flatten = quantized_output[key].flatten()
-
-            self.assertTrue(len(full_output_flatten) == len(quantized_output_flatten))
-
-            norm_factor = np.maximum(full_output_flatten, quantized_output_flatten)
-            norm_factor = np.maximum(norm_factor, 1.0)
-            f_out = full_output_flatten / norm_factor
-            q_out = quantized_output_flatten / norm_factor
-
-            for idx, full_value in enumerate(f_out):
-                quantized_value = q_out[idx]
-                self.assertAlmostEqual(full_value, quantized_value, delta=delta)
-
-    def _test_model(
-        self,
-        model,
-        num_samples=1,
-        mode="random",
-        delta=1e-2,
-        model_dir=None,
-        transpose_keras_result=True,
-        one_dim_seq_flags=None,
-        model_precision=_MLMODEL_FULL_PRECISION,
-    ):
-        # Get the model path
-        use_tmp_folder = False
-        if model_dir is None:
-            use_tmp_folder = True
-            model_dir = tempfile.mkdtemp()
-
-        # Get converted coreml model and sample input
-        (
-            input_names,
-            output_names,
-            _,
-            coreml_input,
-        ) = self.keras_tester._get_coreml_model_params_and_test_input(
-            model, mode, one_dim_seq_flags
-        )
-        from .test_keras2_numeric import _get_coreml_model
-
-        coreml_model = _get_coreml_model(
-            model, input_names, output_names, model_precision=model_precision
-        )
-
-        # Now we quantize the model and dequantize it. We then use this model
-        # as our full precision model since quantizing this model again will
-        # result in 0 quantization error.
-
-        coreml_spec = coreml_model.get_spec()
-        quantization_utils._quantize_spec_weights(
-            spec=coreml_spec,
-            nbits=self.qbits,
-            quantization_mode=self.qmode,
-            lut_function=self.custom_lut,
-        )
-
-        # De-quantize model
-        quantization_utils._dequantize_nn_spec(spec=coreml_spec.neuralNetwork)
-        full_precision_model_spec = coreml_spec
-
-        # Quantize model from another copy
-        quantized_model_spec = quantization_utils._quantize_spec_weights(
-            spec=coreml_model.get_spec(),
-            nbits=self.qbits,
-            quantization_mode=self.qmode,
-            lut_function=self.custom_lut,
-        )
-
-        full_precision_model = coremltools.models.MLModel(full_precision_model_spec)
-        quantized_model = coremltools.models.MLModel(quantized_model_spec)
-        self._run_quantized_test(
-            coreml_input, full_precision_model, quantized_model, delta
-        )
-
-        # Clean up after ourselves
-        if use_tmp_folder and os.path.exists(model_dir):
-            shutil.rmtree(model_dir)
-
-    def test_quantized_tiny_inner_product(self):
-        self.keras_tester.test_tiny_inner_product()
-
-    def test_quantized_conv_batchnorm_random(self):
-        self.keras_tester.test_conv_batchnorm_random()
-
-    def test_quantized_conv_batchnorm_no_gamma_no_beta(self):
-        self.keras_tester.test_conv_batchnorm_no_gamma_no_beta()
-
-    def test_quantized_tiny_deconv_random(self):
-        self.keras_tester.test_tiny_deconv_random()
-
-    def test_quantized_tiny_deconv_random_same_padding(self):
-        self.keras_tester.test_tiny_deconv_random_same_padding()
-
-    def test_quantized_tiny_depthwise_conv_valid_pad(self):
-        self.keras_tester.test_tiny_depthwise_conv_valid_pad()
-
-    def test_quantized_tiny_separable_conv_valid_depth_multiplier(self):
-        self.keras_tester.test_tiny_separable_conv_valid_depth_multiplier()
-
-    def test_quantized_max_pooling_no_overlap(self):
-        self.keras_tester.test_max_pooling_no_overlap()
-
-    def test_quantized_dense_softmax(self):
-        self.keras_tester.test_dense_softmax()
-
-    def test_quantized_housenet_random(self):
-        self.keras_tester.test_housenet_random()
-
-    def test_quantized_large_input_length_conv1d_same_random(self):
-        self.keras_tester.test_large_input_length_conv1d_same_random()
-
-    def test_quantized_conv_dense(self):
-        pytest.xfail(reason="rdar://87349588 ([Rome gitlab CI] Re-enable time out tests)")
-        self.keras_tester.test_conv_dense()
-
-    def test_quantized_tiny_conv_crop_1d_random(self):
-        self.keras_tester.test_tiny_conv_crop_1d_random()
-
-    def test_quantized_embedding(self):
-        self.keras_tester.test_embedding()
-
-    def test_quantized_tiny_conv_elu_random(self):
-        self.keras_tester.test_tiny_conv_elu_random()
-
-    def test_quantized_tiny_concat_random(self):
-        self.keras_tester.test_tiny_concat_random()
-
-    def test_quantized_tiny_dense_tanh_fused_random(self):
-        self.keras_tester.test_tiny_dense_tanh_fused_random()
-
-    def test_quantized_conv1d_flatten(self):
-        # Softmax after quantization appears to have a bigger error margin
-        self.keras_tester.test_conv1d_flatten(delta=2e-2)
-
-    def test_quantized_tiny_conv_dropout_random(self):
-        self.keras_tester.test_tiny_conv_dropout_random()
-
-    def test_quantized_tiny_mul_random(self):
-        self.keras_tester.test_tiny_mul_random()
-
-    def test_quantized_tiny_conv_thresholded_relu_random(self):
-        self.keras_tester.test_tiny_conv_thresholded_relu_random()
-
-    def test_quantized_tiny_seq2seq_rnn_random(self):
-        self.keras_tester.test_tiny_seq2seq_rnn_random()
-
-    def test_quantized_rnn_seq(self):
-        self.keras_tester.test_rnn_seq()
-
-    def test_quantized_medium_no_sequence_simple_rnn_random(self):
-        self.keras_tester.test_medium_no_sequence_simple_rnn_random()
-
-    def test_quantized_tiny_no_sequence_lstm_zeros(self):
-        self.keras_tester.test_tiny_no_sequence_lstm_zeros()
-
-    def test_quantized_tiny_no_sequence_lstm_ones(self):
-        self.keras_tester.test_tiny_no_sequence_lstm_ones()
-
-    def test_quantized_lstm_seq(self):
-        self.keras_tester.test_lstm_seq()
-
-    def test_quantized_medium_no_sequence_lstm_random(self):
-        self.keras_tester.test_medium_no_sequence_lstm_random()
-
-    def test_quantized_tiny_no_sequence_gru_random(self):
-        self.keras_tester.test_tiny_no_sequence_gru_random()
-
-    def test_quantized_gru_seq_backwards(self):
-        self.keras_tester.test_gru_seq_backwards()
-
-    def test_quantized_tiny_no_sequence_bidir_random(self):
-        self.keras_tester.test_tiny_no_sequence_bidir_random()
-
-    def test_quantized_tiny_no_sequence_bidir_random_gpu(self):
-        self.keras_tester.test_tiny_no_sequence_bidir_random_gpu()
-
-    def test_quantized_small_no_sequence_bidir_random(self):
-        self.keras_tester.test_small_no_sequence_bidir_random()
-
-    def test_quantized_medium_no_sequence_bidir_random(self):
-        self.keras_tester.test_medium_no_sequence_bidir_random()
-
-    def test_quantized_medium_bidir_random_return_seq_false(self):
-        self.keras_tester.test_medium_bidir_random_return_seq_false()
-
-    def test_quantized_tiny_sequence_lstm(self):
-        self.keras_tester.test_tiny_sequence_lstm()
-
-    def test_quantized__lstm_td(self):
-        self.keras_tester.test_lstm_td()
-
-    def test_quantized_large_channel_gpu(self):
-        self.keras_tester.test_large_channel_gpu()
-
-    def test_quantized_tiny_seq2seq_rnn_random(self):
-        self.keras_tester.test_tiny_seq2seq_rnn_random()
-
-    def test_quantized_lstm_seq_backwards(self):
-        self.keras_tester.test_lstm_seq_backwards()
-
-
-@unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras. Skipping tests.")
-@pytest.mark.keras2
-@pytest.mark.slow
-class SevenBitQuantizationNumericalCorrectnessTests(
-    QuantizationNumericalCorrectnessTests
-):
-    def setUp(self):
-        super(SevenBitQuantizationNumericalCorrectnessTests, self).setUp()
-        self.qbits = 7
-
-
-@unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras. Skipping tests.")
-@pytest.mark.keras2
-@pytest.mark.slow
-class SixBitQuantizationNumericalCorrectnessTests(
-    QuantizationNumericalCorrectnessTests
-):
-    def setUp(self):
-        super(SixBitQuantizationNumericalCorrectnessTests, self).setUp()
-        self.qbits = 6
-
-
-@unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras. Skipping tests.")
-@pytest.mark.keras2
-@pytest.mark.slow
-class FiveBitQuantizationNumericalCorrectnessTests(
-    QuantizationNumericalCorrectnessTests
-):
-    def setUp(self):
-        super(FiveBitQuantizationNumericalCorrectnessTests, self).setUp()
-        self.qbits = 5
-
-
-@unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras. Skipping tests.")
-@pytest.mark.keras2
-class FourBitQuantizationNumericalCorrectnessTests(
-    QuantizationNumericalCorrectnessTests
-):
-    def setUp(self):
-        super(FourBitQuantizationNumericalCorrectnessTests, self).setUp()
-        self.qbits = 4
-
-
-@unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras. Skipping tests.")
-@pytest.mark.keras2
-@pytest.mark.slow
-class ThreeBitQuantizationNumericalCorrectnessTests(
-    QuantizationNumericalCorrectnessTests
-):
-    def setUp(self):
-        super(ThreeBitQuantizationNumericalCorrectnessTests, self).setUp()
-        self.qbits = 3
-
-
-@unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras. Skipping tests.")
-@pytest.mark.keras2
-@pytest.mark.slow
-class TwoBitQuantizationNumericalCorrectnessTests(
-    QuantizationNumericalCorrectnessTests
-):
-    def setUp(self):
-        super(TwoBitQuantizationNumericalCorrectnessTests, self).setUp()
-        self.qbits = 2
-
-
-@unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras. Skipping tests.")
-@pytest.mark.keras2
-class OneBitQuantizationNumericalCorrectnessTests(
-    QuantizationNumericalCorrectnessTests
-):
-    def setUp(self):
-        super(OneBitQuantizationNumericalCorrectnessTests, self).setUp()
-        self.qbits = 1
-
-
-@unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras. Skipping tests.")
-@pytest.mark.keras2
-class LUTQuantizationNumericalCorrectnessTests(QuantizationNumericalCorrectnessTests):
-    def setUp(self):
-        super(LUTQuantizationNumericalCorrectnessTests, self).setUp()
-        self.qbits = 8
-        self.qmode = _QUANTIZATION_MODE_LOOKUP_TABLE_KMEANS
-
-    def test_quantized_custom_lut(self):
-        pass
-
-
-@unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras. Skipping tests.")
-@pytest.mark.keras2
-@pytest.mark.slow
-class LUTSevenBitQuantizationNumericalCorrectnessTests(
-    QuantizationNumericalCorrectnessTests
-):
-    def setUp(self):
-        super(LUTSevenBitQuantizationNumericalCorrectnessTests, self).setUp()
-        self.qbits = 7
-        self.qmode = _QUANTIZATION_MODE_LOOKUP_TABLE_KMEANS
-
-
-@unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras. Skipping tests.")
-@pytest.mark.keras2
-@pytest.mark.slow
-class LUTSixBitQuantizationNumericalCorrectnessTests(
-    QuantizationNumericalCorrectnessTests
-):
-    def setUp(self):
-        super(LUTSixBitQuantizationNumericalCorrectnessTests, self).setUp()
-        self.qbits = 6
-        self.qmode = _QUANTIZATION_MODE_LOOKUP_TABLE_KMEANS
-
-
-@unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras. Skipping tests.")
-@pytest.mark.keras2
-@pytest.mark.slow
-class LUTFiveBitQuantizationNumericalCorrectnessTests(
-    QuantizationNumericalCorrectnessTests
-):
-    def setUp(self):
-        super(LUTFiveBitQuantizationNumericalCorrectnessTests, self).setUp()
-        self.qbits = 5
-        self.qmode = _QUANTIZATION_MODE_LOOKUP_TABLE_KMEANS
-
-
-@unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras. Skipping tests.")
-@pytest.mark.keras2
-class LUTFourBitQuantizationNumericalCorrectnessTests(
-    QuantizationNumericalCorrectnessTests
-):
-    def setUp(self):
-        super(LUTFourBitQuantizationNumericalCorrectnessTests, self).setUp()
-        self.qbits = 4
-        self.qmode = _QUANTIZATION_MODE_LOOKUP_TABLE_KMEANS
-
-
-@unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras. Skipping tests.")
-@pytest.mark.keras2
-@pytest.mark.slow
-class LUTThreeBitQuantizationNumericalCorrectnessTests(
-    QuantizationNumericalCorrectnessTests
-):
-    def setUp(self):
-        super(LUTThreeBitQuantizationNumericalCorrectnessTests, self).setUp()
-        self.qbits = 3
-        self.qmode = _QUANTIZATION_MODE_LOOKUP_TABLE_KMEANS
-
-
-@unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras. Skipping tests.")
-@pytest.mark.keras2
-@pytest.mark.slow
-class LUTTwoBitQuantizationNumericalCorrectnessTests(
-    QuantizationNumericalCorrectnessTests
-):
-    def setUp(self):
-        super(LUTTwoBitQuantizationNumericalCorrectnessTests, self).setUp()
-        self.qbits = 2
-        self.qmode = _QUANTIZATION_MODE_LOOKUP_TABLE_KMEANS
-
-
-@unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras. Skipping tests.")
-@pytest.mark.keras2
-class LUTOneBitQuantizationNumericalCorrectnessTests(
-    QuantizationNumericalCorrectnessTests
-):
-    def setUp(self):
-        super(LUTOneBitQuantizationNumericalCorrectnessTests, self).setUp()
-        self.qbits = 1
-        self.qmode = _QUANTIZATION_MODE_LOOKUP_TABLE_KMEANS
-
-
-@unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras. Skipping tests.")
-@pytest.mark.keras2
-class LUTCustomQuantizationNumericalCorrectnessTests(
-    QuantizationNumericalCorrectnessTests
-):
-    def setUp(self):
-        super(LUTCustomQuantizationNumericalCorrectnessTests, self).setUp()
-        self.qbits = 8
-        self.qmode = _QUANTIZATION_MODE_CUSTOM_LOOKUP_TABLE
-        self.custom_lut = quantization_utils._get_linear_lookup_table_and_weight
-
-
-from coremltools.converters import keras as keras_converter
-
-
-@unittest.skipIf(
-    not coremltools.utils._is_macos() or coremltools.utils._macos_version() < (10, 14),
-    "Missing macOS 10.14+. Skipping tests.",
-)
-@unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras. Skipping tests.")
-@pytest.mark.keras2
-class AdvancedQuantizationNumericalCorrectnessTests(unittest.TestCase):
-    """ Quantization tests for advanced settings
-    """
-
-    def test_8bit_symmetric_and_skips(self):
-        from keras.models import Sequential
-        from keras.layers import Conv2D
-
-        def stable_rel_error(x, ref):
-            err = x - ref
-            denom = np.maximum(np.abs(ref), np.ones_like(ref))
-            return np.abs(err) / denom
-
-        np.random.seed(1988)
-        input_dim = 16
-        num_kernels, kernel_height, kernel_width, input_channels = 64, 3, 3, 32
-
-        # Define a model
-        model = Sequential()
-        model.add(
-            Conv2D(
-                input_shape=(input_dim, input_dim, input_channels),
-                filters=num_kernels,
-                kernel_size=(kernel_height, kernel_width),
-            )
-        )
-
-        # Set some random weights
-        weight, bias = model.layers[0].get_weights()
-        num_filters = weight.shape[-1]
-        filter_shape = weight.shape[:-1]
-
-        new_weight = np.stack(
-            [4.0 * np.random.rand(*filter_shape) - 2 for i in range(num_filters)],
-            axis=-1,
-        )
-        model.layers[0].set_weights([new_weight, bias])
-
-        mlmodel = keras_converter.convert(model, ["data"], ["output_0"])
-        selector = quantization_utils.AdvancedQuantizedLayerSelector(
-            skip_layer_types=["batchnorm", "bias", "depthwiseConv"],
-            minimum_conv_kernel_channels=4,
-            minimum_conv_weight_count=4096,
-        )
-
-        q_mlmodel = quantization_utils.quantize_weights(mlmodel, 8, selector=selector)
-
-        input_shape = (1, 1, input_channels, input_dim, input_dim)
-        input_val = 2 * np.random.rand(*input_shape) - 1
-
-        coreml_input = {"data": input_val}
-        coreml_output = mlmodel.predict(coreml_input)
-        q_coreml_output = q_mlmodel.predict(coreml_input)
-
-        val = coreml_output["output_0"]
-        q_val = q_coreml_output["output_0"]
-        rel_err = stable_rel_error(q_val, val)
-        max_rel_err, mean_rel_err = np.max(rel_err), np.mean(rel_err)
-        self.assertTrue(max_rel_err < 0.25)
-        self.assertTrue(max_rel_err > 0.01)
-        self.assertTrue(mean_rel_err < 0.02)
 
 
 @unittest.skipIf(
@@ -553,8 +65,8 @@ def compare(self, specification_modified=True):
         x = np.random.rand(*self.input_shape)
 
         def _get_preds(spec):
-            mlmodel = coremltools.models.MLModel(spec)
-            return mlmodel.predict({"data": x}, useCPUOnly=True)["output"]
+            mlmodel = coremltools.models.MLModel(spec, compute_units=ComputeUnit.CPU_ONLY)
+            return mlmodel.predict({"data": x})["output"]
 
         preds = _get_preds(self.builder.spec)
         self.assertEqual(self.builder.spec.specificationVersion, 4)
@@ -1002,7 +514,7 @@ def test_batched_matmul_1bit_weight_quantized(self):
 class TestQuantizeWeightsAPI:
     @staticmethod
     @pytest.mark.parametrize(
-        "compute_units", [_ComputeUnit.ALL, _ComputeUnit.CPU_AND_GPU, _ComputeUnit.CPU_ONLY])
+        "compute_units", [ComputeUnit.ALL, ComputeUnit.CPU_AND_GPU, ComputeUnit.CPU_ONLY])
     def test_embeddingND_quantize(compute_units):
         input_features = [("data", datatypes.Array(10, 1))]
         output_features = [("output", None)]
diff --git a/coremltools/test/neural_network/test_recurrent_stress_tests.py b/coremltools/test/neural_network/test_recurrent_stress_tests.py
deleted file mode 100644
index 50e629e2e..000000000
--- a/coremltools/test/neural_network/test_recurrent_stress_tests.py
+++ /dev/null
@@ -1,1951 +0,0 @@
-#  Copyright (c) 2021, Apple Inc. All rights reserved.
-#
-#  Use of this source code is governed by a BSD-3-clause license that can be
-#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
-
-from copy import copy
-import itertools
-import numpy as np
-import pytest
-import unittest
-
-from coremltools._deps import _HAS_KERAS2_TF, _HAS_KERAS_TF
-from coremltools.models.utils import _macos_version, _is_macos
-
-
-np.random.seed(1377)
-
-if _HAS_KERAS2_TF or _HAS_KERAS_TF:
-    import keras
-    from keras.models import Sequential
-    from keras.layers import LSTM, GRU, SimpleRNN, RepeatVector
-    from keras.layers.wrappers import Bidirectional
-    import keras.backend as K
-    from coremltools.converters import keras as keras_converter
-
-"""
-=============================
-Utility Functions
-=============================
-"""
-
-
-def get_recurrent_activation_name_from_keras(activation):
-    if activation == keras.activations.sigmoid:
-        activation_str = "SIGMOID"
-    elif activation == keras.activations.hard_sigmoid:
-        activation_str = "SIGMOID_HARD"
-    elif activation == keras.activations.tanh:
-        activation_str = "TANH"
-    elif activation == keras.activations.relu:
-        activation_str = "RELU"
-    elif activation == keras.activations.linear:
-        activation_str = "LINEAR"
-    else:
-        raise NotImplementedError(
-            "activation %s not supported for Recurrent layer." % activation
-        )
-    return activation_str
-
-
-def linear(x, alpha=1, beta=0):
-    return alpha * x + beta
-
-
-def relu(x):
-    return np.maximum(0, x)
-
-
-def sigmoid(x):
-    return 1.0 / (1 + np.exp(-x))
-
-
-def hard_sigmoid(x, alpha=0.2, beta=0.5):
-    return np.minimum(np.maximum(alpha * x + beta, 0), 1)
-
-
-def tanh(x):
-    return np.tanh(x)
-
-
-def apply_act(x, option):
-    if option == "TANH":
-        return tanh(x)
-    elif option == "RELU":
-        return relu(x)
-    elif option == "SIGMOID":
-        return sigmoid(x)
-    elif option == "SIGMOID_HARD":
-        return hard_sigmoid(x)
-    elif option == "LINEAR":
-        return linear(x)
-
-
-def clip(x, threshold=50.0):
-    return np.maximum(np.minimum(x, threshold), -threshold)
-
-
-def valid_params(params):
-    """Checks if this combination of parameters is allowed by Keras"""
-    return not (params["input_dims"][1] == 1 and params["unroll"])
-
-
-def _compute_SNR(x, y):
-    x = x.flatten()
-    y = y.flatten()
-    noise = x - y
-    noise_var = np.sum(noise ** 2) / len(noise) + 1e-7
-    signal_energy = np.sum(y ** 2) / len(y)
-    signal_energy2 = np.sum(x ** 2) / len(x)
-    if signal_energy < 1e-5 and signal_energy2 < 1e-5:
-        return 50, 50, 0
-    max_signal_energy = np.amax(y ** 2)
-    SNR = 10 * np.log10(signal_energy / noise_var)
-    PSNR = 10 * np.log10(max_signal_energy / noise_var)
-    return SNR, PSNR, signal_energy
-
-
-"""
-=============================
-Numpy implementations
-=============================
-"""
-
-
-def get_numpy_prediction_gru(model, X):
-    X = X[0, :, :]
-    seq_len, input_size = X.shape
-    keras_layer = model.layers[0]
-    return_seq = keras_layer.return_sequences
-    if keras_layer.go_backwards:
-        X = X[::-1, :]
-
-    if _HAS_KERAS2_TF:
-        hidden_size = keras_layer.units
-
-        keras_W_h = keras_layer.get_weights()[1].T
-        R_z = keras_W_h[0 * hidden_size :][:hidden_size]
-        R_r = keras_W_h[1 * hidden_size :][:hidden_size]
-        R_o = keras_W_h[2 * hidden_size :][:hidden_size]
-
-        keras_W_x = keras_layer.get_weights()[0].T
-        W_z = keras_W_x[0 * hidden_size :][:hidden_size]
-        W_r = keras_W_x[1 * hidden_size :][:hidden_size]
-        W_o = keras_W_x[2 * hidden_size :][:hidden_size]
-
-        keras_b = keras_layer.get_weights()[2]
-        b_z = keras_b[0 * hidden_size :][:hidden_size]
-        b_r = keras_b[1 * hidden_size :][:hidden_size]
-        b_o = keras_b[2 * hidden_size :][:hidden_size]
-
-        inner_activation_str = get_recurrent_activation_name_from_keras(
-            keras_layer.recurrent_activation
-        )
-        activation_str = get_recurrent_activation_name_from_keras(
-            keras_layer.activation
-        )
-
-    else:
-        hidden_size = keras_layer.output_dim
-
-        W_z = keras_layer.get_weights()[0].T
-        W_r = keras_layer.get_weights()[3].T
-        W_o = keras_layer.get_weights()[6].T
-
-        R_z = keras_layer.get_weights()[1].T
-        R_r = keras_layer.get_weights()[4].T
-        R_o = keras_layer.get_weights()[7].T
-
-        b_z = keras_layer.get_weights()[2]
-        b_r = keras_layer.get_weights()[5]
-        b_o = keras_layer.get_weights()[8]
-
-        inner_activation_str = get_recurrent_activation_name_from_keras(
-            keras_layer.inner_activation
-        )
-        activation_str = get_recurrent_activation_name_from_keras(
-            keras_layer.activation
-        )
-
-    h = np.zeros((hidden_size))
-    c = np.zeros((hidden_size))
-    np_out = np.zeros((seq_len, hidden_size))
-    for k in range(seq_len):
-        x = X[k, :]
-        z = apply_act(clip(np.dot(W_z, x) + np.dot(R_z, h) + b_z), inner_activation_str)
-        r = apply_act(clip(np.dot(W_r, x) + np.dot(R_r, h) + b_r), inner_activation_str)
-        c = clip(h * r)
-        o = apply_act(clip(np.dot(W_o, x) + np.dot(R_o, c) + b_o), activation_str)
-        h = (1 - z) * o + z * h
-        np_out[k, :] = h
-
-    if return_seq:
-        np_out_final = np_out
-    else:
-        np_out_final = np_out[-1, :]
-    return np_out_final
-
-
-def get_numpy_prediction_unilstm(model, X):
-    X = X[0, :, :]
-    seq_len, input_size = X.shape
-    keras_layer = model.layers[0]
-    return_seq = keras_layer.return_sequences
-    if keras_layer.go_backwards:
-        X = X[::-1, :]
-
-    if _HAS_KERAS2_TF:
-        hidden_size = keras_layer.units
-
-        keras_W_h = keras_layer.get_weights()[1].T
-        R_i = keras_W_h[0 * hidden_size :][:hidden_size]
-        R_f = keras_W_h[1 * hidden_size :][:hidden_size]
-        R_o = keras_W_h[3 * hidden_size :][:hidden_size]
-        R_g = keras_W_h[2 * hidden_size :][:hidden_size]
-
-        keras_W_x = keras_layer.get_weights()[0].T
-        W_i = keras_W_x[0 * hidden_size :][:hidden_size]
-        W_f = keras_W_x[1 * hidden_size :][:hidden_size]
-        W_o = keras_W_x[3 * hidden_size :][:hidden_size]
-        W_g = keras_W_x[2 * hidden_size :][:hidden_size]
-
-        keras_b = keras_layer.get_weights()[2]
-        b_i = keras_b[0 * hidden_size :][:hidden_size]
-        b_f = keras_b[1 * hidden_size :][:hidden_size]
-        b_o = keras_b[3 * hidden_size :][:hidden_size]
-        b_g = keras_b[2 * hidden_size :][:hidden_size]
-
-        inner_activation_str = get_recurrent_activation_name_from_keras(
-            keras_layer.recurrent_activation
-        )
-        activation_str = get_recurrent_activation_name_from_keras(
-            keras_layer.activation
-        )
-
-    else:
-        hidden_size = keras_layer.output_dim
-
-        R_i = keras_layer.get_weights()[1].T
-        R_f = keras_layer.get_weights()[7].T
-        R_o = keras_layer.get_weights()[10].T
-        R_g = keras_layer.get_weights()[4].T
-
-        W_i = keras_layer.get_weights()[0].T
-        W_f = keras_layer.get_weights()[6].T
-        W_o = keras_layer.get_weights()[9].T
-        W_g = keras_layer.get_weights()[3].T
-
-        b_i = keras_layer.get_weights()[2]
-        b_f = keras_layer.get_weights()[8]
-        b_o = keras_layer.get_weights()[11]
-        b_g = keras_layer.get_weights()[5]
-
-        inner_activation_str = get_recurrent_activation_name_from_keras(
-            keras_layer.inner_activation
-        )
-        activation_str = get_recurrent_activation_name_from_keras(
-            keras_layer.activation
-        )
-
-    h = np.zeros((hidden_size))
-    c = np.zeros((hidden_size))
-    np_out = np.zeros((seq_len, hidden_size))
-    for k in range(seq_len):
-        x = X[k, :]
-        i = apply_act(clip(np.dot(W_i, x) + np.dot(R_i, h) + b_i), inner_activation_str)
-        f = apply_act(clip(np.dot(W_f, x) + np.dot(R_f, h) + b_f), inner_activation_str)
-        g = apply_act(clip(np.dot(W_g, x) + np.dot(R_g, h) + b_g), activation_str)
-        c = c * f + i * g
-        c = clip(c, 50000.0)
-        o = apply_act(clip(np.dot(W_o, x) + np.dot(R_o, h) + b_o), inner_activation_str)
-        h = o * apply_act(c, activation_str)
-        np_out[k, :] = h
-    if return_seq:
-        np_out_final = np_out
-    else:
-        np_out_final = np_out[-1, :]
-    return np_out_final
-
-
-def get_numpy_prediction_bilstm_batched(model, X):
-    batch, _, _ = X.shape
-    out = []
-    for i in range(batch):
-        out.append(
-            get_numpy_prediction_bilstm(model, np.expand_dims(X[i, :, :], axis=0))
-        )
-    return np.stack(out, axis=0)
-
-
-def get_numpy_prediction_bilstm(model, X):
-    X = X[0, :, :]
-    seq_len, input_size = X.shape
-    keras_layer = model.layers[0]
-    return_seq = keras_layer.return_sequences
-
-    if _HAS_KERAS2_TF:
-        hidden_size = keras_layer.forward_layer.units
-
-        keras_W_h = keras_layer.forward_layer.get_weights()[1].T
-        R_i = keras_W_h[0 * hidden_size :][:hidden_size]
-        R_f = keras_W_h[1 * hidden_size :][:hidden_size]
-        R_o = keras_W_h[3 * hidden_size :][:hidden_size]
-        R_g = keras_W_h[2 * hidden_size :][:hidden_size]
-
-        keras_W_x = keras_layer.forward_layer.get_weights()[0].T
-        W_i = keras_W_x[0 * hidden_size :][:hidden_size]
-        W_f = keras_W_x[1 * hidden_size :][:hidden_size]
-        W_o = keras_W_x[3 * hidden_size :][:hidden_size]
-        W_g = keras_W_x[2 * hidden_size :][:hidden_size]
-
-        keras_b = keras_layer.forward_layer.get_weights()[2]
-        b_i = keras_b[0 * hidden_size :][:hidden_size]
-        b_f = keras_b[1 * hidden_size :][:hidden_size]
-        b_o = keras_b[3 * hidden_size :][:hidden_size]
-        b_g = keras_b[2 * hidden_size :][:hidden_size]
-
-        keras_W_h = keras_layer.backward_layer.get_weights()[1].T
-        R_i_back = keras_W_h[0 * hidden_size :][:hidden_size]
-        R_f_back = keras_W_h[1 * hidden_size :][:hidden_size]
-        R_o_back = keras_W_h[3 * hidden_size :][:hidden_size]
-        R_g_back = keras_W_h[2 * hidden_size :][:hidden_size]
-
-        keras_W_x = keras_layer.backward_layer.get_weights()[0].T
-        W_i_back = keras_W_x[0 * hidden_size :][:hidden_size]
-        W_f_back = keras_W_x[1 * hidden_size :][:hidden_size]
-        W_o_back = keras_W_x[3 * hidden_size :][:hidden_size]
-        W_g_back = keras_W_x[2 * hidden_size :][:hidden_size]
-
-        keras_b = keras_layer.backward_layer.get_weights()[2]
-        b_i_back = keras_b[0 * hidden_size :][:hidden_size]
-        b_f_back = keras_b[1 * hidden_size :][:hidden_size]
-        b_o_back = keras_b[3 * hidden_size :][:hidden_size]
-        b_g_back = keras_b[2 * hidden_size :][:hidden_size]
-
-        inner_activation_str = get_recurrent_activation_name_from_keras(
-            keras_layer.forward_layer.recurrent_activation
-        )
-        activation_str = get_recurrent_activation_name_from_keras(
-            keras_layer.forward_layer.activation
-        )
-
-    else:
-        hidden_size = keras_layer.forward_layer.output_dim
-
-        R_i = keras_layer.get_weights()[1].T
-        R_f = keras_layer.get_weights()[7].T
-        R_o = keras_layer.get_weights()[10].T
-        R_g = keras_layer.get_weights()[4].T
-
-        W_i = keras_layer.get_weights()[0].T
-        W_f = keras_layer.get_weights()[6].T
-        W_o = keras_layer.get_weights()[9].T
-        W_g = keras_layer.get_weights()[3].T
-
-        b_i = keras_layer.get_weights()[2]
-        b_f = keras_layer.get_weights()[8]
-        b_o = keras_layer.get_weights()[11]
-        b_g = keras_layer.get_weights()[5]
-
-        R_i_back = keras_layer.backward_layer.get_weights()[1].T
-        R_f_back = keras_layer.backward_layer.get_weights()[7].T
-        R_o_back = keras_layer.backward_layer.get_weights()[10].T
-        R_g_back = keras_layer.backward_layer.get_weights()[4].T
-
-        W_i_back = keras_layer.backward_layer.get_weights()[0].T
-        W_f_back = keras_layer.backward_layer.get_weights()[6].T
-        W_o_back = keras_layer.backward_layer.get_weights()[9].T
-        W_g_back = keras_layer.backward_layer.get_weights()[3].T
-
-        b_i_back = keras_layer.backward_layer.get_weights()[2]
-        b_f_back = keras_layer.backward_layer.get_weights()[8]
-        b_o_back = keras_layer.backward_layer.get_weights()[11]
-        b_g_back = keras_layer.backward_layer.get_weights()[5]
-
-        inner_activation_str = get_recurrent_activation_name_from_keras(
-            keras_layer.forward_layer.inner_activation
-        )
-        activation_str = get_recurrent_activation_name_from_keras(
-            keras_layer.forward_layer.activation
-        )
-
-    h = np.zeros((hidden_size))
-    c = np.zeros((hidden_size))
-    np_out_forward = np.zeros((seq_len, hidden_size))
-    for k in range(seq_len):
-        x = X[k, :]
-        i = apply_act(clip(np.dot(W_i, x) + np.dot(R_i, h) + b_i), inner_activation_str)
-        f = apply_act(clip(np.dot(W_f, x) + np.dot(R_f, h) + b_f), inner_activation_str)
-        g = apply_act(clip(np.dot(W_g, x) + np.dot(R_g, h) + b_g), activation_str)
-        c = c * f + i * g
-        c = clip(c, 50000.0)
-        o = apply_act(clip(np.dot(W_o, x) + np.dot(R_o, h) + b_o), inner_activation_str)
-        h = o * apply_act(c, activation_str)
-        np_out_forward[k, :] = h
-
-    h = np.zeros((hidden_size))
-    c = np.zeros((hidden_size))
-    np_out_backward = np.zeros((seq_len, hidden_size))
-    for k in range(seq_len):
-        x = X[seq_len - k - 1, :]
-        i = apply_act(
-            clip(np.dot(W_i_back, x) + np.dot(R_i_back, h) + b_i_back),
-            inner_activation_str,
-        )
-        f = apply_act(
-            clip(np.dot(W_f_back, x) + np.dot(R_f_back, h) + b_f_back),
-            inner_activation_str,
-        )
-        g = apply_act(
-            clip(np.dot(W_g_back, x) + np.dot(R_g_back, h) + b_g_back), activation_str
-        )
-        c = c * f + i * g
-        c = clip(c, 50000.0)
-        o = apply_act(
-            clip(np.dot(W_o_back, x) + np.dot(R_o_back, h) + b_o_back),
-            inner_activation_str,
-        )
-        h = o * apply_act(c, activation_str)
-        np_out_backward[k, :] = h
-
-    if return_seq:
-        np_out_final = np.zeros((seq_len, 2 * hidden_size))
-        for k in range(seq_len):
-            np_out_final[k, :hidden_size] = np_out_forward[k, :]
-            np_out_final[k, hidden_size:] = np_out_backward[seq_len - k - 1, :]
-    else:
-        np_out_final = np.zeros((2 * hidden_size))
-        np_out_final[:hidden_size] = np_out_forward[-1, :]
-        np_out_final[hidden_size:] = np_out_backward[-1, :]
-    return np_out_final
-
-
-"""
-=============================
-Nosetest Functions
-=============================
-"""
-
-
-def get_mlkit_model_from_path(model):
-    from coremltools.converters import keras as keras_converter
-
-    model = keras_converter.convert(model, ["data"], ["output"])
-    return model
-
-
-def generate_input(dim0, dim1, dim2):
-    input_data = np.random.rand(dim0, dim1, dim2).astype(
-        "f"
-    )  # astype() should be removed
-    return input_data
-
-
-def simple_model_eval(params, model):
-    mlkitmodel = get_mlkit_model_from_path(model)
-    # New test case takes in 2D input as opposed to uniform 3d input across all other tests
-    if len(params[0]["input_dims"]) == 3:
-        input_data = generate_input(
-            params[0]["input_dims"][0],
-            params[0]["input_dims"][1],
-            params[0]["input_dims"][2],
-        )
-        keras_preds = model.predict(input_data).flatten()
-    elif len(params[0]["input_dims"]) == 2:
-        input_data = np.squeeze(
-            np.random.rand(params[0]["input_dims"][0], params[0]["input_dims"][1])
-        )
-        keras_preds = model.predict(
-            input_data.reshape((params[0]["input_dims"][0], params[0]["input_dims"][1]))
-        ).flatten()
-    if len(params[0]["input_dims"]) == 3:
-        input_data = np.transpose(input_data, [1, 0, 2])
-    if _is_macos() and _macos_version() >= (10, 13):
-        coreml_preds = mlkitmodel.predict({"data": input_data})["output"].flatten()
-        if K.tensorflow_backend._SESSION:
-            import tensorflow as tf
-
-            tf.reset_default_graph()
-            K.tensorflow_backend._SESSION.close()
-            K.tensorflow_backend._SESSION = None
-
-        max_denominator = np.maximum(
-            np.maximum(np.abs(coreml_preds), np.abs(keras_preds)), 1.0
-        )
-        relative_error = coreml_preds / max_denominator - keras_preds / max_denominator
-        return relative_error, keras_preds, coreml_preds
-    else:
-        return [], None, None
-
-
-class SimpleTestCase(unittest.TestCase):
-    """
-    Test Simple test cases to make sure layers work under basic params. Also, template for testing
-    different failing test cases from stress tests
-    """
-
-    def _test_simple_rnn(self, keras_major_version):
-        params = (
-            dict(
-                input_dims=[1, 2, 100],
-                go_backwards=False,
-                activation="tanh",
-                stateful=False,
-                unroll=False,
-                return_sequences=True,
-                output_dim=4,  # Passes for < 3
-            ),
-        )
-        model = Sequential()
-        if keras_major_version == 2:
-            model.add(
-                SimpleRNN(
-                    units=params[0]["output_dim"],
-                    input_shape=(
-                        params[0]["input_dims"][1],
-                        params[0]["input_dims"][2],
-                    ),
-                    activation=params[0]["activation"],
-                    return_sequences=params[0]["return_sequences"],
-                    go_backwards=params[0]["go_backwards"],
-                    unroll=True,
-                )
-            )
-        else:
-            model.add(
-                SimpleRNN(
-                    output_dim=params[0]["output_dim"],
-                    input_length=params[0]["input_dims"][1],
-                    input_dim=params[0]["input_dims"][2],
-                    activation=params[0]["activation"],
-                    return_sequences=params[0]["return_sequences"],
-                    go_backwards=params[0]["go_backwards"],
-                    unroll=True,
-                )
-            )
-        relative_error, keras_preds, coreml_preds = simple_model_eval(params, model)
-        for i in range(len(relative_error)):
-            self.assertLessEqual(relative_error[i], 0.01)
-
-    def _test_simple_lstm(self, keras_major_version):
-        params = (
-            dict(
-                input_dims=[1, 3, 5],
-                go_backwards=True,
-                activation="linear",
-                stateful=False,
-                unroll=False,
-                return_sequences=False,
-                output_dim=3,
-                inner_activation="linear",
-            ),
-        )
-        model = Sequential()
-        if keras_major_version == 2:
-            model.add(
-                LSTM(
-                    units=params[0]["output_dim"],
-                    input_shape=(
-                        params[0]["input_dims"][1],
-                        params[0]["input_dims"][2],
-                    ),
-                    activation=params[0]["activation"],
-                    return_sequences=params[0]["return_sequences"],
-                    go_backwards=params[0]["go_backwards"],
-                    unroll=True,
-                    recurrent_activation="linear",
-                )
-            )
-        else:
-            model.add(
-                LSTM(
-                    output_dim=params[0]["output_dim"],
-                    input_length=params[0]["input_dims"][1],
-                    input_dim=params[0]["input_dims"][2],
-                    activation=params[0]["activation"],
-                    return_sequences=params[0]["return_sequences"],
-                    go_backwards=params[0]["go_backwards"],
-                    unroll=True,
-                    inner_activation="linear",
-                )
-            )
-        relative_error, keras_preds, coreml_preds = simple_model_eval(params, model)
-        for i in range(len(relative_error)):
-            self.assertLessEqual(relative_error[i], 0.01)
-
-    def _test_simple_gru(self, keras_major_version):
-        params = (
-            dict(
-                input_dims=[1, 4, 8],
-                go_backwards=False,
-                activation="tanh",
-                stateful=False,
-                unroll=False,
-                return_sequences=False,
-                output_dim=4,
-            ),
-        )
-        model = Sequential()
-        if keras_major_version == 2:
-            model.add(
-                GRU(
-                    units=params[0]["output_dim"],
-                    input_shape=(
-                        params[0]["input_dims"][1],
-                        params[0]["input_dims"][2],
-                    ),
-                    activation=params[0]["activation"],
-                    recurrent_activation="sigmoid",
-                    return_sequences=params[0]["return_sequences"],
-                    go_backwards=params[0]["go_backwards"],
-                    unroll=True,
-                )
-            )
-        else:
-            model.add(
-                GRU(
-                    output_dim=params[0]["output_dim"],
-                    input_length=params[0]["input_dims"][1],
-                    input_dim=params[0]["input_dims"][2],
-                    activation=params[0]["activation"],
-                    inner_activation="sigmoid",
-                    return_sequences=params[0]["return_sequences"],
-                    go_backwards=params[0]["go_backwards"],
-                    unroll=True,
-                )
-            )
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-        relative_error, keras_preds, coreml_preds = simple_model_eval(params, model)
-        for i in range(len(relative_error)):
-            self.assertLessEqual(relative_error[i], 0.01)
-
-    @unittest.skipIf(not _HAS_KERAS_TF, "Missing keras 1. Skipping test.")
-    @pytest.mark.keras1
-    def test_keras1_simple_rnn(self):
-        self._test_simple_rnn(keras_major_version=1)
-
-    @unittest.skipIf(not _HAS_KERAS_TF, "Missing keras 1. Skipping test.")
-    @pytest.mark.keras1
-    def test_keras1_simple_lstm(self):
-        self._test_simple_lstm(keras_major_version=1)
-
-    @unittest.skipIf(not _HAS_KERAS_TF, "Missing keras 1. Skipping test.")
-    @pytest.mark.keras1
-    def test_keras1_simple_gru(self):
-        self._test_simple_gru(keras_major_version=1)
-
-    @unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras 2. Skipping test.")
-    @pytest.mark.keras2
-    def test_keras2_simple_rnn(self):
-        self._test_simple_rnn(keras_major_version=2)
-
-    @unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras 2. Skipping test.")
-    @pytest.mark.keras2
-    def test_keras2_simple_lstm(self):
-        self._test_simple_lstm(keras_major_version=2)
-
-    @unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras 2. Skipping test.")
-    @pytest.mark.keras2
-    def test_keras2_simple_gru(self):
-        self._test_simple_gru(keras_major_version=2)
-
-
-class RecurrentLayerTest(unittest.TestCase):
-    """
-    Base class for recurrent layer tests. Masking param not included here
-    """
-
-    def setUp(self):
-        self.params_dict = dict(
-            input_dims=[[1, 5, 10], [1, 1, 1], [1, 2, 5]],
-            output_dim=[1, 5, 10],
-            stateful=[False],
-            go_backwards=[False, True],
-            unroll=[True],
-            return_sequences=[False, True],
-            activation=["tanh", "linear", "sigmoid", "hard_sigmoid", "relu"],
-        )
-        self.base_layer_params = list(itertools.product(*self.params_dict.values()))
-
-
-class RNNLayer(RecurrentLayerTest):
-    """
-    Class for testing single RNN layer
-    """
-
-    def setUp(self):
-        super(RNNLayer, self).setUp()
-        self.simple_rnn_params_dict = self.params_dict
-        self.rnn_layer_params = list(
-            itertools.product(self.simple_rnn_params_dict.values())
-        )
-
-    def _test_rnn_layer(self, keras_major_version, limit=None):
-        i = 0
-        numerical_err_models = []
-        shape_err_models = []
-        numerical_failiure = 0
-        params = list(itertools.product(self.base_layer_params, self.rnn_layer_params))
-        np.random.shuffle(params)
-        params = [
-            param
-            for param in params
-            if valid_params(dict(zip(self.params_dict.keys(), param[0])))
-        ]
-        for base_params, rnn_params in params[:limit]:
-            base_params = dict(zip(self.params_dict.keys(), base_params))
-            rnn_params = dict(zip(self.simple_rnn_params_dict.keys(), rnn_params))
-            model = Sequential()
-            unroll = base_params["unroll"]
-            if base_params["input_dims"][1] == 1 and unroll is True:
-                unroll = False
-            if keras_major_version == 2:
-                model.add(
-                    SimpleRNN(
-                        base_params["output_dim"],
-                        input_shape=base_params["input_dims"][1:],
-                        activation=base_params["activation"],
-                        return_sequences=base_params["return_sequences"],
-                        go_backwards=base_params["go_backwards"],
-                        unroll=unroll,
-                    )
-                )
-            else:
-                model.add(
-                    SimpleRNN(
-                        base_params["output_dim"],
-                        input_length=base_params["input_dims"][1],
-                        input_dim=base_params["input_dims"][2],
-                        activation=base_params["activation"],
-                        return_sequences=base_params["return_sequences"],
-                        go_backwards=base_params["go_backwards"],
-                        unroll=unroll,
-                    )
-                )
-            mlkitmodel = get_mlkit_model_from_path(model)
-            input_data = generate_input(
-                base_params["input_dims"][0],
-                base_params["input_dims"][1],
-                base_params["input_dims"][2],
-            )
-            keras_preds = model.predict(input_data).flatten()
-            if K.tensorflow_backend._SESSION:
-                import tensorflow as tf
-
-                tf.reset_default_graph()
-                K.tensorflow_backend._SESSION.close()
-                K.tensorflow_backend._SESSION = None
-            input_data = np.transpose(input_data, [1, 0, 2])
-            if _is_macos() and _macos_version() >= (10, 13):
-                coreml_preds = mlkitmodel.predict({"data": input_data})[
-                    "output"
-                ].flatten()
-                try:
-                    self.assertEqual(coreml_preds.shape, keras_preds.shape)
-                except AssertionError:
-                    print(
-                        "Shape error:\nbase_params: {}\nkeras_preds.shape: {}\ncoreml_preds.shape: {}".format(
-                            base_params, keras_preds.shape, coreml_preds.shape
-                        )
-                    )
-                    shape_err_models.append(base_params)
-                    i += 1
-                    continue
-                try:
-                    max_denominator = np.maximum(
-                        np.maximum(np.abs(coreml_preds), np.abs(keras_preds)), 1.0
-                    )
-                    relative_error = (
-                        coreml_preds / max_denominator - keras_preds / max_denominator
-                    )
-                    for i in range(len(relative_error)):
-                        self.assertLessEqual(relative_error[i], 0.01)
-                except AssertionError:
-                    print(
-                        "Assertion error:\nbase_params: {}\nkeras_preds: {}\ncoreml_preds: {}".format(
-                            base_params, keras_preds, coreml_preds
-                        )
-                    )
-                    numerical_failiure += 1
-                    numerical_err_models.append(base_params)
-            i += 1
-
-        self.assertEqual(
-            shape_err_models, [], msg="Shape error models {}".format(shape_err_models)
-        )
-        self.assertEqual(
-            numerical_err_models,
-            [],
-            msg="Numerical error models {}\n"
-            "Total numerical failiures: {}/{}\n".format(
-                numerical_err_models, numerical_failiure, i
-            ),
-        )
-
-    @unittest.skipIf(not _HAS_KERAS_TF, "Missing keras 1. Skipping test.")
-    @pytest.mark.keras1
-    @pytest.mark.slow
-    def test_kers1_rnn_layer_stress(self):
-        self._test_rnn_layer(keras_major_version=1)
-
-    @unittest.skipIf(not _HAS_KERAS_TF, "Missing keras 1. Skipping test.")
-    @pytest.mark.keras1
-    def test_keras1_rnn_layer(self):
-        self._test_rnn_layer(keras_major_version=1, limit=10)
-
-    @unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras 2. Skipping test.")
-    @pytest.mark.keras2
-    @pytest.mark.slow
-    def test_keras2_rnn_layer_stress(self):
-        self._test_rnn_layer(keras_major_version=2)
-
-    @unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras 2. Skipping test.")
-    @pytest.mark.keras2
-    def test_keras2_rnn_layer(self):
-        self._test_rnn_layer(keras_major_version=2, limit=10)
-
-
-class LSTMLayer(RecurrentLayerTest):
-    """
-    Class for testing single RNN layer
-    """
-
-    def setUp(self):
-        super(LSTMLayer, self).setUp()
-        self.lstm_params_dict = dict(
-            inner_activation=["tanh", "linear", "sigmoid", "hard_sigmoid", "relu"],
-            bidirectional=[False],
-        )
-        self.lstm_layer_params = list(
-            itertools.product(*self.lstm_params_dict.values())
-        )
-
-    def _test_bilstm_layer(self, batched=False):
-        if not batched:
-            params_dict = dict(
-                input_dims=[[1, 5, 10], [1, 2, 5]],
-                output_dim=[1, 5, 10],
-                activation=["tanh", "linear", "sigmoid", "hard_sigmoid", "relu"],
-                inner_activation=["tanh", "linear", "sigmoid", "hard_sigmoid", "relu"],
-                return_sequences=[True, False],
-            )
-        else:
-            params_dict = dict(
-                input_dims=[[3, 5, 10], [6, 2, 5]],
-                output_dim=[1, 5, 10],
-                activation=["tanh", "linear", "sigmoid", "hard_sigmoid", "relu"],
-                inner_activation=["tanh", "linear", "sigmoid", "hard_sigmoid", "relu"],
-                return_sequences=[True, False],
-            )
-
-        params = list(itertools.product(*params_dict.values()))
-        ii = 0
-        i = 0
-        numerical_err_models = []
-        shape_err_models = []
-        numerical_failiure = 0
-        for param in params:
-            ii += 1
-            # print('-------------- %d / %d ------------------- ' % (ii, len(params)))
-            param = dict(zip(params_dict.keys(), param))
-
-            if param["activation"] == "linear":
-                keras_act = None
-            else:
-                keras_act = param["activation"]
-
-            if param["inner_activation"] == "linear":
-                keras_inner_act = None
-            else:
-                keras_inner_act = param["inner_activation"]
-
-            model = Sequential()
-            model.add(
-                Bidirectional(
-                    LSTM(
-                        param["output_dim"],
-                        activation=keras_act,
-                        recurrent_activation=keras_inner_act,
-                        return_sequences=param["return_sequences"],
-                        go_backwards=False,
-                        unroll=False,
-                    ),
-                    input_shape=(param["input_dims"][1], param["input_dims"][2]),
-                )
-            )
-            mlmodel = get_mlkit_model_from_path(model)
-
-            Batch = param["input_dims"][0]
-            Seq = param["input_dims"][1]
-            h = param["output_dim"]
-            input_size = param["input_dims"][2]
-
-            input_data = generate_input(Batch, Seq, input_size)
-
-            activations_to_test_with_numpy = {"linear", "relu"}
-            if (
-                param["activation"] in activations_to_test_with_numpy
-                or param["inner_activation"] in activations_to_test_with_numpy
-            ):
-                keras_preds = get_numpy_prediction_bilstm_batched(
-                    model, input_data
-                )  # (Batch, Seq, h)
-            else:
-                keras_preds = model.predict(input_data)  # (Batch, Seq, h)
-
-            if _is_macos() and _macos_version() >= (10, 13):
-                input_data = np.transpose(input_data, [1, 0, 2])
-                input_dict = {}
-                input_dict["data"] = input_data
-                input_dict["bidirectional_1_h_in"] = np.zeros(
-                    (1, Batch, h), dtype=np.float
-                )
-                input_dict["bidirectional_1_c_in"] = np.zeros(
-                    (1, Batch, h), dtype=np.float
-                )
-                input_dict["bidirectional_1_h_in_rev"] = np.zeros(
-                    (1, Batch, h), dtype=np.float
-                )
-                input_dict["bidirectional_1_c_in_rev"] = np.zeros(
-                    (1, Batch, h), dtype=np.float
-                )
-                coreml_preds = mlmodel.predict(input_dict)[
-                    "output"
-                ]  # (Seq, Batch, h, .. )
-                if param["return_sequences"]:
-                    coreml_preds = np.reshape(coreml_preds, [Seq, Batch, 2 * h])
-                else:
-                    coreml_preds = np.reshape(coreml_preds, [1, Batch, 2 * h])
-                    keras_preds = np.expand_dims(keras_preds, axis=1)
-                coreml_preds = np.transpose(coreml_preds, [1, 0, 2])
-
-                if K.tensorflow_backend._SESSION:
-                    import tensorflow as tf
-
-                    tf.reset_default_graph()
-                    K.tensorflow_backend._SESSION.close()
-                    K.tensorflow_backend._SESSION = None
-
-                try:
-                    self.assertEqual(coreml_preds.shape, keras_preds.shape)
-                except AssertionError:
-                    print(
-                        "Shape error:\n param: {}\n\n keras_preds.shape: {}\n\n coreml_preds.shape: {}".format(
-                            param, keras_preds.shape, coreml_preds.shape
-                        )
-                    )
-                    shape_err_models.append(param)
-                    i += 1
-                    continue
-                max_denominator = np.maximum(
-                    np.maximum(
-                        np.abs(coreml_preds.flatten()), np.abs(keras_preds.flatten())
-                    ),
-                    1.0,
-                )
-                relative_error = (
-                    coreml_preds.flatten() / max_denominator
-                    - keras_preds.flatten() / max_denominator
-                )
-                max_relative_error = np.amax(relative_error)
-                try:
-                    self.assertLessEqual(max_relative_error, 0.01)
-                except AssertionError:
-                    snr, psnr, signal_energy = _compute_SNR(keras_preds, coreml_preds)
-                    print("-*" * 80)
-                    print("Assertion error. \n param : {} \n".format(param))
-                    print(
-                        "max error = %.4f, snr = %.1f, psnr = %.1f, energy = %.6f"
-                        % (max_relative_error, snr, psnr, signal_energy)
-                    )
-                    print(
-                        "keras preds shape: {}, coreml preds shape = {}".format(
-                            str(keras_preds.shape), str(coreml_preds.shape)
-                        )
-                    )
-                    # for b in range(Batch):
-                    #     snr, psnr, signal_energy = _compute_SNR(keras_preds[b, :, :], coreml_preds[b, :, :])
-                    #     print('snr = %.1f, psnr = %.1f, energy = %.6f' % (snr, psnr, signal_energy))
-                    #     print('batch id = {}, keras_preds = \n{} '.format(b, keras_preds[b, :, :]))
-                    #     print('batch id = {}, coreml_preds = \n{} '.format(b, coreml_preds[b, :, :]))
-                    print("-*" * 80)
-
-                    numerical_failiure += 1
-                    numerical_err_models.append(param)
-                    continue
-
-            i += 1
-
-        self.assertEqual(
-            shape_err_models, [], msg="Shape error models {}".format(shape_err_models)
-        )
-        self.assertEqual(
-            numerical_err_models,
-            [],
-            msg="Numerical error models {}".format(numerical_err_models),
-        )
-
-    def _test_batched_lstm_layer(self):
-        params_dict = dict(
-            input_dims=[[3, 5, 10], [6, 2, 5]],
-            output_dim=[1, 5, 10],
-            activation=["tanh", "linear", "sigmoid", "hard_sigmoid", "relu"],
-            inner_activation=["tanh", "linear", "sigmoid", "hard_sigmoid", "relu"],
-            return_sequences=[True, False],
-        )
-        params = list(itertools.product(*params_dict.values()))
-        ii = 0
-        i = 0
-        numerical_err_models = []
-        shape_err_models = []
-        numerical_failiure = 0
-        for param in params:
-            ii += 1
-            # print('-------------- %d / %d ------------------- ' % (ii, len(params)))
-            param = dict(zip(params_dict.keys(), param))
-
-            if param["activation"] == "linear":
-                keras_act = None
-            else:
-                keras_act = param["activation"]
-
-            if param["inner_activation"] == "linear":
-                keras_inner_act = None
-            else:
-                keras_inner_act = param["inner_activation"]
-
-            model = Sequential()
-            model.add(
-                LSTM(
-                    param["output_dim"],
-                    input_shape=(param["input_dims"][1], param["input_dims"][2]),
-                    activation=keras_act,
-                    recurrent_activation=keras_inner_act,
-                    return_sequences=param["return_sequences"],
-                    go_backwards=False,
-                    unroll=False,
-                )
-            )
-
-            mlmodel = get_mlkit_model_from_path(model)
-
-            Batch = param["input_dims"][0]
-            Seq = param["input_dims"][1]
-            h = param["output_dim"]
-            input_size = param["input_dims"][2]
-
-            input_data = generate_input(Batch, Seq, input_size)
-
-            keras_preds = model.predict(input_data)  # (Batch, Seq, h)
-
-            if _is_macos() and _macos_version() >= (10, 13):
-                input_data = np.transpose(input_data, [1, 0, 2])
-                input_dict = {}
-                input_dict["data"] = input_data
-                input_dict["lstm_1_h_in"] = np.zeros((1, Batch, h), dtype=np.float)
-                input_dict["lstm_1_c_in"] = np.zeros((1, Batch, h), dtype=np.float)
-                coreml_preds = mlmodel.predict(input_dict)[
-                    "output"
-                ]  # (Seq, Batch, h, .. )
-                if param["return_sequences"]:
-                    coreml_preds = np.reshape(coreml_preds, [Seq, Batch, h])
-                else:
-                    coreml_preds = np.reshape(coreml_preds, [1, Batch, h])
-                    keras_preds = np.expand_dims(keras_preds, axis=1)
-                coreml_preds = np.transpose(coreml_preds, [1, 0, 2])
-
-                if K.tensorflow_backend._SESSION:
-                    import tensorflow as tf
-
-                    tf.reset_default_graph()
-                    K.tensorflow_backend._SESSION.close()
-                    K.tensorflow_backend._SESSION = None
-
-                try:
-                    self.assertEqual(coreml_preds.shape, keras_preds.shape)
-                except AssertionError:
-                    print(
-                        "Shape error:\n param: {}\n\n keras_preds.shape: {}\n\n coreml_preds.shape: {}".format(
-                            param, keras_preds.shape, coreml_preds.shape
-                        )
-                    )
-                    shape_err_models.append(param)
-                    i += 1
-                    continue
-                try:
-                    max_denominator = np.maximum(
-                        np.maximum(
-                            np.abs(coreml_preds.flatten()),
-                            np.abs(keras_preds.flatten()),
-                        ),
-                        1.0,
-                    )
-                    relative_error = (
-                        coreml_preds.flatten() / max_denominator
-                        - keras_preds.flatten() / max_denominator
-                    )
-                    max_relative_error = np.amax(relative_error)
-                    self.assertLessEqual(max_relative_error, 0.01)
-                except AssertionError:
-                    snr, psnr, signal_energy = _compute_SNR(keras_preds, coreml_preds)
-                    print("-*" * 80)
-                    print("Assertion error. \n param : {} \n".format(param))
-                    print(
-                        "max error = %.4f, snr = %.1f, psnr = %.1f, energy = %.6f"
-                        % (max_relative_error, snr, psnr, signal_energy)
-                    )
-                    print(
-                        "keras preds shape: {}, coreml preds shape = {}".format(
-                            str(keras_preds.shape), str(coreml_preds.shape)
-                        )
-                    )
-                    # for b in range(Batch):
-                    #     snr, psnr, signal_energy = _compute_SNR(keras_preds[b, :, :], coreml_preds[b, :, :])
-                    #     print('snr = %.1f, psnr = %.1f, energy = %.6f' % (snr, psnr, signal_energy))
-                    #     print('batch id = {}, keras_preds = \n{} '.format(b, keras_preds[b, :, :]))
-                    #     print('batch id = {}, coreml_preds = \n{} '.format(b, coreml_preds[b, :, :]))
-                    print("-*" * 80)
-
-                    numerical_failiure += 1
-                    numerical_err_models.append(param)
-                    continue
-
-            i += 1
-
-        self.assertEqual(
-            shape_err_models, [], msg="Shape error models {}".format(shape_err_models)
-        )
-        self.assertEqual(
-            numerical_err_models,
-            [],
-            msg="Numerical error models {}".format(numerical_err_models),
-        )
-
-    def _test_lstm_layer(self, keras_major_version, limit=None):
-        numerical_err_models = []
-        shape_err_models = []
-        numerical_failiure = 0
-        params = list(itertools.product(self.base_layer_params, self.lstm_layer_params))
-        np.random.shuffle(params)
-
-        params = [
-            param
-            for param in params
-            if valid_params(dict(zip(self.params_dict.keys(), param[0])))
-        ]
-        ctr = 0
-        for base_params, lstm_params in params[:limit]:
-            ctr += 1
-            # print('--------------- Testing %d/%d ---------------' %(ctr, len(params)))
-            base_params = dict(zip(self.params_dict.keys(), base_params))
-            lstm_params = dict(zip(self.lstm_params_dict.keys(), lstm_params))
-            model = Sequential()
-            unroll = base_params["unroll"]
-            if base_params["input_dims"][1] == 1 and unroll is True:
-                unroll = False
-            if lstm_params["bidirectional"] is True:
-                if keras_major_version == 2:
-                    model.add(
-                        Bidirectional(
-                            LSTM(
-                                base_params["output_dim"],
-                                activation=base_params["activation"],
-                                recurrent_activation=lstm_params["inner_activation"],
-                                return_sequences=base_params["return_sequences"],
-                                go_backwards=False,
-                                unroll=unroll,
-                            ),
-                            input_shape=(
-                                base_params["input_dims"][1],
-                                base_params["input_dims"][2],
-                            ),
-                        )
-                    )
-                else:
-                    model.add(
-                        Bidirectional(
-                            LSTM(
-                                base_params["output_dim"],
-                                activation=base_params["activation"],
-                                inner_activation=lstm_params["inner_activation"],
-                                return_sequences=base_params["return_sequences"],
-                                go_backwards=False,
-                                unroll=unroll,
-                            ),
-                            input_shape=(
-                                base_params["input_dims"][1],
-                                base_params["input_dims"][2],
-                            ),
-                        )
-                    )
-            else:
-                if keras_major_version == 2:
-                    model.add(
-                        LSTM(
-                            base_params["output_dim"],
-                            input_shape=(
-                                base_params["input_dims"][1],
-                                base_params["input_dims"][2],
-                            ),
-                            activation=base_params["activation"],
-                            recurrent_activation=lstm_params["inner_activation"],
-                            return_sequences=base_params["return_sequences"],
-                            go_backwards=base_params["go_backwards"],
-                            unroll=unroll,
-                        )
-                    )
-                else:
-                    model.add(
-                        LSTM(
-                            base_params["output_dim"],
-                            input_shape=(
-                                base_params["input_dims"][1],
-                                base_params["input_dims"][2],
-                            ),
-                            activation=base_params["activation"],
-                            inner_activation=lstm_params["inner_activation"],
-                            return_sequences=base_params["return_sequences"],
-                            go_backwards=base_params["go_backwards"],
-                            unroll=unroll,
-                        )
-                    )
-            mlkitmodel = get_mlkit_model_from_path(model)
-            input_data = generate_input(
-                base_params["input_dims"][0],
-                base_params["input_dims"][1],
-                base_params["input_dims"][2],
-            )
-
-            activations_to_test_with_numpy = {"linear", "relu"}
-            if (
-                base_params["activation"] in activations_to_test_with_numpy
-                or lstm_params["inner_activation"] in activations_to_test_with_numpy
-            ):
-                if lstm_params["bidirectional"]:
-                    keras_preds = get_numpy_prediction_bilstm(
-                        model, input_data
-                    ).flatten()
-                else:
-                    keras_preds = get_numpy_prediction_unilstm(
-                        model, input_data
-                    ).flatten()
-            else:
-                keras_preds = model.predict(input_data).flatten()
-
-            if _is_macos() and _macos_version() >= (10, 13):
-                input_data = np.transpose(input_data, [1, 0, 2])
-                coreml_preds = mlkitmodel.predict({"data": input_data})[
-                    "output"
-                ].flatten()
-
-                if K.tensorflow_backend._SESSION:
-                    import tensorflow as tf
-
-                    tf.reset_default_graph()
-                    K.tensorflow_backend._SESSION.close()
-                    K.tensorflow_backend._SESSION = None
-
-                try:
-                    self.assertEqual(coreml_preds.shape, keras_preds.shape)
-                except AssertionError:
-                    print(
-                        "Shape error:\n base_params: {}\n\n lstm_params: {}\n\n keras_preds.shape: {}\n\n coreml_preds.shape: {}".format(
-                            base_params,
-                            lstm_params,
-                            keras_preds.shape,
-                            coreml_preds.shape,
-                        )
-                    )
-                    shape_err_models.append(base_params)
-                    continue
-
-                max_denominator = np.maximum(
-                    np.maximum(np.abs(coreml_preds), np.abs(keras_preds)), 1.0
-                )
-                try:
-                    relative_error = (
-                        coreml_preds / max_denominator - keras_preds / max_denominator
-                    )
-                    for i in range(len(relative_error)):
-                        self.assertLessEqual(relative_error[i], 0.01)
-                except AssertionError:
-                    print(
-                        "Assertion error:\n base_params: {}\n lstm_params: {}\n\n keras_preds: {}\n\n coreml_preds: {}\n\n\n keras_preds: {}\n\n\n coreml_preds: {}\n".format(
-                            base_params,
-                            lstm_params,
-                            keras_preds / max_denominator,
-                            coreml_preds / max_denominator,
-                            keras_preds,
-                            coreml_preds,
-                        )
-                    )
-                    numerical_failiure += 1
-                    numerical_err_models.append(base_params)
-
-        self.assertEqual(
-            shape_err_models, [], msg="Shape error models {}".format(shape_err_models)
-        )
-        self.assertEqual(
-            numerical_err_models,
-            [],
-            msg="Numerical error models {}".format(numerical_err_models),
-        )
-
-    @unittest.skipIf(not _HAS_KERAS_TF, "Missing keras 1. Skipping test.")
-    @pytest.mark.keras1
-    @pytest.mark.slow
-    def test_keras_lstm_layer_stress(self):
-        self._test_lstm_layer(keras_major_version=1)
-
-    @unittest.skipIf(not _HAS_KERAS_TF, "Missing keras 1. Skipping test.")
-    @pytest.mark.keras1
-    def test_keras_lstm_layer(self):
-        self._test_lstm_layer(keras_major_version=1, limit=10)
-
-    @unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras 2. Skipping test.")
-    @pytest.mark.keras2
-    @pytest.mark.slow
-    def test_keras2_lstm_layer_stress(self):
-        self._test_lstm_layer(keras_major_version=2)
-
-    @unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras 2. Skipping test.")
-    @pytest.mark.keras2
-    def test_keras2_lstm_layer(self):
-        self._test_lstm_layer(keras_major_version=2, limit=10)
-
-    @unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras 2. Skipping test.")
-    @pytest.mark.keras2
-    @pytest.mark.slow
-    def test_keras2_bilstm_layer(self):
-        self._test_bilstm_layer()
-
-    @unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras 2. Skipping test.")
-    @pytest.mark.keras2
-    @pytest.mark.slow
-    def test_keras2_bilstm_layer_batched(self):
-        self._test_bilstm_layer(batched=True)
-
-    @unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras 2. Skipping test.")
-    @pytest.mark.keras2
-    @pytest.mark.slow
-    def test_keras2_lstm_layer_batched(self):
-        self._test_batched_lstm_layer()
-
-
-class GRULayer(RecurrentLayerTest):
-    """
-    Class for testing GRU layer
-    """
-
-    def setUp(self):
-        super(GRULayer, self).setUp()
-        self.gru_params_dict = dict(
-            inner_activation=["tanh", "linear", "sigmoid", "hard_sigmoid", "relu"]
-        )
-        self.gru_layer_params = list(itertools.product(*self.gru_params_dict.values()))
-
-    def _test_gru_layer(self, keras_major_version, limit=None):
-        i = 0
-        numerical_err_models = []
-        shape_err_models = []
-        numerical_failiure = 0
-        params = list(itertools.product(self.base_layer_params, self.gru_layer_params))
-        np.random.shuffle(params)
-        params = [
-            param
-            for param in params
-            if valid_params(dict(zip(self.params_dict.keys(), param[0])))
-        ]
-        for base_params, gru_params in params[:limit]:
-            base_params = dict(zip(self.params_dict.keys(), base_params))
-            gru_params = dict(zip(self.gru_params_dict.keys(), gru_params))
-            model = Sequential()
-            unroll = base_params["unroll"]
-            if base_params["input_dims"][1] == 1 and unroll is True:
-                unroll = False
-            if keras_major_version == 2:
-                model.add(
-                    GRU(
-                        base_params["output_dim"],
-                        input_shape=(
-                            base_params["input_dims"][1],
-                            base_params["input_dims"][2],
-                        ),
-                        activation=base_params["activation"],
-                        recurrent_activation=gru_params["inner_activation"],
-                        return_sequences=base_params["return_sequences"],
-                        go_backwards=base_params["go_backwards"],
-                        unroll=unroll,
-                    )
-                )
-            else:
-                model.add(
-                    GRU(
-                        base_params["output_dim"],
-                        input_length=base_params["input_dims"][1],
-                        input_dim=base_params["input_dims"][2],
-                        activation=base_params["activation"],
-                        inner_activation=gru_params["inner_activation"],
-                        return_sequences=base_params["return_sequences"],
-                        go_backwards=base_params["go_backwards"],
-                        unroll=unroll,
-                    )
-                )
-            model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-            mlkitmodel = get_mlkit_model_from_path(model)
-            input_data = generate_input(
-                base_params["input_dims"][0],
-                base_params["input_dims"][1],
-                base_params["input_dims"][2],
-            )
-
-            activations_to_test_with_numpy = {"linear", "relu"}
-            if (
-                base_params["activation"] in activations_to_test_with_numpy
-                or gru_params["inner_activation"] in activations_to_test_with_numpy
-            ):
-                keras_preds = get_numpy_prediction_gru(model, input_data).flatten()
-            else:
-                keras_preds = model.predict(input_data).flatten()
-
-            if _is_macos() and _macos_version() >= (10, 13):
-                input_data = np.transpose(input_data, [1, 0, 2])
-                coreml_preds = mlkitmodel.predict({"data": input_data})[
-                    "output"
-                ].flatten()
-                if K.tensorflow_backend._SESSION:
-                    import tensorflow as tf
-
-                    tf.reset_default_graph()
-                    K.tensorflow_backend._SESSION.close()
-                    K.tensorflow_backend._SESSION = None
-                try:
-                    self.assertEqual(coreml_preds.shape, keras_preds.shape)
-                except AssertionError:
-                    print(
-                        "Shape error:\nbase_params: {}\n gru_params: {}\nkeras_preds.shape: {}\ncoreml_preds.shape: {}".format(
-                            base_params,
-                            gru_params,
-                            keras_preds.shape,
-                            coreml_preds.shape,
-                        )
-                    )
-                    shape_err_models.append(base_params)
-                    i += 1
-                    continue
-
-                max_denominator = np.maximum(
-                    np.maximum(np.abs(coreml_preds), np.abs(keras_preds)), 1.0
-                )
-                try:
-                    relative_error = (
-                        coreml_preds / max_denominator - keras_preds / max_denominator
-                    )
-                    for i in range(len(relative_error)):
-                        self.assertLessEqual(relative_error[i], 0.01)
-                except AssertionError:
-                    print(
-                        "===============Assertion error:\n base_params: {}\n gru_params: {}\n\n keras_preds: {}\n\n coreml_preds: {}\n\n\n keras_preds: {}\n\n\n coreml_preds: {}\n".format(
-                            base_params,
-                            gru_params,
-                            keras_preds / max_denominator,
-                            coreml_preds / max_denominator,
-                            keras_preds,
-                            coreml_preds,
-                        )
-                    )
-                    numerical_failiure += 1
-                    numerical_err_models.append(base_params)
-            i += 1
-
-        self.assertEqual(
-            shape_err_models, [], msg="Shape error models {}".format(shape_err_models)
-        )
-        self.assertEqual(
-            numerical_err_models,
-            [],
-            msg="Numerical error models {}".format(numerical_err_models),
-        )
-
-    @unittest.skipIf(not _HAS_KERAS_TF, "Missing keras 1. Skipping test.")
-    @pytest.mark.keras1
-    @pytest.mark.slow
-    def test_keras1_test_gru_layer_stress(self):
-        self._test_gru_layer(keras_major_version=1)
-
-    @unittest.skipIf(not _HAS_KERAS_TF, "Missing keras 1. Skipping test.")
-    @pytest.mark.keras1
-    def test_keras1_test_gru_layer(self):
-        self._test_gru_layer(keras_major_version=1, limit=10)
-
-    @unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras 2. Skipping test.")
-    @pytest.mark.keras2
-    @pytest.mark.slow
-    def test_keras2_test_gru_layer_stress(self):
-        self._test_gru_layer(keras_major_version=2)
-
-    @unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras 2. Skipping test.")
-    @pytest.mark.keras2
-    def test_keras2_test_gru_layer(self):
-        self._test_gru_layer(keras_major_version=2, limit=10)
-
-
-class LSTMStacked(unittest.TestCase):
-    """
-    Class for testing LSTMStacked
-    """
-
-    def setUp(self):
-        self.params_dict = dict(
-            input_dims=[[1, 1, 1], [1, 2, 5], [1, 5, 10]],
-            output_dim=[1, 5, 10, 20],
-            stateful=[False],
-            go_backwards=[False],
-            unroll=[True],
-            return_sequences=[True],
-            top_return_sequences=[True, False],
-            activation=["tanh", "sigmoid", "hard_sigmoid"],
-            number_of_layers=[1, 2, 3],
-        )
-        self.base_layer_params = list(itertools.product(*self.params_dict.values()))
-
-    def _test_lstm_stacked(self, keras_major_version, limit=None):
-        numerical_err_models = []
-        shape_err_models = []
-        numerical_failiure = 0
-        params = copy(self.base_layer_params)
-        np.random.shuffle(params)
-        i = 0
-        params = [
-            param
-            for param in params
-            if valid_params(dict(zip(self.params_dict.keys(), param)))
-        ]
-        for base_params in params[:limit]:
-            base_params = dict(zip(self.params_dict.keys(), base_params))
-            model = Sequential()
-            unroll = base_params["unroll"]
-            if base_params["input_dims"][1] == 1 and unroll is True:
-                unroll = False
-            settings = dict(
-                activation=base_params["activation"],
-                return_sequences=True,
-                go_backwards=base_params["go_backwards"],
-                unroll=unroll,
-            )
-            if keras_major_version == 2:
-                model.add(
-                    LSTM(
-                        base_params["output_dim"],
-                        input_shape=base_params["input_dims"][1:],
-                        recurrent_activation="sigmoid",
-                        **settings
-                    )
-                )
-                for idx in range(0, base_params["number_of_layers"]):
-                    model.add(
-                        LSTM(
-                            base_params["output_dim"],
-                            input_shape=(
-                                base_params["input_dims"][1],
-                                base_params["output_dim"],
-                            ),
-                            return_sequences=True,
-                            activation="tanh",
-                            recurrent_activation="sigmoid",
-                        )
-                    )
-                model.add(
-                    LSTM(
-                        10,
-                        input_shape=(
-                            base_params["input_dims"][1],
-                            base_params["output_dim"],
-                        ),
-                        return_sequences=base_params["top_return_sequences"],
-                        activation="sigmoid",
-                    )
-                )
-
-            else:
-                model.add(
-                    LSTM(
-                        output_dim=base_params["output_dim"],
-                        input_length=base_params["input_dims"][1],
-                        input_dim=base_params["input_dims"][2],
-                        inner_activation="sigmoid",
-                        **settings
-                    )
-                )
-                for idx in range(0, base_params["number_of_layers"]):
-                    model.add(
-                        LSTM(
-                            output_dim=base_params["output_dim"],
-                            return_sequences=True,
-                            activation="tanh",
-                            inner_activation="sigmoid",
-                        )
-                    )
-                model.add(
-                    LSTM(
-                        output_dim=10,
-                        return_sequences=base_params["top_return_sequences"],
-                        activation="sigmoid",
-                    )
-                )
-            mlkitmodel = get_mlkit_model_from_path(model)
-            input_data = generate_input(
-                base_params["input_dims"][0],
-                base_params["input_dims"][1],
-                base_params["input_dims"][2],
-            )
-            if _is_macos() and _macos_version() >= (10, 13):
-                keras_preds = model.predict(input_data).flatten()
-                input_data = np.transpose(input_data, [1, 0, 2])
-                coreml_preds = mlkitmodel.predict({"data": input_data})[
-                    "output"
-                ].flatten()
-                import tensorflow as tf
-
-                tf.reset_default_graph()
-                K.tensorflow_backend._SESSION.close()
-                K.tensorflow_backend._SESSION = None
-                try:
-                    self.assertEqual(coreml_preds.shape, keras_preds.shape)
-                except AssertionError:
-                    print(
-                        "Shape error:\nbase_params: {}\nkeras_preds.shape: {}\ncoreml_preds.shape: {}".format(
-                            base_params, keras_preds.shape, coreml_preds.shape
-                        )
-                    )
-                    shape_err_models.append(base_params)
-                    i += 1
-                    continue
-                try:
-                    max_denominator = np.maximum(
-                        np.maximum(np.abs(coreml_preds), np.abs(keras_preds)), 1.0
-                    )
-                    relative_error = (
-                        coreml_preds / max_denominator - keras_preds / max_denominator
-                    )
-                    for i in range(len(relative_error)):
-                        self.assertLessEqual(relative_error[i], 0.01)
-                except AssertionError:
-                    print(
-                        "Assertion error:\nbase_params: {}\nkeras_preds: {}\ncoreml_preds: {}".format(
-                            base_params, keras_preds, coreml_preds
-                        )
-                    )
-                    numerical_failiure += 1
-                    numerical_err_models.append(base_params)
-            i += 1
-        self.assertEqual(
-            shape_err_models, [], msg="Shape error models {}".format(shape_err_models)
-        )
-        self.assertEqual(
-            numerical_err_models,
-            [],
-            msg="Numerical error models {}".format(numerical_err_models),
-        )
-
-    @unittest.skipIf(not _HAS_KERAS_TF, "Missing keras 1. Skipping test.")
-    @pytest.mark.keras1
-    @pytest.mark.slow
-    def test_keras1_lstm_stacked_stress(self):
-        self._test_lstm_stacked(keras_major_version=1)
-
-    @unittest.skipIf(not _HAS_KERAS_TF, "Missing keras 1. Skipping test.")
-    @pytest.mark.keras1
-    def test_keras1_lstm_stacked(self):
-        self._test_lstm_stacked(keras_major_version=1, limit=10)
-
-    @unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras 2. Skipping test.")
-    @pytest.mark.keras2
-    @pytest.mark.slow
-    def test_keras2_lstm_stacked_stress(self):
-        self._test_lstm_stacked(keras_major_version=2)
-
-    @unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras 2. Skipping test.")
-    @pytest.mark.keras2
-    def test_keras2_lstm_stacked(self):
-        self._test_lstm_stacked(keras_major_version=2, limit=10)
-
-
-class DifferentIOModelsTypes(unittest.TestCase):
-    """
-    Class for testing different I/O combinations for LSTMS
-    """
-
-    def _test_one_to_many(self, keras_major_version):
-        params = (
-            dict(
-                input_dims=[1, 10],
-                activation="tanh",
-                return_sequences=False,
-                output_dim=3,
-            ),
-        )
-        number_of_times = 4
-        model = Sequential()
-        model.add(RepeatVector(number_of_times, input_shape=(10,)))
-
-        if keras_major_version == 2:
-            model.add(
-                LSTM(
-                    params[0]["output_dim"],
-                    input_shape=params[0]["input_dims"],
-                    activation=params[0]["activation"],
-                    recurrent_activation="sigmoid",
-                    return_sequences=True,
-                )
-            )
-        else:
-            model.add(
-                LSTM(
-                    output_dim=params[0]["output_dim"],
-                    activation=params[0]["activation"],
-                    inner_activation="sigmoid",
-                    return_sequences=True,
-                )
-            )
-        relative_error, keras_preds, coreml_preds = simple_model_eval(params, model)
-        # print relative_error, '\n', keras_preds, '\n', coreml_preds, '\n'
-        for i in range(len(relative_error)):
-            self.assertLessEqual(relative_error[i], 0.01)
-
-    def _test_many_to_one(self, keras_major_version):
-        params = (
-            dict(
-                input_dims=[1, 10, 5],
-                go_backwards=False,
-                activation="tanh",  # fails with hard_sigmoid
-                stateful=False,
-                unroll=False,
-                return_sequences=False,
-                output_dim=1,
-            ),
-        )
-        model = Sequential()
-        if keras_major_version == 2:
-            model.add(
-                LSTM(
-                    params[0]["output_dim"],
-                    input_shape=params[0]["input_dims"][1:],
-                    activation=params[0]["activation"],
-                    recurrent_activation="sigmoid",
-                )
-            )
-        else:
-            model.add(
-                LSTM(
-                    output_dim=params[0]["output_dim"],
-                    input_shape=params[0]["input_dims"][1:],
-                    activation=params[0]["activation"],
-                    inner_activation="sigmoid",
-                )
-            )
-        relative_error, keras_preds, coreml_preds = simple_model_eval(params, model)
-        # print relative_error, '\n', keras_preds, '\n', coreml_preds, '\n'
-        for i in range(len(relative_error)):
-            self.assertLessEqual(relative_error[i], 0.01)
-
-    def _test_many_to_many(self, keras_major_version):
-        params = (
-            dict(
-                input_dims=[1, 10, 5],
-                go_backwards=False,
-                activation="tanh",  # fails with hard_sigmoid
-                stateful=False,
-                unroll=False,
-                return_sequences=True,
-                output_dim=1,
-            ),
-        )
-        model = Sequential()
-        if keras_major_version == 2:
-            model.add(
-                LSTM(
-                    params[0]["output_dim"],
-                    input_shape=params[0]["input_dims"][1:],
-                    activation=params[0]["activation"],
-                    recurrent_activation="sigmoid",
-                    return_sequences=True,
-                )
-            )
-        else:
-            model.add(
-                LSTM(
-                    output_dim=params[0]["output_dim"],
-                    input_shape=params[0]["input_dims"][1:],
-                    activation=params[0]["activation"],
-                    inner_activation="sigmoid",
-                    return_sequences=True,
-                )
-            )
-        relative_error, keras_preds, coreml_preds = simple_model_eval(params, model)
-        # print relative_error, '\n', keras_preds, '\n', coreml_preds, '\n'
-        for i in range(len(relative_error)):
-            self.assertLessEqual(relative_error[i], 0.01)
-
-    @unittest.skipIf(not _HAS_KERAS_TF, "Missing keras 1. Skipping test.")
-    @pytest.mark.keras1
-    def test_keras1_test_one_to_many(self):
-        self._test_one_to_many(keras_major_version=1)
-
-    @unittest.skipIf(not _HAS_KERAS_TF, "Missing keras 1. Skipping test.")
-    @pytest.mark.keras1
-    def test_keras1_test_many_to_one(self):
-        self._test_many_to_one(keras_major_version=1)
-
-    @unittest.skipIf(not _HAS_KERAS_TF, "Missing keras 1. Skipping test.")
-    @pytest.mark.keras1
-    def test_keras1_many_to_many(self):
-        self._test_many_to_many(keras_major_version=1)
-
-    @unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras 2. Skipping test.")
-    @pytest.mark.keras2
-    def test_keras2_test_one_to_many(self):
-        self._test_one_to_many(keras_major_version=2)
-
-    @unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras 2. Skipping test.")
-    @pytest.mark.keras2
-    def test_keras2_test_many_to_one(self):
-        self._test_many_to_one(keras_major_version=2)
-
-    @unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras 2. Skipping test.")
-    @pytest.mark.keras2
-    def test_keras2_many_to_many(self):
-        self._test_many_to_many(keras_major_version=2)
-
-
-@unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras 2. Skipping test.")
-@pytest.mark.keras2
-class InitialStateRecurrentModels(unittest.TestCase):
-    """
-    This test class sets initial states to the recurrent nodes and then test
-    """
-
-    @unittest.skip("failing - TODO re-enable when it passes consistently")
-    def test_initial_state_GRU(self):
-        data = np.random.rand(1, 1, 2)
-
-        model = keras.models.Sequential()
-        model.add(
-            keras.layers.GRU(
-                5, input_shape=(1, 2), batch_input_shape=[1, 1, 2], stateful=True
-            )
-        )
-        model.get_layer(index=1).reset_states()
-
-        coreml_model = keras_converter.convert(
-            model=model, input_names="data", output_names="output"
-        )
-        if _is_macos() and _macos_version() >= (10, 13):
-            keras_output_1 = model.predict(data)
-            coreml_full_output_1 = coreml_model.predict({"data": data})
-            coreml_output_1 = coreml_full_output_1["output"]
-            coreml_output_1 = np.expand_dims(coreml_output_1, 1)
-
-            np.testing.assert_array_almost_equal(coreml_output_1.T, keras_output_1)
-
-        hidden_state = np.random.rand(1, 5)
-        model.get_layer(index=1).reset_states(hidden_state)
-        coreml_model = keras_converter.convert(
-            model=model, input_names="data", output_names="output"
-        )
-        spec = coreml_model.get_spec()
-        if _is_macos() and _macos_version() >= (10, 13):
-            keras_output_2 = model.predict(data)
-            coreml_full_output_2 = coreml_model.predict(
-                {"data": data, spec.description.input[1].name: hidden_state[0]}
-            )
-            coreml_output_2 = coreml_full_output_2["output"]
-            coreml_output_2 = np.expand_dims(coreml_output_2, 1)
-            np.testing.assert_array_almost_equal(coreml_output_2.T, keras_output_2)
-
-    def test_initial_state_SimpleRNN(self):
-        data = np.random.rand(1, 1, 2)
-        model = keras.models.Sequential()
-        model.add(
-            keras.layers.SimpleRNN(
-                5, input_shape=(1, 2), batch_input_shape=[1, 1, 2], stateful=True
-            )
-        )
-        model.get_layer(index=1).reset_states()
-        coreml_model = keras_converter.convert(
-            model=model, input_names="data", output_names="output"
-        )
-        if _is_macos() and _macos_version() >= (10, 13):
-            keras_output_1 = model.predict(data)
-            coreml_full_output_1 = coreml_model.predict({"data": data})
-            coreml_output_1 = coreml_full_output_1["output"]
-            coreml_output_1 = np.expand_dims(coreml_output_1, 1)
-            np.testing.assert_array_almost_equal(coreml_output_1.T, keras_output_1)
-
-        hidden_state = np.random.rand(1, 5)
-        model.get_layer(index=1).reset_states(hidden_state)
-        coreml_model = keras_converter.convert(
-            model=model, input_names="data", output_names="output"
-        )
-        spec = coreml_model.get_spec()
-        if _is_macos() and _macos_version() >= (10, 13):
-            keras_output_2 = model.predict(data)
-            coreml_full_output_2 = coreml_model.predict(
-                {"data": data, spec.description.input[1].name: hidden_state[0]}
-            )
-            coreml_output_2 = coreml_full_output_2["output"]
-            coreml_output_2 = np.expand_dims(coreml_output_2, 1)
-            np.testing.assert_array_almost_equal(coreml_output_2.T, keras_output_2)
-
-    def test_initial_state_LSTM(self):
-        data = np.random.rand(1, 1, 2)
-
-        model = keras.models.Sequential()
-        model.add(
-            keras.layers.LSTM(
-                5, input_shape=(1, 2), batch_input_shape=[1, 1, 2], stateful=True
-            )
-        )
-        model.get_layer(index=1).reset_states()
-
-        if _is_macos() and _macos_version() >= (10, 13):
-            coreml_model = keras_converter.convert(
-                model=model, input_names="data", output_names="output"
-            )
-
-            keras_output_1 = model.predict(data)
-            coreml_full_output_1 = coreml_model.predict({"data": data})
-            coreml_output_1 = coreml_full_output_1["output"]
-            coreml_output_1 = np.expand_dims(coreml_output_1, 1)
-
-            np.testing.assert_array_almost_equal(coreml_output_1.T, keras_output_1)
-
-        hidden_state = (np.random.rand(1, 5), np.random.rand(1, 5))
-        model.get_layer(index=1).reset_states(hidden_state)
-
-        coreml_model = keras_converter.convert(
-            model=model, input_names="data", output_names="output"
-        )
-        spec = coreml_model.get_spec()
-
-        if _is_macos() and _macos_version() >= (10, 13):
-            keras_output_2 = model.predict(data)
-            coreml_full_output_2 = coreml_model.predict(
-                {
-                    "data": data,
-                    spec.description.input[1].name: hidden_state[0][0],
-                    spec.description.input[2].name: hidden_state[1][0],
-                }
-            )
-            coreml_output_2 = coreml_full_output_2["output"]
-            coreml_output_2 = np.expand_dims(coreml_output_2, 1)
-
-            np.testing.assert_array_almost_equal(coreml_output_2.T, keras_output_2)
-
-
-if __name__ == "__main__":
-    # unittest.main()
-    ## To run a specific test:
-    suite = unittest.TestSuite()
-    suite.addTest(LSTMLayer("test_keras2_bilstm_layer"))
-    unittest.TextTestRunner().run(suite)
diff --git a/coremltools/test/neural_network/test_simple_nn_inference.py b/coremltools/test/neural_network/test_simple_nn_inference.py
index 01ddd7c1e..3eaf7cbfc 100644
--- a/coremltools/test/neural_network/test_simple_nn_inference.py
+++ b/coremltools/test/neural_network/test_simple_nn_inference.py
@@ -8,7 +8,7 @@
 import numpy as np
 
 import coremltools
-from coremltools import utils
+from coremltools import ComputeUnit, utils
 from coremltools.models import neural_network as neural_network
 import coremltools.models.datatypes as datatypes
 
@@ -39,9 +39,9 @@ def test_lrn_model(tmpdir):
         coremltools.models.utils.save_spec(builder.spec, model_path)
 
         try:
-            model = coremltools.models.MLModel(model_path)
+            model = coremltools.models.MLModel(model_path, compute_units=ComputeUnit.CPU_ONLY)
             if utils._macos_version() >= (10, 13):
-                out = model.predict(input, useCPUOnly=True)
+                out = model.predict(input)
         except RuntimeError as e:
             print(e)
             assert str(e) == "Error compiling model: \"The file couldn’t be saved.\"."
diff --git a/coremltools/test/neural_network/test_simple_recurrent_single_layer.py b/coremltools/test/neural_network/test_simple_recurrent_single_layer.py
deleted file mode 100644
index d2396d656..000000000
--- a/coremltools/test/neural_network/test_simple_recurrent_single_layer.py
+++ /dev/null
@@ -1,486 +0,0 @@
-# Copyright (c) 2017, Apple Inc. All rights reserved.
-#
-# Use of this source code is governed by a BSD-3-clause license that can be
-# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
-
-import unittest
-import numpy as np
-import os
-import shutil
-import tempfile
-import itertools
-import coremltools
-from coremltools._deps import _HAS_KERAS_TF, _HAS_KERAS2_TF
-from coremltools.models.utils import _macos_version, _is_macos
-import pytest
-
-if _HAS_KERAS_TF or _HAS_KERAS2_TF:
-    from keras.models import Sequential
-    from keras.layers import LSTM, GRU, SimpleRNN
-    from coremltools.converters import keras as keras_converter
-
-
-def _get_mlkit_model_from_path(model, model_path):
-    from coremltools.converters import keras as keras_converter
-
-    print("converting")
-    model = keras_converter.convert(model, ["data"], ["output"])
-    return model
-
-
-def generate_input(dim0, dim1, dim2):
-    input_data = np.random.rand(dim0, dim1, dim2).astype(
-        "f"
-    )  # astype() should be removed
-    return input_data
-
-
-def valid_params(params):
-    """Checks if this combination of parameters is allowed by Keras"""
-    return not (params["input_dims"][1] == 1 and params["unroll"])
-
-
-class RecurrentLayerTest(unittest.TestCase):
-    """
-    Base class for recurrent layer tests. Masking param not included here
-    """
-
-    def setUp(self):
-        self.params_dict = dict(
-            input_dims=[[1, 1, 1], [1, 1, 5], [1, 1, 10]],  # [1, x > 1, y] not added
-            output_dim=[1, 5, 10, 20],
-            stateful=[False, True],
-            go_backwards=[False],  # True],
-            unroll=[False, True],
-            return_sequences=[False, True],
-            activation=["sigmoid", "tanh", "hard_sigmoid", "linear"],
-        )
-        self.base_layer_params = list(itertools.product(*self.params_dict.values()))
-
-
-class SimpleRNNLayer(RecurrentLayerTest):
-    """
-    Class for testing single RNN layer
-    """
-
-    def setUp(self):
-        super(SimpleRNNLayer, self).setUp()
-        self.simple_rnn_params_dict = dict(
-            dropout=[{"dropout_W": 0.0, "dropout_U": 0.0}],
-            regularizer=[
-                {"W_regularizer": None, "U_regularizer": None, "b_regularizer": None}
-            ],
-        )
-        self.rnn_layer_params = list(
-            itertools.product(*self.simple_rnn_params_dict.values())
-        )
-
-    def _test_rnn_layer(self, keras_major_version, limit=None):
-        i = 0
-        layer_name = str(SimpleRNN).split(".")[3].split("'>")[0]
-        numerical_err_models = []
-        shape_err_models = []
-        params = list(itertools.product(self.base_layer_params, self.rnn_layer_params))
-        np.random.shuffle(params)
-        params = [
-            param
-            for param in params
-            if valid_params(dict(zip(self.params_dict.keys(), param[0])))
-        ]
-        for base_params, rnn_params in params[:limit]:
-            base_params = dict(zip(self.params_dict.keys(), base_params))
-            rnn_params = dict(zip(self.simple_rnn_params_dict.keys(), rnn_params))
-            input_data = generate_input(
-                base_params["input_dims"][0],
-                base_params["input_dims"][1],
-                base_params["input_dims"][2],
-            )
-            model = Sequential()
-            settings = dict(
-                activation=base_params["activation"],
-                return_sequences=base_params["return_sequences"],
-                go_backwards=base_params["go_backwards"],
-                unroll=base_params["unroll"],
-            )
-            if keras_major_version == 2:
-                model.add(
-                    SimpleRNN(
-                        base_params["output_dim"],
-                        input_shape=base_params["input_dims"][1:],
-                        dropout=rnn_params["dropout"]["dropout_U"],
-                        recurrent_dropout=rnn_params["dropout"]["dropout_W"],
-                        kernel_regularizer=rnn_params["regularizer"]["W_regularizer"],
-                        recurrent_regularizer=rnn_params["regularizer"][
-                            "U_regularizer"
-                        ],
-                        bias_regularizer=rnn_params["regularizer"]["b_regularizer"],
-                        **settings
-                    )
-                )
-            else:
-                model.add(
-                    SimpleRNN(
-                        base_params["output_dim"],
-                        input_length=base_params["input_dims"][1],
-                        input_dim=base_params["input_dims"][2],
-                        dropout_U=rnn_params["dropout"]["dropout_U"],
-                        dropout_W=rnn_params["dropout"]["dropout_W"],
-                        W_regularizer=rnn_params["regularizer"]["W_regularizer"],
-                        U_regularizer=rnn_params["regularizer"]["U_regularizer"],
-                        b_regularizer=rnn_params["regularizer"]["b_regularizer"],
-                        **settings
-                    )
-                )
-            model_dir = tempfile.mkdtemp()
-            keras_model_path = os.path.join(model_dir, "keras.h5")
-            coreml_model_path = os.path.join(model_dir, "keras.mlmodel")
-            model.save_weights(keras_model_path)
-            mlkitmodel = _get_mlkit_model_from_path(model, coreml_model_path)
-            if _is_macos() and _macos_version() >= (10, 13):
-                keras_preds = model.predict(input_data).flatten()
-                input_data = np.transpose(input_data, [1, 0, 2])
-                coreml_preds = mlkitmodel.predict({"data": input_data})[
-                    "output"
-                ].flatten()
-                try:
-                    self.assertEqual(coreml_preds.shape, keras_preds.shape)
-                except AssertionError:
-                    print(
-                        "Shape error:\nbase_params: {}\nkeras_preds.shape: {}\ncoreml_preds.shape: {}".format(
-                            base_params, keras_preds.shape, coreml_preds.shape
-                        )
-                    )
-                    shape_err_models.append(base_params)
-                    shutil.rmtree(model_dir)
-                    i += 1
-                    continue
-                try:
-                    for idx in range(0, len(coreml_preds)):
-                        relative_error = (
-                            coreml_preds[idx] - keras_preds[idx]
-                        ) / coreml_preds[idx]
-                        self.assertAlmostEqual(relative_error, 0, places=2)
-                except AssertionError:
-                    print(
-                        "Assertion error:\nbase_params: {}\nkeras_preds: {}\ncoreml_preds: {}".format(
-                            base_params, keras_preds, coreml_preds
-                        )
-                    )
-                    numerical_err_models.append(base_params)
-            shutil.rmtree(model_dir)
-            i += 1
-
-        self.assertEqual(
-            shape_err_models, [], msg="Shape error models {}".format(shape_err_models)
-        )
-        self.assertEqual(
-            numerical_err_models,
-            [],
-            msg="Numerical error models {}".format(numerical_err_models),
-        )
-
-    @unittest.skipIf(not _HAS_KERAS_TF, "Missing keras 1. Skipping test.")
-    @pytest.mark.keras1
-    @pytest.mark.slow
-    def test_keras1_rnn_layer_stress(self):
-        self._test_rnn_layer(keras_major_version=1)
-
-    @unittest.skipIf(not _HAS_KERAS_TF, "Missing keras 1. Skipping test.")
-    @pytest.mark.keras1
-    def test_keras1_rnn_layer(self):
-        self._test_rnn_layer(keras_major_version=1, limit=10)
-
-    @unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras 2. Skipping test.")
-    @pytest.mark.keras2
-    @pytest.mark.slow
-    def test_keras2_rnn_layer_stress(self):
-        self._test_rnn_layer(keras_major_version=2)
-
-    @unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras 2. Skipping test.")
-    @pytest.mark.keras2
-    def test_keras2_rnn_layer(self):
-        self._test_rnn_layer(keras_major_version=2, limit=10)
-
-
-class LSTMLayer(RecurrentLayerTest):
-    """
-    Class for testing single RNN layer
-    """
-
-    def setUp(self):
-        super(LSTMLayer, self).setUp()
-        self.lstm_params_dict = dict(
-            dropout=[{"dropout_W": 0.0, "dropout_U": 0.0}],
-            regularizer=[
-                {"W_regularizer": None, "U_regularizer": None, "b_regularizer": None}
-            ],
-        )
-        self.lstm_layer_params = list(
-            itertools.product(*self.lstm_params_dict.values())
-        )
-
-    def _test_lstm_layer(self, keras_major_version, limit=None):
-        i = 0
-        numerical_err_models = []
-        shape_err_models = []
-        params = list(itertools.product(self.base_layer_params, self.lstm_layer_params))
-        np.random.shuffle(params)
-        params = [
-            param
-            for param in params
-            if valid_params(dict(zip(self.params_dict.keys(), param[0])))
-        ]
-        for base_params, lstm_params in params[:limit]:
-            base_params = dict(zip(self.params_dict.keys(), base_params))
-            lstm_params = dict(zip(self.lstm_params_dict.keys(), lstm_params))
-            input_data = generate_input(
-                base_params["input_dims"][0],
-                base_params["input_dims"][1],
-                base_params["input_dims"][2],
-            )
-            model = Sequential()
-            settings = dict(
-                activation=base_params["activation"],
-                return_sequences=base_params["return_sequences"],
-                go_backwards=base_params["go_backwards"],
-                unroll=base_params["unroll"],
-            )
-            if keras_major_version == 2:
-                model.add(
-                    LSTM(
-                        base_params["output_dim"],
-                        input_shape=base_params["input_dims"][1:],
-                        recurrent_dropout=lstm_params["dropout"]["dropout_U"],
-                        dropout=lstm_params["dropout"]["dropout_W"],
-                        kernel_regularizer=lstm_params["regularizer"]["W_regularizer"],
-                        recurrent_regularizer=lstm_params["regularizer"][
-                            "U_regularizer"
-                        ],
-                        bias_regularizer=lstm_params["regularizer"]["b_regularizer"],
-                        **settings
-                    )
-                )
-            else:
-                model.add(
-                    LSTM(
-                        base_params["output_dim"],
-                        input_length=base_params["input_dims"][1],
-                        input_dim=base_params["input_dims"][2],
-                        dropout_U=lstm_params["dropout"]["dropout_U"],
-                        dropout_W=lstm_params["dropout"]["dropout_W"],
-                        W_regularizer=lstm_params["regularizer"]["W_regularizer"],
-                        U_regularizer=lstm_params["regularizer"]["U_regularizer"],
-                        b_regularizer=lstm_params["regularizer"]["b_regularizer"],
-                        **settings
-                    )
-                )
-            model_dir = tempfile.mkdtemp()
-            keras_model_path = os.path.join(model_dir, "keras.h5")
-            coreml_model_path = os.path.join(model_dir, "keras.mlmodel")
-            model.save_weights(keras_model_path)
-            mlkitmodel = _get_mlkit_model_from_path(model, coreml_model_path)
-            if _is_macos() and _macos_version() >= (10, 13):
-                keras_preds = model.predict(input_data).flatten()
-                input_data = np.transpose(input_data, [1, 0, 2])
-                coreml_preds = mlkitmodel.predict({"data": input_data})[
-                    "output"
-                ].flatten()
-                try:
-                    self.assertEqual(coreml_preds.shape, keras_preds.shape)
-                except AssertionError:
-                    print(
-                        "Shape error:\nbase_params: {}\nkeras_preds.shape: {}\ncoreml_preds.shape: {}".format(
-                            base_params, keras_preds.shape, coreml_preds.shape
-                        )
-                    )
-                    shape_err_models.append(base_params)
-                    shutil.rmtree(model_dir)
-                    i += 1
-                    continue
-                try:
-                    for idx in range(0, len(coreml_preds)):
-                        relative_error = (
-                            coreml_preds[idx] - keras_preds[idx]
-                        ) / coreml_preds[idx]
-                        self.assertAlmostEqual(relative_error, 0, places=2)
-                except AssertionError:
-                    print(
-                        "Assertion error:\nbase_params: {}\nkeras_preds: {}\ncoreml_preds: {}".format(
-                            base_params, keras_preds, coreml_preds
-                        )
-                    )
-                    numerical_err_models.append(base_params)
-            shutil.rmtree(model_dir)
-            i += 1
-
-        self.assertEqual(
-            shape_err_models, [], msg="Shape error models {}".format(shape_err_models)
-        )
-        self.assertEqual(
-            numerical_err_models,
-            [],
-            msg="Numerical error models {}".format(numerical_err_models),
-        )
-
-    @unittest.skipIf(not _HAS_KERAS_TF, "Missing keras 1. Skipping test.")
-    @pytest.mark.keras1
-    @pytest.mark.slow
-    def test_keras1_lstm_layer_stress(self):
-        self._test_lstm_layer(keras_major_version=1)
-
-    @unittest.skipIf(not _HAS_KERAS_TF, "Missing keras 1. Skipping test.")
-    @pytest.mark.keras1
-    def test_keras1_lstm_layer(self):
-        self._test_lstm_layer(keras_major_version=1, limit=10)
-
-    @unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras 2. Skipping test.")
-    @pytest.mark.keras2
-    @pytest.mark.slow
-    def test_keras2_lstm_layer_stress(self):
-        self._test_lstm_layer(keras_major_version=2)
-
-    @unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras 2. Skipping test.")
-    @pytest.mark.keras2
-    def test_keras2_lstm_layer(self):
-        self._test_lstm_layer(keras_major_version=2, limit=10)
-
-
-class GRULayer(RecurrentLayerTest):
-    """
-    Class for testing GRU layer
-    """
-
-    def setUp(self):
-        super(GRULayer, self).setUp()
-        self.gru_params_dict = dict(
-            dropout=[{"dropout_W": 0.0, "dropout_U": 0.0}],
-            regularizer=[
-                {"W_regularizer": None, "U_regularizer": None, "b_regularizer": None}
-            ],
-        )
-        self.gru_layer_params = list(itertools.product(*self.gru_params_dict.values()))
-
-    def _test_gru_layer(self, keras_major_version, limit=None):
-        i = 0
-        numerical_err_models = []
-        shape_err_models = []
-        params = list(itertools.product(self.base_layer_params, self.gru_layer_params))
-        np.random.shuffle(params)
-        params = [
-            param
-            for param in params
-            if valid_params(dict(zip(self.params_dict.keys(), param[0])))
-        ]
-        for base_params, gru_params in params[:limit]:
-            base_params = dict(zip(self.params_dict.keys(), base_params))
-            gru_params = dict(zip(self.gru_params_dict.keys(), gru_params))
-            input_data = generate_input(
-                base_params["input_dims"][0],
-                base_params["input_dims"][1],
-                base_params["input_dims"][2],
-            )
-            model = Sequential()
-            settings = dict(
-                activation=base_params["activation"],
-                return_sequences=base_params["return_sequences"],
-                go_backwards=base_params["go_backwards"],
-                unroll=base_params["unroll"],
-            )
-            if keras_major_version == 2:
-                model.add(
-                    GRU(
-                        base_params["output_dim"],
-                        input_shape=base_params["input_dims"][1:],
-                        recurrent_dropout=gru_params["dropout"]["dropout_U"],
-                        dropout=gru_params["dropout"]["dropout_W"],
-                        kernel_regularizer=gru_params["regularizer"]["W_regularizer"],
-                        recurrent_regularizer=gru_params["regularizer"][
-                            "U_regularizer"
-                        ],
-                        bias_regularizer=gru_params["regularizer"]["b_regularizer"],
-                        **settings
-                    )
-                )
-            else:
-                model.add(
-                    GRU(
-                        base_params["output_dim"],
-                        input_length=base_params["input_dims"][1],
-                        input_dim=base_params["input_dims"][2],
-                        dropout_U=gru_params["dropout"]["dropout_U"],
-                        dropout_W=gru_params["dropout"]["dropout_W"],
-                        W_regularizer=gru_params["regularizer"]["W_regularizer"],
-                        U_regularizer=gru_params["regularizer"]["U_regularizer"],
-                        b_regularizer=gru_params["regularizer"]["b_regularizer"],
-                        **settings
-                    )
-                )
-            model_dir = tempfile.mkdtemp()
-            keras_model_path = os.path.join(model_dir, "keras.h5")
-            coreml_model_path = os.path.join(model_dir, "keras.mlmodel")
-            model.save_weights(keras_model_path)
-            mlkitmodel = _get_mlkit_model_from_path(model, coreml_model_path)
-            if _is_macos() and _macos_version() >= (10, 13):
-                keras_preds = model.predict(input_data).flatten()
-                input_data = np.transpose(input_data, [1, 0, 2])
-                coreml_preds = mlkitmodel.predict({"data": input_data})[
-                    "output"
-                ].flatten()
-                try:
-                    self.assertEqual(coreml_preds.shape, keras_preds.shape)
-                except AssertionError:
-                    print(
-                        "Shape error:\nbase_params: {}\nkeras_preds.shape: {}\ncoreml_preds.shape: {}".format(
-                            base_params, keras_preds.shape, coreml_preds.shape
-                        )
-                    )
-                    shape_err_models.append(base_params)
-                    shutil.rmtree(model_dir)
-                    i += 1
-                    continue
-                try:
-                    for idx in range(0, len(coreml_preds)):
-                        relative_error = (
-                            coreml_preds[idx] - keras_preds[idx]
-                        ) / coreml_preds[idx]
-                        self.assertAlmostEqual(relative_error, 0, places=2)
-                except AssertionError:
-                    print(
-                        "Assertion error:\nbase_params: {}\nkeras_preds: {}\ncoreml_preds: {}".format(
-                            base_params, keras_preds, coreml_preds
-                        )
-                    )
-                    numerical_err_models.append(base_params)
-            shutil.rmtree(model_dir)
-            i += 1
-
-        self.assertEqual(
-            shape_err_models, [], msg="Shape error models {}".format(shape_err_models)
-        )
-        self.assertEqual(
-            numerical_err_models,
-            [],
-            msg="Numerical error models {}".format(numerical_err_models),
-        )
-
-    @unittest.skipIf(not _HAS_KERAS_TF, "Missing keras 1. Skipping test.")
-    @pytest.mark.keras1
-    @pytest.mark.slow
-    def test_keras1_gru_layer_stress(self):
-        self._test_gru_layer(keras_major_version=1)
-
-    @unittest.skipIf(not _HAS_KERAS_TF, "Missing keras 1. Skipping test.")
-    @pytest.mark.keras1
-    def test_keras1_gru_layer(self):
-        self._test_gru_layer(keras_major_version=1, limit=10)
-
-    @unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras 2. Skipping test.")
-    @pytest.mark.keras2
-    @pytest.mark.slow
-    def test_keras2_gru_layer_stress(self):
-        self._test_gru_layer(keras_major_version=2)
-
-    @unittest.skipIf(not _HAS_KERAS2_TF, "Missing keras 2. Skipping test.")
-    @pytest.mark.keras2
-    def test_keras2_gru_layer(self):
-        self._test_gru_layer(keras_major_version=2, limit=10)
diff --git a/coremltools/test/neural_network/test_tf_numeric.py b/coremltools/test/neural_network/test_tf_numeric.py
index a94f60fbf..0e200571e 100644
--- a/coremltools/test/neural_network/test_tf_numeric.py
+++ b/coremltools/test/neural_network/test_tf_numeric.py
@@ -1,14 +1,22 @@
+# Copyright (c) 2017, Apple Inc. All rights reserved.
+#
+# Use of this source code is governed by a BSD-3-clause license that can be
+# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import itertools
 import unittest
+
 import numpy as np
+
+from coremltools import ComputeUnit
+from coremltools._deps import _HAS_TF, MSG_TF1_NOT_FOUND
+from coremltools.models import MLModel, neural_network
 import coremltools.models.datatypes as datatypes
-from coremltools.models import neural_network as neural_network
-from coremltools.models import MLModel
 from coremltools.models.utils import _is_macos, _macos_version
-from coremltools._deps import _HAS_TF, MSG_TF1_NOT_FOUND
 
 if _HAS_TF:
     import tensorflow as tf
-import itertools
+
 
 np.random.seed(10)
 np.set_printoptions(precision=4, suppress=True)
@@ -55,9 +63,8 @@ def _test_model(
         coreml_model,
         snr_thresh=15,
         psnr_thresh=30,
-        cpu_only=False,
     ):
-        coreml_out_dict = coreml_model.predict(input_dict, useCPUOnly=cpu_only)
+        coreml_out_dict = coreml_model.predict(input_dict)
         for out_ in list(ref_output_dict.keys()):
             ref_out = ref_output_dict[out_].flatten()
             coreml_out = coreml_out_dict[out_].flatten()
@@ -69,9 +76,6 @@ def _test_model(
 
 @unittest.skipUnless(_is_macos(), "Only supported for MacOS platform.")
 class StressTest(CorrectnessTest):
-    def runTest(self):
-        pass
-
     def test_data_reorganize(self, cpu_only=False):
         def get_coreml_model_reorganize(X, params):
             eval = True
@@ -90,7 +94,12 @@ def get_coreml_model_reorganize(X, params):
                     mode=params["mode"],
                     block_size=params["block_size"],
                 )
-                mlmodel = MLModel(builder.spec)
+
+                if cpu_only:
+                    compute_unit=ComputeUnit.CPU_ONLY
+                else:
+                    compute_unit=ComputeUnit.ALL
+                mlmodel = MLModel(builder.spec, compute_units=compute_unit)
             except RuntimeError as e:
                 print(e)
                 eval = False
@@ -156,9 +165,7 @@ def get_tf_predictions_reorganize(X, params):
             else:
                 input_dict = {"data": np.expand_dims(X, axis=0)}
                 ref_output_dict = {"output": tf_preds[0, :, :, :]}
-                self._test_model(
-                    input_dict, ref_output_dict, coreml_model, cpu_only=cpu_only
-                )
+                self._test_model(input_dict, ref_output_dict, coreml_model)
 
         self.assertEqual(failed_tests_compile, [])
 
@@ -210,7 +217,11 @@ def get_coreml_model_depthwise(X, params, w):
                     output_name="output",
                 )
 
-                mlmodel = MLModel(builder.spec)
+                if cpu_only:
+                    compute_unit=ComputeUnit.CPU_ONLY
+                else:
+                    compute_unit=ComputeUnit.ALL
+                mlmodel = MLModel(builder.spec, compute_units=compute_unit)
             except RuntimeError as e:
                 print(e)
                 eval = False
@@ -286,9 +297,7 @@ def get_tf_predictions_depthwise(X, params, w):
             else:
                 input_dict = {"data": np.expand_dims(X, axis=0)}
                 ref_output_dict = {"output": tf_preds[0, :, :, :]}
-                self._test_model(
-                    input_dict, ref_output_dict, coreml_model, cpu_only=cpu_only
-                )
+                self._test_model(input_dict, ref_output_dict, coreml_model)
 
         self.assertEqual(failed_tests_compile, [])
 
@@ -319,7 +328,13 @@ def get_coreml_model_resize_bilinear(X, params):
                     target_width=params["Wnew"],
                     mode=mode,
                 )
-                mlmodel = MLModel(builder.spec)
+
+                if cpu_only:
+                    compute_unit=ComputeUnit.CPU_ONLY
+                else:
+                    compute_unit=ComputeUnit.ALL
+
+                mlmodel = MLModel(builder.spec, compute_units=compute_unit)
             except RuntimeError as e:
                 print(e)
                 eval = False
@@ -385,9 +400,7 @@ def get_tf_predictions_resize_bilinear(X, params):
             else:
                 input_dict = {"data": np.expand_dims(X, axis=0)}
                 ref_output_dict = {"output": np.expand_dims(tf_preds, axis=0)}
-                self._test_model(
-                    input_dict, ref_output_dict, coreml_model, cpu_only=cpu_only
-                )
+                self._test_model(input_dict, ref_output_dict, coreml_model)
 
         self.assertEqual(failed_tests_compile, [])
 
@@ -432,7 +445,12 @@ def get_coreml_model_crop_resize(params):
                     box_indices_mode="CORNERS_HEIGHT_FIRST",
                     spatial_scale=1.0,
                 )
-                mlmodel = MLModel(builder.spec)
+
+                if cpu_only:
+                    compute_unit=ComputeUnit.CPU_ONLY
+                else:
+                    compute_unit=ComputeUnit.ALL
+                mlmodel = MLModel(builder.spec, compute_units=compute_unit)
             except RuntimeError as e:
                 print(e)
                 eval = False
@@ -509,9 +527,7 @@ def get_tf_predictions_crop_resize(X, boxes, box_ind, params):
                         box_ind.astype(np.float32), (n_roi, 1, 1, 1, 1)
                     )
                 ref_output_dict = {"output": np.expand_dims(tf_preds, axis=0)}
-                self._test_model(
-                    input_dict, ref_output_dict, coreml_model, cpu_only=cpu_only
-                )
+                self._test_model(input_dict, ref_output_dict, coreml_model)
 
         self.assertEqual(failed_tests_compile, [])
 
diff --git a/coremltools/test/pipeline/test_model_updatable.py b/coremltools/test/pipeline/test_model_updatable.py
index 925f31786..4e24a67a5 100644
--- a/coremltools/test/pipeline/test_model_updatable.py
+++ b/coremltools/test/pipeline/test_model_updatable.py
@@ -807,81 +807,3 @@ def test_shuffle_on_by_default(self):
             builder.nn_spec.updateParams.shuffle.defaultValue,
             "Shuffle not turned on by default for updatable models",
         )
-
-    @pytest.mark.skipif(
-        coremltools.utils._python_version() >= (3, 8, 0),
-        reason="Keras isn't compatible with Python 3.8+.",
-    )
-    def test_make_updatable_with_unilstm(self):
-        from keras.models import Sequential
-        from keras.layers import Dense, LSTM
-        from coremltools.converters import keras as keras_converter
-        import numpy as np
-
-        input_dim = 5
-        num_hidden = 12
-        num_classes = 6
-        input_length = 3
-
-        model = Sequential()
-        model.add(
-            LSTM(
-                num_hidden,
-                input_dim=input_dim,
-                input_length=input_length,
-                return_sequences=False,
-            )
-        )
-        model.add(Dense(num_classes, activation="softmax"))
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        input_names = ["input"]
-        output_names = ["zzzz"]
-        class_labels = ["a", "b", "c", "d", "e", "f"]
-        predicted_feature_name = "pf"
-        coremlmodel = keras_converter.convert(
-            model,
-            input_names,
-            output_names,
-            class_labels=class_labels,
-            predicted_feature_name=predicted_feature_name,
-            predicted_probabilities_output=output_names[0],
-        )
-        spec = coremlmodel.get_spec()
-        builder = NeuralNetworkBuilder(spec=spec)
-        # we could be able to make "dense_1" updatable without any issue
-        builder.make_updatable([spec.neuralNetworkClassifier.layers[1].name])
-
-    @pytest.mark.skipif(
-        coremltools.utils._python_version() >= (3, 8, 0),
-        reason="Keras isn't compatible with Python 3.8+.",
-    )
-    def test_make_updatable_with_bilstm(self):
-        from keras.models import Sequential
-        from keras.layers import Dense, LSTM
-        from coremltools.converters import keras as keras_converter
-        from keras.layers.wrappers import Bidirectional
-        import numpy as np
-
-        num_classes = 6
-        model = Sequential()
-        model.add(Bidirectional(LSTM(32, input_shape=(10, 32)), input_shape=(10, 32)))
-        model.add(Dense(num_classes, activation="softmax"))
-        model.set_weights([np.random.rand(*w.shape) for w in model.get_weights()])
-
-        input_names = ["input"]
-        output_names = ["zzzz"]
-        class_labels = ["a", "b", "c", "d", "e", "f"]
-        predicted_feature_name = "pf"
-        coremlmodel = keras_converter.convert(
-            model,
-            input_names,
-            output_names,
-            class_labels=class_labels,
-            predicted_feature_name=predicted_feature_name,
-            predicted_probabilities_output=output_names[0],
-        )
-        spec = coremlmodel.get_spec()
-        builder = NeuralNetworkBuilder(spec=spec)
-        # we could be able to make "dense_1" updatable without any issue
-        builder.make_updatable([spec.neuralNetworkClassifier.layers[1].name])
diff --git a/coremltools/test/sklearn_tests/test_io_types.py b/coremltools/test/sklearn_tests/test_io_types.py
index b6e257738..62f99d779 100644
--- a/coremltools/test/sklearn_tests/test_io_types.py
+++ b/coremltools/test/sklearn_tests/test_io_types.py
@@ -3,17 +3,18 @@
 # Use of this source code is governed by a BSD-3-clause license that can be
 # found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+import pytest
+import unittest
+
 import coremltools
 from coremltools._deps import (
-    _HAS_KERAS2_TF,
-    MSG_KERAS2_NOT_FOUND,
     _HAS_SKLEARN,
     MSG_SKLEARN_NOT_FOUND,
 )
 from coremltools.models.utils import _macos_version, _is_macos
 
-if _HAS_KERAS2_TF:
-    import keras
+import numpy as np
+import PIL.Image
 if _HAS_SKLEARN:
     import sklearn
     from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
@@ -21,10 +22,6 @@
     from sklearn.linear_model import LinearRegression
     from sklearn.svm import SVC, SVR
     from sklearn.datasets import load_boston
-import unittest
-import numpy as np
-import pytest
-import PIL.Image
 
 
 def create_model(spec):
@@ -250,94 +247,6 @@ def test_linear_regressor(self):
             except RuntimeError:
                 print("{} not supported. ".format(dtype))
 
-    @unittest.skipIf(not _HAS_KERAS2_TF, MSG_KERAS2_NOT_FOUND)
-    @pytest.mark.keras2
-    def test_keras_dense_model(self):
-        model = keras.models.Sequential()
-        model.add(
-            keras.layers.Dense(
-                3,
-                activation="sigmoid",
-                kernel_initializer="random_uniform",
-                bias_initializer="random_uniform",
-                input_dim=3,
-            )
-        )
-        for key, dtype in self.number_data_type.items():
-            try:
-                input_data = np.random.rand(1, 3).astype(key)
-                keras_out = model.predict(input_data)
-                coreml_model = coremltools.converters.keras.convert(
-                    model, input_names=["data"], output_names=["target"]
-                )
-                spec = coreml_model.get_spec()
-                spec.description.output[
-                    0
-                ].type.multiArrayType.dataType = coremltools.proto.FeatureTypes_pb2.ArrayFeatureType.ArrayDataType.Value(
-                    self._feature_data_type(dtype)
-                )
-                spec.description.input[
-                    0
-                ].type.multiArrayType.dataType = coremltools.proto.FeatureTypes_pb2.ArrayFeatureType.ArrayDataType.Value(
-                    self._feature_data_type(dtype)
-                )
-                coreml_model = coremltools.models.MLModel(spec)
-                coreml_out = coreml_model.predict(
-                    {"data": np.expand_dims(input_data, 0)}
-                )["target"]
-                self.assertEqual(dtype, coreml_out.dtype)
-                if dtype != np.int32:
-                    for idx in range(0, len(keras_out)):
-                        self.assertAlmostEqual(
-                            keras_out[0][idx],
-                            coreml_out[idx],
-                            msg="{}\n{} != {}".format(dtype, keras_out, coreml_out),
-                            places=2,
-                        )
-            except KeyError:
-                print("{} not supported. ".format(dtype))
-
-    def test_keras_embedding_model(self):
-
-        model = keras.models.Sequential()
-        model.add(keras.layers.Embedding(100, 3, input_length=5, input_dtype="float32"))
-        for key, dtype in self.number_data_type.items():
-            try:
-                input_data = np.random.randint(0, 100, size=(1, 5)).astype(key)
-                keras_out = np.squeeze(model.predict(input_data)).flatten()
-                coreml_model = coremltools.converters.keras.convert(
-                    model, input_names=["data"], output_names=["target"]
-                )
-
-                spec = coreml_model.get_spec()
-                spec.description.output[
-                    0
-                ].type.multiArrayType.dataType = coremltools.proto.FeatureTypes_pb2.ArrayFeatureType.ArrayDataType.Value(
-                    self._feature_data_type(dtype)
-                )
-                spec.description.input[
-                    0
-                ].type.multiArrayType.dataType = coremltools.proto.FeatureTypes_pb2.ArrayFeatureType.ArrayDataType.Value(
-                    self._feature_data_type(dtype)
-                )
-                coreml_model = coremltools.models.MLModel(spec)
-                coreml_out = np.squeeze(
-                    coreml_model.predict({"data": np.expand_dims(input_data, 0).T})[
-                        "target"
-                    ]
-                ).flatten()
-                self.assertEqual(dtype, coreml_out.dtype)
-                if dtype != np.int32:
-                    for idx in range(0, len(keras_out)):
-                        self.assertAlmostEqual(
-                            keras_out[idx],
-                            coreml_out[idx],
-                            msg="{}\n{} != {}".format(dtype, keras_out, coreml_out),
-                            places=2,
-                        )
-            except KeyError:
-                print("{} not supported. ".format(dtype))
-
     def test_image_output_rgb(self):
         input_shape = (3, 10, 20)
         input_features = [("data", coremltools.models.datatypes.Array(*input_shape))]
diff --git a/coremltools/version.py b/coremltools/version.py
index 8bfede6a4..55bb29f4c 100644
--- a/coremltools/version.py
+++ b/coremltools/version.py
@@ -4,4 +4,4 @@
 # found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 
-__version__ = "5.2.0"  # VERSION_STRING
+__version__ = "6.0b1"  # VERSION_STRING
diff --git a/deps/protobuf/src/google/protobuf/repeated_field.h b/deps/protobuf/src/google/protobuf/repeated_field.h
index 074319e82..264ac91bf 100644
--- a/deps/protobuf/src/google/protobuf/repeated_field.h
+++ b/deps/protobuf/src/google/protobuf/repeated_field.h
@@ -46,11 +46,7 @@
 #ifndef GOOGLE_PROTOBUF_REPEATED_FIELD_H__
 #define GOOGLE_PROTOBUF_REPEATED_FIELD_H__
 
-#ifdef _MSC_VER
-// This is required for min/max on VS2013 only.
 #include <algorithm>
-#endif
-
 #include <iterator>
 #include <limits>
 #include <string>
diff --git a/docs/documentation.md b/docs/documentation.md
index 5186b1ca2..ef943df9a 100644
--- a/docs/documentation.md
+++ b/docs/documentation.md
@@ -184,11 +184,9 @@ You need to do this only once. Follow these steps:
 
 	```
 	coremltools.converters.rst
-	coremltools.converters.keras.rst
 	coremltools.converters.libsvm.rst
 	coremltools.converters.mil.rst
 	coremltools.converters.mil.mil.ops.defs.rst
-	coremltools.converters.onnx.rst
 	coremltools.converters.sklearn.rst
 	coremltools.converters.xgboost.rst
 	coremltools.models.neural_network.rst
@@ -202,12 +200,10 @@ You need to do this only once. Follow these steps:
 3. Delete all other `.rst` files in your `source` folder. You should now have the following files in your `source` folder, along with any additional modules you may have added:
 
 	```
-	coremltools.converters.keras.rst
 	coremltools.converters.libsvm.rst
 	coremltools.converters.mil.input_types.rst
 	coremltools.converters.mil.mil.ops.defs.rst
 	coremltools.converters.mil.rst
-	coremltools.converters.onnx.rst
 	coremltools.converters.rst
 	coremltools.converters.sklearn.rst
 	coremltools.converters.xgboost.rst
diff --git a/docs/source/coremltools.converters.keras.rst b/docs/source/coremltools.converters.keras.rst
deleted file mode 100644
index 8d445397f..000000000
--- a/docs/source/coremltools.converters.keras.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-Keras
----------------
-
-.. automodule:: coremltools.converters.keras._keras_converter
-   :members:
-
-.. automodule:: coremltools.converters.keras._keras2_converter
-   :members:
-
diff --git a/docs/source/coremltools.converters.onnx.rst b/docs/source/coremltools.converters.onnx.rst
deleted file mode 100644
index ab94815e9..000000000
--- a/docs/source/coremltools.converters.onnx.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-ONNX
-----
-
-.. automodule:: coremltools.converters.onnx._converter
-   :members:
diff --git a/docs/source/coremltools.converters.rst b/docs/source/coremltools.converters.rst
index f898f35d3..e14d275f4 100644
--- a/docs/source/coremltools.converters.rst
+++ b/docs/source/coremltools.converters.rst
@@ -5,9 +5,7 @@ Converters
    :maxdepth: 1
 
    coremltools.converters.convert
-   coremltools.converters.keras
    coremltools.converters.libsvm
-   coremltools.converters.onnx
    coremltools.converters.sklearn
    coremltools.converters.xgboost
 
diff --git a/docs/source/coremltools.models.rst b/docs/source/coremltools.models.rst
index eafe3e5a2..e08c095ed 100644
--- a/docs/source/coremltools.models.rst
+++ b/docs/source/coremltools.models.rst
@@ -7,6 +7,12 @@ MLModel
 .. automodule:: coremltools.models.model
    :members:
 
+compression\_utils 
+---------------------------------------------------
+
+.. automodule:: coremltools.models.ml_program.compression_utils
+   :members:
+
 
 array\_feature\_extractor 
 ---------------------------------------------------
@@ -57,4 +63,3 @@ utils
 
 .. automodule:: coremltools.models.utils
    :members:
-
diff --git a/milstoragepython/MilStorage.cpp b/milstoragepython/MilStorage.cpp
index 612ef50fe..453c859de 100644
--- a/milstoragepython/MilStorage.cpp
+++ b/milstoragepython/MilStorage.cpp
@@ -45,6 +45,10 @@ namespace {
 // These methods are needed in addition to the above template methods
 // because pybind does not allow us to expose template methods to
 // Python with gcc on Linux.
+u_int64_t MilStoragePythonWriter::write_int8_data(const std::vector<int8_t>& data) {
+    return writeData<int8_t>(*m_writer, data);
+}
+
 u_int64_t MilStoragePythonWriter::write_uint8_data(const std::vector<uint8_t>& data) {
     return writeData<uint8_t>(*m_writer, data);
 }
@@ -90,6 +94,10 @@ namespace {
 // These methods are needed in addition to the above template methods
 // because pybind does not allow us to expose template methods to
 // Python with gcc on Linux.
+const std::vector<int8_t> MilStoragePythonReader::read_int8_data(uint64_t offset) {
+    return readData<int8_t>(*m_reader, offset);
+}
+
 const std::vector<uint8_t> MilStoragePythonReader::read_uint8_data(uint64_t offset) {
     return readData<uint8_t>(*m_reader, offset);
 }
diff --git a/milstoragepython/MilStorage.hpp b/milstoragepython/MilStorage.hpp
index e86419994..88ef435da 100644
--- a/milstoragepython/MilStorage.hpp
+++ b/milstoragepython/MilStorage.hpp
@@ -30,6 +30,7 @@ namespace CoreML {
             MilStoragePythonWriter(const std::string& filePath, bool truncateFile);
             ~MilStoragePythonWriter();
 
+            u_int64_t write_int8_data(const std::vector<int8_t>& data);
             u_int64_t write_uint8_data(const std::vector<uint8_t>& data);
             u_int64_t write_fp16_data(const std::vector<uint16_t>& data);
             u_int64_t write_float_data(const std::vector<float>& data);
@@ -48,6 +49,7 @@ namespace CoreML {
             MilStoragePythonReader(std::string filePath);
             ~MilStoragePythonReader();
 
+            const std::vector<int8_t> read_int8_data(uint64_t offset);
             const std::vector<uint8_t> read_uint8_data(uint64_t offset);
             const std::vector<uint16_t> read_fp16_data(uint64_t offset);
             const std::vector<float> read_float_data(uint64_t offset);
diff --git a/milstoragepython/MilStoragePython.cpp b/milstoragepython/MilStoragePython.cpp
index 6739a9682..673e566c4 100644
--- a/milstoragepython/MilStoragePython.cpp
+++ b/milstoragepython/MilStoragePython.cpp
@@ -33,12 +33,14 @@ PYBIND11_PLUGIN(libmilstoragepython) {
 
     py::class_<MilStoragePythonWriter> blobStorageWriter(m, "_BlobStorageWriter");
     blobStorageWriter.def(py::init<const std::string&, bool>(), py::arg("file_name"), py::arg("truncate_file") = true)
+      .def("write_int8_data", &MilStoragePythonWriter::write_int8_data)
       .def("write_uint8_data", &MilStoragePythonWriter::write_uint8_data)
       .def("write_fp16_data", &MilStoragePythonWriter::write_fp16_data)
       .def("write_float_data", &MilStoragePythonWriter::write_float_data);
 
     py::class_<MilStoragePythonReader> blobStorageReader(m, "_BlobStorageReader");
     blobStorageReader.def(py::init<std::string>())
+      .def("read_int8_data", &MilStoragePythonReader::read_int8_data)
       .def("read_uint8_data", &MilStoragePythonReader::read_uint8_data)
       .def("read_fp16_data", &MilStoragePythonReader::read_fp16_data)
       .def("read_float_data", &MilStoragePythonReader::read_float_data);
diff --git a/mlmodel/build/format/FeatureTypes.pb.cc b/mlmodel/build/format/FeatureTypes.pb.cc
index aba83e033..b5a9ea1cb 100644
--- a/mlmodel/build/format/FeatureTypes.pb.cc
+++ b/mlmodel/build/format/FeatureTypes.pb.cc
@@ -176,6 +176,7 @@ bool ImageFeatureType_ColorSpace_IsValid(int value) {
     case 10:
     case 20:
     case 30:
+    case 40:
       return true;
     default:
       return false;
@@ -187,6 +188,7 @@ const ImageFeatureType_ColorSpace ImageFeatureType::INVALID_COLOR_SPACE;
 const ImageFeatureType_ColorSpace ImageFeatureType::GRAYSCALE;
 const ImageFeatureType_ColorSpace ImageFeatureType::RGB;
 const ImageFeatureType_ColorSpace ImageFeatureType::BGR;
+const ImageFeatureType_ColorSpace ImageFeatureType::GRAYSCALE_FLOAT16;
 const ImageFeatureType_ColorSpace ImageFeatureType::ColorSpace_MIN;
 const ImageFeatureType_ColorSpace ImageFeatureType::ColorSpace_MAX;
 const int ImageFeatureType::ColorSpace_ARRAYSIZE;
@@ -194,6 +196,7 @@ const int ImageFeatureType::ColorSpace_ARRAYSIZE;
 bool ArrayFeatureType_ArrayDataType_IsValid(int value) {
   switch (value) {
     case 0:
+    case 65552:
     case 65568:
     case 65600:
     case 131104:
@@ -208,6 +211,7 @@ const ArrayFeatureType_ArrayDataType ArrayFeatureType::INVALID_ARRAY_DATA_TYPE;
 const ArrayFeatureType_ArrayDataType ArrayFeatureType::FLOAT32;
 const ArrayFeatureType_ArrayDataType ArrayFeatureType::DOUBLE;
 const ArrayFeatureType_ArrayDataType ArrayFeatureType::INT32;
+const ArrayFeatureType_ArrayDataType ArrayFeatureType::FLOAT16;
 const ArrayFeatureType_ArrayDataType ArrayFeatureType::ArrayDataType_MIN;
 const ArrayFeatureType_ArrayDataType ArrayFeatureType::ArrayDataType_MAX;
 const int ArrayFeatureType::ArrayDataType_ARRAYSIZE;
diff --git a/mlmodel/build/format/FeatureTypes.pb.h b/mlmodel/build/format/FeatureTypes.pb.h
index 7eaf7d6c5..afb8c9943 100644
--- a/mlmodel/build/format/FeatureTypes.pb.h
+++ b/mlmodel/build/format/FeatureTypes.pb.h
@@ -102,12 +102,13 @@ enum ImageFeatureType_ColorSpace {
   ImageFeatureType_ColorSpace_GRAYSCALE = 10,
   ImageFeatureType_ColorSpace_RGB = 20,
   ImageFeatureType_ColorSpace_BGR = 30,
+  ImageFeatureType_ColorSpace_GRAYSCALE_FLOAT16 = 40,
   ImageFeatureType_ColorSpace_ImageFeatureType_ColorSpace_INT_MIN_SENTINEL_DO_NOT_USE_ = ::google::protobuf::kint32min,
   ImageFeatureType_ColorSpace_ImageFeatureType_ColorSpace_INT_MAX_SENTINEL_DO_NOT_USE_ = ::google::protobuf::kint32max
 };
 bool ImageFeatureType_ColorSpace_IsValid(int value);
 const ImageFeatureType_ColorSpace ImageFeatureType_ColorSpace_ColorSpace_MIN = ImageFeatureType_ColorSpace_INVALID_COLOR_SPACE;
-const ImageFeatureType_ColorSpace ImageFeatureType_ColorSpace_ColorSpace_MAX = ImageFeatureType_ColorSpace_BGR;
+const ImageFeatureType_ColorSpace ImageFeatureType_ColorSpace_ColorSpace_MAX = ImageFeatureType_ColorSpace_GRAYSCALE_FLOAT16;
 const int ImageFeatureType_ColorSpace_ColorSpace_ARRAYSIZE = ImageFeatureType_ColorSpace_ColorSpace_MAX + 1;
 
 enum ArrayFeatureType_ArrayDataType {
@@ -115,6 +116,7 @@ enum ArrayFeatureType_ArrayDataType {
   ArrayFeatureType_ArrayDataType_FLOAT32 = 65568,
   ArrayFeatureType_ArrayDataType_DOUBLE = 65600,
   ArrayFeatureType_ArrayDataType_INT32 = 131104,
+  ArrayFeatureType_ArrayDataType_FLOAT16 = 65552,
   ArrayFeatureType_ArrayDataType_ArrayFeatureType_ArrayDataType_INT_MIN_SENTINEL_DO_NOT_USE_ = ::google::protobuf::kint32min,
   ArrayFeatureType_ArrayDataType_ArrayFeatureType_ArrayDataType_INT_MAX_SENTINEL_DO_NOT_USE_ = ::google::protobuf::kint32max
 };
@@ -762,6 +764,8 @@ class ImageFeatureType : public ::google::protobuf::MessageLite /* @@protoc_inse
     ImageFeatureType_ColorSpace_RGB;
   static const ColorSpace BGR =
     ImageFeatureType_ColorSpace_BGR;
+  static const ColorSpace GRAYSCALE_FLOAT16 =
+    ImageFeatureType_ColorSpace_GRAYSCALE_FLOAT16;
   static inline bool ColorSpace_IsValid(int value) {
     return ImageFeatureType_ColorSpace_IsValid(value);
   }
@@ -1175,6 +1179,8 @@ class ArrayFeatureType : public ::google::protobuf::MessageLite /* @@protoc_inse
     ArrayFeatureType_ArrayDataType_DOUBLE;
   static const ArrayDataType INT32 =
     ArrayFeatureType_ArrayDataType_INT32;
+  static const ArrayDataType FLOAT16 =
+    ArrayFeatureType_ArrayDataType_FLOAT16;
   static inline bool ArrayDataType_IsValid(int value) {
     return ArrayFeatureType_ArrayDataType_IsValid(value);
   }
diff --git a/mlmodel/build/format/FeatureTypes_enums.h b/mlmodel/build/format/FeatureTypes_enums.h
index 5ec789c67..3095f492d 100644
--- a/mlmodel/build/format/FeatureTypes_enums.h
+++ b/mlmodel/build/format/FeatureTypes_enums.h
@@ -5,6 +5,7 @@ enum MLColorSpace: int {
     MLColorSpaceGRAYSCALE = 10,
     MLColorSpaceRGB = 20,
     MLColorSpaceBGR = 30,
+    MLColorSpaceGRAYSCALE_FLOAT16 = 40,
 };
 
 enum MLImageFeatureTypeSizeFlexibility: int {
@@ -31,6 +32,7 @@ enum MLArrayDataType: int {
     MLArrayDataTypeFLOAT32 = 65568,
     MLArrayDataTypeDOUBLE = 65600,
     MLArrayDataTypeINT32 = 131104,
+    MLArrayDataTypeFLOAT16 = 65552,
 };
 
 enum MLArrayFeatureTypeShapeFlexibility: int {
diff --git a/mlmodel/format/FeatureTypes.proto b/mlmodel/format/FeatureTypes.proto
index 8711ac7de..375e0af8b 100644
--- a/mlmodel/format/FeatureTypes.proto
+++ b/mlmodel/format/FeatureTypes.proto
@@ -36,9 +36,10 @@ message ImageFeatureType {
     // Assumes raw (decompressed) format
     enum ColorSpace {
         INVALID_COLOR_SPACE = 0;
-        GRAYSCALE = 10; //  8 bits per pixel
-        RGB = 20;       // 32 bits per pixel: RGBA with A channel ignored
-        BGR = 30;       // 32 bits per pixel: BGRA with A channel ignored
+        GRAYSCALE = 10;         //  8 bits per pixel
+        RGB = 20;               // 32 bits per pixel: RGBA with A channel ignored
+        BGR = 30;               // 32 bits per pixel: BGRA with A channel ignored
+        GRAYSCALE_FLOAT16 = 40; // 16 bits float per pixel
     }
 
     message ImageSize {
@@ -108,6 +109,7 @@ message ArrayFeatureType {
         FLOAT32 = 65568; // 0x10000 | 32
         DOUBLE = 65600;  // 0x10000 | 64
         INT32 = 131104;  // 0x20000 | 32
+        FLOAT16 = 65552; // 0x10000 | 16
     }
 
     // The required or default shape
diff --git a/mlmodel/format/Model.proto b/mlmodel/format/Model.proto
index ae1518153..bc3b9bf57 100644
--- a/mlmodel/format/Model.proto
+++ b/mlmodel/format/Model.proto
@@ -251,6 +251,9 @@ message SerializedModel {
  * - Core ML Audio Feature Print
  * - new type of model: mlprogram (MILSpec.Program)
  *
+ * 7 : iOS 16, macOS 13, tvOS 16, watchOS 9 (Core ML 6)
+ * - FLOAT16 array data type
+ * - GRAYSCALE_FLOAT16 image color space.
  */
 message Model {
     int32 specificationVersion = 1;
diff --git a/mlmodel/src/DataType.cpp b/mlmodel/src/DataType.cpp
index 742f2eb73..3e9647876 100644
--- a/mlmodel/src/DataType.cpp
+++ b/mlmodel/src/DataType.cpp
@@ -154,6 +154,8 @@ FeatureType FeatureType::T() { return FeatureType(U); }
                 return "Double";
             case Specification::ArrayFeatureType_ArrayDataType_FLOAT32:
                 return "Float32";
+            case Specification::ArrayFeatureType_ArrayDataType_FLOAT16:
+                return "Float16";
             case Specification::ArrayFeatureType_ArrayDataType_INVALID_ARRAY_DATA_TYPE:
             case Specification::ArrayFeatureType_ArrayDataType_ArrayFeatureType_ArrayDataType_INT_MAX_SENTINEL_DO_NOT_USE_:
             case Specification::ArrayFeatureType_ArrayDataType_ArrayFeatureType_ArrayDataType_INT_MIN_SENTINEL_DO_NOT_USE_:
@@ -180,6 +182,8 @@ FeatureType FeatureType::T() { return FeatureType(U); }
                 return "RGB";
             case Specification::ImageFeatureType_ColorSpace_GRAYSCALE:
                 return "Grayscale";
+            case Specification::ImageFeatureType_ColorSpace_GRAYSCALE_FLOAT16:
+                return "Grayscale16Half";
             case Specification::ImageFeatureType_ColorSpace_ImageFeatureType_ColorSpace_INT_MAX_SENTINEL_DO_NOT_USE_:
             case Specification::ImageFeatureType_ColorSpace_ImageFeatureType_ColorSpace_INT_MIN_SENTINEL_DO_NOT_USE_:
             case Specification::ImageFeatureType_ColorSpace_INVALID_COLOR_SPACE:
@@ -462,7 +466,8 @@ FeatureType FeatureType::T() { return FeatureType(U); }
                 dict["width"] = std::to_string(defaultSize[0]);
                 dict["height"] = std::to_string(defaultSize[1]);
                 dict["colorspace"] = colorSpaceToString(params.colorspace());
-                dict["isColor"] = params.colorspace() == Specification::ImageFeatureType_ColorSpace_GRAYSCALE ? "0" : "1";
+                dict["isColor"] = (params.colorspace() == Specification::ImageFeatureType_ColorSpace_GRAYSCALE ||
+                                   params.colorspace() == Specification::ImageFeatureType_ColorSpace_GRAYSCALE_FLOAT16) ? "0" : "1";
                 dict["hasSizeFlexibility"] = params.SizeFlexibility_case() != Specification::ImageFeatureType::SIZEFLEXIBILITY_NOT_SET ? "1" : "0";
                 switch (params.SizeFlexibility_case()) {
                     case Specification::ImageFeatureType::kEnumeratedSizes: {
diff --git a/mlmodel/src/Globals.hpp b/mlmodel/src/Globals.hpp
index 86401a705..0954a4dbf 100644
--- a/mlmodel/src/Globals.hpp
+++ b/mlmodel/src/Globals.hpp
@@ -54,7 +54,12 @@ namespace CoreML {
     // - Sound Print of Audio Feature Print
     static const int32_t MLMODEL_SPECIFICATION_VERSION_IOS15 = 6;
 
-    static const int32_t MLMODEL_SPECIFICATION_VERSION_NEWEST = MLMODEL_SPECIFICATION_VERSION_IOS15;
+    // version 7:
+    // - FLOAT16 array data type
+    // - GRAYSCALE_FLOAT16 image color space.
+    static const int32_t MLMODEL_SPECIFICATION_VERSION_IOS16 = 7;
+
+    static const int32_t MLMODEL_SPECIFICATION_VERSION_NEWEST = MLMODEL_SPECIFICATION_VERSION_IOS16;
 
 }
 
diff --git a/mlmodel/src/LayerShapeConstraints.cpp b/mlmodel/src/LayerShapeConstraints.cpp
index 6d019d18a..608ce4429 100644
--- a/mlmodel/src/LayerShapeConstraints.cpp
+++ b/mlmodel/src/LayerShapeConstraints.cpp
@@ -568,7 +568,8 @@ void ShapeConstraint::updateConstraint(const Specification::FeatureType& type) {
     if (type.Type_case() == Specification::FeatureType::kImageType) {
 
         // Handle the number of channels first
-        if (type.imagetype().colorspace() == Specification::ImageFeatureType_ColorSpace_GRAYSCALE)
+        if (type.imagetype().colorspace() == Specification::ImageFeatureType_ColorSpace_GRAYSCALE ||
+            type.imagetype().colorspace() == Specification::ImageFeatureType_ColorSpace_GRAYSCALE_FLOAT16 )
             setChannel(1);
         else {
             setChannel(3);
diff --git a/mlmodel/src/TreeEnsembleCommon.cpp b/mlmodel/src/TreeEnsembleCommon.cpp
index 377386b6c..bd36a9933 100644
--- a/mlmodel/src/TreeEnsembleCommon.cpp
+++ b/mlmodel/src/TreeEnsembleCommon.cpp
@@ -244,7 +244,10 @@ namespace CoreML { namespace TreeEnsembles {
                     // Set up the false child node.
                     {
                         auto false_child_node = _get_node( {n->tree_id, n->false_child_node_id} );
-
+                        if (nullptr == false_child_node) {
+                            continue; // Press on for further validation. Will trigger fatality in null check below at "This indicates there are logic errors above fooling us up; abort."
+                        }
+                        
                         if(false_child_node == n) {
                             std::ostringstream ss;
                             ss << "False child and parent have same ID (TreeID=" << n->tree_id
@@ -269,7 +272,10 @@ namespace CoreML { namespace TreeEnsembles {
                     // Set up the true child node.
                     {
                         auto true_child_node = _get_node( {n->tree_id, n->true_child_node_id} );
-
+                        if (nullptr == true_child_node) {
+                            continue; // Press on for further validation. Will trigger fatality in null check below at "This indicates there are logic errors above fooling us up; abort."
+                        }
+                        
                         if(true_child_node == n) {
                             std::ostringstream ss;
                             ss << "True child and parent have same ID (TreeID=" << n->tree_id
diff --git a/mlmodel/src/Utils.cpp b/mlmodel/src/Utils.cpp
index 82dd35f0e..04501a89d 100644
--- a/mlmodel/src/Utils.cpp
+++ b/mlmodel/src/Utils.cpp
@@ -122,6 +122,10 @@ void CoreML::downgradeSpecificationVersion(Specification::Model *pModel) {
         pModel->set_specificationversion(MLMODEL_SPECIFICATION_VERSION_NEWEST);
     }
 
+    if (pModel->specificationversion() == MLMODEL_SPECIFICATION_VERSION_IOS16 && !hasIOS16Features(*pModel)) {
+        pModel->set_specificationversion(MLMODEL_SPECIFICATION_VERSION_IOS15);
+    }
+
     if (pModel->specificationversion() == MLMODEL_SPECIFICATION_VERSION_IOS15 && !hasIOS15Features(*pModel)) {
         pModel->set_specificationversion(MLMODEL_SPECIFICATION_VERSION_IOS14);
     }
@@ -304,6 +308,59 @@ bool CoreML::hasFlexibleShapes(const Specification::Model& model) {
     return false;
 }
 
+bool CoreML::hasFloat16MultiArray(const Specification::Model& model) {
+    for (const auto& input: model.description().input()) {
+        if (input.type().Type_case() == Specification::FeatureType::kMultiArrayType) {
+            if (input.type().multiarraytype().datatype() == Specification::ArrayFeatureType_ArrayDataType_FLOAT16) {
+                return true;
+            }
+        }
+    }
+
+    for (const auto& output: model.description().output()) {
+        if (output.type().Type_case() == Specification::FeatureType::kMultiArrayType) {
+            if (output.type().multiarraytype().datatype() == Specification::ArrayFeatureType_ArrayDataType_FLOAT16) {
+                return true;
+            }
+        }
+    }
+
+    return false;
+}
+
+bool CoreML::hasCoreML6Opsets(const Specification::Model& model) {
+    if (model.Type_case() == Specification::Model::kMlProgram) {
+        auto main_iter = model.mlprogram().functions().find("main");
+        if (main_iter != model.mlprogram().functions().end()) {
+            const auto& main = main_iter->second;
+            if (main.opset() == "CoreML6") {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+bool CoreML::hasGrayscaleFloat16Image(const Specification::Model& model) {
+    for (const auto& input: model.description().input()) {
+        if (input.type().Type_case() == Specification::FeatureType::kImageType) {
+            if (input.type().imagetype().colorspace() == Specification::ImageFeatureType_ColorSpace_GRAYSCALE_FLOAT16) {
+                return true;
+            }
+        }
+    }
+
+    for (const auto& output: model.description().output()) {
+        if (output.type().Type_case() == Specification::FeatureType::kMultiArrayType) {
+            if (output.type().imagetype().colorspace() == Specification::ImageFeatureType_ColorSpace_GRAYSCALE_FLOAT16) {
+                return true;
+            }
+        }
+    }
+
+    return false;
+}
+
 bool CoreML::hasIOS11_2Features(const Specification::Model& model) {
     bool result = false;
     switch (model.Type_case()) {
@@ -568,6 +625,20 @@ bool CoreML::hasIOS15Features(const Specification::Model& model) {
     return false;
 }
 
+bool CoreML::hasIOS16Features(const Specification::Model& model) {
+    // New in IOS16 features:
+    //  - FLOAT16 array data type
+    //  - GRAYSCALE_FLOAT16 image color space.
+    //  - CoreML6 Opsets for mlProgram models
+    
+    bool result = false;
+    result = result || hasFloat16MultiArray(model);
+    result = result || hasGrayscaleFloat16Image(model);
+    result = result || hasCoreML6Opsets(model);
+
+    return result;
+}
+
 bool CoreML::hasCustomModel(const Specification::Model& model) {
     return (model.Type_case() == Specification::Model::kCustomModel);
 }
diff --git a/mlmodel/src/Utils.hpp b/mlmodel/src/Utils.hpp
index 5c20e6b34..0a992b44d 100644
--- a/mlmodel/src/Utils.hpp
+++ b/mlmodel/src/Utils.hpp
@@ -110,6 +110,7 @@ namespace CoreML {
     bool hasIOS13Features(const Specification::Model& model);
     bool hasIOS14Features(const Specification::Model& model);
     bool hasIOS15Features(const Specification::Model& model);
+    bool hasIOS16Features(const Specification::Model& model);
 
     typedef std::pair<std::string,std::string> StringPair;
     // Returns a vector of pairs of strings, one pair per custom layer instance
@@ -143,6 +144,9 @@ namespace CoreML {
     bool hasIOS14NeuralNetworkFeatures(const Specification::Model& model);
     bool hasDefaultValueForOptionalInputs(const Specification::Model& model);
     bool hasFloat32InputsOrOutputsForNonmaxSuppression(const Specification::Model& model);
+    bool hasFloat16MultiArray(const Specification::Model& model);
+    bool hasGrayscaleFloat16Image(const Specification::Model& model);
+    bool hasCoreML6Opsets(const Specification::Model& model);
 
     bool hasModelOrSubModelProperty(const Specification::Model& model, const std::function<bool(const Specification::Model&)> &boolFunc);
 
diff --git a/mlmodel/src/Validation/BayesianProbitRegressionValidator.cpp b/mlmodel/src/Validation/BayesianProbitRegressionValidator.cpp
index a82ed1933..9393347ee 100644
--- a/mlmodel/src/Validation/BayesianProbitRegressionValidator.cpp
+++ b/mlmodel/src/Validation/BayesianProbitRegressionValidator.cpp
@@ -24,6 +24,8 @@ namespace CoreML {
                 return "MLArrayDataTypeDOUBLE";
             case MLArrayDataTypeINT32:
                 return "MLArrayDataTypeINT32";
+            case MLArrayDataTypeFLOAT16:
+                return "MLArrayDataTypeFLOAT16";
         }
     }
 
diff --git a/mlmodel/src/Validation/InterfaceValidators.cpp b/mlmodel/src/Validation/InterfaceValidators.cpp
index a0e877f9c..61d948836 100644
--- a/mlmodel/src/Validation/InterfaceValidators.cpp
+++ b/mlmodel/src/Validation/InterfaceValidators.cpp
@@ -157,11 +157,20 @@ namespace CoreML {
                     case Specification::ArrayFeatureType_ArrayDataType_DOUBLE:
                     case Specification::ArrayFeatureType_ArrayDataType_FLOAT32:
                     case Specification::ArrayFeatureType_ArrayDataType_INT32:
+                        break;
+                    case Specification::ArrayFeatureType_ArrayDataType_FLOAT16:
+                        if (modelVersion < MLMODEL_SPECIFICATION_VERSION_IOS16) {
+                            return Result(ResultType::INVALID_MODEL_INTERFACE,
+                                          "Description of multiarray feature '" + desc.name() +
+                                          "' has FLOAT16 dataType, which is only valid in specification version >= " += std::to_string(MLMODEL_SPECIFICATION_VERSION_IOS16)+
+                                          ". This model has version " + std::to_string(modelVersion));
+                        }
+
                         break;
                     default:
                         return Result(ResultType::INVALID_MODEL_INTERFACE,
                                       "Description of multiarray feature '" + desc.name() + "' has an invalid or unspecified dataType. "
-                                      "It must be specified as DOUBLE, FLOAT32 or INT32");
+                                      "It must be specified as DOUBLE, FLOAT32, FLOAT16 or INT32");
                 }
 
                 switch (type.multiarraytype().defaultOptionalValue_case()) {
@@ -173,7 +182,8 @@ namespace CoreML {
                         }
                         break;
                     case CoreML::Specification::ArrayFeatureType::kFloatDefaultValue:
-                        if (type.multiarraytype().datatype() != Specification::ArrayFeatureType_ArrayDataType_FLOAT32){
+                        if (type.multiarraytype().datatype() != Specification::ArrayFeatureType_ArrayDataType_FLOAT32 &&
+                            type.multiarraytype().datatype() != Specification::ArrayFeatureType_ArrayDataType_FLOAT16){
                             return Result(ResultType::INVALID_MODEL_INTERFACE,
                                           "Description of multiarray feature '" + desc.name() + "' has mistmatch"
                                           " between dataType and the type of default optional value.");
@@ -308,6 +318,14 @@ namespace CoreML {
                     case Specification::ImageFeatureType_ColorSpace_RGB:
                     case Specification::ImageFeatureType_ColorSpace_BGR:
                         break;
+                    case Specification::ImageFeatureType_ColorSpace_GRAYSCALE_FLOAT16:
+                        if (modelVersion < MLMODEL_SPECIFICATION_VERSION_IOS16) {
+                            return Result(ResultType::INVALID_MODEL_INTERFACE,
+                                          "Description of image feature '" + desc.name() +
+                                          "' has GRAYSCALE_FLOAT16 colorspace, which is only valid in specification version >= " += std::to_string(MLMODEL_SPECIFICATION_VERSION_IOS16)+
+                                          ". This model has version " + std::to_string(modelVersion));
+                        }
+                        break;
                     default:
                         return Result(ResultType::INVALID_MODEL_INTERFACE,
                                       "Description of image feature '" + desc.name() +
diff --git a/mlmodel/src/Validation/NeuralNetwork/NeuralNetworkShapes.cpp b/mlmodel/src/Validation/NeuralNetwork/NeuralNetworkShapes.cpp
index 58be617d5..0883c37f5 100644
--- a/mlmodel/src/Validation/NeuralNetwork/NeuralNetworkShapes.cpp
+++ b/mlmodel/src/Validation/NeuralNetwork/NeuralNetworkShapes.cpp
@@ -1931,7 +1931,9 @@ NeuralNetworkShaper::NeuralNetworkShaper(const Specification::ModelDescription&
                 // sequence constraint here is unbounded
                 // batch is unbounded
                 // other three read from the constraint as is -- later to be updated with flexibility
-                if (desc.type().imagetype().colorspace() == Specification::ImageFeatureType_ColorSpace_GRAYSCALE)
+                auto colorspace = desc.type().imagetype().colorspace();
+                if (colorspace == Specification::ImageFeatureType_ColorSpace_GRAYSCALE ||
+                    colorspace == Specification::ImageFeatureType_ColorSpace_GRAYSCALE_FLOAT16)
                     constraint.setChannel(1);
                 else {
                     constraint.setChannel(3);
diff --git a/mlmodel/src/Validation/SoundAnalysisPreprocessingValidator.cpp b/mlmodel/src/Validation/SoundAnalysisPreprocessingValidator.cpp
index 1c3949ef3..39a4fa784 100644
--- a/mlmodel/src/Validation/SoundAnalysisPreprocessingValidator.cpp
+++ b/mlmodel/src/Validation/SoundAnalysisPreprocessingValidator.cpp
@@ -27,6 +27,8 @@ namespace CoreML {
                 return "MLArrayDataTypeDOUBLE";
             case MLArrayDataTypeINT32:
                 return "MLArrayDataTypeINT32";
+            case MLArrayDataTypeFLOAT16:
+                return "MLArrayDataTypeFLOAT16";
         }
     }
 
diff --git a/mlmodel/tests/InterfaceTests.cpp b/mlmodel/tests/InterfaceTests.cpp
index ff3593304..ebee58a26 100644
--- a/mlmodel/tests/InterfaceTests.cpp
+++ b/mlmodel/tests/InterfaceTests.cpp
@@ -144,6 +144,8 @@ int testFeatureDescriptions() {
     ML_ASSERT_GOOD(validateFeatureDescription(*feature,true));
     feature->mutable_type()->mutable_imagetype()->set_colorspace(::CoreML::Specification::ImageFeatureType_ColorSpace_GRAYSCALE);
     ML_ASSERT_GOOD(validateFeatureDescription(*feature,true));
+    feature->mutable_type()->mutable_imagetype()->set_colorspace(::CoreML::Specification::ImageFeatureType_ColorSpace_GRAYSCALE_FLOAT16);
+    ML_ASSERT_GOOD(validateFeatureDescription(*feature, MLMODEL_SPECIFICATION_VERSION_IOS16, true));
     feature->mutable_type()->mutable_imagetype()->set_colorspace(::CoreML::Specification::ImageFeatureType_ColorSpace_INVALID_COLOR_SPACE);
     ML_ASSERT_BAD(validateFeatureDescription(*feature,true));
 
diff --git a/reqs/build.pip b/reqs/build.pip
index 54a4425ea..74fe98c7d 100644
--- a/reqs/build.pip
+++ b/reqs/build.pip
@@ -1,5 +1,9 @@
 numpy<1.20; platform_machine != "arm64"
-protobuf
+
+# rdar://93977023
+protobuf<=3.20.1; python_version < "3.7"
+protobuf==3.20.1; python_version >= "3.7"
+
 pytest
 six
 sympy
diff --git a/reqs/docs.pip b/reqs/docs.pip
index 769f9e57b..9ef7061e2 100644
--- a/reqs/docs.pip
+++ b/reqs/docs.pip
@@ -10,7 +10,11 @@ idna
 imagesize
 numpy
 numpydoc
-protobuf
+
+# rdar://93977023
+protobuf<=3.20.1; python_version < "3.7"
+protobuf==3.20.1; python_version >= "3.7"
+
 pytz
 six
 snowballstemmer
diff --git a/reqs/test.pip b/reqs/test.pip
index c5dbd1c60..425baa6fc 100644
--- a/reqs/test.pip
+++ b/reqs/test.pip
@@ -1,12 +1,10 @@
 boto3==1.14.8
 configparser
-Keras==2.1.6; python_version < "3.8"
-h5py==2.10.0
+h5py==2.10.0; platform_machine != "arm64"
 future
 numpy>1.18.5
 libsvm; python_version >= "3.6"
 olefile==0.44
-onnx==1.6.0; python_version <= "3.7"
 pandas
 parameterized==0.8.1
 pillow
@@ -14,16 +12,13 @@ pytest; python_version < '3.7'
 pytest==5.3.4; python_version >= '3.7'
 pytest-cov
 pytest-sugar
-scikit-learn==0.19.2; python_version <= '3.7'
-scikit-learn; python_version > '3.7'
+scikit-learn==0.19.2
 scipy > 1.4
 six
 sympy > 1.6
-tensorflow==1.14.0; python_version < '3.8'
-torch==1.5.0; python_version == '3.5'
-torch==1.10.2; python_version > '3.5'
-torchvision==0.6.1; python_version == '3.5'
-torchvision==0.11.3; python_version > '3.5'
+tensorflow==1.15.0; python_version < '3.8'
+torch==1.11.0
+torchvision==0.12.0
 xgboost==1.4.2
 mock
 wrapt
diff --git a/reqs/test_tf2.pip b/reqs/test_tf2.pip
index 08efa4647..9f3811145 100644
--- a/reqs/test_tf2.pip
+++ b/reqs/test_tf2.pip
@@ -1,10 +1,8 @@
-tensorflow==2.1.0; python_version <= '2.7'
+gast==0.5.0
+tensorflow==2.8.0
+tensorflow-estimator==2.8.0
+keras==2.8.0
 
-tensorflow==2.6.2; python_version >= '3.5'
-tensorflow-estimator==2.6.0; python_version >= '3.5'
-keras==2.6.0; python_version >= '3.5'
-
-tensorflow-addons==0.7.1; python_version <= '2.7'
-tensorflow-addons==0.12.1; python_version >= '3.5'
-tensorflow-hub==0.9.0
+tensorflow-addons==0.16.1
+tensorflow-hub==0.12.0
 transformers==2.10.0; python_version > '3.6'
diff --git a/scripts/test.sh b/scripts/test.sh
index d2415d2d3..f151d8b60 100755
--- a/scripts/test.sh
+++ b/scripts/test.sh
@@ -77,10 +77,10 @@ echo
 
 if [[ $WHEEL_PATH == "" ]]; then
     cd ..
-    $PIP_EXECUTABLE install -e ${COREMLTOOLS_NAME} --upgrade
+    $PIP_EXECUTABLE install -e ${COREMLTOOLS_NAME}  --upgrade --no-deps
     cd ${COREMLTOOLS_NAME}
 else
-    $PIP_EXECUTABLE install $~WHEEL_PATH --upgrade --force-reinstall
+    $PIP_EXECUTABLE install $~WHEEL_PATH --upgrade --no-deps --force-reinstall
 fi
 
 # Install dependencies if specified
diff --git a/setup.py b/setup.py
index f0529b6d2..625f975a2 100755
--- a/setup.py
+++ b/setup.py
@@ -70,7 +70,7 @@
     },
     install_requires=[
         "numpy >= 1.14.5",
-        "protobuf >= 3.1.0",
+        "protobuf >= 3.1.0, <= 3.20.1",
         "sympy",
         "tqdm",
         "packaging",
@@ -80,7 +80,6 @@
         "Intended Audience :: Developers",
         "Operating System :: MacOS :: MacOS X",
         "Operating System :: POSIX :: Linux",
-        "Programming Language :: Python :: 3.5",
         "Programming Language :: Python :: 3.6",
         "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",

From c58abbde9211f15829a395dfa7a1017a247d6fcb Mon Sep 17 00:00:00 2001
From: Toby Roseman <troseman@apple.com>
Date: Tue, 7 Jun 2022 12:54:45 -0700
Subject: [PATCH 17/54] Add 6.0b1 install instructions to README.md (#1510)

---
 README.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/README.md b/README.md
index 2b925c00e..183b0b892 100644
--- a/README.md
+++ b/README.md
@@ -36,6 +36,14 @@ To install coremltools, use the following command:
 pip install coremltools
 ```
 
+## Version 6
+
+To install [the first beta of coremltools 6.0](https://github.com/apple/coremltools/releases/tag/6.0b1) use the following command:
+
+```shell
+pip install coremltools==6.0b1
+```
+
 
 ## Core ML
 

From df41d900768ce12177bc296819fc358af5da9ca7 Mon Sep 17 00:00:00 2001
From: Arjun Sharda <77706434+ajsharda17@users.noreply.github.com>
Date: Tue, 7 Jun 2022 15:24:48 -0500
Subject: [PATCH 18/54] Update README.md (#1511)

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 183b0b892..becd1ff50 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 [Core ML Tools](https://coremltools.readme.io/docs)
 =======================
 
-Use *coremltools* to convert machine learning models from third-party libraries to the Core ML format. The Python package contains the supporting tools for converting models from training libraries such as the following:
+Use *coremltools* to convert machine learning models from third-party libraries to the Core ML format. This Python package contains the supporting tools for converting models from training libraries such as the following:
 
 * [TensorFlow 1.x](https://www.tensorflow.org/versions/r1.15/api_docs/python/tf)
 * [TensorFlow 2.x](https://www.tensorflow.org/api_docs)
@@ -30,7 +30,7 @@ The coremltools 5 package offers several performance improvements over previous
 * [Core ML model package](https://coremltools.readme.io/docs/new-in-coremltools#save-a-core-ml-model-package): A new model container format that separates the model into components and offers more flexible metadata editing and better source control.
 * [ML program](https://coremltools.readme.io/docs/ml-programs): A new model type that represents computation as programmatic instructions, offers more control over the precision of its intermediate tensors and better performance. 
 
-To install coremltools, use the following command:
+To install coremltools, use the following command in your terminal:
 
 ```shell
 pip install coremltools
@@ -38,7 +38,7 @@ pip install coremltools
 
 ## Version 6
 
-To install [the first beta of coremltools 6.0](https://github.com/apple/coremltools/releases/tag/6.0b1) use the following command:
+To install [the first beta of coremltools 6.0](https://github.com/apple/coremltools/releases/tag/6.0b1) use the following command in your terminal:
 
 ```shell
 pip install coremltools==6.0b1

From 573f103de6b0df1a34fe6f80a3453753d2660b7e Mon Sep 17 00:00:00 2001
From: Duncan Buck <dncnbuck@gmail.com>
Date: Wed, 8 Jun 2022 12:02:56 +0200
Subject: [PATCH 19/54] remove logicaland op and alias new logical_and op

---
 .../converters/mil/frontend/torch/ops.py       | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
index 75546452c..d3145bcda 100644
--- a/coremltools/converters/mil/frontend/torch/ops.py
+++ b/coremltools/converters/mil/frontend/torch/ops.py
@@ -4108,7 +4108,7 @@ def is_floating_point(context, node):
     is_float = types.is_float(inputs[0].dtype)
     context.add(mb.const(val=is_float, name=node.name))
 
-@register_torch_op()
+@register_torch_op(torch_alias=["__and_", '__and__'])
 def logical_and(context, node):
     inputs = _get_inputs(context, node, expected=2)
     x, y = inputs
@@ -4545,7 +4545,7 @@ def scatter_add(context, node):
     inputs = _get_inputs(context, node)
     _scatter(context, inputs, 'add', node.name)
 
-@register_torch_op()
+@register_torch_op
 def roi_align(context, node):
     """
     https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L2239
@@ -4600,12 +4600,12 @@ def roi_align(context, node):
 
     context.add(x, torch_name=node.outputs[0])
 
-@register_torch_op()
+@register_torch_op
 def numel(context, node):
     inputs = _get_inputs(context, node, expected=1)
     context.add(mb.reduce_prod(x=inputs[0], name=node.name), torch_name=node.outputs[0])
 
-@register_torch_op()
+@register_torch_op
 def nms(context, node):
     inputs = _get_inputs(context, node)
     boxes = inputs[0]
@@ -4685,7 +4685,7 @@ def repeat_interleave(context, node):
 
     context.add(reshape, node.name)
 
-@register_torch_op(override=True)
+@register_torch_op
 def narrow(context, node):
     data, dim, start, length = _get_inputs(context, node, expected=4)
     data_shape = mb.shape(x=data).val
@@ -4695,11 +4695,3 @@ def narrow(context, node):
     end[dim.val] = start.val+length.val
     out = mb.slice_by_index(x=data, begin=begin, end=end)
     context.add(out, torch_name=node.name)
-
-@register_torch_op(torch_alias=["__and_", '__and__'])
-def logicaland(context, node):
-    inputs = _get_inputs(context, node, expected=2)
-    x, y = inputs
-    x = mb.cast(x=x, dtype="bool")
-    y = mb.cast(x=y, dtype="bool")
-    context.add(mb.logical_and(x=x, y=y, name=node.name))

From 8834011f7eb988dfb7f184a3de955023b91364a8 Mon Sep 17 00:00:00 2001
From: Duncan Buck <dncnbuck@gmail.com>
Date: Wed, 8 Jun 2022 12:30:36 +0200
Subject: [PATCH 20/54] consistent use of double quotes

---
 coremltools/converters/mil/frontend/torch/ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
index d3145bcda..39088740c 100644
--- a/coremltools/converters/mil/frontend/torch/ops.py
+++ b/coremltools/converters/mil/frontend/torch/ops.py
@@ -4108,7 +4108,7 @@ def is_floating_point(context, node):
     is_float = types.is_float(inputs[0].dtype)
     context.add(mb.const(val=is_float, name=node.name))
 
-@register_torch_op(torch_alias=["__and_", '__and__'])
+@register_torch_op(torch_alias=["__and_", "__and__"])
 def logical_and(context, node):
     inputs = _get_inputs(context, node, expected=2)
     x, y = inputs

From 12b3cc1080e24f864662d8592d7d6b00ae83f950 Mon Sep 17 00:00:00 2001
From: Duncan Buck <dncnbuck@gmail.com>
Date: Wed, 8 Jun 2022 12:41:32 +0200
Subject: [PATCH 21/54] remove link to crop and resize layer in NN

---
 coremltools/converters/mil/frontend/torch/ops.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
index 39088740c..a1f91ac7d 100644
--- a/coremltools/converters/mil/frontend/torch/ops.py
+++ b/coremltools/converters/mil/frontend/torch/ops.py
@@ -4547,9 +4547,6 @@ def scatter_add(context, node):
 
 @register_torch_op
 def roi_align(context, node):
-    """
-    https://github.com/apple/coremltools/blob/655b3be5cc0d42c3c4fa49f0f0e4a93a26b3e492/mlmodel/format/NeuralNetwork.proto#L2239
-    """
     inputs = _get_inputs(context, node)
 
     x = context[node.inputs[0]]

From bdcfe40b26b40897d049b306655698ea2faec645 Mon Sep 17 00:00:00 2001
From: Tony Bove <70229264+tonybove-apple@users.noreply.github.com>
Date: Wed, 8 Jun 2022 11:36:36 -1000
Subject: [PATCH 22/54] Docs for v6 with layer_norm fix (#1514)

---
 coremltools/converters/mil/mil/ops/defs/normalization.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/coremltools/converters/mil/mil/ops/defs/normalization.py b/coremltools/converters/mil/mil/ops/defs/normalization.py
index de6174580..7c7a46d5e 100644
--- a/coremltools/converters/mil/mil/ops/defs/normalization.py
+++ b/coremltools/converters/mil/mil/ops/defs/normalization.py
@@ -229,13 +229,13 @@ class layer_norm(Operation):
         * Dimensions to perform layer normalization.
         * Default is ``None`` (all dimensions).
 
-    gamma: const tensor<[K], T> (Optional)
+    gamma: const tensor<\*?, T>, T> (Optional)
         * if provided, the shape must be be ``x.shape[axes]``. For instance, if
           input ``x`` with shape ``(3,4,5,6)`` and ``axes = [2,3]``, gamma must have
           shape ``(5,6)``.
         * Default is all ones.
 
-    beta: const tensor<[K], T> (Optional)
+    beta: const tensor<\*?, T>, T> (Optional)
         * Same shape as gamma.
         * Default is all zeros.
 

From 4508f196d5f0f00a2b01b27522a0d857cd13b235 Mon Sep 17 00:00:00 2001
From: Arjun Sharda <77706434+ajsharda17@users.noreply.github.com>
Date: Wed, 8 Jun 2022 16:53:11 -0500
Subject: [PATCH 23/54] Update ---bug-report.md (#1513)

---
 .github/ISSUE_TEMPLATE/---bug-report.md | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/---bug-report.md b/.github/ISSUE_TEMPLATE/---bug-report.md
index 7ca71fdf8..e0c2ba750 100644
--- a/.github/ISSUE_TEMPLATE/---bug-report.md
+++ b/.github/ISSUE_TEMPLATE/---bug-report.md
@@ -7,10 +7,9 @@ assignees: ''
 
 ---
 
-## 🐞Describe the bug
-
-- Only create an issue here for bugs in the coremltools Python package. If this is a bug with the Core ML Framework or Xcode, please submit your bug here: https://developer.apple.com/bug-reporting/
-- A clear and brief description of the bug.
+## 🐞Describing the bug
+- Make sure to only create an issue here for bugs in the coremltools Python package. If this is a bug with the Core ML Framework or Xcode, please submit your bug here: https://developer.apple.com/bug-reporting/
+- Provide a clear and consise description of the bug.
 
 ## Stack Trace
 - If applicable, please paste the complete stack trace.
@@ -20,13 +19,12 @@ assignees: ''
 ```
 # Paste Python code snippet here, complete with any required import statements.
 ```
-- If the model conversion succeeds, but there is a numerical mismatch in predictions, please include code used for comparisons.
+- If the model conversion succeeds, but there is a numerical mismatch in predictions, please include the code used for comparisons.
 
 ## System environment (please complete the following information):
  - coremltools version:
  - OS (e.g. MacOS version or Linux type):
- - Any other relevant version information:
-     - e.g. PyTorch or TensorFlow version
+ - Any other relevant version information (e.g. PyTorch or TensorFlow version):
 
 ## Additional context
-- Add any other context about the problem here.
+- Add anything else about the problem here that you want to share.

From 79441783d613d5138cf7c20ba99a9ecb6b116781 Mon Sep 17 00:00:00 2001
From: Henry Tao <55294647+jakesabathia2@users.noreply.github.com>
Date: Thu, 9 Jun 2022 08:54:37 -0700
Subject: [PATCH 24/54] Fix a bug when destructing coreml model (#1515)

Fix a bug when destructing coreml model (#1515)
---
 coremltools/models/model.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/coremltools/models/model.py b/coremltools/models/model.py
index b656ed23a..16ead447a 100644
--- a/coremltools/models/model.py
+++ b/coremltools/models/model.py
@@ -363,6 +363,8 @@ def __del__(self):
         # Cleanup temporary package upon destruction
         if hasattr(self, 'is_package') and self.is_package \
            and hasattr(self, 'is_temp_package') and self.is_temp_package:
+            if _shutil is None:
+                import shutil as _shutil
             _shutil.rmtree(self.package_path)
 
     @property

From 33564509fe2f03ca257c491f36794b9328825bec Mon Sep 17 00:00:00 2001
From: Tony Bove <70229264+tonybove-apple@users.noreply.github.com>
Date: Thu, 9 Jun 2022 13:11:20 -1000
Subject: [PATCH 25/54] Formatting fixes and compression submenu (#1518)

---
 coremltools/converters/_converters_entry.py   | 282 ++++++++++--------
 coremltools/converters/mil/input_types.py     |  39 ++-
 docs/source/coremltools.models.ml_program.rst |  21 ++
 docs/source/coremltools.models.rst            |  10 +-
 4 files changed, 198 insertions(+), 154 deletions(-)
 create mode 100644 docs/source/coremltools.models.ml_program.rst

diff --git a/coremltools/converters/_converters_entry.py b/coremltools/converters/_converters_entry.py
index 7277bc0bb..1169bc2d2 100644
--- a/coremltools/converters/_converters_entry.py
+++ b/coremltools/converters/_converters_entry.py
@@ -66,18 +66,17 @@ def convert(
     debug=False,
 ):
     """
-    Convert a TensorFlow or PyTorch model to the Core ML model format as either a neural network or an ML program. 
-    To learn about the differences, see
-    `ML Programs <https://coremltools.readme.io/docs/ml-programs>`_.
-    Some parameters and requirements differ by TensorFlow and PyTorch frameworks.
-
+    Convert a TensorFlow or PyTorch model to the Core ML model format as either
+    a neural network or an `ML program <https://coremltools.readme.io/docs/ml-programs>`_.
+    Some parameters and requirements differ for TensorFlow and PyTorch
+    conversions.
 
     Parameters
     ----------
-    
+
     model :
         TensorFlow 1, TensorFlow 2, or PyTorch model in one of the following
-        formats.
+        formats:
 
         * TensorFlow versions 1.x
         
@@ -107,79 +106,94 @@ def convert(
 
     inputs : list of ``TensorType`` or ``ImageType``
 
-        * If "dtype" is specified in ``TensorType`` / ``ImageType``,
-          it will be applied to the input of the converted model.
-
-          e.g.: The following code snippet will produce a CoreML model with Float16 typed inputs.
-          >>> import coremltools as ct
-          >>> mlmodel = ct.convert(keras_model,
-          >>>                      inputs=[ct.TensorType(dtype=np.float16)],
-          >>>                      minimum_deployment_target=ct.target.macOS13)
-
-          e.g.: The following code snippet will produce a CoreML model with Grayscale 16 bit input image type
-          >>> import coremltools as ct
-          >>> # H : image height, W: image width
-          >>> mlmodel = ct.convert(torch_model,
-          >>>                      inputs=[ct.ImageType(shape=(1, 1, H, W),
-          >>>                              color_layout=ct.colorlayout.GRAYSCALE_FLOAT16)],
-          >>>                      minimum_deployment_target=ct.target.macOS13)
-
-        * TensorFlow 1 and 2 (including tf.keras)
-
+        * If you specify ``dtype`` with ``TensorType`` or ``ImageType``, it will
+          be applied to the input of the converted model. For example, the
+          following code snippet will produce a Core ML model with float 16 typed
+          inputs.
+          
+          .. sourcecode:: python
+
+              import coremltools as ct
+              mlmodel = ct.convert(keras_model,
+                                   inputs=[ct.TensorType(dtype=np.float16)],
+                                   minimum_deployment_target=ct.target.macOS13)
+
+        * The following code snippet will produce a Core ML model with the
+          ``GRAYSCALE_FLOAT16`` input image type:
+          
+          .. sourcecode:: python
+
+              import coremltools as ct
+              # H : image height, W: image width
+              mlmodel = ct.convert(torch_model,
+                               inputs=[ct.ImageType(shape=(1, 1, H, W),
+                                       color_layout=ct.colorlayout.GRAYSCALE_FLOAT16)],
+                               minimum_deployment_target=ct.target.macOS13)
+
+        * TensorFlow 1 and 2 (including tf.keras):
             - The ``inputs`` parameter is optional. If not provided, the inputs
-              are placeholder nodes in the model (if the model is frozen graph)
+              are placeholder nodes in the model (if the model is a frozen graph)
               or function inputs (if the model is a ``tf.function``).
             - If ``inputs`` is provided, it must be a flat list.
             - The ``inputs`` must correspond to all or some of the placeholder nodes
               in the TF model.
-            - If ``name`` is specified in ``TensorType`` and ``ImageType``, it
+            - If ``name`` is specified with ``TensorType`` and ``ImageType``, it
               must correspond to a placeholder op in the TF graph. The input names
-              in the converted CoreML model can later be modifed using ``ct.utils.rename_feature`` API.
-            - If ``dtype`` is not specified, it defaults to the dtype of the inputs in the TF model.
-
-        * PyTorch
+              in the converted Core ML model can later be modifed using the
+              ``ct.utils.rename_feature`` API.
+            - If ``dtype`` is not specified, it defaults to the ``dtype`` of the
+              inputs in the TF model.
 
+        * PyTorch:
             - The ``inputs`` parameter is required.
-            - Number of elements in the ``inputs`` must match the number of inputs of the pytorch model.
+            - Number of elements in ``inputs`` must match the number of inputs
+              of the PyTorch model.
             - ``inputs`` may be a nested list or tuple.
             - ``TensorType`` and ``ImageType`` must have the ``shape`` specified.
-            - If ``name`` argument is specified in ``TensorType`` / ``ImageType``, the converted
-                CoreML model will have inputs with the same name.
-            - If ``dtype`` is missing, it defaults to float32
+            - If the ``name`` argument is specified with ``TensorType`` or
+              ``ImageType``, the converted Core ML model will have inputs with
+              the same name.
+            - If ``dtype`` is missing, it defaults to float 32.
 
     outputs : list of ``TensorType`` or ``ImageType`` (optional)
 
-        * If "dtype" is specified in ``TensorType`` / ``ImageType``,
-          it will be applied to the output of the converted model.
+        * If you specify ``dtype`` with ``TensorType`` or ``ImageType``,
+          it will be applied to the output of the converted model. For example,
+          to produce float 16 typed inputs and outputs:
+          
+          .. sourcecode:: python
 
-          e.g.: to produce float 16 typed inputs and outputs:
-          >>> import coremltools as ct
-          >>> mlmodel = ct.convert(keras_model,
-          >>>                      inputs=[ct.TensorType(dtype=np.float16)],
-          >>>                      outputs=[ct.TensorType(dtype=np.float16)],
-          >>>                      minimum_deployment_target=ct.target.macOS13)
+              import coremltools as ct
+              mlmodel = ct.convert(keras_model,
+                                   inputs=[ct.TensorType(dtype=np.float16)],
+                                   outputs=[ct.TensorType(dtype=np.float16)],
+                                   minimum_deployment_target=ct.target.macOS13)
 
-          e.g.: to produce Image inputs and outputs:
-          >>> import coremltools as ct
-          >>> # H: image height, W: image width
-          >>> mlmodel = ct.convert(torch_model,
-          >>>                      inputs=[ct.ImageType(shape=(1, 3, H, W), color_layout=ct.colorlayout.RGB)],
-          >>>                      outputs=[ct.ImageType(color_layout=ct.colorlayout.RGB)],
-          >>>                      minimum_deployment_target=ct.target.macOS13)
+        * To produce image inputs and outputs:
+          
+          .. sourcecode:: python
 
-        * TensorFlow 1 and 2 (including tf.keras)
+              import coremltools as ct
+              # H: image height, W: image width
+              mlmodel = ct.convert(torch_model,
+                                   inputs=[ct.ImageType(shape=(1, 3, H, W), color_layout=ct.colorlayout.RGB)],
+                                   outputs=[ct.ImageType(color_layout=ct.colorlayout.RGB)],
+                                   minimum_deployment_target=ct.target.macOS13)
 
-            - If ``outputs`` is not specified, the converter infers outputs from the
-              sink nodes in the graph.
-            - If specified, the ``name`` in ``TensorType`` / ``ImageType`` must correspond
-              to a node in the TF graph. In this case, the model will be converted up to
-              that node.
+        * TensorFlow 1 and 2 (including tf.keras):
 
-        * PyTorch
+            - If ``outputs`` is not specified, the converter infers outputs from 
+              the sink nodes in the graph.
+            - If specified, the ``name`` with ``TensorType`` or ``ImageType``
+              must correspond to a node in the TF graph. In this case, the model
+              will be converted up to that node.
 
-            - If specified, the length of the list must match the number of outputs returned by the
-              torch model
-            - If ``name`` is specified it is applied to the output names of the converted coreml model.
+        * PyTorch:
+
+            - If specified, the length of the list must match the number of
+              outputs returned by the PyTorch model.
+            - If ``name`` is specified, it is applied to the output names of the
+              converted Core ML model.
 
     classifier_config : ClassifierConfig class (optional)
         The configuration if the MLModel is intended to be a classifier.
@@ -187,8 +201,8 @@ def convert(
     minimum_deployment_target : coremltools.target enumeration (optional)
         A member of the ``coremltools.target`` enum.
         The value of this parameter determines the type of the model
-        representation produced by the converter. To learn about the differences between
-        neural networks and ML programs, see
+        representation produced by the converter. To learn about the differences
+        between neural networks and ML programs, see
         `ML Programs <https://coremltools.readme.io/docs/ml-programs>`_.
 
         - The converter produces a neural network (``neuralnetwork``) if:
@@ -226,8 +240,8 @@ def convert(
         - ``'neuralnetwork'``: Returns an MLModel (``coremltools.models.MLModel``)
           containing a NeuralNetwork proto, which is the original Core ML format.
           The model saved from this returned object is executable either on
-          iOS13/macOS10.15/watchOS6/tvOS13 and above, or on
-          iOS14/macOS11/watchOS7/tvOS14 and above, depending on the layers used
+          iOS13/macOS10.15/watchOS6/tvOS13 and newer, or on
+          iOS14/macOS11/watchOS7/tvOS14 and newer, depending on the layers used
           in the model.
         - ``'mlprogram'`` : Returns an MLModel (``coremltools.models.MLModel``)
           containing a MILSpec.Program proto, which is the Core ML program format.
@@ -247,80 +261,83 @@ def convert(
 
     compute_precision : coremltools.precision enumeration or ct.transform.FP16ComputePrecision() (optional)
 
-        Use this argument to control the storage precision of the tensors in the mlprogram.
-
-        Must be one of the following.
-
-        - ``coremltools.precision.FLOAT16`` enum
-            - In this case the following transform is applied to produce a float16 typed program,
-                i.e. a program where all the intermediate float tensors have type float16
-                (for ops that support that type).
-
-              ::
-                 coremltools.transform.FP16ComputePrecision(op_selector=
+        Use this argument to control the storage precision of the tensors in the
+        ML program. Must be one of the following.
+        
+        - ``coremltools.precision.FLOAT16`` enum: The following transform is
+          applied to produce a float 16 program; that is, a program in which all
+          the intermediate float tensors are of type float 16 (for ops that
+          support that type).
+          ::
+              coremltools.transform.FP16ComputePrecision(op_selector=
                                                          lambda op:True)
 
-              The above transform itertes over all the ops. For each op,
-              it looks at its inputs and outputs, and if they are of type float32, ``cast``
-              ops are injected to convert those tensors (aka vars) to float16 type.
+          The above transform iterates through all the ops, looking at each op's
+          inputs and outputs. If they are of type float 32, ``cast``
+          ops are injected to convert those tensors (also known as `vars`) to
+          type float 16.
 
-        - ``coremltools.precision.FLOAT32`` enum
-            - No transform is applied. The original float32 tensor dtype in
-              the source model is preserved. Opt into this option if the default converted model
-              is displaying numerical precision issues.
+        - ``coremltools.precision.FLOAT32`` enum: No transform is applied.
+          
+          The original float32 tensor dtype in the source model is preserved.
+          Opt into this option if the default converted model is displaying
+          numerical precision issues.
 
         - ``coremltools.transform.FP16ComputePrecision(op_selector=...)``
-            - Use this option to control which tensors are cast to float16.
-              Before casting the inputs/outputs of any op from float32 to float16,
-              the op_selector function is invoked on the op object. This function
-              must return a boolean value. By default its set to return True for every op,
-              however, this can be customized.
-            - For example:
-              ::
-                 coremltools.transform.FP16ComputePrecision(op_selector=
+          
+          Use this option to control which tensors are cast to float 16.
+          Before casting the inputs/outputs of any op from float32 to float 16,
+          the op_selector function is invoked on the op object. This function
+          must return a boolean value. By default it returns ``True`` for every op,
+          but you can customize this.
+          
+          For example:
+          ::
+             coremltools.transform.FP16ComputePrecision(op_selector=
                                          lambda op: op.op_type != "linear")
 
-              The above casts all the float32 tensors to be float16, except
-              the input/output tensors to any ``linear`` op. See more examples
-              on this below.
+          The above casts all the float32 tensors to be float 16, except
+          the input/output tensors to any ``linear`` op. See more examples
+          below.
 
-        - ``None``
-            - This is the default.
-            - When ``convert_to="mlprogram"``, compute_precision parameter
+        - ``None``: The default
+            - When ``convert_to="mlprogram"``, the ``compute_precision`` parameter
               defaults to ``coremltools.precision.FLOAT16``.
-            - When ``convert_to="neuralnetwork"``, compute_precision parameter
+            - When ``convert_to="neuralnetwork"``, the ``compute_precision`` parameter
               needs to be ``None`` and has no meaning.
-
-            e.g.: Customize the float16 precision transform to prevent from casting all the "real_div"
-                  ops in the program to float16 precision:
-
-            >>> def skip_real_div_ops(op):
-            >>>     if op.op_type == "real_div":
-            >>>         return False
-            >>>     return True
-            >>>
-            >>> model = ct.convert(source_model,
-            >>>                    compute_precision=ct.transform.FP16ComputePrecision(op_selector=skip_real_div_ops),
-            >>>                    minimum_deployment_target=ct.target.iOS15
-            >>>                    )
+            - For example, you can customize the float 16 precision transform to prevent
+              casting all the ``real_div`` ops in the program to float 16
+              precision:
+              
+              .. sourcecode:: python
+
+                  def skip_real_div_ops(op):
+                       if op.op_type == "real_div":
+                           return False
+                       return True
+                  
+                  model = ct.convert(source_model,
+                                     compute_precision=ct.transform.FP16ComputePrecision(op_selector=skip_real_div_ops),
+                                     minimum_deployment_target=ct.target.iOS15
+                                    )
 
     skip_model_load : bool
-        Set to True to prevent coremltools from calling into the Core ML framework
+        Set to ``True`` to prevent coremltools from calling into the Core ML framework
         to compile and load the model, post-conversion. In that case, the returned
         model object cannot be used to make a prediction, but can be used to save
-        via ``"model.save()"``. This flag may be used to convert to a newer model type
-        on an older Mac, which if done without turning this flag on, may raise a
-        runtime warning.
+        with ``model.save()``. This flag may be used to convert to a newer model type
+        on an older Mac, which may raise a runtime warning if done without
+        turning this flag on.
         
-        Example: Use this flag to suppress runtime warning when converting to
-        ML program model type on a macOS 11, since ML program
-        can only be compiled and loaded from macOS12+.
+        Example: Use this flag to suppress a runtime warning when converting to an
+        ML program model on macOS 11, since an ML program can only be compiled and
+        loaded from macOS12+.
         
-        Defaults to False.
+        Defaults to ``False``.
 
     compute_units: coremltools.ComputeUnit
     
-        An enum with three possible values.
+        An enum with the following possible values.
         
             - ``coremltools.ComputeUnit.ALL``: Use all compute units available, including the
               neural engine.
@@ -331,21 +348,26 @@ def convert(
     package_dir : str
         Post conversion, the model is saved at a temporary location and
         loaded to form the MLModel object ready for prediction.
-        If package_dir is provided, model will be saved at this location instead of creating a temporary directory.
-        - if not None, must be a path to a directory with extension .mlpackage
+        
+        * If ``package_dir`` is provided, model will be saved at this location
+          rather than creating a temporary directory.
+        * If not ``None``, this must be a path to a directory with the extension
+          ``.mlpackage``.
 
     debug : bool
-        This flag should generally be False except for debugging purposes
-        Setting this flag to True:
-         - For Torch conversion, it will print the list of supported and unsupported ops
-           found in the model if conversion fails due to an unsupported op.
-         - For Tensorflow conversion, it will cause to display extra logging and visualizations
+        This flag should generally be ``False`` except for debugging purposes.
+        Setting this flag to ``True`` produces the following behavior:
+          - For Torch conversion, it will print the list of supported and
+            unsupported ops found in the model if conversion fails due to an
+            unsupported op.
+          - For Tensorflow conversion, it will cause to display extra logging
+            and visualizations.
 
     Returns
     -------
     
     model : ``coremltools.models.MLModel`` or ``coremltools.converters.mil.Program``
-        A Core ML MLModel object or MIL Program object (see ``convert_to``).
+        A Core ML MLModel object or MIL program object (see ``convert_to``).
 
     Examples
     --------
@@ -363,7 +385,7 @@ def convert(
         >>> results = mlmodel.predict({"input": test_input})
         >>> print(results['output'])
 
-    TensorFlow 2 (``model`` is tf.Keras model path):
+    TensorFlow 2 (``model`` is a tf.Keras model path):
 
         >>> x = tf.keras.Input(shape=(32,), name='input')
         >>> y = tf.keras.layers.Dense(16, activation='softmax')(x)
@@ -388,7 +410,7 @@ def convert(
         >>> results = mlmodel.predict({"input": example_input.numpy()})
         >>> print(results['1651']) # 1651 is the node name given by PyTorch's JIT
 
-    See `neural-network-conversion <https://coremltools.readme.io/docs/neural-network-conversion>`_ for
+    See `Conversion Options <https://coremltools.readme.io/docs/neural-network-conversion>`_ for
     more advanced options.
     """
     _check_deployment_target(minimum_deployment_target)
diff --git a/coremltools/converters/mil/input_types.py b/coremltools/converters/mil/input_types.py
index c26a767af..6e1022454 100644
--- a/coremltools/converters/mil/input_types.py
+++ b/coremltools/converters/mil/input_types.py
@@ -89,7 +89,7 @@ def __init__(
         channel_first=None,
     ):
         """
-        Configuration class used for image inputs in CoreML.
+        Configuration class used for image inputs in Core ML.
 
         Parameters
         ----------
@@ -97,32 +97,31 @@ def __init__(
             The scaling factor for all values in the image channels.
 
         bias: float or list of float
-            If ``color_layout`` is ``'G'``, bias would be a ``float``.
+            * If ``color_layout`` is ``ct.colorlayout.GRAYSCALE`` or
+              ``ct.colorlayout.GRAYSCALE_FLOAT16``, bias would be a ``float``.
+            * If ``color_layout`` is ``ct.colorlayout.RGB`` or ``ct.colorlayout.BGR``,
+              bias would be a list of ``float``.
 
-            If `color_layout` is ``'RGB'`` or ``'BGR'``, bias would be a list of ``float``.
-
-        color_layout: string or of type ct.colorlayout enumeration
-            Color layout of the image.
-
-            Valid values:
-            enumeration (recommended):
-                * ct.colorlayout.RGB
-                * ct.colorlayout.BGR
-                * ct.colorlayout.GRAYSCALE
-                * ct.colorlayout.GRAYSCALE_FLOAT16
+        color_layout: string or enumeration of type ``ct.colorlayout``
+            Color layout of the image. Valid values are as follows:
+            
+            Enumeration (recommended):
+                * ``ct.colorlayout.RGB``
+                * ``ct.colorlayout.BGR``
+                * ``ct.colorlayout.GRAYSCALE``
+                * ``ct.colorlayout.GRAYSCALE_FLOAT16``
 
-            string values (older way to specify):
-                * ``'G'``: Grayscale (maps to ct.colorlayout.GRAYSCALE)
-                * ``'RGB'``: [Red, Green, Blue] (maps to ct.colorlayout.BGR)
-                * ``'BGR'``: [Blue, Green, Red] (maps to ct.colorlayout.RGB)
+            String values (older way to specify):
+                * ``'G'``: Grayscale (maps to ``ct.colorlayout.GRAYSCALE``)
+                * ``'RGB'``: [Red, Green, Blue] (maps to ``ct.colorlayout.BGR``)
+                * ``'BGR'``: [Blue, Green, Red] (maps to ``ct.colorlayout.RGB``)
 
         channel_first: (bool) or None
             Set to ``True`` if input format is channel first.
 
             Default format:
-                For TensorFlow: channel last (``channel_first=False``).
-
-                For PyTorch: channel first (``channel_first=True``).
+                * For TensorFlow: channel last (``channel_first=False``).
+                * For PyTorch: channel first (``channel_first=True``).
         """
         super(ImageType, self).__init__(name, shape)
         self.scale = scale
diff --git a/docs/source/coremltools.models.ml_program.rst b/docs/source/coremltools.models.ml_program.rst
new file mode 100644
index 000000000..bf3deda3f
--- /dev/null
+++ b/docs/source/coremltools.models.ml_program.rst
@@ -0,0 +1,21 @@
+
+affine_quantize_weights
+----------------------------------------------------------------
+
+.. automodule:: coremltools.models.ml_program.compression_utils
+
+   .. autofunction:: affine_quantize_weights
+
+palettize_weights
+----------------------------------------------------------------
+
+.. automodule:: coremltools.models.ml_program.compression_utils
+
+   .. autofunction:: palettize_weights
+
+sparsify_weights
+----------------------------------------------------------------
+
+.. automodule:: coremltools.models.ml_program.compression_utils
+
+   .. autofunction:: sparsify_weights
diff --git a/docs/source/coremltools.models.rst b/docs/source/coremltools.models.rst
index e08c095ed..16a92dc75 100644
--- a/docs/source/coremltools.models.rst
+++ b/docs/source/coremltools.models.rst
@@ -7,11 +7,13 @@ MLModel
 .. automodule:: coremltools.models.model
    :members:
 
-compression\_utils 
----------------------------------------------------
+compression\_utils
+-------------------------------------------------
 
-.. automodule:: coremltools.models.ml_program.compression_utils
-   :members:
+.. toctree::
+   :maxdepth: 1
+   
+   coremltools.models.ml_program
 
 
 array\_feature\_extractor 

From 203b555e09963bb62d89730599ee9d9c9da394ce Mon Sep 17 00:00:00 2001
From: Arjun Sharda <77706434+ajsharda17@users.noreply.github.com>
Date: Mon, 13 Jun 2022 12:33:45 -0500
Subject: [PATCH 26/54] Update CONTRIBUTING.md (#1521)

---
 CONTRIBUTING.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 31d5e6911..2ff1ec285 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -2,7 +2,7 @@
 Contribution Guidelines
 =======================
 
-The Core ML `.mlmodel` file format is a publicly documented specification. The Core ML Tools source code is 100% open source under the [BSD license](https://github.com/apple/coremltools/blob/master/LICENSE.txt). We welcome all contributions and ideas to grow the product. We ask that you follow the [contributing guidelines and code of conduct](#guidelines-and-code-of-conduct)  in this document, which are typical of open source communities.
+The Core ML `.mlmodel` file format is a publicly documented specification. The Core ML Tools source code is 100% open source under the [BSD-3 Clause license](https://github.com/apple/coremltools/blob/master/LICENSE.txt). We welcome all contributions and ideas to grow the product. We ask that you follow the [contributing guidelines and code of conduct](#guidelines-and-code-of-conduct), which are typical of open source communities.
 
 There are many ways to contribute to coremltools. [**Use these templates**](https://github.com/apple/coremltools/issues/new/choose) to report issues, make feature requests, or ask questions. We welcome even minor improvements to code, testing, and documentation, as well as requests for new features and enhancements. Don’t hesitate to do the following:
 
@@ -25,7 +25,7 @@ To ensure that issues and pull requests can be addressed quickly, please do the
 
 * Check [open issues](https://github.com/apple/coremltools/issues) and [current pull requests](https://github.com/apple/coremltools/pulls) in the repository to see if your issue, feature request, or question already exists or has already been addressed.
 * Fill in the appropriate [template](https://github.com/apple/coremltools/issues/new/choose) with as much detail as possible as well as code snippets, so that we are able to reproduce the issue.
-* Promptly reply to any requests or questions posed by others within the community on your issue or pull request.
+* Promptly reply to any requests or questions asked by others within the community on your issue or pull request.
 
 ## Resources
 

From 01983e652a1fc75ed068dcfecde49b967faf0779 Mon Sep 17 00:00:00 2001
From: fukatani <nannyakannya@gmail.com>
Date: Tue, 14 Jun 2022 03:23:15 +0900
Subject: [PATCH 27/54] Add torch AdaptiveAvgPool2d test. (#1502)

Co-authored-by: Toby Roseman <troseman@apple.com>
---
 .../mil/frontend/torch/test/test_torch_ops.py | 34 +++++++++++++++++--
 1 file changed, 31 insertions(+), 3 deletions(-)

diff --git a/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py b/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
index 11535f371..7b68fedf0 100644
--- a/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
+++ b/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
@@ -1451,7 +1451,7 @@ class TestAdaptiveMaxPool(TorchBaseTest):
     @pytest.mark.parametrize(
         "output_size, magnification, delta, depth, backend",
         itertools.product(
-            [(1,1), (3,2)],
+            [(1, 1), (3, 2)],
             [1, 2, 7],
             [0, 11],
             [1, 2, 3],
@@ -1466,15 +1466,43 @@ def test_adaptive_max_pool2d(
         # since coremltools reproduces PyTorch's kernel sizes and
         # offsets for adaptive pooling layers only when input_size is
         # a multiple of output_size, we expect failures otherwise
-        if not (input_size[0] % output_size[0]  == 0 and input_size[1] % output_size[1] == 0):
+        if not (input_size[0] % output_size[0] == 0 and input_size[1] % output_size[1] == 0):
             pytest.xfail("Test should fail because input_size is not a multiple of output_size")
         n = 1
-        in_shape = (n,depth) + input_size
+        in_shape = (n, depth) + input_size
         model = nn.AdaptiveMaxPool2d(
             output_size
         )
         self.run_compare_torch(in_shape, model, backend=backend)
 
+class TestAdaptiveAvgPool(TorchBaseTest):
+    @pytest.mark.parametrize(
+        "output_size, magnification, delta, depth, backend",
+        itertools.product(
+            [(1, 1), (3, 2)],
+            [1, 2, 7],
+            [0, 11],
+            [1, 2, 3],
+            backends,
+        ),
+    )
+    def test_adaptive_avg_pool2d(
+            self, output_size, magnification, delta, depth, backend
+    ):
+        # input_size = output_size * magnification + delta
+        input_size = (delta + magnification * output_size[0], delta + magnification * output_size[1])
+        # since coremltools reproduces PyTorch's kernel sizes and
+        # offsets for adaptive pooling layers only when input_size is
+        # a multiple of output_size, we expect failures otherwise
+        if not (input_size[0] % output_size[0] == 0 and input_size[1] % output_size[1] == 0):
+            pytest.xfail("Test should fail because input_size is not a multiple of output_size")
+        n = 1
+        in_shape = (n, depth) + input_size
+        model = nn.AdaptiveAvgPool2d(
+            output_size
+        )
+        self.run_compare_torch(in_shape, model, backend=backend)
+
 class TestMaxPool(TorchBaseTest):
 
     @pytest.mark.parametrize(

From f1819954ccd5d481093740e7dba9b7cfa42e5f30 Mon Sep 17 00:00:00 2001
From: Arjun Sharda <77706434+ajsharda17@users.noreply.github.com>
Date: Mon, 13 Jun 2022 14:14:36 -0500
Subject: [PATCH 28/54] Update BUILDING.md (#1523)

Co-authored-by: Toby Roseman <troseman@apple.com>
---
 BUILDING.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/BUILDING.md b/BUILDING.md
index f4bd213f2..3c4aeaccf 100644
--- a/BUILDING.md
+++ b/BUILDING.md
@@ -1,7 +1,7 @@
 Building from Source
 ====================
 
-This page describes building Core ML Tools (coremltools) from the source repository.
+This page describes how to build Core ML Tools (coremltools) from the source repository.
 
 ## Requirements
 
@@ -24,7 +24,7 @@ Follow these steps:
 	
 3. Run the [test.sh](scripts/test.sh) script to test the build.
 
-**Under the hood**: If an Anaconda or Miniconda environment doesn't already exists or is not up-to-date, the `build.sh` script automatically runs the [`env_create.sh`](scripts/env_create.sh) script to create the environment. It then uses [`env_activate.sh`](scripts/env_activate.sh) to activate the environment and set up the appropriate version of Python. The new environment is located at `<repo root>/coremltoos/envs` and is named after the `py` parameter. For example, a development environment with py 3.7 is named `coremltools-dev-py37`.
+**Under the hood**: If an Anaconda or Miniconda environment doesn't already exist or is not up-to-date, the `build.sh` script automatically runs the [`env_create.sh`](scripts/env_create.sh) script to create the environment. It then uses [`env_activate.sh`](scripts/env_activate.sh) to activate the environment and set up the appropriate version of Python. The new environment is located at `<repo root>/coremltoos/envs` and is named after the `py` parameter. For example, a development environment with py 3.7 is named `coremltools-dev-py37`.
 
 
 ## Build targets
@@ -53,8 +53,8 @@ For more information, see the following:
 
 * Core ML Tools [README](README.md) file for this repository
 * [Release Notes](https://github.com/apple/coremltools/releases/) for the current release and previous releases
-* [Guides and examples](https://coremltools.readme.io/) with installation and troubleshooting
+* [Guides and examples](https://coremltools.readme.io/) with installation and troubleshooting help
 * [API Reference](https://apple.github.io/coremltools/index.html)
 * [Core ML Specification](https://apple.github.io/coremltools/mlmodel/index.html)
-* [Contribution Guidelines](CONTRIBUTING.md) for reporting issues and making requests
+* [Contribution Guidelines](CONTRIBUTING.md) for reporting issues and making pull requests
 

From 9715d0782b4f6d08c5125b4b9f1f7270fa952042 Mon Sep 17 00:00:00 2001
From: Arjun Sharda <77706434+ajsharda17@users.noreply.github.com>
Date: Mon, 13 Jun 2022 14:44:53 -0500
Subject: [PATCH 29/54] Update ---feature-request.md (change of wording mostly)
 (#1524)

* Update ---feature-request.md

* Update ---feature-request.md
---
 .github/ISSUE_TEMPLATE/---feature-request.md | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/---feature-request.md b/.github/ISSUE_TEMPLATE/---feature-request.md
index ba27df4b4..4db066cc7 100644
--- a/.github/ISSUE_TEMPLATE/---feature-request.md
+++ b/.github/ISSUE_TEMPLATE/---feature-request.md
@@ -8,16 +8,15 @@ assignees: ''
 ---
 
 ## 🌱 Describe your Feature Request
-- A clear and concise description of the feature request.
+- Please provide a description of your feature request, clear and consise.
 - If this is a feature request for the Core ML Framework or Xcode, please instead submit your feature request using the Feedback Assistant for Developers:
 https://developer.apple.com/bug-reporting/
 
-## Use cases
-- Please describe the use cases.
-- Please provide examples.
+## How can this feature be used?
+Please provide some examples where this feature can be used.
 
 ## Describe alternatives you've considered
-A clear and concise description of any alternative solutions or features you've considered.
+Tell us some alternatives that you have considered instead of this feature.
 
 ## Additional context
-Add any other context or screenshots about the feature request here.
+Do you have anything else to say?

From d13735d0764ebfb3723fd7cad439ce8aee86a841 Mon Sep 17 00:00:00 2001
From: fukatani <nannyakannya@gmail.com>
Date: Fri, 17 Jun 2022 03:39:41 +0900
Subject: [PATCH 30/54] Torch eq and ne ops supports bool type. (#1501)

* Torch eq and ne ops supports bool type.

* Addressed review comment
---
 .../converters/mil/frontend/torch/ops.py      | 16 +++++++++--
 .../mil/frontend/torch/test/test_torch_ops.py | 28 +++++++++++++++++++
 2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
index a1f91ac7d..5e28148d3 100644
--- a/coremltools/converters/mil/frontend/torch/ops.py
+++ b/coremltools/converters/mil/frontend/torch/ops.py
@@ -472,14 +472,26 @@ def listconstruct(context, node):
 @register_torch_op
 def eq(context, node):
     inputs = _get_inputs(context, node, expected=2)
-    equal_to = mb.equal(x=inputs[0], y=inputs[1], name=node.name)
+    x = inputs[0]
+    y = inputs[1]
+    if is_bool(x.dtype):
+        x = mb.cast(x=x, dtype='int32')
+    if is_bool(y.dtype):
+        y = mb.cast(x=y, dtype='int32')
+    equal_to = mb.equal(x=x, y=y, name=node.name)
     context.add(equal_to)
 
 
 @register_torch_op
 def ne(context, node):
     inputs = _get_inputs(context, node, expected=2)
-    equal_to = mb.not_equal(x=inputs[0], y=inputs[1], name=node.name)
+    x = inputs[0]
+    y = inputs[1]
+    if is_bool(x.dtype):
+        x = mb.cast(x=x, dtype='int32')
+    if is_bool(y.dtype):
+        y = mb.cast(x=y, dtype='int32')
+    equal_to = mb.not_equal(x=x, y=y, name=node.name)
     context.add(equal_to)
 
 
diff --git a/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py b/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
index 7b68fedf0..95e6690b6 100644
--- a/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
+++ b/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
@@ -3370,6 +3370,34 @@ def forward(self, x):
         )
 
 
+class TestBitWiseLogical(TorchBaseTest):
+    @pytest.mark.parametrize(
+        "backend, x_y, op_string",
+        itertools.product(
+            backends,
+            [
+                ([True, False, True, False], [True, True, False, False]),
+                ([[True, False], [True, False]], [[True, True], [False, False]]),
+                ([[True, False], [True, False]], [[1, 0], [2, 1]]),
+                ([-1.5, 0.0, 1.0, 0.0], [0.1, 2.5, 0.0, 0.0]),
+                ([2, 0, -1, 0, 5], [1, 1, 0, 0, -5]),
+            ],
+            [
+                "eq",
+                "ne",
+            ],
+        ),
+    )
+    def test_bitwise_logical(self, backend, x_y, op_string):
+        if not contains_op(torch, op_string):
+            return
+        op_func = getattr(torch, op_string)
+        model = ModuleWrapper(function=op_func)
+        x = torch.tensor(x_y[0])
+        y = torch.tensor(x_y[1])
+        self.run_compare_torch([x, y], model, backend=backend, input_as_shape=False)
+
+
 class TestLogicalAnd(TorchBaseTest):
     @pytest.mark.parametrize(
         "backend, x_y",

From 5d842ecb37df311c0b884949ab3aa62c3ac2685a Mon Sep 17 00:00:00 2001
From: Duncan Buck <dncnbuck@gmail.com>
Date: Tue, 5 Jul 2022 15:42:59 +0200
Subject: [PATCH 31/54] accept incoming changes

---
 .../converters/mil/frontend/torch/ops.py      | 55 -------------------
 1 file changed, 55 deletions(-)

diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
index 750cca489..4bb3feff1 100644
--- a/coremltools/converters/mil/frontend/torch/ops.py
+++ b/coremltools/converters/mil/frontend/torch/ops.py
@@ -2379,7 +2379,6 @@ def upsample_linear1d(context, node):
     output_size = inputs[1]
     align_corners = bool(inputs[2].val)
     scale = inputs[3]
-<<<<<<< HEAD
 
     scale_factor = None
 
@@ -2432,60 +2431,6 @@ def upsample_linear1d(context, node):
     x = mb.squeeze(x=x, axes=[3], name=node.name)
     context.add(x)
 
-=======
-
-    scale_factor = None
-
-    if scale is not None and scale.val is not None and scale.shape == (1,):
-        # Get the scale factor from provided inputs
-        # This happens when recompute_scale_factor = False
-        scale_factor = scale.val[0]
-
-        # Currently, we are not supporting recompute_scale_factor = False, align_corners = False with float output size
-        _, _, h = x.shape
-        if not is_symbolic(h):
-            # For the static input shape, we can compute the output size beforehand, and check if it is a float value
-            output_size = h * scale_factor
-            is_float = _is_float_value(output_size)
-        else:
-            # For the dynamic input shape, we check if the scale factor itself is float
-            is_float = _is_float_value(scale_factor)
-
-        if is_float and not align_corners:
-            msg = "recompute_scale_factor = False, align_corners = False with float output size is " + \
-                                            "not supported for the upsample op {}".format(node.name)
-            raise NotImplementedError(msg)
-
-    elif isinstance(output_size, list):
-        # When the input shape is dynamic and recompute_scale_factor = True,
-        # we need to trace the graph to find the scale factor.
-        x = mb.expand_dims(x=x, axes=[3])
-        x = mb.torch_upsample_bilinear(
-            x=x,
-            output_height=output_size[0],
-            output_width=1.,
-            align_corners=align_corners,
-        )
-        x = mb.squeeze(x=x, axes=[3], name=node.name)
-        context.add(x)
-        return
-
-    elif output_size.val is not None:
-        # Infer the scale factor from the provided output size
-        scale_factor = _get_scales_from_output_size(output_size, x.shape)
-
-    # Expand the input to a 4d tensor, and use MIL's upsample_bilinear op
-    x = mb.expand_dims(x=x, axes=[3])
-    x = mb.upsample_bilinear(
-        x=x,
-        scale_factor_height=scale_factor,
-        scale_factor_width=1.,
-        align_corners=align_corners,
-    )
-    x = mb.squeeze(x=x, axes=[3], name=node.name)
-    context.add(x)
-
->>>>>>> 12662ddcce4b486494b55d7d25ac0dac3835abb7
 @register_torch_op
 def upsample_bilinear2d(context, node):
     inputs = _get_inputs(context, node)

From 4353c4cdd6a73b5413992da79d67fa0a6dc212b9 Mon Sep 17 00:00:00 2001
From: Duncan Buck <dncnbuck@gmail.com>
Date: Wed, 6 Jul 2022 14:11:56 +0200
Subject: [PATCH 32/54] Add tests for numel and narrow

---
 .../mil/frontend/torch/test/test_torch_ops.py | 69 +++++++++++++++++++
 1 file changed, 69 insertions(+)

diff --git a/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py b/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
index 95e6690b6..b89489414 100644
--- a/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
+++ b/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
@@ -4564,3 +4564,72 @@ def forward(self, x):
             backend=backend,
             converter_input_type=converter_input_type,
         )
+
+# class TestRoiAlign(TorchBaseTest):
+#     @pytest.mark.parametrize(
+#         "shapes, backend",
+#         itertools.product(
+#             [(1,), (1, 2)],
+#             backends
+#         ),
+#     )
+#     def test_roi_align(self, shapes, backend):
+#         class TestModel(nn.Module):
+#             def __init__(self):
+#                 super(TestModel, self).__init__()
+
+#             def forward(self, a):
+#                 return torch.roi_align
+#         self.run_compare_torch(shapes, TestModel().eval(), backend=backend)
+
+
+class TestNumel(TorchBaseTest):
+    @pytest.mark.parametrize(
+        "shapes, backend",
+        itertools.product(
+            [
+                [(2,1)],
+                [(5,1,4,1)],
+                [(1,)],
+            ],
+            backends
+        ),
+    )
+    def test_numel(self, shapes, backend):
+        class Model(nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                v = torch.numel(x)
+                return torch.tensor(v)
+
+        model = Model()
+        self.run_compare_torch(shapes, model, backend=backend)
+
+
+class TestNarrow(TorchBaseTest):
+    @pytest.mark.parametrize(
+        "shapes, dim_start_length, backend",
+        itertools.product(
+            [
+                [(3, 3)],
+            ],
+            [
+                (0, 0, 2)
+            ]
+            ,
+            backends
+        ),
+    )
+    def test_narrow(self, shapes, dim_start_length, backend):
+        dim, start, length = dim_start_length
+        class Model(nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return torch.narrow(x, dim, start, length)
+
+        model = Model()
+        self.run_compare_torch(shapes, model, backend=backend)

From ed2f33e4d1749e31053f23bc1f4c929617c62c13 Mon Sep 17 00:00:00 2001
From: Duncan Buck <dncnbuck@gmail.com>
Date: Wed, 6 Jul 2022 20:09:30 +0200
Subject: [PATCH 33/54] Add tests for torch.op.nms

---
 .../converters/mil/frontend/torch/ops.py      |  4 +--
 .../mil/frontend/torch/test/test_torch_ops.py | 31 +++++++++++++++++++
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
index 4bb3feff1..1927a32d5 100644
--- a/coremltools/converters/mil/frontend/torch/ops.py
+++ b/coremltools/converters/mil/frontend/torch/ops.py
@@ -4618,7 +4618,7 @@ def nms(context, node):
     inputs = _get_inputs(context, node)
     boxes = inputs[0]
 
-    num_boxes = boxes.shape[1]
+    num_boxes = boxes.shape[0]
     max_boxes = num_boxes  # we set the max_boxes just to be # input boxes
 
     scores = inputs[1]
@@ -4656,7 +4656,7 @@ def repeat_interleave(context, node):
     perm = [] + [axis for axis in range(x.rank) if axis not in []]
 
     x = mb.transpose(x=x, perm=perm)  # torch.transpose(x, 0, 1)
-    x = mb.tile(x=x, reps=reps.val[0], name=node.name)  # torch.repeat(x, size)
+    x = mb.tile(x=x, reps=reps, name=node.name)  # torch.repeat(x, size)
     x = mb.reshape(x=x, shape=(-1, x.shape[0]))  # x.view(-1, 2)
     x = mb.transpose(x=x, perm=(-1, 0))  # torch.transpose(x, 0, 1)
     dims = list(x.shape)
diff --git a/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py b/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
index b89489414..ebead3ef9 100644
--- a/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
+++ b/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
@@ -10,6 +10,8 @@
 import pytest
 import torch.nn as nn
 
+import torchvision
+
 from .testing_utils import (
     contains_op,
     generate_input_data,
@@ -4633,3 +4635,32 @@ def forward(self, x):
 
         model = Model()
         self.run_compare_torch(shapes, model, backend=backend)
+
+
+class TestNonMaximalSuppression(TorchBaseTest):
+    @pytest.mark.parametrize(
+        "shapes, scores, iou_threshold, backend",
+        itertools.product(
+            [
+                [(2, 4)],
+            ],
+            [
+                (2,)
+            ],
+            [1]
+            ,
+            backends
+        ),
+    )
+    def test_non_maximal_supression(self, shapes, scores, iou_threshold, backend):
+        # scores = torch.rand((2,))
+        scores = torch.rand(scores)
+        class Model(nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return torchvision.ops.nms(x, scores, iou_threshold=0.7)
+
+        model = Model()
+        self.run_compare_torch(shapes, model, backend=backend)

From bf5de6bfe6963d78fba68f471dc83be84f3d8c6e Mon Sep 17 00:00:00 2001
From: Duncan Buck <dncnbuck@gmail.com>
Date: Wed, 6 Jul 2022 20:13:07 +0200
Subject: [PATCH 34/54] tidy up

---
 .../mil/frontend/torch/test/test_torch_ops.py     | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py b/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
index ebead3ef9..0ed026166 100644
--- a/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
+++ b/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
@@ -4639,21 +4639,14 @@ def forward(self, x):
 
 class TestNonMaximalSuppression(TorchBaseTest):
     @pytest.mark.parametrize(
-        "shapes, scores, iou_threshold, backend",
+        "shapes, scores, backend",
         itertools.product(
-            [
-                [(2, 4)],
-            ],
-            [
-                (2,)
-            ],
-            [1]
-            ,
+            [[(2, 4)]],
+            [(2,)],
             backends
         ),
     )
-    def test_non_maximal_supression(self, shapes, scores, iou_threshold, backend):
-        # scores = torch.rand((2,))
+    def test_non_maximal_supression(self, shapes, scores, backend):
         scores = torch.rand(scores)
         class Model(nn.Module):
             def __init__(self):

From b2e8153cf024ede722bd56fb807c0bac094f41be Mon Sep 17 00:00:00 2001
From: Duncan Buck <dncnbuck@gmail.com>
Date: Sat, 3 Sep 2022 10:15:55 +0200
Subject: [PATCH 35/54] tidy up

---
 .../converters/mil/frontend/torch/ops.py      | 49 -------------------
 1 file changed, 49 deletions(-)

diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
index 1927a32d5..3a52cda73 100644
--- a/coremltools/converters/mil/frontend/torch/ops.py
+++ b/coremltools/converters/mil/frontend/torch/ops.py
@@ -4566,7 +4566,6 @@ def roi_align(context, node):
         raise ValueError(
             '"CropResize" op: expected input rank 4, got {}'.format(x.rank)
         )
-    Hin, Win = input_shape[1:3]
 
     const_box_info = True
     if context[node.inputs[1]].val is None or context[node.inputs[2]].val is None:
@@ -4645,54 +4644,6 @@ def nms(context, node):
         x = mb.squeeze(x=x, axes=[0], name=node.name)
     context.add(x, torch_name=node.name)
 
-@register_torch_op
-def repeat_interleave(context, node):
-    inputs = _get_inputs(context, node)
-
-    x = inputs[0]
-    reps = inputs[1]
-    dim = inputs[2] if inputs[2] else 0
-
-    perm = [] + [axis for axis in range(x.rank) if axis not in []]
-
-    x = mb.transpose(x=x, perm=perm)  # torch.transpose(x, 0, 1)
-    x = mb.tile(x=x, reps=reps, name=node.name)  # torch.repeat(x, size)
-    x = mb.reshape(x=x, shape=(-1, x.shape[0]))  # x.view(-1, 2)
-    x = mb.transpose(x=x, perm=(-1, 0))  # torch.transpose(x, 0, 1)
-    dims = list(x.shape)
-
-    # Implementation of flatten
-    total = 1
-    start_val = dim
-    end_val = -1
-    start = len(dims) + start_val if start_val < 0 else start_val
-    end = len(dims) + end_val if end_val < 0 else end_val
-
-    if start > len(dims) or end > len(dims) or start < 0 or end < 0:
-        raise ValueError(
-            "Invalid start and end. (start, end) == ({}, {})".format(start, end_val)
-        )
-    if start > end:
-        raise ValueError(
-            "Start must be before end. (start, end) == ({}, {})".format(start, end_val)
-        )
-    x_shape = mb.shape(x=x)
-
-    shape1 = mb.slice_by_index(x=x_shape, begin=[0], end=[start])
-    shape2 = mb.slice_by_index(x=x_shape, begin=[end + 1], end=[len(dims)])
-
-    flatten_dim = -1
-    if not any_symbolic(x.shape):
-        flatten_dim = 1
-        for dim in dims[start: end + 1]:
-            flatten_dim *= dim
-
-    shape = mb.concat(values=(shape1, [flatten_dim], shape2), axis=0)
-    shape = mb.cast(x=shape, dtype="int32")
-    reshape = mb.reshape(x=x, shape=shape, name=node.name)
-
-    context.add(reshape, node.name)
-
 @register_torch_op
 def narrow(context, node):
     data, dim, start, length = _get_inputs(context, node, expected=4)

From 20da0e2d2275fafcfed94ed85ecee41616ad9d9b Mon Sep 17 00:00:00 2001
From: Duncan Buck <dncnbuck@gmail.com>
Date: Tue, 7 Jun 2022 12:04:26 +0200
Subject: [PATCH 36/54] handle-split-op-when-num-splits-1

---
 .../converters/mil/frontend/torch/ops.py      | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
index dba015ff1..e0e652577 100644
--- a/coremltools/converters/mil/frontend/torch/ops.py
+++ b/coremltools/converters/mil/frontend/torch/ops.py
@@ -3386,6 +3386,18 @@ def _slice(context, node):
     context.add(res)
 
 
+def _num_splits_and_sizes(split_sizes):
+    if split_sizes.sym_val is not None:
+        return len(split_sizes.sym_val), split_sizes.sym_val
+
+    if any_symbolic(split_sizes.shape):
+        raise ValueError("Unable to determine number of splits")
+
+    num_splits = len(split_sizes.shape)
+    sizes = [get_new_symbol() for _ in range(num_splits)]
+    return num_splits, sizes
+
+
 @register_torch_op(torch_alias=["split_with_sizes"])
 def split(context, node):
     inputs = _get_inputs(context, node, expected=3)
@@ -3413,6 +3425,14 @@ def split(context, node):
         else:
             partial_size = mb.mul(x=tmp, y=remainder)
             split_sizes = mb.concat(values=[whole_sizes, partial_size], axis=0)
+    
+
+    num_splits, sizes = _num_splits_and_sizes(split_sizes=split_sizes)
+    if num_splits == 1:
+        out = mb.identity(x=x, name=node.name)
+        context.add(out, node.name)
+        return
+
     res = mb.split(x=x, split_sizes=split_sizes, axis=dim, name=node.name)
     context.add(res, torch_name=node.name)
 

From ca4cd926646e5aface50a09db3620552be46cbfd Mon Sep 17 00:00:00 2001
From: Duncan Buck <dncnbuck@gmail.com>
Date: Tue, 7 Jun 2022 12:08:20 +0200
Subject: [PATCH 37/54] handle when unpacked tuple contains only single value

---
 coremltools/converters/mil/frontend/torch/ops.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
index e0e652577..d1874cca0 100644
--- a/coremltools/converters/mil/frontend/torch/ops.py
+++ b/coremltools/converters/mil/frontend/torch/ops.py
@@ -2586,6 +2586,10 @@ def upsample_nearest2d(context, node):
 def tupleunpack(context, node):
     inputs = _get_inputs(context, node, expected=1)
     values = inputs[0]
+
+    if len(node.outputs) == 1:
+        values = [values]
+
     # Node input could have been turned into constant array in @tupleconstruct
     if not isinstance(values, tuple) and not isinstance(values, list):
         values = values.val

From c80a3a78b4107ef90cb20f0c275a63bcc0b1f5bd Mon Sep 17 00:00:00 2001
From: Duncan Buck <dncnbuck@gmail.com>
Date: Tue, 7 Jun 2022 12:28:23 +0200
Subject: [PATCH 38/54] handle broadcasting indicies for torch index op

---
 coremltools/converters/mil/frontend/torch/ops.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
index d1874cca0..93c50d1e6 100644
--- a/coremltools/converters/mil/frontend/torch/ops.py
+++ b/coremltools/converters/mil/frontend/torch/ops.py
@@ -3089,8 +3089,11 @@ def index(context, node):
     # For multiple index axes case, we now assume that all the index have equal shape
     for index in valid_indices:
         if not is_compatible_symbolic_vector(index.shape, valid_indices[0].shape):
-            raise NotImplementedError("Broadcasable tensor index not supported.")
-
+            broadcast_inputs = _broadcast_tensors([valid_indices[0], index])
+            index = broadcast_inputs[1]
+            valid_indices[0] = broadcast_inputs[0]
+            valid_indices.append(index)
+    
     # First stack the index together
     indices_rank = valid_indices[0].rank
     indices = mb.stack(values=valid_indices, axis=indices_rank)

From 8631d1ba03c6f3a3b13745816d963e0322a60066 Mon Sep 17 00:00:00 2001
From: Duncan Buck <dncnbuck@gmail.com>
Date: Tue, 7 Jun 2022 12:29:53 +0200
Subject: [PATCH 39/54] patch torch clamp op to handle int dtype

---
 coremltools/converters/mil/frontend/torch/ops.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
index 93c50d1e6..1e2bcd240 100644
--- a/coremltools/converters/mil/frontend/torch/ops.py
+++ b/coremltools/converters/mil/frontend/torch/ops.py
@@ -3939,8 +3939,20 @@ def ceil(context, node):
 @register_torch_op
 def clamp(context, node):
     inputs = _get_inputs(context, node, expected=3)
-    min_val = inputs[1] if inputs[1] else _np.finfo(_np.float32).min
-    max_val = inputs[2] if inputs[2] else _np.finfo(_np.float32).max
+    if not inputs[1]:
+        min_val = _np.finfo(_np.float32).min
+    else:
+        min_val = inputs[1]
+        if types.builtin_to_string(min_val.dtype).startswith('int'):
+            min_val = mb.cast(x=min_val, dtype='fp32')
+
+    if not inputs[2]:
+        max_val = _np.finfo(_np.float32).max
+    else:
+        max_val = inputs[2]
+        if types.builtin_to_string(max_val.dtype).startswith('int'):
+            max_val = mb.cast(x=max_val, dtype='fp32')
+
     context.add(mb.clip(x=inputs[0], alpha=min_val, beta=max_val, name=node.name))
 
 @register_torch_op

From 2f05538e7ebe79acf524c2a7d0d6cb591f8fb5c9 Mon Sep 17 00:00:00 2001
From: Duncan Buck <dncnbuck@gmail.com>
Date: Tue, 7 Jun 2022 13:06:34 +0200
Subject: [PATCH 40/54] return copy of inpt tensor if no dtype is given

---
 coremltools/converters/mil/frontend/torch/ops.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
index 1e2bcd240..85887c1a2 100644
--- a/coremltools/converters/mil/frontend/torch/ops.py
+++ b/coremltools/converters/mil/frontend/torch/ops.py
@@ -3497,6 +3497,13 @@ def to(context, node):
             "Received invalid arguments for PyTorch conversion of op {}".format(node)
         )
 
+    # We have to handle the case where the dtype is not set, this should be inferred from the Tensor dtype
+    # see, https://pytorch.org/docs/stable/generated/torch.Tensor.to.html?highlight=#torch.Tensor.to
+    if dtype is None:
+        out = mb.identity(x=_input, name=node.name)
+        context.add(out, node.name)
+        return = 0 # TODO: infer from Tensor (spoiler in this case we care about its f32 => 6)
+
     torch_dtype = NUM_TO_TORCH_DTYPE[dtype]
     if isinstance(_input, Var) and _input.val is not None:
         _input = _input.val

From ed02c4d87f4c7822af5b89b89b49068682763552 Mon Sep 17 00:00:00 2001
From: Duncan Buck <dncnbuck@gmail.com>
Date: Wed, 8 Jun 2022 11:39:19 +0200
Subject: [PATCH 41/54] remove accidential typo

---
 coremltools/converters/mil/frontend/torch/ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
index 85887c1a2..7054e5f27 100644
--- a/coremltools/converters/mil/frontend/torch/ops.py
+++ b/coremltools/converters/mil/frontend/torch/ops.py
@@ -3502,7 +3502,7 @@ def to(context, node):
     if dtype is None:
         out = mb.identity(x=_input, name=node.name)
         context.add(out, node.name)
-        return = 0 # TODO: infer from Tensor (spoiler in this case we care about its f32 => 6)
+        return
 
     torch_dtype = NUM_TO_TORCH_DTYPE[dtype]
     if isinstance(_input, Var) and _input.val is not None:

From ec550ca1aafbfa3af43f2f04cd9c8bb8783cee77 Mon Sep 17 00:00:00 2001
From: Tony Bove <70229264+tonybove-apple@users.noreply.github.com>
Date: Wed, 8 Jun 2022 11:36:36 -1000
Subject: [PATCH 42/54] Docs for v6 with layer_norm fix (#1514)

---
 coremltools/converters/mil/mil/ops/defs/normalization.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/coremltools/converters/mil/mil/ops/defs/normalization.py b/coremltools/converters/mil/mil/ops/defs/normalization.py
index de6174580..7c7a46d5e 100644
--- a/coremltools/converters/mil/mil/ops/defs/normalization.py
+++ b/coremltools/converters/mil/mil/ops/defs/normalization.py
@@ -229,13 +229,13 @@ class layer_norm(Operation):
         * Dimensions to perform layer normalization.
         * Default is ``None`` (all dimensions).
 
-    gamma: const tensor<[K], T> (Optional)
+    gamma: const tensor<\*?, T>, T> (Optional)
         * if provided, the shape must be be ``x.shape[axes]``. For instance, if
           input ``x`` with shape ``(3,4,5,6)`` and ``axes = [2,3]``, gamma must have
           shape ``(5,6)``.
         * Default is all ones.
 
-    beta: const tensor<[K], T> (Optional)
+    beta: const tensor<\*?, T>, T> (Optional)
         * Same shape as gamma.
         * Default is all zeros.
 

From 78ab5fd16d7b313ee60baf612c83bfb4997e17fe Mon Sep 17 00:00:00 2001
From: Arjun Sharda <77706434+ajsharda17@users.noreply.github.com>
Date: Wed, 8 Jun 2022 16:53:11 -0500
Subject: [PATCH 43/54] Update ---bug-report.md (#1513)

---
 .github/ISSUE_TEMPLATE/---bug-report.md | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/---bug-report.md b/.github/ISSUE_TEMPLATE/---bug-report.md
index 7ca71fdf8..e0c2ba750 100644
--- a/.github/ISSUE_TEMPLATE/---bug-report.md
+++ b/.github/ISSUE_TEMPLATE/---bug-report.md
@@ -7,10 +7,9 @@ assignees: ''
 
 ---
 
-## 🐞Describe the bug
-
-- Only create an issue here for bugs in the coremltools Python package. If this is a bug with the Core ML Framework or Xcode, please submit your bug here: https://developer.apple.com/bug-reporting/
-- A clear and brief description of the bug.
+## 🐞Describing the bug
+- Make sure to only create an issue here for bugs in the coremltools Python package. If this is a bug with the Core ML Framework or Xcode, please submit your bug here: https://developer.apple.com/bug-reporting/
+- Provide a clear and consise description of the bug.
 
 ## Stack Trace
 - If applicable, please paste the complete stack trace.
@@ -20,13 +19,12 @@ assignees: ''
 ```
 # Paste Python code snippet here, complete with any required import statements.
 ```
-- If the model conversion succeeds, but there is a numerical mismatch in predictions, please include code used for comparisons.
+- If the model conversion succeeds, but there is a numerical mismatch in predictions, please include the code used for comparisons.
 
 ## System environment (please complete the following information):
  - coremltools version:
  - OS (e.g. MacOS version or Linux type):
- - Any other relevant version information:
-     - e.g. PyTorch or TensorFlow version
+ - Any other relevant version information (e.g. PyTorch or TensorFlow version):
 
 ## Additional context
-- Add any other context about the problem here.
+- Add anything else about the problem here that you want to share.

From f8e177684a7530447e228d6fe2ab344bdf7d2dca Mon Sep 17 00:00:00 2001
From: Henry Tao <55294647+jakesabathia2@users.noreply.github.com>
Date: Thu, 9 Jun 2022 08:54:37 -0700
Subject: [PATCH 44/54] Fix a bug when destructing coreml model (#1515)

Fix a bug when destructing coreml model (#1515)
---
 coremltools/models/model.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/coremltools/models/model.py b/coremltools/models/model.py
index b656ed23a..16ead447a 100644
--- a/coremltools/models/model.py
+++ b/coremltools/models/model.py
@@ -363,6 +363,8 @@ def __del__(self):
         # Cleanup temporary package upon destruction
         if hasattr(self, 'is_package') and self.is_package \
            and hasattr(self, 'is_temp_package') and self.is_temp_package:
+            if _shutil is None:
+                import shutil as _shutil
             _shutil.rmtree(self.package_path)
 
     @property

From d96b7d69f73ed07f1005099e831a2e8864de485e Mon Sep 17 00:00:00 2001
From: Tony Bove <70229264+tonybove-apple@users.noreply.github.com>
Date: Thu, 9 Jun 2022 13:11:20 -1000
Subject: [PATCH 45/54] Formatting fixes and compression submenu (#1518)

---
 coremltools/converters/_converters_entry.py   | 282 ++++++++++--------
 coremltools/converters/mil/input_types.py     |  39 ++-
 docs/source/coremltools.models.ml_program.rst |  21 ++
 docs/source/coremltools.models.rst            |  10 +-
 4 files changed, 198 insertions(+), 154 deletions(-)
 create mode 100644 docs/source/coremltools.models.ml_program.rst

diff --git a/coremltools/converters/_converters_entry.py b/coremltools/converters/_converters_entry.py
index 7277bc0bb..1169bc2d2 100644
--- a/coremltools/converters/_converters_entry.py
+++ b/coremltools/converters/_converters_entry.py
@@ -66,18 +66,17 @@ def convert(
     debug=False,
 ):
     """
-    Convert a TensorFlow or PyTorch model to the Core ML model format as either a neural network or an ML program. 
-    To learn about the differences, see
-    `ML Programs <https://coremltools.readme.io/docs/ml-programs>`_.
-    Some parameters and requirements differ by TensorFlow and PyTorch frameworks.
-
+    Convert a TensorFlow or PyTorch model to the Core ML model format as either
+    a neural network or an `ML program <https://coremltools.readme.io/docs/ml-programs>`_.
+    Some parameters and requirements differ for TensorFlow and PyTorch
+    conversions.
 
     Parameters
     ----------
-    
+
     model :
         TensorFlow 1, TensorFlow 2, or PyTorch model in one of the following
-        formats.
+        formats:
 
         * TensorFlow versions 1.x
         
@@ -107,79 +106,94 @@ def convert(
 
     inputs : list of ``TensorType`` or ``ImageType``
 
-        * If "dtype" is specified in ``TensorType`` / ``ImageType``,
-          it will be applied to the input of the converted model.
-
-          e.g.: The following code snippet will produce a CoreML model with Float16 typed inputs.
-          >>> import coremltools as ct
-          >>> mlmodel = ct.convert(keras_model,
-          >>>                      inputs=[ct.TensorType(dtype=np.float16)],
-          >>>                      minimum_deployment_target=ct.target.macOS13)
-
-          e.g.: The following code snippet will produce a CoreML model with Grayscale 16 bit input image type
-          >>> import coremltools as ct
-          >>> # H : image height, W: image width
-          >>> mlmodel = ct.convert(torch_model,
-          >>>                      inputs=[ct.ImageType(shape=(1, 1, H, W),
-          >>>                              color_layout=ct.colorlayout.GRAYSCALE_FLOAT16)],
-          >>>                      minimum_deployment_target=ct.target.macOS13)
-
-        * TensorFlow 1 and 2 (including tf.keras)
-
+        * If you specify ``dtype`` with ``TensorType`` or ``ImageType``, it will
+          be applied to the input of the converted model. For example, the
+          following code snippet will produce a Core ML model with float 16 typed
+          inputs.
+          
+          .. sourcecode:: python
+
+              import coremltools as ct
+              mlmodel = ct.convert(keras_model,
+                                   inputs=[ct.TensorType(dtype=np.float16)],
+                                   minimum_deployment_target=ct.target.macOS13)
+
+        * The following code snippet will produce a Core ML model with the
+          ``GRAYSCALE_FLOAT16`` input image type:
+          
+          .. sourcecode:: python
+
+              import coremltools as ct
+              # H : image height, W: image width
+              mlmodel = ct.convert(torch_model,
+                               inputs=[ct.ImageType(shape=(1, 1, H, W),
+                                       color_layout=ct.colorlayout.GRAYSCALE_FLOAT16)],
+                               minimum_deployment_target=ct.target.macOS13)
+
+        * TensorFlow 1 and 2 (including tf.keras):
             - The ``inputs`` parameter is optional. If not provided, the inputs
-              are placeholder nodes in the model (if the model is frozen graph)
+              are placeholder nodes in the model (if the model is a frozen graph)
               or function inputs (if the model is a ``tf.function``).
             - If ``inputs`` is provided, it must be a flat list.
             - The ``inputs`` must correspond to all or some of the placeholder nodes
               in the TF model.
-            - If ``name`` is specified in ``TensorType`` and ``ImageType``, it
+            - If ``name`` is specified with ``TensorType`` and ``ImageType``, it
               must correspond to a placeholder op in the TF graph. The input names
-              in the converted CoreML model can later be modifed using ``ct.utils.rename_feature`` API.
-            - If ``dtype`` is not specified, it defaults to the dtype of the inputs in the TF model.
-
-        * PyTorch
+              in the converted Core ML model can later be modifed using the
+              ``ct.utils.rename_feature`` API.
+            - If ``dtype`` is not specified, it defaults to the ``dtype`` of the
+              inputs in the TF model.
 
+        * PyTorch:
             - The ``inputs`` parameter is required.
-            - Number of elements in the ``inputs`` must match the number of inputs of the pytorch model.
+            - Number of elements in ``inputs`` must match the number of inputs
+              of the PyTorch model.
             - ``inputs`` may be a nested list or tuple.
             - ``TensorType`` and ``ImageType`` must have the ``shape`` specified.
-            - If ``name`` argument is specified in ``TensorType`` / ``ImageType``, the converted
-                CoreML model will have inputs with the same name.
-            - If ``dtype`` is missing, it defaults to float32
+            - If the ``name`` argument is specified with ``TensorType`` or
+              ``ImageType``, the converted Core ML model will have inputs with
+              the same name.
+            - If ``dtype`` is missing, it defaults to float 32.
 
     outputs : list of ``TensorType`` or ``ImageType`` (optional)
 
-        * If "dtype" is specified in ``TensorType`` / ``ImageType``,
-          it will be applied to the output of the converted model.
+        * If you specify ``dtype`` with ``TensorType`` or ``ImageType``,
+          it will be applied to the output of the converted model. For example,
+          to produce float 16 typed inputs and outputs:
+          
+          .. sourcecode:: python
 
-          e.g.: to produce float 16 typed inputs and outputs:
-          >>> import coremltools as ct
-          >>> mlmodel = ct.convert(keras_model,
-          >>>                      inputs=[ct.TensorType(dtype=np.float16)],
-          >>>                      outputs=[ct.TensorType(dtype=np.float16)],
-          >>>                      minimum_deployment_target=ct.target.macOS13)
+              import coremltools as ct
+              mlmodel = ct.convert(keras_model,
+                                   inputs=[ct.TensorType(dtype=np.float16)],
+                                   outputs=[ct.TensorType(dtype=np.float16)],
+                                   minimum_deployment_target=ct.target.macOS13)
 
-          e.g.: to produce Image inputs and outputs:
-          >>> import coremltools as ct
-          >>> # H: image height, W: image width
-          >>> mlmodel = ct.convert(torch_model,
-          >>>                      inputs=[ct.ImageType(shape=(1, 3, H, W), color_layout=ct.colorlayout.RGB)],
-          >>>                      outputs=[ct.ImageType(color_layout=ct.colorlayout.RGB)],
-          >>>                      minimum_deployment_target=ct.target.macOS13)
+        * To produce image inputs and outputs:
+          
+          .. sourcecode:: python
 
-        * TensorFlow 1 and 2 (including tf.keras)
+              import coremltools as ct
+              # H: image height, W: image width
+              mlmodel = ct.convert(torch_model,
+                                   inputs=[ct.ImageType(shape=(1, 3, H, W), color_layout=ct.colorlayout.RGB)],
+                                   outputs=[ct.ImageType(color_layout=ct.colorlayout.RGB)],
+                                   minimum_deployment_target=ct.target.macOS13)
 
-            - If ``outputs`` is not specified, the converter infers outputs from the
-              sink nodes in the graph.
-            - If specified, the ``name`` in ``TensorType`` / ``ImageType`` must correspond
-              to a node in the TF graph. In this case, the model will be converted up to
-              that node.
+        * TensorFlow 1 and 2 (including tf.keras):
 
-        * PyTorch
+            - If ``outputs`` is not specified, the converter infers outputs from 
+              the sink nodes in the graph.
+            - If specified, the ``name`` with ``TensorType`` or ``ImageType``
+              must correspond to a node in the TF graph. In this case, the model
+              will be converted up to that node.
 
-            - If specified, the length of the list must match the number of outputs returned by the
-              torch model
-            - If ``name`` is specified it is applied to the output names of the converted coreml model.
+        * PyTorch:
+
+            - If specified, the length of the list must match the number of
+              outputs returned by the PyTorch model.
+            - If ``name`` is specified, it is applied to the output names of the
+              converted Core ML model.
 
     classifier_config : ClassifierConfig class (optional)
         The configuration if the MLModel is intended to be a classifier.
@@ -187,8 +201,8 @@ def convert(
     minimum_deployment_target : coremltools.target enumeration (optional)
         A member of the ``coremltools.target`` enum.
         The value of this parameter determines the type of the model
-        representation produced by the converter. To learn about the differences between
-        neural networks and ML programs, see
+        representation produced by the converter. To learn about the differences
+        between neural networks and ML programs, see
         `ML Programs <https://coremltools.readme.io/docs/ml-programs>`_.
 
         - The converter produces a neural network (``neuralnetwork``) if:
@@ -226,8 +240,8 @@ def convert(
         - ``'neuralnetwork'``: Returns an MLModel (``coremltools.models.MLModel``)
           containing a NeuralNetwork proto, which is the original Core ML format.
           The model saved from this returned object is executable either on
-          iOS13/macOS10.15/watchOS6/tvOS13 and above, or on
-          iOS14/macOS11/watchOS7/tvOS14 and above, depending on the layers used
+          iOS13/macOS10.15/watchOS6/tvOS13 and newer, or on
+          iOS14/macOS11/watchOS7/tvOS14 and newer, depending on the layers used
           in the model.
         - ``'mlprogram'`` : Returns an MLModel (``coremltools.models.MLModel``)
           containing a MILSpec.Program proto, which is the Core ML program format.
@@ -247,80 +261,83 @@ def convert(
 
     compute_precision : coremltools.precision enumeration or ct.transform.FP16ComputePrecision() (optional)
 
-        Use this argument to control the storage precision of the tensors in the mlprogram.
-
-        Must be one of the following.
-
-        - ``coremltools.precision.FLOAT16`` enum
-            - In this case the following transform is applied to produce a float16 typed program,
-                i.e. a program where all the intermediate float tensors have type float16
-                (for ops that support that type).
-
-              ::
-                 coremltools.transform.FP16ComputePrecision(op_selector=
+        Use this argument to control the storage precision of the tensors in the
+        ML program. Must be one of the following.
+        
+        - ``coremltools.precision.FLOAT16`` enum: The following transform is
+          applied to produce a float 16 program; that is, a program in which all
+          the intermediate float tensors are of type float 16 (for ops that
+          support that type).
+          ::
+              coremltools.transform.FP16ComputePrecision(op_selector=
                                                          lambda op:True)
 
-              The above transform itertes over all the ops. For each op,
-              it looks at its inputs and outputs, and if they are of type float32, ``cast``
-              ops are injected to convert those tensors (aka vars) to float16 type.
+          The above transform iterates through all the ops, looking at each op's
+          inputs and outputs. If they are of type float 32, ``cast``
+          ops are injected to convert those tensors (also known as `vars`) to
+          type float 16.
 
-        - ``coremltools.precision.FLOAT32`` enum
-            - No transform is applied. The original float32 tensor dtype in
-              the source model is preserved. Opt into this option if the default converted model
-              is displaying numerical precision issues.
+        - ``coremltools.precision.FLOAT32`` enum: No transform is applied.
+          
+          The original float32 tensor dtype in the source model is preserved.
+          Opt into this option if the default converted model is displaying
+          numerical precision issues.
 
         - ``coremltools.transform.FP16ComputePrecision(op_selector=...)``
-            - Use this option to control which tensors are cast to float16.
-              Before casting the inputs/outputs of any op from float32 to float16,
-              the op_selector function is invoked on the op object. This function
-              must return a boolean value. By default its set to return True for every op,
-              however, this can be customized.
-            - For example:
-              ::
-                 coremltools.transform.FP16ComputePrecision(op_selector=
+          
+          Use this option to control which tensors are cast to float 16.
+          Before casting the inputs/outputs of any op from float32 to float 16,
+          the op_selector function is invoked on the op object. This function
+          must return a boolean value. By default it returns ``True`` for every op,
+          but you can customize this.
+          
+          For example:
+          ::
+             coremltools.transform.FP16ComputePrecision(op_selector=
                                          lambda op: op.op_type != "linear")
 
-              The above casts all the float32 tensors to be float16, except
-              the input/output tensors to any ``linear`` op. See more examples
-              on this below.
+          The above casts all the float32 tensors to be float 16, except
+          the input/output tensors to any ``linear`` op. See more examples
+          below.
 
-        - ``None``
-            - This is the default.
-            - When ``convert_to="mlprogram"``, compute_precision parameter
+        - ``None``: The default
+            - When ``convert_to="mlprogram"``, the ``compute_precision`` parameter
               defaults to ``coremltools.precision.FLOAT16``.
-            - When ``convert_to="neuralnetwork"``, compute_precision parameter
+            - When ``convert_to="neuralnetwork"``, the ``compute_precision`` parameter
               needs to be ``None`` and has no meaning.
-
-            e.g.: Customize the float16 precision transform to prevent from casting all the "real_div"
-                  ops in the program to float16 precision:
-
-            >>> def skip_real_div_ops(op):
-            >>>     if op.op_type == "real_div":
-            >>>         return False
-            >>>     return True
-            >>>
-            >>> model = ct.convert(source_model,
-            >>>                    compute_precision=ct.transform.FP16ComputePrecision(op_selector=skip_real_div_ops),
-            >>>                    minimum_deployment_target=ct.target.iOS15
-            >>>                    )
+            - For example, you can customize the float 16 precision transform to prevent
+              casting all the ``real_div`` ops in the program to float 16
+              precision:
+              
+              .. sourcecode:: python
+
+                  def skip_real_div_ops(op):
+                       if op.op_type == "real_div":
+                           return False
+                       return True
+                  
+                  model = ct.convert(source_model,
+                                     compute_precision=ct.transform.FP16ComputePrecision(op_selector=skip_real_div_ops),
+                                     minimum_deployment_target=ct.target.iOS15
+                                    )
 
     skip_model_load : bool
-        Set to True to prevent coremltools from calling into the Core ML framework
+        Set to ``True`` to prevent coremltools from calling into the Core ML framework
         to compile and load the model, post-conversion. In that case, the returned
         model object cannot be used to make a prediction, but can be used to save
-        via ``"model.save()"``. This flag may be used to convert to a newer model type
-        on an older Mac, which if done without turning this flag on, may raise a
-        runtime warning.
+        with ``model.save()``. This flag may be used to convert to a newer model type
+        on an older Mac, which may raise a runtime warning if done without
+        turning this flag on.
         
-        Example: Use this flag to suppress runtime warning when converting to
-        ML program model type on a macOS 11, since ML program
-        can only be compiled and loaded from macOS12+.
+        Example: Use this flag to suppress a runtime warning when converting to an
+        ML program model on macOS 11, since an ML program can only be compiled and
+        loaded from macOS12+.
         
-        Defaults to False.
+        Defaults to ``False``.
 
     compute_units: coremltools.ComputeUnit
     
-        An enum with three possible values.
+        An enum with the following possible values.
         
             - ``coremltools.ComputeUnit.ALL``: Use all compute units available, including the
               neural engine.
@@ -331,21 +348,26 @@ def convert(
     package_dir : str
         Post conversion, the model is saved at a temporary location and
         loaded to form the MLModel object ready for prediction.
-        If package_dir is provided, model will be saved at this location instead of creating a temporary directory.
-        - if not None, must be a path to a directory with extension .mlpackage
+        
+        * If ``package_dir`` is provided, model will be saved at this location
+          rather than creating a temporary directory.
+        * If not ``None``, this must be a path to a directory with the extension
+          ``.mlpackage``.
 
     debug : bool
-        This flag should generally be False except for debugging purposes
-        Setting this flag to True:
-         - For Torch conversion, it will print the list of supported and unsupported ops
-           found in the model if conversion fails due to an unsupported op.
-         - For Tensorflow conversion, it will cause to display extra logging and visualizations
+        This flag should generally be ``False`` except for debugging purposes.
+        Setting this flag to ``True`` produces the following behavior:
+          - For Torch conversion, it will print the list of supported and
+            unsupported ops found in the model if conversion fails due to an
+            unsupported op.
+          - For Tensorflow conversion, it will cause to display extra logging
+            and visualizations.
 
     Returns
     -------
     
     model : ``coremltools.models.MLModel`` or ``coremltools.converters.mil.Program``
-        A Core ML MLModel object or MIL Program object (see ``convert_to``).
+        A Core ML MLModel object or MIL program object (see ``convert_to``).
 
     Examples
     --------
@@ -363,7 +385,7 @@ def convert(
         >>> results = mlmodel.predict({"input": test_input})
         >>> print(results['output'])
 
-    TensorFlow 2 (``model`` is tf.Keras model path):
+    TensorFlow 2 (``model`` is a tf.Keras model path):
 
         >>> x = tf.keras.Input(shape=(32,), name='input')
         >>> y = tf.keras.layers.Dense(16, activation='softmax')(x)
@@ -388,7 +410,7 @@ def convert(
         >>> results = mlmodel.predict({"input": example_input.numpy()})
         >>> print(results['1651']) # 1651 is the node name given by PyTorch's JIT
 
-    See `neural-network-conversion <https://coremltools.readme.io/docs/neural-network-conversion>`_ for
+    See `Conversion Options <https://coremltools.readme.io/docs/neural-network-conversion>`_ for
     more advanced options.
     """
     _check_deployment_target(minimum_deployment_target)
diff --git a/coremltools/converters/mil/input_types.py b/coremltools/converters/mil/input_types.py
index c26a767af..6e1022454 100644
--- a/coremltools/converters/mil/input_types.py
+++ b/coremltools/converters/mil/input_types.py
@@ -89,7 +89,7 @@ def __init__(
         channel_first=None,
     ):
         """
-        Configuration class used for image inputs in CoreML.
+        Configuration class used for image inputs in Core ML.
 
         Parameters
         ----------
@@ -97,32 +97,31 @@ def __init__(
             The scaling factor for all values in the image channels.
 
         bias: float or list of float
-            If ``color_layout`` is ``'G'``, bias would be a ``float``.
+            * If ``color_layout`` is ``ct.colorlayout.GRAYSCALE`` or
+              ``ct.colorlayout.GRAYSCALE_FLOAT16``, bias would be a ``float``.
+            * If ``color_layout`` is ``ct.colorlayout.RGB`` or ``ct.colorlayout.BGR``,
+              bias would be a list of ``float``.
 
-            If `color_layout` is ``'RGB'`` or ``'BGR'``, bias would be a list of ``float``.
-
-        color_layout: string or of type ct.colorlayout enumeration
-            Color layout of the image.
-
-            Valid values:
-            enumeration (recommended):
-                * ct.colorlayout.RGB
-                * ct.colorlayout.BGR
-                * ct.colorlayout.GRAYSCALE
-                * ct.colorlayout.GRAYSCALE_FLOAT16
+        color_layout: string or enumeration of type ``ct.colorlayout``
+            Color layout of the image. Valid values are as follows:
+            
+            Enumeration (recommended):
+                * ``ct.colorlayout.RGB``
+                * ``ct.colorlayout.BGR``
+                * ``ct.colorlayout.GRAYSCALE``
+                * ``ct.colorlayout.GRAYSCALE_FLOAT16``
 
-            string values (older way to specify):
-                * ``'G'``: Grayscale (maps to ct.colorlayout.GRAYSCALE)
-                * ``'RGB'``: [Red, Green, Blue] (maps to ct.colorlayout.BGR)
-                * ``'BGR'``: [Blue, Green, Red] (maps to ct.colorlayout.RGB)
+            String values (older way to specify):
+                * ``'G'``: Grayscale (maps to ``ct.colorlayout.GRAYSCALE``)
+                * ``'RGB'``: [Red, Green, Blue] (maps to ``ct.colorlayout.BGR``)
+                * ``'BGR'``: [Blue, Green, Red] (maps to ``ct.colorlayout.RGB``)
 
         channel_first: (bool) or None
             Set to ``True`` if input format is channel first.
 
             Default format:
-                For TensorFlow: channel last (``channel_first=False``).
-
-                For PyTorch: channel first (``channel_first=True``).
+                * For TensorFlow: channel last (``channel_first=False``).
+                * For PyTorch: channel first (``channel_first=True``).
         """
         super(ImageType, self).__init__(name, shape)
         self.scale = scale
diff --git a/docs/source/coremltools.models.ml_program.rst b/docs/source/coremltools.models.ml_program.rst
new file mode 100644
index 000000000..bf3deda3f
--- /dev/null
+++ b/docs/source/coremltools.models.ml_program.rst
@@ -0,0 +1,21 @@
+
+affine_quantize_weights
+----------------------------------------------------------------
+
+.. automodule:: coremltools.models.ml_program.compression_utils
+
+   .. autofunction:: affine_quantize_weights
+
+palettize_weights
+----------------------------------------------------------------
+
+.. automodule:: coremltools.models.ml_program.compression_utils
+
+   .. autofunction:: palettize_weights
+
+sparsify_weights
+----------------------------------------------------------------
+
+.. automodule:: coremltools.models.ml_program.compression_utils
+
+   .. autofunction:: sparsify_weights
diff --git a/docs/source/coremltools.models.rst b/docs/source/coremltools.models.rst
index e08c095ed..16a92dc75 100644
--- a/docs/source/coremltools.models.rst
+++ b/docs/source/coremltools.models.rst
@@ -7,11 +7,13 @@ MLModel
 .. automodule:: coremltools.models.model
    :members:
 
-compression\_utils 
----------------------------------------------------
+compression\_utils
+-------------------------------------------------
 
-.. automodule:: coremltools.models.ml_program.compression_utils
-   :members:
+.. toctree::
+   :maxdepth: 1
+   
+   coremltools.models.ml_program
 
 
 array\_feature\_extractor 

From c082d4c8efafe10c0026daaef5adc9d7690cf22c Mon Sep 17 00:00:00 2001
From: Arjun Sharda <77706434+ajsharda17@users.noreply.github.com>
Date: Mon, 13 Jun 2022 12:33:45 -0500
Subject: [PATCH 46/54] Update CONTRIBUTING.md (#1521)

---
 CONTRIBUTING.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 31d5e6911..2ff1ec285 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -2,7 +2,7 @@
 Contribution Guidelines
 =======================
 
-The Core ML `.mlmodel` file format is a publicly documented specification. The Core ML Tools source code is 100% open source under the [BSD license](https://github.com/apple/coremltools/blob/master/LICENSE.txt). We welcome all contributions and ideas to grow the product. We ask that you follow the [contributing guidelines and code of conduct](#guidelines-and-code-of-conduct)  in this document, which are typical of open source communities.
+The Core ML `.mlmodel` file format is a publicly documented specification. The Core ML Tools source code is 100% open source under the [BSD-3 Clause license](https://github.com/apple/coremltools/blob/master/LICENSE.txt). We welcome all contributions and ideas to grow the product. We ask that you follow the [contributing guidelines and code of conduct](#guidelines-and-code-of-conduct), which are typical of open source communities.
 
 There are many ways to contribute to coremltools. [**Use these templates**](https://github.com/apple/coremltools/issues/new/choose) to report issues, make feature requests, or ask questions. We welcome even minor improvements to code, testing, and documentation, as well as requests for new features and enhancements. Don’t hesitate to do the following:
 
@@ -25,7 +25,7 @@ To ensure that issues and pull requests can be addressed quickly, please do the
 
 * Check [open issues](https://github.com/apple/coremltools/issues) and [current pull requests](https://github.com/apple/coremltools/pulls) in the repository to see if your issue, feature request, or question already exists or has already been addressed.
 * Fill in the appropriate [template](https://github.com/apple/coremltools/issues/new/choose) with as much detail as possible as well as code snippets, so that we are able to reproduce the issue.
-* Promptly reply to any requests or questions posed by others within the community on your issue or pull request.
+* Promptly reply to any requests or questions asked by others within the community on your issue or pull request.
 
 ## Resources
 

From 108f5da36cc265a78a59001d2c871b9d1fa47495 Mon Sep 17 00:00:00 2001
From: fukatani <nannyakannya@gmail.com>
Date: Tue, 14 Jun 2022 03:23:15 +0900
Subject: [PATCH 47/54] Add torch AdaptiveAvgPool2d test. (#1502)

Co-authored-by: Toby Roseman <troseman@apple.com>
---
 .../mil/frontend/torch/test/test_torch_ops.py | 34 +++++++++++++++++--
 1 file changed, 31 insertions(+), 3 deletions(-)

diff --git a/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py b/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
index 11535f371..7b68fedf0 100644
--- a/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
+++ b/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
@@ -1451,7 +1451,7 @@ class TestAdaptiveMaxPool(TorchBaseTest):
     @pytest.mark.parametrize(
         "output_size, magnification, delta, depth, backend",
         itertools.product(
-            [(1,1), (3,2)],
+            [(1, 1), (3, 2)],
             [1, 2, 7],
             [0, 11],
             [1, 2, 3],
@@ -1466,15 +1466,43 @@ def test_adaptive_max_pool2d(
         # since coremltools reproduces PyTorch's kernel sizes and
         # offsets for adaptive pooling layers only when input_size is
         # a multiple of output_size, we expect failures otherwise
-        if not (input_size[0] % output_size[0]  == 0 and input_size[1] % output_size[1] == 0):
+        if not (input_size[0] % output_size[0] == 0 and input_size[1] % output_size[1] == 0):
             pytest.xfail("Test should fail because input_size is not a multiple of output_size")
         n = 1
-        in_shape = (n,depth) + input_size
+        in_shape = (n, depth) + input_size
         model = nn.AdaptiveMaxPool2d(
             output_size
         )
         self.run_compare_torch(in_shape, model, backend=backend)
 
+class TestAdaptiveAvgPool(TorchBaseTest):
+    @pytest.mark.parametrize(
+        "output_size, magnification, delta, depth, backend",
+        itertools.product(
+            [(1, 1), (3, 2)],
+            [1, 2, 7],
+            [0, 11],
+            [1, 2, 3],
+            backends,
+        ),
+    )
+    def test_adaptive_avg_pool2d(
+            self, output_size, magnification, delta, depth, backend
+    ):
+        # input_size = output_size * magnification + delta
+        input_size = (delta + magnification * output_size[0], delta + magnification * output_size[1])
+        # since coremltools reproduces PyTorch's kernel sizes and
+        # offsets for adaptive pooling layers only when input_size is
+        # a multiple of output_size, we expect failures otherwise
+        if not (input_size[0] % output_size[0] == 0 and input_size[1] % output_size[1] == 0):
+            pytest.xfail("Test should fail because input_size is not a multiple of output_size")
+        n = 1
+        in_shape = (n, depth) + input_size
+        model = nn.AdaptiveAvgPool2d(
+            output_size
+        )
+        self.run_compare_torch(in_shape, model, backend=backend)
+
 class TestMaxPool(TorchBaseTest):
 
     @pytest.mark.parametrize(

From 47debd332483f1351a31153b2b017125d9e49b94 Mon Sep 17 00:00:00 2001
From: Arjun Sharda <77706434+ajsharda17@users.noreply.github.com>
Date: Mon, 13 Jun 2022 14:14:36 -0500
Subject: [PATCH 48/54] Update BUILDING.md (#1523)

Co-authored-by: Toby Roseman <troseman@apple.com>
---
 BUILDING.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/BUILDING.md b/BUILDING.md
index f4bd213f2..3c4aeaccf 100644
--- a/BUILDING.md
+++ b/BUILDING.md
@@ -1,7 +1,7 @@
 Building from Source
 ====================
 
-This page describes building Core ML Tools (coremltools) from the source repository.
+This page describes how to build Core ML Tools (coremltools) from the source repository.
 
 ## Requirements
 
@@ -24,7 +24,7 @@ Follow these steps:
 	
 3. Run the [test.sh](scripts/test.sh) script to test the build.
 
-**Under the hood**: If an Anaconda or Miniconda environment doesn't already exists or is not up-to-date, the `build.sh` script automatically runs the [`env_create.sh`](scripts/env_create.sh) script to create the environment. It then uses [`env_activate.sh`](scripts/env_activate.sh) to activate the environment and set up the appropriate version of Python. The new environment is located at `<repo root>/coremltoos/envs` and is named after the `py` parameter. For example, a development environment with py 3.7 is named `coremltools-dev-py37`.
+**Under the hood**: If an Anaconda or Miniconda environment doesn't already exist or is not up-to-date, the `build.sh` script automatically runs the [`env_create.sh`](scripts/env_create.sh) script to create the environment. It then uses [`env_activate.sh`](scripts/env_activate.sh) to activate the environment and set up the appropriate version of Python. The new environment is located at `<repo root>/coremltoos/envs` and is named after the `py` parameter. For example, a development environment with py 3.7 is named `coremltools-dev-py37`.
 
 
 ## Build targets
@@ -53,8 +53,8 @@ For more information, see the following:
 
 * Core ML Tools [README](README.md) file for this repository
 * [Release Notes](https://github.com/apple/coremltools/releases/) for the current release and previous releases
-* [Guides and examples](https://coremltools.readme.io/) with installation and troubleshooting
+* [Guides and examples](https://coremltools.readme.io/) with installation and troubleshooting help
 * [API Reference](https://apple.github.io/coremltools/index.html)
 * [Core ML Specification](https://apple.github.io/coremltools/mlmodel/index.html)
-* [Contribution Guidelines](CONTRIBUTING.md) for reporting issues and making requests
+* [Contribution Guidelines](CONTRIBUTING.md) for reporting issues and making pull requests
 

From e1aaf57c3631765a2e846c9d9b91da8c1af74331 Mon Sep 17 00:00:00 2001
From: Arjun Sharda <77706434+ajsharda17@users.noreply.github.com>
Date: Mon, 13 Jun 2022 14:44:53 -0500
Subject: [PATCH 49/54] Update ---feature-request.md (change of wording mostly)
 (#1524)

* Update ---feature-request.md

* Update ---feature-request.md
---
 .github/ISSUE_TEMPLATE/---feature-request.md | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/---feature-request.md b/.github/ISSUE_TEMPLATE/---feature-request.md
index ba27df4b4..4db066cc7 100644
--- a/.github/ISSUE_TEMPLATE/---feature-request.md
+++ b/.github/ISSUE_TEMPLATE/---feature-request.md
@@ -8,16 +8,15 @@ assignees: ''
 ---
 
 ## 🌱 Describe your Feature Request
-- A clear and concise description of the feature request.
+- Please provide a description of your feature request, clear and consise.
 - If this is a feature request for the Core ML Framework or Xcode, please instead submit your feature request using the Feedback Assistant for Developers:
 https://developer.apple.com/bug-reporting/
 
-## Use cases
-- Please describe the use cases.
-- Please provide examples.
+## How can this feature be used?
+Please provide some examples where this feature can be used.
 
 ## Describe alternatives you've considered
-A clear and concise description of any alternative solutions or features you've considered.
+Tell us some alternatives that you have considered instead of this feature.
 
 ## Additional context
-Add any other context or screenshots about the feature request here.
+Do you have anything else to say?

From 1f29b6a00761e27951aef210c5863c841592e5b9 Mon Sep 17 00:00:00 2001
From: fukatani <nannyakannya@gmail.com>
Date: Fri, 17 Jun 2022 03:39:41 +0900
Subject: [PATCH 50/54] Torch eq and ne ops supports bool type. (#1501)

* Torch eq and ne ops supports bool type.

* Addressed review comment
---
 .../converters/mil/frontend/torch/ops.py      | 16 +++++++++--
 .../mil/frontend/torch/test/test_torch_ops.py | 28 +++++++++++++++++++
 2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
index 7054e5f27..64132b460 100644
--- a/coremltools/converters/mil/frontend/torch/ops.py
+++ b/coremltools/converters/mil/frontend/torch/ops.py
@@ -472,14 +472,26 @@ def listconstruct(context, node):
 @register_torch_op
 def eq(context, node):
     inputs = _get_inputs(context, node, expected=2)
-    equal_to = mb.equal(x=inputs[0], y=inputs[1], name=node.name)
+    x = inputs[0]
+    y = inputs[1]
+    if is_bool(x.dtype):
+        x = mb.cast(x=x, dtype='int32')
+    if is_bool(y.dtype):
+        y = mb.cast(x=y, dtype='int32')
+    equal_to = mb.equal(x=x, y=y, name=node.name)
     context.add(equal_to)
 
 
 @register_torch_op
 def ne(context, node):
     inputs = _get_inputs(context, node, expected=2)
-    equal_to = mb.not_equal(x=inputs[0], y=inputs[1], name=node.name)
+    x = inputs[0]
+    y = inputs[1]
+    if is_bool(x.dtype):
+        x = mb.cast(x=x, dtype='int32')
+    if is_bool(y.dtype):
+        y = mb.cast(x=y, dtype='int32')
+    equal_to = mb.not_equal(x=x, y=y, name=node.name)
     context.add(equal_to)
 
 
diff --git a/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py b/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
index 7b68fedf0..95e6690b6 100644
--- a/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
+++ b/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
@@ -3370,6 +3370,34 @@ def forward(self, x):
         )
 
 
+class TestBitWiseLogical(TorchBaseTest):
+    @pytest.mark.parametrize(
+        "backend, x_y, op_string",
+        itertools.product(
+            backends,
+            [
+                ([True, False, True, False], [True, True, False, False]),
+                ([[True, False], [True, False]], [[True, True], [False, False]]),
+                ([[True, False], [True, False]], [[1, 0], [2, 1]]),
+                ([-1.5, 0.0, 1.0, 0.0], [0.1, 2.5, 0.0, 0.0]),
+                ([2, 0, -1, 0, 5], [1, 1, 0, 0, -5]),
+            ],
+            [
+                "eq",
+                "ne",
+            ],
+        ),
+    )
+    def test_bitwise_logical(self, backend, x_y, op_string):
+        if not contains_op(torch, op_string):
+            return
+        op_func = getattr(torch, op_string)
+        model = ModuleWrapper(function=op_func)
+        x = torch.tensor(x_y[0])
+        y = torch.tensor(x_y[1])
+        self.run_compare_torch([x, y], model, backend=backend, input_as_shape=False)
+
+
 class TestLogicalAnd(TorchBaseTest):
     @pytest.mark.parametrize(
         "backend, x_y",

From f25a68415b9f075b7c18e44708c696c339526cd2 Mon Sep 17 00:00:00 2001
From: Duncan Buck <dncnbuck@gmail.com>
Date: Wed, 6 Jul 2022 14:11:56 +0200
Subject: [PATCH 51/54] Add tests for numel and narrow

---
 .../mil/frontend/torch/test/test_torch_ops.py | 69 +++++++++++++++++++
 1 file changed, 69 insertions(+)

diff --git a/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py b/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
index 95e6690b6..b89489414 100644
--- a/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
+++ b/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
@@ -4564,3 +4564,72 @@ def forward(self, x):
             backend=backend,
             converter_input_type=converter_input_type,
         )
+
+# class TestRoiAlign(TorchBaseTest):
+#     @pytest.mark.parametrize(
+#         "shapes, backend",
+#         itertools.product(
+#             [(1,), (1, 2)],
+#             backends
+#         ),
+#     )
+#     def test_roi_align(self, shapes, backend):
+#         class TestModel(nn.Module):
+#             def __init__(self):
+#                 super(TestModel, self).__init__()
+
+#             def forward(self, a):
+#                 return torch.roi_align
+#         self.run_compare_torch(shapes, TestModel().eval(), backend=backend)
+
+
+class TestNumel(TorchBaseTest):
+    @pytest.mark.parametrize(
+        "shapes, backend",
+        itertools.product(
+            [
+                [(2,1)],
+                [(5,1,4,1)],
+                [(1,)],
+            ],
+            backends
+        ),
+    )
+    def test_numel(self, shapes, backend):
+        class Model(nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                v = torch.numel(x)
+                return torch.tensor(v)
+
+        model = Model()
+        self.run_compare_torch(shapes, model, backend=backend)
+
+
+class TestNarrow(TorchBaseTest):
+    @pytest.mark.parametrize(
+        "shapes, dim_start_length, backend",
+        itertools.product(
+            [
+                [(3, 3)],
+            ],
+            [
+                (0, 0, 2)
+            ]
+            ,
+            backends
+        ),
+    )
+    def test_narrow(self, shapes, dim_start_length, backend):
+        dim, start, length = dim_start_length
+        class Model(nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return torch.narrow(x, dim, start, length)
+
+        model = Model()
+        self.run_compare_torch(shapes, model, backend=backend)

From f2f795b1301fe503d3e31bd71a441abb7403d4a9 Mon Sep 17 00:00:00 2001
From: Duncan Buck <dncnbuck@gmail.com>
Date: Wed, 6 Jul 2022 20:09:30 +0200
Subject: [PATCH 52/54] Add tests for torch.op.nms

---
 .../mil/frontend/torch/test/test_torch_ops.py | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py b/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
index b89489414..ebead3ef9 100644
--- a/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
+++ b/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
@@ -10,6 +10,8 @@
 import pytest
 import torch.nn as nn
 
+import torchvision
+
 from .testing_utils import (
     contains_op,
     generate_input_data,
@@ -4633,3 +4635,32 @@ def forward(self, x):
 
         model = Model()
         self.run_compare_torch(shapes, model, backend=backend)
+
+
+class TestNonMaximalSuppression(TorchBaseTest):
+    @pytest.mark.parametrize(
+        "shapes, scores, iou_threshold, backend",
+        itertools.product(
+            [
+                [(2, 4)],
+            ],
+            [
+                (2,)
+            ],
+            [1]
+            ,
+            backends
+        ),
+    )
+    def test_non_maximal_supression(self, shapes, scores, iou_threshold, backend):
+        # scores = torch.rand((2,))
+        scores = torch.rand(scores)
+        class Model(nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return torchvision.ops.nms(x, scores, iou_threshold=0.7)
+
+        model = Model()
+        self.run_compare_torch(shapes, model, backend=backend)

From 9be029f970ac41e7a20a6d0f738dae0aa603c32a Mon Sep 17 00:00:00 2001
From: Duncan Buck <dncnbuck@gmail.com>
Date: Wed, 6 Jul 2022 20:13:07 +0200
Subject: [PATCH 53/54] tidy up

---
 .../mil/frontend/torch/test/test_torch_ops.py     | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py b/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
index ebead3ef9..0ed026166 100644
--- a/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
+++ b/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
@@ -4639,21 +4639,14 @@ def forward(self, x):
 
 class TestNonMaximalSuppression(TorchBaseTest):
     @pytest.mark.parametrize(
-        "shapes, scores, iou_threshold, backend",
+        "shapes, scores, backend",
         itertools.product(
-            [
-                [(2, 4)],
-            ],
-            [
-                (2,)
-            ],
-            [1]
-            ,
+            [[(2, 4)]],
+            [(2,)],
             backends
         ),
     )
-    def test_non_maximal_supression(self, shapes, scores, iou_threshold, backend):
-        # scores = torch.rand((2,))
+    def test_non_maximal_supression(self, shapes, scores, backend):
         scores = torch.rand(scores)
         class Model(nn.Module):
             def __init__(self):

From 9e842a2389f8f59dcc0f2ee28391d9b415c3ecfc Mon Sep 17 00:00:00 2001
From: Duncan Buck <dncnbuck@gmail.com>
Date: Sat, 3 Sep 2022 10:44:07 +0200
Subject: [PATCH 54/54] some code clean up

---
 .../mil/frontend/torch/test/test_torch_ops.py | 22 ++-----------------
 1 file changed, 2 insertions(+), 20 deletions(-)

diff --git a/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py b/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
index 0ed026166..8ebb78982 100644
--- a/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
+++ b/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
@@ -4567,31 +4567,13 @@ def forward(self, x):
             converter_input_type=converter_input_type,
         )
 
-# class TestRoiAlign(TorchBaseTest):
-#     @pytest.mark.parametrize(
-#         "shapes, backend",
-#         itertools.product(
-#             [(1,), (1, 2)],
-#             backends
-#         ),
-#     )
-#     def test_roi_align(self, shapes, backend):
-#         class TestModel(nn.Module):
-#             def __init__(self):
-#                 super(TestModel, self).__init__()
-
-#             def forward(self, a):
-#                 return torch.roi_align
-#         self.run_compare_torch(shapes, TestModel().eval(), backend=backend)
-
-
 class TestNumel(TorchBaseTest):
     @pytest.mark.parametrize(
         "shapes, backend",
         itertools.product(
             [
-                [(2,1)],
-                [(5,1,4,1)],
+                [(2, 1)],
+                [(5, 1, 4, 1)],
                 [(1,)],
             ],
             backends